baidu_web 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +6 -0
- data/README.md +36 -0
- data/Rakefile +1 -0
- data/baidu_web.gemspec +61 -0
- data/lib/baidu_web/extension_key.rb +9 -0
- data/lib/baidu_web/record.rb +24 -0
- data/lib/baidu_web/string_extension.rb +8 -0
- data/lib/baidu_web/strip.rb +5 -0
- data/lib/baidu_web/version.rb +3 -0
- data/lib/baidu_web.rb +120 -0
- data/lib/test.rb +60 -0
- metadata +68 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Baidu_web
|
2
|
+
|
3
|
+
Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
|
4
|
+
在线Demo: http://www.inruby.com/search
|
5
|
+
|
6
|
+
## 用法
|
7
|
+
|
8
|
+
gem 'baidu_web'
|
9
|
+
|
10
|
+
## 调用
|
11
|
+
|
12
|
+
# http://www.baidu.com/s?wd=inruby&pn=100&rn=50
|
13
|
+
BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
|
14
|
+
|
15
|
+
## 返还结果
|
16
|
+
|
17
|
+
result = BaiduWeb.search('key words')
|
18
|
+
result[:record_arr].each do |record|
|
19
|
+
puts record.title
|
20
|
+
puts record.url
|
21
|
+
puts record.summary
|
22
|
+
puts record.updated_date
|
23
|
+
puts record.item_index
|
24
|
+
puts record.cached_url
|
25
|
+
end
|
26
|
+
result[:ext_key_arr].each do |ext_key|
|
27
|
+
puts ext_key.title
|
28
|
+
puts ext_key.url
|
29
|
+
end
|
30
|
+
|
31
|
+
## 在irb中测试:
|
32
|
+
|
33
|
+
$:.unshift(File.dirname(__FILE__))
|
34
|
+
require 'baidu_web'
|
35
|
+
require 'cgi'
|
36
|
+
result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/baidu_web.gemspec
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "baidu_web/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "baidu_web"
|
7
|
+
s.version = BaiduWeb::VERSION
|
8
|
+
s.authors = ["krongk"]
|
9
|
+
s.email = ["kenrome@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/krongk/baidu_web"
|
11
|
+
s.summary = %q{baidu_web is a meta search engine for 百度(www.baidu.com)}
|
12
|
+
s.description = %q{
|
13
|
+
# Baidu_web
|
14
|
+
|
15
|
+
Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
|
16
|
+
在线Demo: http://www.inruby.com/search
|
17
|
+
|
18
|
+
## 用法
|
19
|
+
|
20
|
+
gem 'baidu_web'
|
21
|
+
|
22
|
+
## 调用
|
23
|
+
|
24
|
+
# http://www.baidu.com/s?wd=inruby&pn=100&rn=50
|
25
|
+
BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
|
26
|
+
|
27
|
+
## 返还结果
|
28
|
+
|
29
|
+
result = BaiduWeb.search('key words')
|
30
|
+
result[:record_arr].each do |record|
|
31
|
+
puts record.title
|
32
|
+
puts record.url
|
33
|
+
puts record.summary
|
34
|
+
puts record.updated_date
|
35
|
+
puts record.item_index
|
36
|
+
puts record.cached_url
|
37
|
+
end
|
38
|
+
result[:ext_key_arr].each do |ext_key|
|
39
|
+
puts ext_key.title
|
40
|
+
puts ext_key.url
|
41
|
+
end
|
42
|
+
|
43
|
+
## 在irb中测试:
|
44
|
+
|
45
|
+
$:.unshift(File.dirname(__FILE__))
|
46
|
+
require 'baidu_web'
|
47
|
+
require 'cgi'
|
48
|
+
result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
|
49
|
+
}
|
50
|
+
|
51
|
+
s.rubyforge_project = "baidu_web"
|
52
|
+
|
53
|
+
s.files = `git ls-files`.split("\n")
|
54
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
55
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
56
|
+
s.require_paths = ["lib"]
|
57
|
+
|
58
|
+
# specify any dependencies here; for example:
|
59
|
+
# s.add_development_dependency "rspec"
|
60
|
+
# s.add_runtime_dependency "rest-client"
|
61
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module BaiduWeb
|
2
|
+
class Record
|
3
|
+
attr_accessor :title, :url, :summary, :updated_date, :item_index, :size, :cached_url
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
# Store the item of search result.
|
8
|
+
# :title
|
9
|
+
# Returns the matched web page or document's page title.
|
10
|
+
# :url
|
11
|
+
# Returns the absolute URL for the matched web document.
|
12
|
+
# :summary
|
13
|
+
# A brief summary for the matched web document from Baidu.com
|
14
|
+
# :date
|
15
|
+
# Return the searched date.
|
16
|
+
# :page_index
|
17
|
+
# Return the record of the pagination, started with 1.
|
18
|
+
# :item_index
|
19
|
+
# Return the record index in the search result page, started with 1.
|
20
|
+
# :size
|
21
|
+
# Size info for the matched document. Note that it's not an number, instead it's in the form of '32K' or something like that.
|
22
|
+
# :cached_url
|
23
|
+
# Returns the url pointing to the cached version of the matched document on Baidu.com. Note that, for documents with types like DOC, PPT and XSL, there won't be a cached version. So this property always returns undef in these cases.
|
24
|
+
#
|
data/lib/baidu_web.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift(File.dirname(__FILE__))
|
3
|
+
|
4
|
+
require 'hpricot'
|
5
|
+
require 'open-uri'
|
6
|
+
#require 'iconv'
|
7
|
+
#require 'cgi'
|
8
|
+
require "baidu_web/version"
|
9
|
+
require "baidu_web/record"
|
10
|
+
require "baidu_web/extension_key"
|
11
|
+
require "baidu_web/string_extension"
|
12
|
+
require "baidu_web/strip"
|
13
|
+
|
14
|
+
module BaiduWeb
|
15
|
+
class << self
|
16
|
+
def search(key_word, options)
|
17
|
+
result = {:record_arr => [], :ext_key_arr => [], :source => 'web'}
|
18
|
+
|
19
|
+
#@ic = Iconv.new("UTF-8//IGNORE", "GBK//IGNORE")
|
20
|
+
|
21
|
+
@key_word = key_word
|
22
|
+
return result if @key_word.blank?
|
23
|
+
#uri parser key word
|
24
|
+
# @key_word = CGI.escape(@key_word)
|
25
|
+
|
26
|
+
#determine how many records display on one page. (same as www.baidu.com/?<some params>&rn=50)
|
27
|
+
@per_page = options[:per_page]
|
28
|
+
@per_page ||= 50
|
29
|
+
|
30
|
+
#get which page of result. (same as www.baidu.com/?<some params>&pn=0)
|
31
|
+
@page_index = options[:page_index]
|
32
|
+
@page_index ||= 1
|
33
|
+
|
34
|
+
#get the start item index.
|
35
|
+
item_index = (@page_index - 1 ) * @per_page
|
36
|
+
|
37
|
+
agent = Mechanize.new
|
38
|
+
|
39
|
+
url = "http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}"
|
40
|
+
#debug: url
|
41
|
+
spage = agent.get(url)
|
42
|
+
#debug
|
43
|
+
# File.open(File.join(File.dirname(__FILE__), 'baidu_result.html'), "w"){|f| f.write(@ic.iconv(spage.body))}
|
44
|
+
|
45
|
+
#doc = Hpricot(@ic.iconv(spage.body))
|
46
|
+
doc = Hpricot(spage.body)
|
47
|
+
|
48
|
+
#- this is hack on linux:
|
49
|
+
#case1:
|
50
|
+
# result_page = @ic.iconv(open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}").read)
|
51
|
+
#case2:
|
52
|
+
# result_page = ""
|
53
|
+
# open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}", "r:utf-8") {|f|
|
54
|
+
# f.each_line do |line|
|
55
|
+
# result_page += @ic.iconv(line)
|
56
|
+
# end
|
57
|
+
# }
|
58
|
+
|
59
|
+
return result if doc.blank?
|
60
|
+
|
61
|
+
result[:record_arr] = extract_item(doc, item_index)
|
62
|
+
result[:ext_key_arr] = extract_extension_key(doc)
|
63
|
+
#debug
|
64
|
+
puts result[:record_arr].size
|
65
|
+
|
66
|
+
return result
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
def extract_item(content, item_index)
|
71
|
+
record_arr = []
|
72
|
+
#remove op recors, e.g. search by 'mysql', see the second record.
|
73
|
+
content.search("table[@class='result-op']").remove
|
74
|
+
|
75
|
+
content.search("table[@class='result']").each do |res|
|
76
|
+
next if res.at("h3").nil?
|
77
|
+
|
78
|
+
record = Record.new
|
79
|
+
|
80
|
+
title = res.at("h3").inner_text
|
81
|
+
record.title = title
|
82
|
+
record.url = res.at("h3").at("a").attributes['href'].to_s
|
83
|
+
|
84
|
+
summary = []
|
85
|
+
res.at("td[@class='f']").children.each do |elem|
|
86
|
+
if elem.respond_to?(:attributes) && elem.attributes['href'] =~ /http:\/\/cache.baidu.com/
|
87
|
+
record.cached_url = elem.attributes['href']
|
88
|
+
next
|
89
|
+
elsif elem.respond_to?(:attributes) && elem.attributes['class'] == 'g' && elem.to_s =~ /(\d{4}-\d{1,2}-\d{1,2})/
|
90
|
+
record.updated_date = $1
|
91
|
+
next
|
92
|
+
end
|
93
|
+
next if elem.respond_to?(:attributes) && elem.attributes['class'] == 't'
|
94
|
+
summary << elem.inner_text
|
95
|
+
end
|
96
|
+
record.summary = summary.join(' ').gsub(/百度|百度快照|快照/, '')
|
97
|
+
|
98
|
+
item_index += 1
|
99
|
+
record.item_index = item_index
|
100
|
+
record_arr << record
|
101
|
+
end
|
102
|
+
return record_arr
|
103
|
+
end
|
104
|
+
|
105
|
+
def extract_extension_key(doc)
|
106
|
+
rs = doc.at("//div#rs")
|
107
|
+
return [] if rs.nil?
|
108
|
+
ext_key_arr = []
|
109
|
+
rs.get_elements_by_tag_name("a").each do |link|
|
110
|
+
ext_key = ExtensionKey.new
|
111
|
+
ext_key.title = link.inner_text
|
112
|
+
ext_key.parent_key = @key_word
|
113
|
+
ext_key.source = 'web'
|
114
|
+
ext_key_arr << ext_key
|
115
|
+
end
|
116
|
+
ext_key_arr
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
end
|
data/lib/test.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding = utf-8
|
2
|
+
$:.unshift(File.dirname(__FILE__))
|
3
|
+
|
4
|
+
require 'baidu_web'
|
5
|
+
require 'cgi'
|
6
|
+
result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
|
7
|
+
|
8
|
+
# class Test
|
9
|
+
# def self.search(key, ha)
|
10
|
+
# puts key
|
11
|
+
# puts ha
|
12
|
+
# puts ha.class
|
13
|
+
# puts ha.size
|
14
|
+
# puts ha[:key].to_s
|
15
|
+
# puts ha[:b].to_s
|
16
|
+
# puts ha[:c].to_s
|
17
|
+
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Test.search('a', :a=>'a', :key => '444', :b=>44)
|
22
|
+
|
23
|
+
|
24
|
+
# 0.times do |i|
|
25
|
+
# puts i
|
26
|
+
# end
|
27
|
+
|
28
|
+
# http://www.baidu.com/s?wd=%C3%C0%C5%AE&rsv_bp=0&rsv_spt=3&inputT=930
|
29
|
+
# http://www.baidu.com/s?wd=%B3%C9%B6%BC&pn=0&rn=5&usm=4
|
30
|
+
# http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=50&rn=50&usm=4
|
31
|
+
# http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=100&rn=50&usm=4
|
32
|
+
|
33
|
+
# rn -> per page record
|
34
|
+
# pn -> start record
|
35
|
+
|
36
|
+
|
37
|
+
# html = Hpricot(doc);nil
|
38
|
+
|
39
|
+
|
40
|
+
# ic = Iconv.new("utf-8//IGNORE", "GB2312//IGNORE")
|
41
|
+
# doc = ""
|
42
|
+
# open("http://www.baidu.com/s?rn=50&bs=%C3%C0%C5%AE&f=8&rsv_bp=1&wd=mysql&inputT=2955", "r:gb2312:utf-8") {|f|
|
43
|
+
# f.each_line do |line|
|
44
|
+
# doc += ic.iconv(line)
|
45
|
+
# end
|
46
|
+
# };nil
|
47
|
+
|
48
|
+
# doc = Hpricot(doc);nil
|
49
|
+
|
50
|
+
# results = doc.at("div[@id='container']");nil
|
51
|
+
|
52
|
+
# results.search("table[@class='result']");nil
|
53
|
+
|
54
|
+
# doc.search("table[@class='result-op']").remove
|
55
|
+
|
56
|
+
|
57
|
+
# <table class="result-op"
|
58
|
+
# cellpadding="0" cellspacing="0" srcid="6669" id="2"
|
59
|
+
# mu="http://soft.baidu.com/softwaresearch/s?tn=software&rn=10&wd=mysql"
|
60
|
+
# data-op="{'y':'BF0FFF7F'}"><style>.op_mini_table01_content table{margin-top:4px;}.op_mini_table01_content th{text-align:left;white-space:nowrap;background:url("http://www.baidu.com/aladdin/img/table/bg.gif") repeat-x 0 -37px;font-weight:normal;height:26px;line-height:26px;font-size:13px;padding:0 10px 0 8px;}.op_mini_table01_content td{white-space:nowrap;font-size:14px;border-bottom:#eee 1px solid;}.OP_TABLE_COMMON{ width:100%;}.OP_TABLE_COMMON td{ padding:7px 10px 7px 8px; font-size:14px;}.OP_TABLE_COMMON a,.OP_TABLE_COMMON a em{ text-decoration:none;}.OP_TABLE_COMMON a:hover,.OP_TABLE_COMMON a:hover em{ text-decoration:underline;}</style><script>function jI(D){var C=D;var B=0;while(C=C.parentNode){B=parseInt(C.getAttribute("id"));if(B>0){break}}var A=C.getElementsByTagName("a");for(var B=0;B<A.length;B++){if(D==A[B]){return B}}return A.length-1}function _aMC(C){var B=C,A=-1;while(B=B.parentNode){A=parseInt(B.getAttribute("id"));if(A>0){return A}}};</script><tbody><tr><td class="f"><h3 class="t"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this)})" href="http://soft.baidu.com/softwaresearch/s?tn=software&rn=10&wd=mysql" target="_blank"><font size="3"><em>mysql</em>_相关下载信息363条_百度软件搜索</font></a><span class="tsuf tsuf-op" data="{title : 'mysql_相关下载信息363条_百度软件搜索', link : 'http:\/\/soft.baidu.com\/softwaresearch\/s?tn=software&rn=10&wd=mysql'}"></span></h3><div class="op_mini_table01_content op_software"> <table cellspacing="0" class="OP_TABLE_COMMON"><tbody><tr><th style="border-left:0;">软件名称</th><th>软件大小</th><th>来源</th></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.skycn.com/soft/30406.html">Apache+Php+<em>Mysql</em> V1.3 绿色自动安装版</a></td><td width="120px;"> 13.44 M </td><td> 天空软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.newhua.com/soft/3573.htm"><em>MYSQL</em> 5.5.15</a></td><td width="120px;"> 27.76 M </td><td> 华军软件园 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.duote.com/soft/3169.html"><em>MYSQL</em> For Windows V5.0.67(无毒无插件)</a></td><td width="120px;"> 23.27 M </td><td> 多特软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://dl.pconline.com.cn/html_2/1/79/id=465&pn=0.html"><em>MYSQL</em> 5.1.59</a></td><td width="120px;"> 26.7 M </td><td> 太平洋下载 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://xiazai.zol.com.cn/detail/9/89874.shtml"><em>MySQL</em> 5.5.15官方下载</a></td><td width="120px;"> 28.43 M </td><td> ZOL软件下载 </td></tr> </tbody></table> <div style="padding:4px 0 2px;"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" style="color:#7777CC;font-size:12px;" href="http://soft.baidu.com/softwaresearch/s?tn=software&rn=10&wd=mysql" target="_blank">查看全部363条结果<span style="font-family:simsun">>></span></a></div><font size="-1" color="#008000">soft.baidu.com/softwaresearch/s?tn=software&r... 2011-10-1</font></div> </td></tr></tbody></table>
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: baidu_web
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- krongk
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-29 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! "\n # Baidu_web\n\n Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;\n
|
15
|
+
\ 在线Demo: http://www.inruby.com/search\n \n ## 用法\n \n
|
16
|
+
\ gem 'baidu_web'\n \n ## 调用\n \n # http://www.baidu.com/s?wd=inruby&pn=100&rn=50\n
|
17
|
+
\ BaiduWeb.search(\"inruby\", :per_page => 50, :page_index => 2)\n \n
|
18
|
+
\ ## 返还结果\n \n result = BaiduWeb.search('key words')\n result[:record_arr].each
|
19
|
+
do |record|\n puts record.title\n puts record.url\n puts
|
20
|
+
record.summary\n puts record.updated_date\n puts record.item_index\n
|
21
|
+
\ puts record.cached_url\n end\n result[:ext_key_arr].each
|
22
|
+
do |ext_key|\n puts ext_key.title\n puts ext_key.url\n end\n
|
23
|
+
\ \n ## 在irb中测试:\n \n $:.unshift(File.dirname(__FILE__))\n
|
24
|
+
\ require 'baidu_web'\n require 'cgi'\n result =
|
25
|
+
BaiduWeb.search(CGI.escape(\"游戏\"), :per_page => 10, :page_index => 1)\n "
|
26
|
+
email:
|
27
|
+
- kenrome@gmail.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- README.md
|
35
|
+
- Rakefile
|
36
|
+
- baidu_web.gemspec
|
37
|
+
- lib/baidu_web.rb
|
38
|
+
- lib/baidu_web/extension_key.rb
|
39
|
+
- lib/baidu_web/record.rb
|
40
|
+
- lib/baidu_web/string_extension.rb
|
41
|
+
- lib/baidu_web/strip.rb
|
42
|
+
- lib/baidu_web/version.rb
|
43
|
+
- lib/test.rb
|
44
|
+
homepage: https://github.com/krongk/baidu_web
|
45
|
+
licenses: []
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project: baidu_web
|
64
|
+
rubygems_version: 1.8.16
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: baidu_web is a meta search engine for 百度(www.baidu.com)
|
68
|
+
test_files: []
|