baidu_web 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in baidu_web.gemspec
4
+ gemspec
5
+
6
+ gem hpricot, '~>0.8.4'
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Baidu_web
2
+
3
+ Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
4
+ 在线Demo: http://www.inruby.com/search
5
+
6
+ ## 用法
7
+
8
+ gem 'baidu_web'
9
+
10
+ ## 调用
11
+
12
+ # http://www.baidu.com/s?wd=inruby&pn=100&rn=50
13
+ BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
14
+
15
+ ## 返还结果
16
+
17
+ result = BaiduWeb.search('key words')
18
+ result[:record_arr].each do |record|
19
+ puts record.title
20
+ puts record.url
21
+ puts record.summary
22
+ puts record.updated_date
23
+ puts record.item_index
24
+ puts record.cached_url
25
+ end
26
+ result[:ext_key_arr].each do |ext_key|
27
+ puts ext_key.title
28
+ puts ext_key.url
29
+ end
30
+
31
+ ## 在irb中测试:
32
+
33
+ $:.unshift(File.dirname(__FILE__))
34
+ require 'baidu_web'
35
+ require 'cgi'
36
+ result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/baidu_web.gemspec ADDED
@@ -0,0 +1,61 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "baidu_web/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "baidu_web"
7
+ s.version = BaiduWeb::VERSION
8
+ s.authors = ["krongk"]
9
+ s.email = ["kenrome@gmail.com"]
10
+ s.homepage = "https://github.com/krongk/baidu_web"
11
+ s.summary = %q{baidu_web is a meta search engine for 百度(www.baidu.com)}
12
+ s.description = %q{
13
+ # Baidu_web
14
+
15
+ Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
16
+ 在线Demo: http://www.inruby.com/search
17
+
18
+ ## 用法
19
+
20
+ gem 'baidu_web'
21
+
22
+ ## 调用
23
+
24
+ # http://www.baidu.com/s?wd=inruby&pn=100&rn=50
25
+ BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
26
+
27
+ ## 返还结果
28
+
29
+ result = BaiduWeb.search('key words')
30
+ result[:record_arr].each do |record|
31
+ puts record.title
32
+ puts record.url
33
+ puts record.summary
34
+ puts record.updated_date
35
+ puts record.item_index
36
+ puts record.cached_url
37
+ end
38
+ result[:ext_key_arr].each do |ext_key|
39
+ puts ext_key.title
40
+ puts ext_key.url
41
+ end
42
+
43
+ ## 在irb中测试:
44
+
45
+ $:.unshift(File.dirname(__FILE__))
46
+ require 'baidu_web'
47
+ require 'cgi'
48
+ result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
49
+ }
50
+
51
+ s.rubyforge_project = "baidu_web"
52
+
53
+ s.files = `git ls-files`.split("\n")
54
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
55
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
56
+ s.require_paths = ["lib"]
57
+
58
+ # specify any dependencies here; for example:
59
+ # s.add_development_dependency "rspec"
60
+ # s.add_runtime_dependency "rest-client"
61
+ end
@@ -0,0 +1,9 @@
1
+ module BaiduWeb
2
+ class ExtensionKey
3
+ attr_accessor :title, :parent_key, :source
4
+ end
5
+ end
6
+
7
+
8
+ # Store the extension key of reference key word
9
+ # see www.baidu.com '相关搜索'
@@ -0,0 +1,24 @@
1
+ module BaiduWeb
2
+ class Record
3
+ attr_accessor :title, :url, :summary, :updated_date, :item_index, :size, :cached_url
4
+ end
5
+ end
6
+
7
+ # Store the item of search result.
8
+ # :title
9
+ # Returns the matched web page or document's page title.
10
+ # :url
11
+ # Returns the absolute URL for the matched web document.
12
+ # :summary
13
+ # A brief summary for the matched web document from Baidu.com
14
+ # :date
15
+ # Return the searched date.
16
+ # :page_index
17
+ # Return the record of the pagination, started with 1.
18
+ # :item_index
19
+ # Return the record index in the search result page, started with 1.
20
+ # :size
21
+ # Size info for the matched document. Note that it's not an number, instead it's in the form of '32K' or something like that.
22
+ # :cached_url
23
+ # Returns the url pointing to the cached version of the matched document on Baidu.com. Note that, for documents with types like DOC, PPT and XSL, there won't be a cached version. So this property always returns undef in these cases.
24
+ #
@@ -0,0 +1,8 @@
1
+ module StringExtension
2
+ def blank?
3
+ self.nil? || self.strip.length == 0
4
+ end
5
+ end
6
+
7
+ String.send :include, StringExtension
8
+ NilClass.send :include, StringExtension
@@ -0,0 +1,5 @@
1
+ class Object
2
+ def strip
3
+ self.class == String && !self.nil? ? super : self
4
+ end
5
+ end
@@ -0,0 +1,3 @@
1
+ module BaiduWeb
2
+ VERSION = "0.0.1"
3
+ end
data/lib/baidu_web.rb ADDED
@@ -0,0 +1,120 @@
1
+ # encoding: utf-8
2
+ $:.unshift(File.dirname(__FILE__))
3
+
4
+ require 'hpricot'
5
+ require 'open-uri'
6
+ #require 'iconv'
7
+ #require 'cgi'
8
+ require "baidu_web/version"
9
+ require "baidu_web/record"
10
+ require "baidu_web/extension_key"
11
+ require "baidu_web/string_extension"
12
+ require "baidu_web/strip"
13
+
14
+ module BaiduWeb
15
+ class << self
16
+ def search(key_word, options)
17
+ result = {:record_arr => [], :ext_key_arr => [], :source => 'web'}
18
+
19
+ #@ic = Iconv.new("UTF-8//IGNORE", "GBK//IGNORE")
20
+
21
+ @key_word = key_word
22
+ return result if @key_word.blank?
23
+ #uri parser key word
24
+ # @key_word = CGI.escape(@key_word)
25
+
26
+ #determine how many records display on one page. (same as www.baidu.com/?<some params>&rn=50)
27
+ @per_page = options[:per_page]
28
+ @per_page ||= 50
29
+
30
+ #get which page of result. (same as www.baidu.com/?<some params>&pn=0)
31
+ @page_index = options[:page_index]
32
+ @page_index ||= 1
33
+
34
+ #get the start item index.
35
+ item_index = (@page_index - 1 ) * @per_page
36
+
37
+ agent = Mechanize.new
38
+
39
+ url = "http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}"
40
+ #debug: url
41
+ spage = agent.get(url)
42
+ #debug
43
+ # File.open(File.join(File.dirname(__FILE__), 'baidu_result.html'), "w"){|f| f.write(@ic.iconv(spage.body))}
44
+
45
+ #doc = Hpricot(@ic.iconv(spage.body))
46
+ doc = Hpricot(spage.body)
47
+
48
+ #- this is hack on linux:
49
+ #case1:
50
+ # result_page = @ic.iconv(open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}").read)
51
+ #case2:
52
+ # result_page = ""
53
+ # open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}", "r:utf-8") {|f|
54
+ # f.each_line do |line|
55
+ # result_page += @ic.iconv(line)
56
+ # end
57
+ # }
58
+
59
+ return result if doc.blank?
60
+
61
+ result[:record_arr] = extract_item(doc, item_index)
62
+ result[:ext_key_arr] = extract_extension_key(doc)
63
+ #debug
64
+ puts result[:record_arr].size
65
+
66
+ return result
67
+ end
68
+
69
+ private
70
+ def extract_item(content, item_index)
71
+ record_arr = []
72
+ #remove op recors, e.g. search by 'mysql', see the second record.
73
+ content.search("table[@class='result-op']").remove
74
+
75
+ content.search("table[@class='result']").each do |res|
76
+ next if res.at("h3").nil?
77
+
78
+ record = Record.new
79
+
80
+ title = res.at("h3").inner_text
81
+ record.title = title
82
+ record.url = res.at("h3").at("a").attributes['href'].to_s
83
+
84
+ summary = []
85
+ res.at("td[@class='f']").children.each do |elem|
86
+ if elem.respond_to?(:attributes) && elem.attributes['href'] =~ /http:\/\/cache.baidu.com/
87
+ record.cached_url = elem.attributes['href']
88
+ next
89
+ elsif elem.respond_to?(:attributes) && elem.attributes['class'] == 'g' && elem.to_s =~ /(\d{4}-\d{1,2}-\d{1,2})/
90
+ record.updated_date = $1
91
+ next
92
+ end
93
+ next if elem.respond_to?(:attributes) && elem.attributes['class'] == 't'
94
+ summary << elem.inner_text
95
+ end
96
+ record.summary = summary.join(' ').gsub(/百度|百度快照|快照/, '')
97
+
98
+ item_index += 1
99
+ record.item_index = item_index
100
+ record_arr << record
101
+ end
102
+ return record_arr
103
+ end
104
+
105
+ def extract_extension_key(doc)
106
+ rs = doc.at("//div#rs")
107
+ return [] if rs.nil?
108
+ ext_key_arr = []
109
+ rs.get_elements_by_tag_name("a").each do |link|
110
+ ext_key = ExtensionKey.new
111
+ ext_key.title = link.inner_text
112
+ ext_key.parent_key = @key_word
113
+ ext_key.source = 'web'
114
+ ext_key_arr << ext_key
115
+ end
116
+ ext_key_arr
117
+ end
118
+
119
+ end
120
+ end
data/lib/test.rb ADDED
@@ -0,0 +1,60 @@
1
+ # encoding = utf-8
2
+ $:.unshift(File.dirname(__FILE__))
3
+
4
+ require 'baidu_web'
5
+ require 'cgi'
6
+ result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
7
+
8
+ # class Test
9
+ # def self.search(key, ha)
10
+ # puts key
11
+ # puts ha
12
+ # puts ha.class
13
+ # puts ha.size
14
+ # puts ha[:key].to_s
15
+ # puts ha[:b].to_s
16
+ # puts ha[:c].to_s
17
+
18
+ # end
19
+ # end
20
+
21
+ # Test.search('a', :a=>'a', :key => '444', :b=>44)
22
+
23
+
24
+ # 0.times do |i|
25
+ # puts i
26
+ # end
27
+
28
+ # http://www.baidu.com/s?wd=%C3%C0%C5%AE&rsv_bp=0&rsv_spt=3&inputT=930
29
+ # http://www.baidu.com/s?wd=%B3%C9%B6%BC&pn=0&rn=5&usm=4
30
+ # http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=50&rn=50&usm=4
31
+ # http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=100&rn=50&usm=4
32
+
33
+ # rn -> per page record
34
+ # pn -> start record
35
+
36
+
37
+ # html = Hpricot(doc);nil
38
+
39
+
40
+ # ic = Iconv.new("utf-8//IGNORE", "GB2312//IGNORE")
41
+ # doc = ""
42
+ # open("http://www.baidu.com/s?rn=50&bs=%C3%C0%C5%AE&f=8&rsv_bp=1&wd=mysql&inputT=2955", "r:gb2312:utf-8") {|f|
43
+ # f.each_line do |line|
44
+ # doc += ic.iconv(line)
45
+ # end
46
+ # };nil
47
+
48
+ # doc = Hpricot(doc);nil
49
+
50
+ # results = doc.at("div[@id='container']");nil
51
+
52
+ # results.search("table[@class='result']");nil
53
+
54
+ # doc.search("table[@class='result-op']").remove
55
+
56
+
57
+ # <table class="result-op"
58
+ # cellpadding="0" cellspacing="0" srcid="6669" id="2"
59
+ # mu="http://soft.baidu.com/softwaresearch/s?tn=software&amp;rn=10&amp;wd=mysql"
60
+ # data-op="{'y':'BF0FFF7F'}"><style>.op_mini_table01_content table{margin-top:4px;}.op_mini_table01_content th{text-align:left;white-space:nowrap;background:url("http://www.baidu.com/aladdin/img/table/bg.gif") repeat-x 0 -37px;font-weight:normal;height:26px;line-height:26px;font-size:13px;padding:0 10px 0 8px;}.op_mini_table01_content td{white-space:nowrap;font-size:14px;border-bottom:#eee 1px solid;}.OP_TABLE_COMMON{ width:100%;}.OP_TABLE_COMMON td{ padding:7px 10px 7px 8px; font-size:14px;}.OP_TABLE_COMMON a,.OP_TABLE_COMMON a em{ text-decoration:none;}.OP_TABLE_COMMON a:hover,.OP_TABLE_COMMON a:hover em{ text-decoration:underline;}</style><script>function jI(D){var C=D;var B=0;while(C=C.parentNode){B=parseInt(C.getAttribute("id"));if(B>0){break}}var A=C.getElementsByTagName("a");for(var B=0;B<A.length;B++){if(D==A[B]){return B}}return A.length-1}function _aMC(C){var B=C,A=-1;while(B=B.parentNode){A=parseInt(B.getAttribute("id"));if(A>0){return A}}};</script><tbody><tr><td class="f"><h3 class="t"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this)})" href="http://soft.baidu.com/softwaresearch/s?tn=software&amp;rn=10&amp;wd=mysql" target="_blank"><font size="3"><em>mysql</em>_相关下载信息363条_百度软件搜索</font></a><span class="tsuf tsuf-op" data="{title : 'mysql_相关下载信息363条_百度软件搜索', link : 'http:\/\/soft.baidu.com\/softwaresearch\/s?tn=software&amp;rn=10&amp;wd=mysql'}"></span></h3><div class="op_mini_table01_content op_software"> <table cellspacing="0" class="OP_TABLE_COMMON"><tbody><tr><th style="border-left:0;">软件名称</th><th>软件大小</th><th>来源</th></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.skycn.com/soft/30406.html">Apache+Php+<em>Mysql</em> V1.3 绿色自动安装版</a></td><td width="120px;"> 13.44 M </td><td> 天空软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.newhua.com/soft/3573.htm"><em>MYSQL</em> 5.5.15</a></td><td width="120px;"> 27.76 M </td><td> 华军软件园 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.duote.com/soft/3169.html"><em>MYSQL</em> For Windows V5.0.67(无毒无插件)</a></td><td width="120px;"> 23.27 M </td><td> 多特软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://dl.pconline.com.cn/html_2/1/79/id=465&amp;pn=0.html"><em>MYSQL</em> 5.1.59</a></td><td width="120px;"> 26.7 M </td><td> 太平洋下载 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://xiazai.zol.com.cn/detail/9/89874.shtml"><em>MySQL</em> 5.5.15官方下载</a></td><td width="120px;"> 28.43 M </td><td> ZOL软件下载 </td></tr> </tbody></table> <div style="padding:4px 0 2px;"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" style="color:#7777CC;font-size:12px;" href="http://soft.baidu.com/softwaresearch/s?tn=software&amp;rn=10&amp;wd=mysql" target="_blank">查看全部363条结果<span style="font-family:simsun">&gt;&gt;</span></a></div><font size="-1" color="#008000">soft.baidu.com/softwaresearch/s?tn=software&amp;r... 2011-10-1</font></div> </td></tr></tbody></table>
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: baidu_web
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - krongk
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-29 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! "\n # Baidu_web\n\n Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;\n
15
+ \ 在线Demo: http://www.inruby.com/search\n \n ## 用法\n \n
16
+ \ gem 'baidu_web'\n \n ## 调用\n \n # http://www.baidu.com/s?wd=inruby&pn=100&rn=50\n
17
+ \ BaiduWeb.search(\"inruby\", :per_page => 50, :page_index => 2)\n \n
18
+ \ ## 返还结果\n \n result = BaiduWeb.search('key words')\n result[:record_arr].each
19
+ do |record|\n puts record.title\n puts record.url\n puts
20
+ record.summary\n puts record.updated_date\n puts record.item_index\n
21
+ \ puts record.cached_url\n end\n result[:ext_key_arr].each
22
+ do |ext_key|\n puts ext_key.title\n puts ext_key.url\n end\n
23
+ \ \n ## 在irb中测试:\n \n $:.unshift(File.dirname(__FILE__))\n
24
+ \ require 'baidu_web'\n require 'cgi'\n result =
25
+ BaiduWeb.search(CGI.escape(\"游戏\"), :per_page => 10, :page_index => 1)\n "
26
+ email:
27
+ - kenrome@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - README.md
35
+ - Rakefile
36
+ - baidu_web.gemspec
37
+ - lib/baidu_web.rb
38
+ - lib/baidu_web/extension_key.rb
39
+ - lib/baidu_web/record.rb
40
+ - lib/baidu_web/string_extension.rb
41
+ - lib/baidu_web/strip.rb
42
+ - lib/baidu_web/version.rb
43
+ - lib/test.rb
44
+ homepage: https://github.com/krongk/baidu_web
45
+ licenses: []
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project: baidu_web
64
+ rubygems_version: 1.8.16
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: baidu_web is a meta search engine for 百度(www.baidu.com)
68
+ test_files: []