baidu_web 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in baidu_web.gemspec
4
+ gemspec
5
+
6
+ gem hpricot, '~>0.8.4'
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Baidu_web
2
+
3
+ Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
4
+ 在线Demo: http://www.inruby.com/search
5
+
6
+ ## 用法
7
+
8
+ gem 'baidu_web'
9
+
10
+ ## 调用
11
+
12
+ # http://www.baidu.com/s?wd=inruby&pn=100&rn=50
13
+ BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
14
+
15
+ ## 返还结果
16
+
17
+ result = BaiduWeb.search('key words')
18
+ result[:record_arr].each do |record|
19
+ puts record.title
20
+ puts record.url
21
+ puts record.summary
22
+ puts record.updated_date
23
+ puts record.item_index
24
+ puts record.cached_url
25
+ end
26
+ result[:ext_key_arr].each do |ext_key|
27
+ puts ext_key.title
28
+ puts ext_key.url
29
+ end
30
+
31
+ ## 在irb中测试:
32
+
33
+ $:.unshift(File.dirname(__FILE__))
34
+ require 'baidu_web'
35
+ require 'cgi'
36
+ result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/baidu_web.gemspec ADDED
@@ -0,0 +1,61 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "baidu_web/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "baidu_web"
7
+ s.version = BaiduWeb::VERSION
8
+ s.authors = ["krongk"]
9
+ s.email = ["kenrome@gmail.com"]
10
+ s.homepage = "https://github.com/krongk/baidu_web"
11
+ s.summary = %q{baidu_web is a meta search engine for 百度(www.baidu.com)}
12
+ s.description = %q{
13
+ # Baidu_web
14
+
15
+ Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
16
+ 在线Demo: http://www.inruby.com/search
17
+
18
+ ## 用法
19
+
20
+ gem 'baidu_web'
21
+
22
+ ## 调用
23
+
24
+ # http://www.baidu.com/s?wd=inruby&pn=100&rn=50
25
+ BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
26
+
27
+ ## 返还结果
28
+
29
+ result = BaiduWeb.search('key words')
30
+ result[:record_arr].each do |record|
31
+ puts record.title
32
+ puts record.url
33
+ puts record.summary
34
+ puts record.updated_date
35
+ puts record.item_index
36
+ puts record.cached_url
37
+ end
38
+ result[:ext_key_arr].each do |ext_key|
39
+ puts ext_key.title
40
+ puts ext_key.url
41
+ end
42
+
43
+ ## 在irb中测试:
44
+
45
+ $:.unshift(File.dirname(__FILE__))
46
+ require 'baidu_web'
47
+ require 'cgi'
48
+ result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
49
+ }
50
+
51
+ s.rubyforge_project = "baidu_web"
52
+
53
+ s.files = `git ls-files`.split("\n")
54
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
55
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
56
+ s.require_paths = ["lib"]
57
+
58
+ # specify any dependencies here; for example:
59
+ # s.add_development_dependency "rspec"
60
+ # s.add_runtime_dependency "rest-client"
61
+ end
@@ -0,0 +1,9 @@
1
+ module BaiduWeb
2
+ class ExtensionKey
3
+ attr_accessor :title, :parent_key, :source
4
+ end
5
+ end
6
+
7
+
8
+ # Store the extension key of reference key word
9
+ # see www.baidu.com '相关搜索'
@@ -0,0 +1,24 @@
1
+ module BaiduWeb
2
+ class Record
3
+ attr_accessor :title, :url, :summary, :updated_date, :item_index, :size, :cached_url
4
+ end
5
+ end
6
+
7
+ # Store the item of search result.
8
+ # :title
9
+ # Returns the matched web page or document's page title.
10
+ # :url
11
+ # Returns the absolute URL for the matched web document.
12
+ # :summary
13
+ # A brief summary for the matched web document from Baidu.com
14
+ # :date
15
+ # Return the searched date.
16
+ # :page_index
17
+ # Return the record of the pagination, started with 1.
18
+ # :item_index
19
+ # Return the record index in the search result page, started with 1.
20
+ # :size
21
+ # Size info for the matched document. Note that it's not an number, instead it's in the form of '32K' or something like that.
22
+ # :cached_url
23
+ # Returns the url pointing to the cached version of the matched document on Baidu.com. Note that, for documents with types like DOC, PPT and XSL, there won't be a cached version. So this property always returns undef in these cases.
24
+ #
@@ -0,0 +1,8 @@
1
+ module StringExtension
2
+ def blank?
3
+ self.nil? || self.strip.length == 0
4
+ end
5
+ end
6
+
7
+ String.send :include, StringExtension
8
+ NilClass.send :include, StringExtension
@@ -0,0 +1,5 @@
1
+ class Object
2
+ def strip
3
+ self.class == String && !self.nil? ? super : self
4
+ end
5
+ end
@@ -0,0 +1,3 @@
1
+ module BaiduWeb
2
+ VERSION = "0.0.1"
3
+ end
data/lib/baidu_web.rb ADDED
@@ -0,0 +1,120 @@
1
+ # encoding: utf-8
2
+ $:.unshift(File.dirname(__FILE__))
3
+
4
+ require 'hpricot'
5
+ require 'open-uri'
6
+ #require 'iconv'
7
+ #require 'cgi'
8
+ require "baidu_web/version"
9
+ require "baidu_web/record"
10
+ require "baidu_web/extension_key"
11
+ require "baidu_web/string_extension"
12
+ require "baidu_web/strip"
13
+
14
+ module BaiduWeb
15
+ class << self
16
+ def search(key_word, options)
17
+ result = {:record_arr => [], :ext_key_arr => [], :source => 'web'}
18
+
19
+ #@ic = Iconv.new("UTF-8//IGNORE", "GBK//IGNORE")
20
+
21
+ @key_word = key_word
22
+ return result if @key_word.blank?
23
+ #uri parser key word
24
+ # @key_word = CGI.escape(@key_word)
25
+
26
+ #determine how many records display on one page. (same as www.baidu.com/?<some params>&rn=50)
27
+ @per_page = options[:per_page]
28
+ @per_page ||= 50
29
+
30
+ #get which page of result. (same as www.baidu.com/?<some params>&pn=0)
31
+ @page_index = options[:page_index]
32
+ @page_index ||= 1
33
+
34
+ #get the start item index.
35
+ item_index = (@page_index - 1 ) * @per_page
36
+
37
+ agent = Mechanize.new
38
+
39
+ url = "http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}"
40
+ #debug: url
41
+ spage = agent.get(url)
42
+ #debug
43
+ # File.open(File.join(File.dirname(__FILE__), 'baidu_result.html'), "w"){|f| f.write(@ic.iconv(spage.body))}
44
+
45
+ #doc = Hpricot(@ic.iconv(spage.body))
46
+ doc = Hpricot(spage.body)
47
+
48
+ #- this is hack on linux:
49
+ #case1:
50
+ # result_page = @ic.iconv(open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}").read)
51
+ #case2:
52
+ # result_page = ""
53
+ # open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}", "r:utf-8") {|f|
54
+ # f.each_line do |line|
55
+ # result_page += @ic.iconv(line)
56
+ # end
57
+ # }
58
+
59
+ return result if doc.blank?
60
+
61
+ result[:record_arr] = extract_item(doc, item_index)
62
+ result[:ext_key_arr] = extract_extension_key(doc)
63
+ #debug
64
+ puts result[:record_arr].size
65
+
66
+ return result
67
+ end
68
+
69
+ private
70
+ def extract_item(content, item_index)
71
+ record_arr = []
72
+ #remove op recors, e.g. search by 'mysql', see the second record.
73
+ content.search("table[@class='result-op']").remove
74
+
75
+ content.search("table[@class='result']").each do |res|
76
+ next if res.at("h3").nil?
77
+
78
+ record = Record.new
79
+
80
+ title = res.at("h3").inner_text
81
+ record.title = title
82
+ record.url = res.at("h3").at("a").attributes['href'].to_s
83
+
84
+ summary = []
85
+ res.at("td[@class='f']").children.each do |elem|
86
+ if elem.respond_to?(:attributes) && elem.attributes['href'] =~ /http:\/\/cache.baidu.com/
87
+ record.cached_url = elem.attributes['href']
88
+ next
89
+ elsif elem.respond_to?(:attributes) && elem.attributes['class'] == 'g' && elem.to_s =~ /(\d{4}-\d{1,2}-\d{1,2})/
90
+ record.updated_date = $1
91
+ next
92
+ end
93
+ next if elem.respond_to?(:attributes) && elem.attributes['class'] == 't'
94
+ summary << elem.inner_text
95
+ end
96
+ record.summary = summary.join(' ').gsub(/百度|百度快照|快照/, '')
97
+
98
+ item_index += 1
99
+ record.item_index = item_index
100
+ record_arr << record
101
+ end
102
+ return record_arr
103
+ end
104
+
105
+ def extract_extension_key(doc)
106
+ rs = doc.at("//div#rs")
107
+ return [] if rs.nil?
108
+ ext_key_arr = []
109
+ rs.get_elements_by_tag_name("a").each do |link|
110
+ ext_key = ExtensionKey.new
111
+ ext_key.title = link.inner_text
112
+ ext_key.parent_key = @key_word
113
+ ext_key.source = 'web'
114
+ ext_key_arr << ext_key
115
+ end
116
+ ext_key_arr
117
+ end
118
+
119
+ end
120
+ end
data/lib/test.rb ADDED
@@ -0,0 +1,60 @@
1
+ # encoding = utf-8
2
+ $:.unshift(File.dirname(__FILE__))
3
+
4
+ require 'baidu_web'
5
+ require 'cgi'
6
+ result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
7
+
8
+ # class Test
9
+ # def self.search(key, ha)
10
+ # puts key
11
+ # puts ha
12
+ # puts ha.class
13
+ # puts ha.size
14
+ # puts ha[:key].to_s
15
+ # puts ha[:b].to_s
16
+ # puts ha[:c].to_s
17
+
18
+ # end
19
+ # end
20
+
21
+ # Test.search('a', :a=>'a', :key => '444', :b=>44)
22
+
23
+
24
+ # 0.times do |i|
25
+ # puts i
26
+ # end
27
+
28
+ # http://www.baidu.com/s?wd=%C3%C0%C5%AE&rsv_bp=0&rsv_spt=3&inputT=930
29
+ # http://www.baidu.com/s?wd=%B3%C9%B6%BC&pn=0&rn=5&usm=4
30
+ # http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=50&rn=50&usm=4
31
+ # http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=100&rn=50&usm=4
32
+
33
+ # rn -> per page record
34
+ # pn -> start record
35
+
36
+
37
+ # html = Hpricot(doc);nil
38
+
39
+
40
+ # ic = Iconv.new("utf-8//IGNORE", "GB2312//IGNORE")
41
+ # doc = ""
42
+ # open("http://www.baidu.com/s?rn=50&bs=%C3%C0%C5%AE&f=8&rsv_bp=1&wd=mysql&inputT=2955", "r:gb2312:utf-8") {|f|
43
+ # f.each_line do |line|
44
+ # doc += ic.iconv(line)
45
+ # end
46
+ # };nil
47
+
48
+ # doc = Hpricot(doc);nil
49
+
50
+ # results = doc.at("div[@id='container']");nil
51
+
52
+ # results.search("table[@class='result']");nil
53
+
54
+ # doc.search("table[@class='result-op']").remove
55
+
56
+
57
+ # <table class="result-op"
58
+ # cellpadding="0" cellspacing="0" srcid="6669" id="2"
59
+ # mu="http://soft.baidu.com/softwaresearch/s?tn=software&amp;rn=10&amp;wd=mysql"
60
+ # data-op="{'y':'BF0FFF7F'}"><style>.op_mini_table01_content table{margin-top:4px;}.op_mini_table01_content th{text-align:left;white-space:nowrap;background:url("http://www.baidu.com/aladdin/img/table/bg.gif") repeat-x 0 -37px;font-weight:normal;height:26px;line-height:26px;font-size:13px;padding:0 10px 0 8px;}.op_mini_table01_content td{white-space:nowrap;font-size:14px;border-bottom:#eee 1px solid;}.OP_TABLE_COMMON{ width:100%;}.OP_TABLE_COMMON td{ padding:7px 10px 7px 8px; font-size:14px;}.OP_TABLE_COMMON a,.OP_TABLE_COMMON a em{ text-decoration:none;}.OP_TABLE_COMMON a:hover,.OP_TABLE_COMMON a:hover em{ text-decoration:underline;}</style><script>function jI(D){var C=D;var B=0;while(C=C.parentNode){B=parseInt(C.getAttribute("id"));if(B>0){break}}var A=C.getElementsByTagName("a");for(var B=0;B<A.length;B++){if(D==A[B]){return B}}return A.length-1}function _aMC(C){var B=C,A=-1;while(B=B.parentNode){A=parseInt(B.getAttribute("id"));if(A>0){return A}}};</script><tbody><tr><td class="f"><h3 class="t"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this)})" href="http://soft.baidu.com/softwaresearch/s?tn=software&amp;rn=10&amp;wd=mysql" target="_blank"><font size="3"><em>mysql</em>_相关下载信息363条_百度软件搜索</font></a><span class="tsuf tsuf-op" data="{title : 'mysql_相关下载信息363条_百度软件搜索', link : 'http:\/\/soft.baidu.com\/softwaresearch\/s?tn=software&amp;rn=10&amp;wd=mysql'}"></span></h3><div class="op_mini_table01_content op_software"> <table cellspacing="0" class="OP_TABLE_COMMON"><tbody><tr><th style="border-left:0;">软件名称</th><th>软件大小</th><th>来源</th></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.skycn.com/soft/30406.html">Apache+Php+<em>Mysql</em> V1.3 绿色自动安装版</a></td><td width="120px;"> 13.44 M </td><td> 天空软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.newhua.com/soft/3573.htm"><em>MYSQL</em> 5.5.15</a></td><td width="120px;"> 27.76 M </td><td> 华军软件园 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.duote.com/soft/3169.html"><em>MYSQL</em> For Windows V5.0.67(无毒无插件)</a></td><td width="120px;"> 23.27 M </td><td> 多特软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://dl.pconline.com.cn/html_2/1/79/id=465&amp;pn=0.html"><em>MYSQL</em> 5.1.59</a></td><td width="120px;"> 26.7 M </td><td> 太平洋下载 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://xiazai.zol.com.cn/detail/9/89874.shtml"><em>MySQL</em> 5.5.15官方下载</a></td><td width="120px;"> 28.43 M </td><td> ZOL软件下载 </td></tr> </tbody></table> <div style="padding:4px 0 2px;"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" style="color:#7777CC;font-size:12px;" href="http://soft.baidu.com/softwaresearch/s?tn=software&amp;rn=10&amp;wd=mysql" target="_blank">查看全部363条结果<span style="font-family:simsun">&gt;&gt;</span></a></div><font size="-1" color="#008000">soft.baidu.com/softwaresearch/s?tn=software&amp;r... 2011-10-1</font></div> </td></tr></tbody></table>
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: baidu_web
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - krongk
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-29 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! "\n # Baidu_web\n\n Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;\n
15
+ \ 在线Demo: http://www.inruby.com/search\n \n ## 用法\n \n
16
+ \ gem 'baidu_web'\n \n ## 调用\n \n # http://www.baidu.com/s?wd=inruby&pn=100&rn=50\n
17
+ \ BaiduWeb.search(\"inruby\", :per_page => 50, :page_index => 2)\n \n
18
+ \ ## 返还结果\n \n result = BaiduWeb.search('key words')\n result[:record_arr].each
19
+ do |record|\n puts record.title\n puts record.url\n puts
20
+ record.summary\n puts record.updated_date\n puts record.item_index\n
21
+ \ puts record.cached_url\n end\n result[:ext_key_arr].each
22
+ do |ext_key|\n puts ext_key.title\n puts ext_key.url\n end\n
23
+ \ \n ## 在irb中测试:\n \n $:.unshift(File.dirname(__FILE__))\n
24
+ \ require 'baidu_web'\n require 'cgi'\n result =
25
+ BaiduWeb.search(CGI.escape(\"游戏\"), :per_page => 10, :page_index => 1)\n "
26
+ email:
27
+ - kenrome@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - README.md
35
+ - Rakefile
36
+ - baidu_web.gemspec
37
+ - lib/baidu_web.rb
38
+ - lib/baidu_web/extension_key.rb
39
+ - lib/baidu_web/record.rb
40
+ - lib/baidu_web/string_extension.rb
41
+ - lib/baidu_web/strip.rb
42
+ - lib/baidu_web/version.rb
43
+ - lib/test.rb
44
+ homepage: https://github.com/krongk/baidu_web
45
+ licenses: []
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project: baidu_web
64
+ rubygems_version: 1.8.16
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: baidu_web is a meta search engine for 百度(www.baidu.com)
68
+ test_files: []