baidu_web 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +6 -0
- data/README.md +36 -0
- data/Rakefile +1 -0
- data/baidu_web.gemspec +61 -0
- data/lib/baidu_web/extension_key.rb +9 -0
- data/lib/baidu_web/record.rb +24 -0
- data/lib/baidu_web/string_extension.rb +8 -0
- data/lib/baidu_web/strip.rb +5 -0
- data/lib/baidu_web/version.rb +3 -0
- data/lib/baidu_web.rb +120 -0
- data/lib/test.rb +60 -0
- metadata +68 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Baidu_web
|
2
|
+
|
3
|
+
Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
|
4
|
+
在线Demo: http://www.inruby.com/search
|
5
|
+
|
6
|
+
## 用法
|
7
|
+
|
8
|
+
gem 'baidu_web'
|
9
|
+
|
10
|
+
## 调用
|
11
|
+
|
12
|
+
# http://www.baidu.com/s?wd=inruby&pn=100&rn=50
|
13
|
+
BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
|
14
|
+
|
15
|
+
## 返还结果
|
16
|
+
|
17
|
+
result = BaiduWeb.search('key words')
|
18
|
+
result[:record_arr].each do |record|
|
19
|
+
puts record.title
|
20
|
+
puts record.url
|
21
|
+
puts record.summary
|
22
|
+
puts record.updated_date
|
23
|
+
puts record.item_index
|
24
|
+
puts record.cached_url
|
25
|
+
end
|
26
|
+
result[:ext_key_arr].each do |ext_key|
|
27
|
+
puts ext_key.title
|
28
|
+
puts ext_key.url
|
29
|
+
end
|
30
|
+
|
31
|
+
## 在irb中测试:
|
32
|
+
|
33
|
+
$:.unshift(File.dirname(__FILE__))
|
34
|
+
require 'baidu_web'
|
35
|
+
require 'cgi'
|
36
|
+
result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/baidu_web.gemspec
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "baidu_web/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "baidu_web"
|
7
|
+
s.version = BaiduWeb::VERSION
|
8
|
+
s.authors = ["krongk"]
|
9
|
+
s.email = ["kenrome@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/krongk/baidu_web"
|
11
|
+
s.summary = %q{baidu_web is a meta search engine for 百度(www.baidu.com)}
|
12
|
+
s.description = %q{
|
13
|
+
# Baidu_web
|
14
|
+
|
15
|
+
Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;
|
16
|
+
在线Demo: http://www.inruby.com/search
|
17
|
+
|
18
|
+
## 用法
|
19
|
+
|
20
|
+
gem 'baidu_web'
|
21
|
+
|
22
|
+
## 调用
|
23
|
+
|
24
|
+
# http://www.baidu.com/s?wd=inruby&pn=100&rn=50
|
25
|
+
BaiduWeb.search("inruby", :per_page => 50, :page_index => 2)
|
26
|
+
|
27
|
+
## 返还结果
|
28
|
+
|
29
|
+
result = BaiduWeb.search('key words')
|
30
|
+
result[:record_arr].each do |record|
|
31
|
+
puts record.title
|
32
|
+
puts record.url
|
33
|
+
puts record.summary
|
34
|
+
puts record.updated_date
|
35
|
+
puts record.item_index
|
36
|
+
puts record.cached_url
|
37
|
+
end
|
38
|
+
result[:ext_key_arr].each do |ext_key|
|
39
|
+
puts ext_key.title
|
40
|
+
puts ext_key.url
|
41
|
+
end
|
42
|
+
|
43
|
+
## 在irb中测试:
|
44
|
+
|
45
|
+
$:.unshift(File.dirname(__FILE__))
|
46
|
+
require 'baidu_web'
|
47
|
+
require 'cgi'
|
48
|
+
result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
|
49
|
+
}
|
50
|
+
|
51
|
+
s.rubyforge_project = "baidu_web"
|
52
|
+
|
53
|
+
s.files = `git ls-files`.split("\n")
|
54
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
55
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
56
|
+
s.require_paths = ["lib"]
|
57
|
+
|
58
|
+
# specify any dependencies here; for example:
|
59
|
+
# s.add_development_dependency "rspec"
|
60
|
+
# s.add_runtime_dependency "rest-client"
|
61
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module BaiduWeb
|
2
|
+
class Record
|
3
|
+
attr_accessor :title, :url, :summary, :updated_date, :item_index, :size, :cached_url
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
# Store the item of search result.
|
8
|
+
# :title
|
9
|
+
# Returns the matched web page or document's page title.
|
10
|
+
# :url
|
11
|
+
# Returns the absolute URL for the matched web document.
|
12
|
+
# :summary
|
13
|
+
# A brief summary for the matched web document from Baidu.com
|
14
|
+
# :date
|
15
|
+
# Return the searched date.
|
16
|
+
# :page_index
|
17
|
+
# Return the record of the pagination, started with 1.
|
18
|
+
# :item_index
|
19
|
+
# Return the record index in the search result page, started with 1.
|
20
|
+
# :size
|
21
|
+
# Size info for the matched document. Note that it's not an number, instead it's in the form of '32K' or something like that.
|
22
|
+
# :cached_url
|
23
|
+
# Returns the url pointing to the cached version of the matched document on Baidu.com. Note that, for documents with types like DOC, PPT and XSL, there won't be a cached version. So this property always returns undef in these cases.
|
24
|
+
#
|
data/lib/baidu_web.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift(File.dirname(__FILE__))
|
3
|
+
|
4
|
+
require 'hpricot'
|
5
|
+
require 'open-uri'
|
6
|
+
#require 'iconv'
|
7
|
+
#require 'cgi'
|
8
|
+
require "baidu_web/version"
|
9
|
+
require "baidu_web/record"
|
10
|
+
require "baidu_web/extension_key"
|
11
|
+
require "baidu_web/string_extension"
|
12
|
+
require "baidu_web/strip"
|
13
|
+
|
14
|
+
module BaiduWeb
|
15
|
+
class << self
|
16
|
+
def search(key_word, options)
|
17
|
+
result = {:record_arr => [], :ext_key_arr => [], :source => 'web'}
|
18
|
+
|
19
|
+
#@ic = Iconv.new("UTF-8//IGNORE", "GBK//IGNORE")
|
20
|
+
|
21
|
+
@key_word = key_word
|
22
|
+
return result if @key_word.blank?
|
23
|
+
#uri parser key word
|
24
|
+
# @key_word = CGI.escape(@key_word)
|
25
|
+
|
26
|
+
#determine how many records display on one page. (same as www.baidu.com/?<some params>&rn=50)
|
27
|
+
@per_page = options[:per_page]
|
28
|
+
@per_page ||= 50
|
29
|
+
|
30
|
+
#get which page of result. (same as www.baidu.com/?<some params>&pn=0)
|
31
|
+
@page_index = options[:page_index]
|
32
|
+
@page_index ||= 1
|
33
|
+
|
34
|
+
#get the start item index.
|
35
|
+
item_index = (@page_index - 1 ) * @per_page
|
36
|
+
|
37
|
+
agent = Mechanize.new
|
38
|
+
|
39
|
+
url = "http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}"
|
40
|
+
#debug: url
|
41
|
+
spage = agent.get(url)
|
42
|
+
#debug
|
43
|
+
# File.open(File.join(File.dirname(__FILE__), 'baidu_result.html'), "w"){|f| f.write(@ic.iconv(spage.body))}
|
44
|
+
|
45
|
+
#doc = Hpricot(@ic.iconv(spage.body))
|
46
|
+
doc = Hpricot(spage.body)
|
47
|
+
|
48
|
+
#- this is hack on linux:
|
49
|
+
#case1:
|
50
|
+
# result_page = @ic.iconv(open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}").read)
|
51
|
+
#case2:
|
52
|
+
# result_page = ""
|
53
|
+
# open("http://www.baidu.com/s?wd=#{@key_word}&rn=#{@per_page}&pn=#{item_index}", "r:utf-8") {|f|
|
54
|
+
# f.each_line do |line|
|
55
|
+
# result_page += @ic.iconv(line)
|
56
|
+
# end
|
57
|
+
# }
|
58
|
+
|
59
|
+
return result if doc.blank?
|
60
|
+
|
61
|
+
result[:record_arr] = extract_item(doc, item_index)
|
62
|
+
result[:ext_key_arr] = extract_extension_key(doc)
|
63
|
+
#debug
|
64
|
+
puts result[:record_arr].size
|
65
|
+
|
66
|
+
return result
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
def extract_item(content, item_index)
|
71
|
+
record_arr = []
|
72
|
+
#remove op recors, e.g. search by 'mysql', see the second record.
|
73
|
+
content.search("table[@class='result-op']").remove
|
74
|
+
|
75
|
+
content.search("table[@class='result']").each do |res|
|
76
|
+
next if res.at("h3").nil?
|
77
|
+
|
78
|
+
record = Record.new
|
79
|
+
|
80
|
+
title = res.at("h3").inner_text
|
81
|
+
record.title = title
|
82
|
+
record.url = res.at("h3").at("a").attributes['href'].to_s
|
83
|
+
|
84
|
+
summary = []
|
85
|
+
res.at("td[@class='f']").children.each do |elem|
|
86
|
+
if elem.respond_to?(:attributes) && elem.attributes['href'] =~ /http:\/\/cache.baidu.com/
|
87
|
+
record.cached_url = elem.attributes['href']
|
88
|
+
next
|
89
|
+
elsif elem.respond_to?(:attributes) && elem.attributes['class'] == 'g' && elem.to_s =~ /(\d{4}-\d{1,2}-\d{1,2})/
|
90
|
+
record.updated_date = $1
|
91
|
+
next
|
92
|
+
end
|
93
|
+
next if elem.respond_to?(:attributes) && elem.attributes['class'] == 't'
|
94
|
+
summary << elem.inner_text
|
95
|
+
end
|
96
|
+
record.summary = summary.join(' ').gsub(/百度|百度快照|快照/, '')
|
97
|
+
|
98
|
+
item_index += 1
|
99
|
+
record.item_index = item_index
|
100
|
+
record_arr << record
|
101
|
+
end
|
102
|
+
return record_arr
|
103
|
+
end
|
104
|
+
|
105
|
+
def extract_extension_key(doc)
|
106
|
+
rs = doc.at("//div#rs")
|
107
|
+
return [] if rs.nil?
|
108
|
+
ext_key_arr = []
|
109
|
+
rs.get_elements_by_tag_name("a").each do |link|
|
110
|
+
ext_key = ExtensionKey.new
|
111
|
+
ext_key.title = link.inner_text
|
112
|
+
ext_key.parent_key = @key_word
|
113
|
+
ext_key.source = 'web'
|
114
|
+
ext_key_arr << ext_key
|
115
|
+
end
|
116
|
+
ext_key_arr
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
end
|
data/lib/test.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding = utf-8
|
2
|
+
$:.unshift(File.dirname(__FILE__))
|
3
|
+
|
4
|
+
require 'baidu_web'
|
5
|
+
require 'cgi'
|
6
|
+
result = BaiduWeb.search(CGI.escape("游戏"), :per_page => 10, :page_index => 1)
|
7
|
+
|
8
|
+
# class Test
|
9
|
+
# def self.search(key, ha)
|
10
|
+
# puts key
|
11
|
+
# puts ha
|
12
|
+
# puts ha.class
|
13
|
+
# puts ha.size
|
14
|
+
# puts ha[:key].to_s
|
15
|
+
# puts ha[:b].to_s
|
16
|
+
# puts ha[:c].to_s
|
17
|
+
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Test.search('a', :a=>'a', :key => '444', :b=>44)
|
22
|
+
|
23
|
+
|
24
|
+
# 0.times do |i|
|
25
|
+
# puts i
|
26
|
+
# end
|
27
|
+
|
28
|
+
# http://www.baidu.com/s?wd=%C3%C0%C5%AE&rsv_bp=0&rsv_spt=3&inputT=930
|
29
|
+
# http://www.baidu.com/s?wd=%B3%C9%B6%BC&pn=0&rn=5&usm=4
|
30
|
+
# http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=50&rn=50&usm=4
|
31
|
+
# http://www.baidu.com/s?wd=%C3%C0%C5%AE&pn=100&rn=50&usm=4
|
32
|
+
|
33
|
+
# rn -> per page record
|
34
|
+
# pn -> start record
|
35
|
+
|
36
|
+
|
37
|
+
# html = Hpricot(doc);nil
|
38
|
+
|
39
|
+
|
40
|
+
# ic = Iconv.new("utf-8//IGNORE", "GB2312//IGNORE")
|
41
|
+
# doc = ""
|
42
|
+
# open("http://www.baidu.com/s?rn=50&bs=%C3%C0%C5%AE&f=8&rsv_bp=1&wd=mysql&inputT=2955", "r:gb2312:utf-8") {|f|
|
43
|
+
# f.each_line do |line|
|
44
|
+
# doc += ic.iconv(line)
|
45
|
+
# end
|
46
|
+
# };nil
|
47
|
+
|
48
|
+
# doc = Hpricot(doc);nil
|
49
|
+
|
50
|
+
# results = doc.at("div[@id='container']");nil
|
51
|
+
|
52
|
+
# results.search("table[@class='result']");nil
|
53
|
+
|
54
|
+
# doc.search("table[@class='result-op']").remove
|
55
|
+
|
56
|
+
|
57
|
+
# <table class="result-op"
|
58
|
+
# cellpadding="0" cellspacing="0" srcid="6669" id="2"
|
59
|
+
# mu="http://soft.baidu.com/softwaresearch/s?tn=software&rn=10&wd=mysql"
|
60
|
+
# data-op="{'y':'BF0FFF7F'}"><style>.op_mini_table01_content table{margin-top:4px;}.op_mini_table01_content th{text-align:left;white-space:nowrap;background:url("http://www.baidu.com/aladdin/img/table/bg.gif") repeat-x 0 -37px;font-weight:normal;height:26px;line-height:26px;font-size:13px;padding:0 10px 0 8px;}.op_mini_table01_content td{white-space:nowrap;font-size:14px;border-bottom:#eee 1px solid;}.OP_TABLE_COMMON{ width:100%;}.OP_TABLE_COMMON td{ padding:7px 10px 7px 8px; font-size:14px;}.OP_TABLE_COMMON a,.OP_TABLE_COMMON a em{ text-decoration:none;}.OP_TABLE_COMMON a:hover,.OP_TABLE_COMMON a:hover em{ text-decoration:underline;}</style><script>function jI(D){var C=D;var B=0;while(C=C.parentNode){B=parseInt(C.getAttribute("id"));if(B>0){break}}var A=C.getElementsByTagName("a");for(var B=0;B<A.length;B++){if(D==A[B]){return B}}return A.length-1}function _aMC(C){var B=C,A=-1;while(B=B.parentNode){A=parseInt(B.getAttribute("id"));if(A>0){return A}}};</script><tbody><tr><td class="f"><h3 class="t"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this)})" href="http://soft.baidu.com/softwaresearch/s?tn=software&rn=10&wd=mysql" target="_blank"><font size="3"><em>mysql</em>_相关下载信息363条_百度软件搜索</font></a><span class="tsuf tsuf-op" data="{title : 'mysql_相关下载信息363条_百度软件搜索', link : 'http:\/\/soft.baidu.com\/softwaresearch\/s?tn=software&rn=10&wd=mysql'}"></span></h3><div class="op_mini_table01_content op_software"> <table cellspacing="0" class="OP_TABLE_COMMON"><tbody><tr><th style="border-left:0;">软件名称</th><th>软件大小</th><th>来源</th></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.skycn.com/soft/30406.html">Apache+Php+<em>Mysql</em> V1.3 绿色自动安装版</a></td><td width="120px;"> 13.44 M </td><td> 天空软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.newhua.com/soft/3573.htm"><em>MYSQL</em> 5.5.15</a></td><td width="120px;"> 27.76 M </td><td> 华军软件园 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://www.duote.com/soft/3169.html"><em>MYSQL</em> For Windows V5.0.67(无毒无插件)</a></td><td width="120px;"> 23.27 M </td><td> 多特软件站 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://dl.pconline.com.cn/html_2/1/79/id=465&pn=0.html"><em>MYSQL</em> 5.1.59</a></td><td width="120px;"> 26.7 M </td><td> 太平洋下载 </td></tr> <tr><td> <a target="_blank" onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" href="http://xiazai.zol.com.cn/detail/9/89874.shtml"><em>MySQL</em> 5.5.15官方下载</a></td><td width="120px;"> 28.43 M </td><td> ZOL软件下载 </td></tr> </tbody></table> <div style="padding:4px 0 2px;"><a onmousedown="return c({'fm':'alop','title':this.innerHTML,'url':this.href,'p1':_aMC(this),'p2':jI(this)})" style="color:#7777CC;font-size:12px;" href="http://soft.baidu.com/softwaresearch/s?tn=software&rn=10&wd=mysql" target="_blank">查看全部363条结果<span style="font-family:simsun">>></span></a></div><font size="-1" color="#008000">soft.baidu.com/softwaresearch/s?tn=software&r... 2011-10-1</font></div> </td></tr></tbody></table>
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: baidu_web
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- krongk
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-29 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! "\n # Baidu_web\n\n Baidu_web是一个基于百度的元搜索引擎,输入关键字,返还百度搜索结果和相关的关键词;\n
|
15
|
+
\ 在线Demo: http://www.inruby.com/search\n \n ## 用法\n \n
|
16
|
+
\ gem 'baidu_web'\n \n ## 调用\n \n # http://www.baidu.com/s?wd=inruby&pn=100&rn=50\n
|
17
|
+
\ BaiduWeb.search(\"inruby\", :per_page => 50, :page_index => 2)\n \n
|
18
|
+
\ ## 返还结果\n \n result = BaiduWeb.search('key words')\n result[:record_arr].each
|
19
|
+
do |record|\n puts record.title\n puts record.url\n puts
|
20
|
+
record.summary\n puts record.updated_date\n puts record.item_index\n
|
21
|
+
\ puts record.cached_url\n end\n result[:ext_key_arr].each
|
22
|
+
do |ext_key|\n puts ext_key.title\n puts ext_key.url\n end\n
|
23
|
+
\ \n ## 在irb中测试:\n \n $:.unshift(File.dirname(__FILE__))\n
|
24
|
+
\ require 'baidu_web'\n require 'cgi'\n result =
|
25
|
+
BaiduWeb.search(CGI.escape(\"游戏\"), :per_page => 10, :page_index => 1)\n "
|
26
|
+
email:
|
27
|
+
- kenrome@gmail.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- README.md
|
35
|
+
- Rakefile
|
36
|
+
- baidu_web.gemspec
|
37
|
+
- lib/baidu_web.rb
|
38
|
+
- lib/baidu_web/extension_key.rb
|
39
|
+
- lib/baidu_web/record.rb
|
40
|
+
- lib/baidu_web/string_extension.rb
|
41
|
+
- lib/baidu_web/strip.rb
|
42
|
+
- lib/baidu_web/version.rb
|
43
|
+
- lib/test.rb
|
44
|
+
homepage: https://github.com/krongk/baidu_web
|
45
|
+
licenses: []
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project: baidu_web
|
64
|
+
rubygems_version: 1.8.16
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: baidu_web is a meta search engine for 百度(www.baidu.com)
|
68
|
+
test_files: []
|