baiduserp 2.1.1 → 2.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/baiduserp.rb +4 -0
- data/lib/baiduserp/client.rb +1 -1
- data/lib/baiduserp/parser.rb +7 -1
- data/lib/baiduserp/version.rb +1 -1
- data/lib/parsers/ads_top.rb +18 -14
- data/lib/parsers/ranks.rb +11 -2
- data/lib/parsers/zhixin.rb +5 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 333ba9624a535b422d93a318e5b5db9f50681fab
|
4
|
+
data.tar.gz: fcc4bce281984552a81f1ecf138b91146ce3042d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad3edca21e77a528e13d2231737d8d78dd0fa4c221d6107a5e7ea933e0c51ee1e785c5fda5ffb79306e914b687a04dd6aa48ee8a34911474219d4bb6d0b81514
|
7
|
+
data.tar.gz: 5702393d3254700fd2ff721363ab68a7a394b7f1e3cae00a54adfbd4bff18a8cf4715f977729a90531c7b29977a9bfe1825980b4e8244d5d366092baa57e5d54
|
data/lib/baiduserp.rb
CHANGED
data/lib/baiduserp/client.rb
CHANGED
data/lib/baiduserp/parser.rb
CHANGED
@@ -40,10 +40,16 @@ module Baiduserp
|
|
40
40
|
end
|
41
41
|
|
42
42
|
def search(keyword,page=1)
|
43
|
+
html = get_search_html(keyword,page)
|
44
|
+
parse html
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_search_html(keyword,page=1)
|
43
48
|
keyword = keyword.gsub(" ","+")
|
44
49
|
page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
|
45
50
|
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
46
|
-
|
51
|
+
p serp_url
|
52
|
+
Client.get_serp(serp_url)
|
47
53
|
end
|
48
54
|
|
49
55
|
def parse_file(file_path)
|
data/lib/baiduserp/version.rb
CHANGED
data/lib/parsers/ads_top.rb
CHANGED
@@ -1,29 +1,33 @@
|
|
1
1
|
class Baiduserp::Parser
|
2
2
|
def _parse_ads_top(file)
|
3
3
|
result = []
|
4
|
-
|
4
|
+
rank = 0
|
5
|
+
|
6
|
+
file[:doc].search('div#content_left').first.children.each do |div|
|
5
7
|
id = div['id'].to_i
|
6
|
-
|
7
|
-
|
8
|
+
break if id > 0 && id < 3000
|
9
|
+
next unless div['class'].to_s.include?('ec_pp_f')
|
10
|
+
rank += 1
|
8
11
|
|
9
|
-
|
12
|
+
if div.name == 'div'
|
13
|
+
r = {rank: rank, id: id}
|
10
14
|
|
11
|
-
|
15
|
+
r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))
|
12
16
|
|
13
|
-
|
17
|
+
r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))
|
14
18
|
|
15
|
-
|
16
|
-
|
19
|
+
r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))
|
20
|
+
|
21
|
+
result << r
|
17
22
|
|
18
|
-
|
19
|
-
|
20
|
-
r = {rank: i + 1}
|
23
|
+
else # div.name == 'table'
|
24
|
+
r = {rank: rank, id: id}
|
21
25
|
|
22
|
-
r[:title] = Baiduserp::Helper.get_content_safe(
|
26
|
+
r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a'))
|
23
27
|
|
24
|
-
r[:content] = Baiduserp::Helper.get_content_safe(
|
28
|
+
r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc'))
|
25
29
|
|
26
|
-
r[:site] = Baiduserp::Helper.get_content_safe(
|
30
|
+
r[:site] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url'))
|
27
31
|
|
28
32
|
result << r
|
29
33
|
|
data/lib/parsers/ranks.rb
CHANGED
@@ -5,8 +5,19 @@ class Baiduserp::Parser
|
|
5
5
|
next if table.nil?
|
6
6
|
id = table['id'].to_i
|
7
7
|
next unless id > 0 && id < 3000
|
8
|
+
|
8
9
|
r = {:rank => id}
|
9
10
|
|
11
|
+
r[:result_op] = table['class'].to_s.include?('result-op')
|
12
|
+
|
13
|
+
r[:fk] = table['fk']
|
14
|
+
|
15
|
+
r[:srcid] = table['srcid']
|
16
|
+
|
17
|
+
r[:tpl] = table['tpl']
|
18
|
+
|
19
|
+
r[:mu] = table['mu']
|
20
|
+
|
10
21
|
url = table.search('h3/a').first
|
11
22
|
unless url.nil?
|
12
23
|
url = url['href']
|
@@ -23,8 +34,6 @@ class Baiduserp::Parser
|
|
23
34
|
|
24
35
|
r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract'))
|
25
36
|
|
26
|
-
r[:mu] = table['mu']
|
27
|
-
|
28
37
|
table.search('a').each do |link|
|
29
38
|
r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
|
30
39
|
end
|
data/lib/parsers/zhixin.rb
CHANGED
@@ -2,7 +2,11 @@ class Baiduserp::Parser
|
|
2
2
|
def _parse_zhixin(file)
|
3
3
|
result = []
|
4
4
|
file[:doc].search("div#content_left .result-zxl").each do |zxl|
|
5
|
-
result << {:id => zxl['id'],
|
5
|
+
result << {:id => zxl['id'],
|
6
|
+
:srcid => zxl['srcid'],
|
7
|
+
:fk => zxl['fk'],
|
8
|
+
:tpl => zxl['tpl'],
|
9
|
+
:mu => zxl['mu'] }
|
6
10
|
end
|
7
11
|
result
|
8
12
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MingQian Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-11-
|
11
|
+
date: 2013-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|