baiduserp 2.1.1 → 2.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/baiduserp.rb +4 -0
- data/lib/baiduserp/client.rb +1 -1
- data/lib/baiduserp/parser.rb +7 -1
- data/lib/baiduserp/version.rb +1 -1
- data/lib/parsers/ads_top.rb +18 -14
- data/lib/parsers/ranks.rb +11 -2
- data/lib/parsers/zhixin.rb +5 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 333ba9624a535b422d93a318e5b5db9f50681fab
|
4
|
+
data.tar.gz: fcc4bce281984552a81f1ecf138b91146ce3042d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad3edca21e77a528e13d2231737d8d78dd0fa4c221d6107a5e7ea933e0c51ee1e785c5fda5ffb79306e914b687a04dd6aa48ee8a34911474219d4bb6d0b81514
|
7
|
+
data.tar.gz: 5702393d3254700fd2ff721363ab68a7a394b7f1e3cae00a54adfbd4bff18a8cf4715f977729a90531c7b29977a9bfe1825980b4e8244d5d366092baa57e5d54
|
data/lib/baiduserp.rb
CHANGED
data/lib/baiduserp/client.rb
CHANGED
data/lib/baiduserp/parser.rb
CHANGED
@@ -40,10 +40,16 @@ module Baiduserp
|
|
40
40
|
end
|
41
41
|
|
42
42
|
def search(keyword,page=1)
|
43
|
+
html = get_search_html(keyword,page)
|
44
|
+
parse html
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_search_html(keyword,page=1)
|
43
48
|
keyword = keyword.gsub(" ","+")
|
44
49
|
page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
|
45
50
|
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
46
|
-
|
51
|
+
p serp_url
|
52
|
+
Client.get_serp(serp_url)
|
47
53
|
end
|
48
54
|
|
49
55
|
def parse_file(file_path)
|
data/lib/baiduserp/version.rb
CHANGED
data/lib/parsers/ads_top.rb
CHANGED
@@ -1,29 +1,33 @@
|
|
1
1
|
class Baiduserp::Parser
|
2
2
|
def _parse_ads_top(file)
|
3
3
|
result = []
|
4
|
-
|
4
|
+
rank = 0
|
5
|
+
|
6
|
+
file[:doc].search('div#content_left').first.children.each do |div|
|
5
7
|
id = div['id'].to_i
|
6
|
-
|
7
|
-
|
8
|
+
break if id > 0 && id < 3000
|
9
|
+
next unless div['class'].to_s.include?('ec_pp_f')
|
10
|
+
rank += 1
|
8
11
|
|
9
|
-
|
12
|
+
if div.name == 'div'
|
13
|
+
r = {rank: rank, id: id}
|
10
14
|
|
11
|
-
|
15
|
+
r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))
|
12
16
|
|
13
|
-
|
17
|
+
r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))
|
14
18
|
|
15
|
-
|
16
|
-
|
19
|
+
r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))
|
20
|
+
|
21
|
+
result << r
|
17
22
|
|
18
|
-
|
19
|
-
|
20
|
-
r = {rank: i + 1}
|
23
|
+
else # div.name == 'table'
|
24
|
+
r = {rank: rank, id: id}
|
21
25
|
|
22
|
-
r[:title] = Baiduserp::Helper.get_content_safe(
|
26
|
+
r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a'))
|
23
27
|
|
24
|
-
r[:content] = Baiduserp::Helper.get_content_safe(
|
28
|
+
r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc'))
|
25
29
|
|
26
|
-
r[:site] = Baiduserp::Helper.get_content_safe(
|
30
|
+
r[:site] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url'))
|
27
31
|
|
28
32
|
result << r
|
29
33
|
|
data/lib/parsers/ranks.rb
CHANGED
@@ -5,8 +5,19 @@ class Baiduserp::Parser
|
|
5
5
|
next if table.nil?
|
6
6
|
id = table['id'].to_i
|
7
7
|
next unless id > 0 && id < 3000
|
8
|
+
|
8
9
|
r = {:rank => id}
|
9
10
|
|
11
|
+
r[:result_op] = table['class'].to_s.include?('result-op')
|
12
|
+
|
13
|
+
r[:fk] = table['fk']
|
14
|
+
|
15
|
+
r[:srcid] = table['srcid']
|
16
|
+
|
17
|
+
r[:tpl] = table['tpl']
|
18
|
+
|
19
|
+
r[:mu] = table['mu']
|
20
|
+
|
10
21
|
url = table.search('h3/a').first
|
11
22
|
unless url.nil?
|
12
23
|
url = url['href']
|
@@ -23,8 +34,6 @@ class Baiduserp::Parser
|
|
23
34
|
|
24
35
|
r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract'))
|
25
36
|
|
26
|
-
r[:mu] = table['mu']
|
27
|
-
|
28
37
|
table.search('a').each do |link|
|
29
38
|
r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
|
30
39
|
end
|
data/lib/parsers/zhixin.rb
CHANGED
@@ -2,7 +2,11 @@ class Baiduserp::Parser
|
|
2
2
|
def _parse_zhixin(file)
|
3
3
|
result = []
|
4
4
|
file[:doc].search("div#content_left .result-zxl").each do |zxl|
|
5
|
-
result << {:id => zxl['id'],
|
5
|
+
result << {:id => zxl['id'],
|
6
|
+
:srcid => zxl['srcid'],
|
7
|
+
:fk => zxl['fk'],
|
8
|
+
:tpl => zxl['tpl'],
|
9
|
+
:mu => zxl['mu'] }
|
6
10
|
end
|
7
11
|
result
|
8
12
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MingQian Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-11-
|
11
|
+
date: 2013-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|