ro_crawler 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ro_crawler.rb +1 -3
- data/lib/ro_crawler/base.rb +14 -10
- data/lib/ro_crawler/version.rb +1 -1
- data/spec/ro_crawler/base_spec.rb +17 -3
- metadata +31 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2bc31485c675436fdef8b64dfc058c8109509de6
|
4
|
+
data.tar.gz: 3090f40f7e0ce91a531b4b29c26fdda660e64988
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f8c5d1fa858d404688779e1b58618259736e27bc024b6a935c3fba53d32b9d70c720b875a524efe9b41a8da6bc0013ab13efa9624fb380adc84610bbd923f327
|
7
|
+
data.tar.gz: 2b71ebef44fec7ce565550c3322d9400087f0ed95733720153655a6931c553421d2429d4bf80fc67c8581a05c72bdf520ffed224bc4a7f76c79658bec12b40e0
|
data/lib/ro_crawler.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
# require all files in dir name is same with __FILE_-
|
2
|
-
|
3
|
-
autoload :Array, 'ro_support/array'
|
4
|
-
autoload :Log, 'ro_support/log'
|
2
|
+
require 'ro_support/array'
|
5
3
|
|
6
4
|
include RoSupport::FileActions
|
7
5
|
include RoSupport::Array
|
data/lib/ro_crawler/base.rb
CHANGED
@@ -18,8 +18,8 @@ module RoCrawler
|
|
18
18
|
|
19
19
|
def get_contents
|
20
20
|
ro_raise(err "@link_titles is nil", output: ['@url']) if @link_titles.nil?
|
21
|
-
@
|
22
|
-
@
|
21
|
+
@offers = @link_titles.dup
|
22
|
+
@offers.each do |link_content|
|
23
23
|
if link_content[0][/http/]
|
24
24
|
link = link_content[0]
|
25
25
|
else
|
@@ -28,14 +28,23 @@ module RoCrawler
|
|
28
28
|
|
29
29
|
link_content << intr = get_tags_attrs_from(link, @intr_selector, 'text')
|
30
30
|
unless intr.is_a? String
|
31
|
-
|
31
|
+
puts_log 'intr must be a string', 'ro_crawler_base.log'
|
32
32
|
end
|
33
33
|
link_content
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
def handler
|
38
|
-
|
37
|
+
def handler
|
38
|
+
ro_raise(err '@offers is empty') if @offers.empty?
|
39
|
+
if block_given?
|
40
|
+
if @offers.is_a?(Array)
|
41
|
+
@offers.each do |offer|
|
42
|
+
yield offer
|
43
|
+
end
|
44
|
+
else
|
45
|
+
raise "@offers is not a Array, @offers is #{@offers.class}"
|
46
|
+
end
|
47
|
+
end
|
39
48
|
end
|
40
49
|
|
41
50
|
def open_browser(driver)
|
@@ -126,11 +135,6 @@ module RoCrawler
|
|
126
135
|
end
|
127
136
|
|
128
137
|
def handle(results)
|
129
|
-
if results.is_a?(Array)
|
130
|
-
results.each do |result|
|
131
|
-
yield result
|
132
|
-
end
|
133
|
-
end
|
134
138
|
end
|
135
139
|
|
136
140
|
def get_home_url(url)
|
data/lib/ro_crawler/version.rb
CHANGED
@@ -90,13 +90,13 @@ describe RoCrawler::Base do
|
|
90
90
|
end
|
91
91
|
|
92
92
|
|
93
|
-
it '
|
93
|
+
it 'get_tags_from_url' do
|
94
94
|
expect(
|
95
95
|
@b.get_tags_from("http://baidu.com", 'body').inner_html
|
96
96
|
).not_to be_nil
|
97
97
|
end
|
98
98
|
|
99
|
-
describe '
|
99
|
+
describe 'get_attrs_in_tags' do
|
100
100
|
it 'when get two attribute' do
|
101
101
|
tags = @b.get_tags_from("http://baidu.com", '#m p#nv a')
|
102
102
|
@b.get_attrs_in(tags, 'href', 'text').each do |attrs|
|
@@ -121,9 +121,23 @@ describe RoCrawler::Base do
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
-
it '
|
124
|
+
it 'v2ex' do
|
125
125
|
@b.get_tags_attrs_from("http://www.v2ex.com/t/80954#reply6", '.topic_content', 'text')
|
126
126
|
end
|
127
|
+
|
128
|
+
it 'ruby_china' do
|
129
|
+
url = "http://ruby-china.org/topics/node25"
|
130
|
+
anchor_selector = ".title>a"
|
131
|
+
results = @b.get_tags_attrs_from(url, anchor_selector, 'text')
|
132
|
+
expect(results.count).to be == 15
|
133
|
+
end
|
134
|
+
|
135
|
+
it 'yingjiesheng' do
|
136
|
+
url = "http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF&city=0&jobtype=0&do=1&stype=0"
|
137
|
+
anchor_selector = "h3.title>a"
|
138
|
+
results = @b.get_tags_attrs_from(url, anchor_selector, 'text')
|
139
|
+
expect(results.count).to be == 10
|
140
|
+
end
|
127
141
|
end
|
128
142
|
|
129
143
|
it 'handle result' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ro_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -24,6 +24,34 @@ dependencies:
|
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 4.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ro_support
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: term-ansicolor
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
27
55
|
- !ruby/object:Gem::Dependency
|
28
56
|
name: watir-rails
|
29
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -125,7 +153,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
153
|
version: '0'
|
126
154
|
requirements: []
|
127
155
|
rubyforge_project:
|
128
|
-
rubygems_version: 2.
|
156
|
+
rubygems_version: 2.1.5
|
129
157
|
signing_key:
|
130
158
|
specification_version: 4
|
131
159
|
summary: ''
|