micro_spider 0.1.16 → 0.1.17
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/micro_spider.rb +106 -18
- data/lib/spider_core/behavior.rb +3 -1
- data/lib/spider_core/field_dsl.rb +6 -4
- data/lib/spider_core/follow_dsl.rb +2 -1
- data/lib/spider_core/pagination_dsl.rb +2 -1
- data/lib/spider_core/version.rb +1 -1
- data/test/micro_spider_test.rb +1 -0
- metadata +55 -22
- checksums.yaml +0 -7
data/lib/micro_spider.rb
CHANGED
@@ -2,10 +2,14 @@ require 'capybara'
|
|
2
2
|
require 'capybara-webkit'
|
3
3
|
require 'capybara/dsl'
|
4
4
|
|
5
|
-
Capybara.run_server = false
|
6
5
|
Capybara.current_driver = :webkit
|
6
|
+
Capybara.configure do |config|
|
7
|
+
config.ignore_hidden_elements = false
|
8
|
+
config.run_server = false
|
9
|
+
end
|
7
10
|
|
8
11
|
require 'logger'
|
12
|
+
require 'set'
|
9
13
|
require 'spider_core'
|
10
14
|
|
11
15
|
class MicroSpider
|
@@ -16,14 +20,16 @@ class MicroSpider
|
|
16
20
|
include SpiderCore::FollowDSL
|
17
21
|
include SpiderCore::PaginationDSL
|
18
22
|
|
19
|
-
attr_reader :excretion, :paths, :delay, :current_location
|
23
|
+
attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
|
20
24
|
attr_accessor :logger, :actions, :recipe, :skip_set_entrance
|
21
25
|
|
22
26
|
def initialize(excretion = nil)
|
23
|
-
@paths
|
24
|
-
@actions
|
27
|
+
@paths = []
|
28
|
+
@actions = []
|
25
29
|
@excretion = excretion || { status: 'inprogress', results: [] }
|
26
|
-
@logger
|
30
|
+
@logger = Logger.new(STDOUT)
|
31
|
+
@visited_paths = Set.new
|
32
|
+
@broken_paths = []
|
27
33
|
end
|
28
34
|
|
29
35
|
# The seconds between each two request.
|
@@ -37,8 +43,9 @@ class MicroSpider
|
|
37
43
|
# Visit the path.
|
38
44
|
#
|
39
45
|
# @param path [String] the path to visit, can be absolute path or relative path.
|
46
|
+
#
|
40
47
|
# @example Visit a path
|
41
|
-
# spider =
|
48
|
+
# spider = MicroSpider.new
|
42
49
|
# spider.visit('/example')
|
43
50
|
# spider.visit('http://google.com')
|
44
51
|
#
|
@@ -50,13 +57,37 @@ class MicroSpider
|
|
50
57
|
logger.info "Current location is #{path}."
|
51
58
|
end
|
52
59
|
|
60
|
+
# Click the locator. This will trigger visit action and change current location.
|
61
|
+
# @params locator [String] the text or id of the link.
|
62
|
+
#
|
53
63
|
def click(locator, opts = {})
|
54
|
-
actions << lambda {
|
64
|
+
actions << lambda {
|
55
65
|
path = find_link(locator, opts)[:href]
|
56
66
|
visit(path)
|
57
67
|
}
|
58
68
|
end
|
59
|
-
|
69
|
+
|
70
|
+
# Teach the spider behaviors and it will repeat to the end.
|
71
|
+
# @param recipe [String, Proc] the recipe be learned.
|
72
|
+
#
|
73
|
+
# @example
|
74
|
+
# spider = MicroSpider.new
|
75
|
+
# spider.learn do
|
76
|
+
# entrance 'http://google.com'
|
77
|
+
# end
|
78
|
+
# spider.crawl
|
79
|
+
#
|
80
|
+
# @example
|
81
|
+
# spider.learn("entrance 'http://google.com'")
|
82
|
+
# spider.crawl
|
83
|
+
#
|
84
|
+
# @example
|
85
|
+
# recipe = lambda {
|
86
|
+
# entrance 'http://google.com'
|
87
|
+
# }
|
88
|
+
# spider.learn(recipe)
|
89
|
+
# spider.crawl
|
90
|
+
#
|
60
91
|
def learn(recipe = nil, &block)
|
61
92
|
if block_given?
|
62
93
|
instance_eval(&block)
|
@@ -76,14 +107,33 @@ class MicroSpider
|
|
76
107
|
return if @site
|
77
108
|
Capybara.app_host = @excretion[:site] = @site = url
|
78
109
|
end
|
79
|
-
|
110
|
+
|
111
|
+
# This will be the first path for spider to visit.
|
112
|
+
# If more than one entrance, the spider will crawl theme one by one.
|
113
|
+
# @param path_or_paths [String] one or more entrances
|
114
|
+
#
|
115
|
+
# @example
|
116
|
+
# spider = MicroSpider.new
|
117
|
+
# spider.site('http://google.com')
|
118
|
+
# spider.entrance('/a')
|
119
|
+
# spider.entrance('/b')
|
120
|
+
#
|
80
121
|
def entrance(*path_or_paths)
|
81
122
|
return if @skip_set_entrance
|
82
123
|
@paths += path_or_paths
|
83
124
|
end
|
84
125
|
|
85
|
-
|
126
|
+
# Sometimes the entrances are on the page.
|
127
|
+
# @param path [String] path to visit
|
128
|
+
# @param pattern [String, Regexp] links pattern
|
129
|
+
#
|
130
|
+
# @example
|
131
|
+
# spider = MicroSpider.new
|
132
|
+
# spider.entrance_on_path('http://google.com', '.links a')
|
133
|
+
#
|
134
|
+
def entrance_on_path(path, pattern, opts = {}, &block)
|
86
135
|
return if @skip_set_entrance
|
136
|
+
kind = opts[:kind] || :css
|
87
137
|
visit(path)
|
88
138
|
entrances = scan_all(kind, pattern, opts).map do |element|
|
89
139
|
block_given? ? yield(element) : element[:href]
|
@@ -95,24 +145,60 @@ class MicroSpider
|
|
95
145
|
return excretion if completed?
|
96
146
|
|
97
147
|
@paths.compact!
|
98
|
-
path =
|
148
|
+
path = nil
|
149
|
+
loop do
|
150
|
+
path = @paths.shift
|
151
|
+
break if path.nil?
|
152
|
+
break unless @visited_paths.include?(path)
|
153
|
+
end
|
154
|
+
|
99
155
|
if path.nil?
|
100
156
|
excretion[:status] = 'completed'
|
101
157
|
return excretion
|
102
158
|
end
|
103
159
|
|
104
|
-
|
105
|
-
execute_actions
|
106
|
-
yield(@current_location) if block_given?
|
107
|
-
excretion[:results] << @current_location
|
160
|
+
learn(@recipe) if @actions.empty?
|
108
161
|
|
109
|
-
|
110
|
-
|
111
|
-
|
162
|
+
begin
|
163
|
+
visit(path)
|
164
|
+
rescue Timeout::Error => err
|
165
|
+
@broken_paths << path
|
166
|
+
logger.fatal("Timeout!!! execution expired when visit `#{path}`")
|
167
|
+
logger.fatal(err)
|
168
|
+
rescue SystemExit, Interrupt
|
169
|
+
logger.fatal("SystemExit && Interrupt")
|
170
|
+
exit!
|
171
|
+
rescue Exception => err
|
172
|
+
@broken_paths << path
|
173
|
+
logger.fatal("Caught exception when visit `#{path}`")
|
174
|
+
logger.fatal(err)
|
175
|
+
else
|
176
|
+
@visited_paths << path
|
177
|
+
execute_actions
|
178
|
+
yield(@current_location) if block_given?
|
179
|
+
excretion[:results] << @current_location
|
180
|
+
ensure
|
181
|
+
@actions = []
|
182
|
+
@skip_set_entrance = true
|
183
|
+
crawl(&block)
|
184
|
+
end
|
112
185
|
|
113
186
|
excretion
|
114
187
|
end
|
115
188
|
|
189
|
+
# Spider can create custom action when it is crawling.
|
190
|
+
# @param name [String] the name of action
|
191
|
+
# @param block [Proc] the actions
|
192
|
+
#
|
193
|
+
# @example
|
194
|
+
# spider = MicroSpider.new
|
195
|
+
#
|
196
|
+
# spider.create_action :save do |result|
|
197
|
+
# SomeClass.save(result)
|
198
|
+
# end
|
199
|
+
#
|
200
|
+
# spider.save
|
201
|
+
#
|
116
202
|
def create_action(name, &block)
|
117
203
|
action = proc { actions << lambda { block.call(current_location) } }
|
118
204
|
metaclass.send :define_method, name, &action
|
@@ -126,6 +212,8 @@ class MicroSpider
|
|
126
212
|
spider = self.clone
|
127
213
|
spider.instance_variable_set(:@paths, [])
|
128
214
|
spider.instance_variable_set(:@actions, [])
|
215
|
+
spider.instance_variable_set(:@visited_paths, [])
|
216
|
+
spider.instance_variable_set(:@broken_paths, Set.new)
|
129
217
|
spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
|
130
218
|
spider.skip_set_entrance = false
|
131
219
|
spider
|
data/lib/spider_core/behavior.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
require 'enumerable/lazy' if RUBY_VERSION < '2.0'
|
2
|
+
|
1
3
|
module SpiderCore
|
2
4
|
module Behavior
|
3
5
|
|
4
6
|
protected
|
5
7
|
|
6
|
-
def scan_all(kind, pattern,
|
8
|
+
def scan_all(kind, pattern, opts = {})
|
7
9
|
if pattern.is_a?(String)
|
8
10
|
elements = all(kind, pattern).lazy
|
9
11
|
if opts[:limit] && opts[:limit].to_i > 0
|
@@ -36,10 +36,12 @@ module SpiderCore
|
|
36
36
|
|
37
37
|
protected
|
38
38
|
def handle_element(element)
|
39
|
-
if element
|
40
|
-
element.text
|
41
|
-
else
|
39
|
+
if element.is_a?(String)
|
42
40
|
element
|
41
|
+
elsif element.tag_name == 'input'
|
42
|
+
element.value
|
43
|
+
else
|
44
|
+
element.text
|
43
45
|
end
|
44
46
|
end
|
45
47
|
|
@@ -63,7 +65,7 @@ module SpiderCore
|
|
63
65
|
when :field
|
64
66
|
scan_first(action_opts[:kind], action_opts[:pattern])
|
65
67
|
when :fields
|
66
|
-
scan_all(action_opts[:kind], action_opts[:pattern], opts)
|
68
|
+
scan_all(action_opts[:kind], action_opts[:pattern], opts)
|
67
69
|
else
|
68
70
|
raise 'Unknow action.'
|
69
71
|
end
|
@@ -3,8 +3,9 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :skip_followers
|
5
5
|
|
6
|
-
def follow(pattern,
|
6
|
+
def follow(pattern, opts = {}, &block)
|
7
7
|
return unless block_given?
|
8
|
+
kind = opts[:kind] || :css
|
8
9
|
actions << lambda {
|
9
10
|
spider = self.spawn
|
10
11
|
spider.learn(&block)
|
@@ -3,7 +3,8 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :next_page, :skip_pages
|
5
5
|
|
6
|
-
def keep_eyes_on_next_page(pattern,
|
6
|
+
def keep_eyes_on_next_page(pattern, opts = {})
|
7
|
+
kind = opts[:kind] || :css
|
7
8
|
actions << lambda {
|
8
9
|
@next_page = first(kind, pattern)[:href] rescue nil
|
9
10
|
@paths.unshift(@next_page) if @next_page
|
data/lib/spider_core/version.rb
CHANGED
data/test/micro_spider_test.rb
CHANGED
metadata
CHANGED
@@ -1,111 +1,142 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.17
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- zires
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-25 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: capybara
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
|
-
- - '>='
|
19
|
+
- - ! '>='
|
18
20
|
- !ruby/object:Gem::Version
|
19
21
|
version: '0'
|
20
22
|
type: :runtime
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
|
-
- - '>='
|
27
|
+
- - ! '>='
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '0'
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: capybara-webkit
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
|
-
- - '>='
|
35
|
+
- - ! '>='
|
32
36
|
- !ruby/object:Gem::Version
|
33
37
|
version: '0'
|
34
38
|
type: :runtime
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
|
-
- - '>='
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: enumerable-lazy
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
39
60
|
- !ruby/object:Gem::Version
|
40
61
|
version: '0'
|
41
62
|
- !ruby/object:Gem::Dependency
|
42
63
|
name: pry
|
43
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
44
66
|
requirements:
|
45
|
-
- - '>='
|
67
|
+
- - ! '>='
|
46
68
|
- !ruby/object:Gem::Version
|
47
69
|
version: '0'
|
48
70
|
type: :development
|
49
71
|
prerelease: false
|
50
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
51
74
|
requirements:
|
52
|
-
- - '>='
|
75
|
+
- - ! '>='
|
53
76
|
- !ruby/object:Gem::Version
|
54
77
|
version: '0'
|
55
78
|
- !ruby/object:Gem::Dependency
|
56
79
|
name: yard
|
57
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
58
82
|
requirements:
|
59
|
-
- - '>='
|
83
|
+
- - ! '>='
|
60
84
|
- !ruby/object:Gem::Version
|
61
85
|
version: '0'
|
62
86
|
type: :development
|
63
87
|
prerelease: false
|
64
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
65
90
|
requirements:
|
66
|
-
- - '>='
|
91
|
+
- - ! '>='
|
67
92
|
- !ruby/object:Gem::Version
|
68
93
|
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: rake
|
71
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
72
98
|
requirements:
|
73
|
-
- - '>='
|
99
|
+
- - ! '>='
|
74
100
|
- !ruby/object:Gem::Version
|
75
101
|
version: '0'
|
76
102
|
type: :development
|
77
103
|
prerelease: false
|
78
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
79
106
|
requirements:
|
80
|
-
- - '>='
|
107
|
+
- - ! '>='
|
81
108
|
- !ruby/object:Gem::Version
|
82
109
|
version: '0'
|
83
110
|
- !ruby/object:Gem::Dependency
|
84
111
|
name: turn
|
85
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
86
114
|
requirements:
|
87
|
-
- - '>='
|
115
|
+
- - ! '>='
|
88
116
|
- !ruby/object:Gem::Version
|
89
117
|
version: '0'
|
90
118
|
type: :development
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
93
122
|
requirements:
|
94
|
-
- - '>='
|
123
|
+
- - ! '>='
|
95
124
|
- !ruby/object:Gem::Version
|
96
125
|
version: '0'
|
97
126
|
- !ruby/object:Gem::Dependency
|
98
127
|
name: sinatra
|
99
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
100
130
|
requirements:
|
101
|
-
- - '>='
|
131
|
+
- - ! '>='
|
102
132
|
- !ruby/object:Gem::Version
|
103
133
|
version: '0'
|
104
134
|
type: :development
|
105
135
|
prerelease: false
|
106
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
107
138
|
requirements:
|
108
|
-
- - '>='
|
139
|
+
- - ! '>='
|
109
140
|
- !ruby/object:Gem::Version
|
110
141
|
version: '0'
|
111
142
|
description: A DSL to write web spider. Depend on capybara and capybara-webkit.
|
@@ -128,27 +159,29 @@ files:
|
|
128
159
|
- test/micro_spider_test.rb
|
129
160
|
- test/test_helper.rb
|
130
161
|
homepage: https://github.com/zires/micro-spider
|
131
|
-
licenses:
|
132
|
-
|
162
|
+
licenses:
|
163
|
+
- MIT
|
133
164
|
post_install_message:
|
134
165
|
rdoc_options: []
|
135
166
|
require_paths:
|
136
167
|
- lib
|
137
168
|
required_ruby_version: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
138
170
|
requirements:
|
139
|
-
- - '>='
|
171
|
+
- - ! '>='
|
140
172
|
- !ruby/object:Gem::Version
|
141
173
|
version: '0'
|
142
174
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
175
|
+
none: false
|
143
176
|
requirements:
|
144
|
-
- - '>='
|
177
|
+
- - ! '>='
|
145
178
|
- !ruby/object:Gem::Version
|
146
179
|
version: '0'
|
147
180
|
requirements: []
|
148
181
|
rubyforge_project:
|
149
|
-
rubygems_version:
|
182
|
+
rubygems_version: 1.8.23
|
150
183
|
signing_key:
|
151
|
-
specification_version:
|
184
|
+
specification_version: 3
|
152
185
|
summary: A DSL to write web spider.
|
153
186
|
test_files:
|
154
187
|
- test/micro_spider_test.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: f9363e70b57c95de9256ea2549cf2ee76c2669c0
|
4
|
-
data.tar.gz: 7f8b0bc18fde686058c2b426f84c05b26ba3a1b3
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: eb6a2ca107f788c95b4244b06de2ac8b7c94e983e0306dbcce71872cee37dd5946784cb900681fdd9d0ee35f6e82c2c6f7abbfed4916fabbdcc97a1ee27c849b
|
7
|
-
data.tar.gz: 06ea26fcfd3b53edbb461927772a50d4320525e89f9d7c19e250cc93ba33937362dd60d7d5467dd5d7a0c8950fa67e50c0a7d2df6f5fd5c81a51ca1a9e78c9cb
|