micro_spider 0.1.16 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/micro_spider.rb +106 -18
- data/lib/spider_core/behavior.rb +3 -1
- data/lib/spider_core/field_dsl.rb +6 -4
- data/lib/spider_core/follow_dsl.rb +2 -1
- data/lib/spider_core/pagination_dsl.rb +2 -1
- data/lib/spider_core/version.rb +1 -1
- data/test/micro_spider_test.rb +1 -0
- metadata +55 -22
- checksums.yaml +0 -7
data/lib/micro_spider.rb
CHANGED
@@ -2,10 +2,14 @@ require 'capybara'
|
|
2
2
|
require 'capybara-webkit'
|
3
3
|
require 'capybara/dsl'
|
4
4
|
|
5
|
-
Capybara.run_server = false
|
6
5
|
Capybara.current_driver = :webkit
|
6
|
+
Capybara.configure do |config|
|
7
|
+
config.ignore_hidden_elements = false
|
8
|
+
config.run_server = false
|
9
|
+
end
|
7
10
|
|
8
11
|
require 'logger'
|
12
|
+
require 'set'
|
9
13
|
require 'spider_core'
|
10
14
|
|
11
15
|
class MicroSpider
|
@@ -16,14 +20,16 @@ class MicroSpider
|
|
16
20
|
include SpiderCore::FollowDSL
|
17
21
|
include SpiderCore::PaginationDSL
|
18
22
|
|
19
|
-
attr_reader :excretion, :paths, :delay, :current_location
|
23
|
+
attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
|
20
24
|
attr_accessor :logger, :actions, :recipe, :skip_set_entrance
|
21
25
|
|
22
26
|
def initialize(excretion = nil)
|
23
|
-
@paths
|
24
|
-
@actions
|
27
|
+
@paths = []
|
28
|
+
@actions = []
|
25
29
|
@excretion = excretion || { status: 'inprogress', results: [] }
|
26
|
-
@logger
|
30
|
+
@logger = Logger.new(STDOUT)
|
31
|
+
@visited_paths = Set.new
|
32
|
+
@broken_paths = []
|
27
33
|
end
|
28
34
|
|
29
35
|
# The seconds between each two request.
|
@@ -37,8 +43,9 @@ class MicroSpider
|
|
37
43
|
# Visit the path.
|
38
44
|
#
|
39
45
|
# @param path [String] the path to visit, can be absolute path or relative path.
|
46
|
+
#
|
40
47
|
# @example Visit a path
|
41
|
-
# spider =
|
48
|
+
# spider = MicroSpider.new
|
42
49
|
# spider.visit('/example')
|
43
50
|
# spider.visit('http://google.com')
|
44
51
|
#
|
@@ -50,13 +57,37 @@ class MicroSpider
|
|
50
57
|
logger.info "Current location is #{path}."
|
51
58
|
end
|
52
59
|
|
60
|
+
# Click the locator. This will trigger visit action and change current location.
|
61
|
+
# @params locator [String] the text or id of the link.
|
62
|
+
#
|
53
63
|
def click(locator, opts = {})
|
54
|
-
actions << lambda {
|
64
|
+
actions << lambda {
|
55
65
|
path = find_link(locator, opts)[:href]
|
56
66
|
visit(path)
|
57
67
|
}
|
58
68
|
end
|
59
|
-
|
69
|
+
|
70
|
+
# Teach the spider behaviors and it will repeat to the end.
|
71
|
+
# @param recipe [String, Proc] the recipe be learned.
|
72
|
+
#
|
73
|
+
# @example
|
74
|
+
# spider = MicroSpider.new
|
75
|
+
# spider.learn do
|
76
|
+
# entrance 'http://google.com'
|
77
|
+
# end
|
78
|
+
# spider.crawl
|
79
|
+
#
|
80
|
+
# @example
|
81
|
+
# spider.learn("entrance 'http://google.com'")
|
82
|
+
# spider.crawl
|
83
|
+
#
|
84
|
+
# @example
|
85
|
+
# recipe = lambda {
|
86
|
+
# entrance 'http://google.com'
|
87
|
+
# }
|
88
|
+
# spider.learn(recipe)
|
89
|
+
# spider.crawl
|
90
|
+
#
|
60
91
|
def learn(recipe = nil, &block)
|
61
92
|
if block_given?
|
62
93
|
instance_eval(&block)
|
@@ -76,14 +107,33 @@ class MicroSpider
|
|
76
107
|
return if @site
|
77
108
|
Capybara.app_host = @excretion[:site] = @site = url
|
78
109
|
end
|
79
|
-
|
110
|
+
|
111
|
+
# This will be the first path for spider to visit.
|
112
|
+
# If more than one entrance, the spider will crawl theme one by one.
|
113
|
+
# @param path_or_paths [String] one or more entrances
|
114
|
+
#
|
115
|
+
# @example
|
116
|
+
# spider = MicroSpider.new
|
117
|
+
# spider.site('http://google.com')
|
118
|
+
# spider.entrance('/a')
|
119
|
+
# spider.entrance('/b')
|
120
|
+
#
|
80
121
|
def entrance(*path_or_paths)
|
81
122
|
return if @skip_set_entrance
|
82
123
|
@paths += path_or_paths
|
83
124
|
end
|
84
125
|
|
85
|
-
|
126
|
+
# Sometimes the entrances are on the page.
|
127
|
+
# @param path [String] path to visit
|
128
|
+
# @param pattern [String, Regexp] links pattern
|
129
|
+
#
|
130
|
+
# @example
|
131
|
+
# spider = MicroSpider.new
|
132
|
+
# spider.entrance_on_path('http://google.com', '.links a')
|
133
|
+
#
|
134
|
+
def entrance_on_path(path, pattern, opts = {}, &block)
|
86
135
|
return if @skip_set_entrance
|
136
|
+
kind = opts[:kind] || :css
|
87
137
|
visit(path)
|
88
138
|
entrances = scan_all(kind, pattern, opts).map do |element|
|
89
139
|
block_given? ? yield(element) : element[:href]
|
@@ -95,24 +145,60 @@ class MicroSpider
|
|
95
145
|
return excretion if completed?
|
96
146
|
|
97
147
|
@paths.compact!
|
98
|
-
path =
|
148
|
+
path = nil
|
149
|
+
loop do
|
150
|
+
path = @paths.shift
|
151
|
+
break if path.nil?
|
152
|
+
break unless @visited_paths.include?(path)
|
153
|
+
end
|
154
|
+
|
99
155
|
if path.nil?
|
100
156
|
excretion[:status] = 'completed'
|
101
157
|
return excretion
|
102
158
|
end
|
103
159
|
|
104
|
-
|
105
|
-
execute_actions
|
106
|
-
yield(@current_location) if block_given?
|
107
|
-
excretion[:results] << @current_location
|
160
|
+
learn(@recipe) if @actions.empty?
|
108
161
|
|
109
|
-
|
110
|
-
|
111
|
-
|
162
|
+
begin
|
163
|
+
visit(path)
|
164
|
+
rescue Timeout::Error => err
|
165
|
+
@broken_paths << path
|
166
|
+
logger.fatal("Timeout!!! execution expired when visit `#{path}`")
|
167
|
+
logger.fatal(err)
|
168
|
+
rescue SystemExit, Interrupt
|
169
|
+
logger.fatal("SystemExit && Interrupt")
|
170
|
+
exit!
|
171
|
+
rescue Exception => err
|
172
|
+
@broken_paths << path
|
173
|
+
logger.fatal("Caught exception when visit `#{path}`")
|
174
|
+
logger.fatal(err)
|
175
|
+
else
|
176
|
+
@visited_paths << path
|
177
|
+
execute_actions
|
178
|
+
yield(@current_location) if block_given?
|
179
|
+
excretion[:results] << @current_location
|
180
|
+
ensure
|
181
|
+
@actions = []
|
182
|
+
@skip_set_entrance = true
|
183
|
+
crawl(&block)
|
184
|
+
end
|
112
185
|
|
113
186
|
excretion
|
114
187
|
end
|
115
188
|
|
189
|
+
# Spider can create custom action when it is crawling.
|
190
|
+
# @param name [String] the name of action
|
191
|
+
# @param block [Proc] the actions
|
192
|
+
#
|
193
|
+
# @example
|
194
|
+
# spider = MicroSpider.new
|
195
|
+
#
|
196
|
+
# spider.create_action :save do |result|
|
197
|
+
# SomeClass.save(result)
|
198
|
+
# end
|
199
|
+
#
|
200
|
+
# spider.save
|
201
|
+
#
|
116
202
|
def create_action(name, &block)
|
117
203
|
action = proc { actions << lambda { block.call(current_location) } }
|
118
204
|
metaclass.send :define_method, name, &action
|
@@ -126,6 +212,8 @@ class MicroSpider
|
|
126
212
|
spider = self.clone
|
127
213
|
spider.instance_variable_set(:@paths, [])
|
128
214
|
spider.instance_variable_set(:@actions, [])
|
215
|
+
spider.instance_variable_set(:@visited_paths, [])
|
216
|
+
spider.instance_variable_set(:@broken_paths, Set.new)
|
129
217
|
spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
|
130
218
|
spider.skip_set_entrance = false
|
131
219
|
spider
|
data/lib/spider_core/behavior.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
require 'enumerable/lazy' if RUBY_VERSION < '2.0'
|
2
|
+
|
1
3
|
module SpiderCore
|
2
4
|
module Behavior
|
3
5
|
|
4
6
|
protected
|
5
7
|
|
6
|
-
def scan_all(kind, pattern,
|
8
|
+
def scan_all(kind, pattern, opts = {})
|
7
9
|
if pattern.is_a?(String)
|
8
10
|
elements = all(kind, pattern).lazy
|
9
11
|
if opts[:limit] && opts[:limit].to_i > 0
|
@@ -36,10 +36,12 @@ module SpiderCore
|
|
36
36
|
|
37
37
|
protected
|
38
38
|
def handle_element(element)
|
39
|
-
if element
|
40
|
-
element.text
|
41
|
-
else
|
39
|
+
if element.is_a?(String)
|
42
40
|
element
|
41
|
+
elsif element.tag_name == 'input'
|
42
|
+
element.value
|
43
|
+
else
|
44
|
+
element.text
|
43
45
|
end
|
44
46
|
end
|
45
47
|
|
@@ -63,7 +65,7 @@ module SpiderCore
|
|
63
65
|
when :field
|
64
66
|
scan_first(action_opts[:kind], action_opts[:pattern])
|
65
67
|
when :fields
|
66
|
-
scan_all(action_opts[:kind], action_opts[:pattern], opts)
|
68
|
+
scan_all(action_opts[:kind], action_opts[:pattern], opts)
|
67
69
|
else
|
68
70
|
raise 'Unknow action.'
|
69
71
|
end
|
@@ -3,8 +3,9 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :skip_followers
|
5
5
|
|
6
|
-
def follow(pattern,
|
6
|
+
def follow(pattern, opts = {}, &block)
|
7
7
|
return unless block_given?
|
8
|
+
kind = opts[:kind] || :css
|
8
9
|
actions << lambda {
|
9
10
|
spider = self.spawn
|
10
11
|
spider.learn(&block)
|
@@ -3,7 +3,8 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :next_page, :skip_pages
|
5
5
|
|
6
|
-
def keep_eyes_on_next_page(pattern,
|
6
|
+
def keep_eyes_on_next_page(pattern, opts = {})
|
7
|
+
kind = opts[:kind] || :css
|
7
8
|
actions << lambda {
|
8
9
|
@next_page = first(kind, pattern)[:href] rescue nil
|
9
10
|
@paths.unshift(@next_page) if @next_page
|
data/lib/spider_core/version.rb
CHANGED
data/test/micro_spider_test.rb
CHANGED
metadata
CHANGED
@@ -1,111 +1,142 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.17
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- zires
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-25 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: capybara
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
|
-
- - '>='
|
19
|
+
- - ! '>='
|
18
20
|
- !ruby/object:Gem::Version
|
19
21
|
version: '0'
|
20
22
|
type: :runtime
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
|
-
- - '>='
|
27
|
+
- - ! '>='
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '0'
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: capybara-webkit
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
|
-
- - '>='
|
35
|
+
- - ! '>='
|
32
36
|
- !ruby/object:Gem::Version
|
33
37
|
version: '0'
|
34
38
|
type: :runtime
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
|
-
- - '>='
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: enumerable-lazy
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
39
60
|
- !ruby/object:Gem::Version
|
40
61
|
version: '0'
|
41
62
|
- !ruby/object:Gem::Dependency
|
42
63
|
name: pry
|
43
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
44
66
|
requirements:
|
45
|
-
- - '>='
|
67
|
+
- - ! '>='
|
46
68
|
- !ruby/object:Gem::Version
|
47
69
|
version: '0'
|
48
70
|
type: :development
|
49
71
|
prerelease: false
|
50
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
51
74
|
requirements:
|
52
|
-
- - '>='
|
75
|
+
- - ! '>='
|
53
76
|
- !ruby/object:Gem::Version
|
54
77
|
version: '0'
|
55
78
|
- !ruby/object:Gem::Dependency
|
56
79
|
name: yard
|
57
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
58
82
|
requirements:
|
59
|
-
- - '>='
|
83
|
+
- - ! '>='
|
60
84
|
- !ruby/object:Gem::Version
|
61
85
|
version: '0'
|
62
86
|
type: :development
|
63
87
|
prerelease: false
|
64
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
65
90
|
requirements:
|
66
|
-
- - '>='
|
91
|
+
- - ! '>='
|
67
92
|
- !ruby/object:Gem::Version
|
68
93
|
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: rake
|
71
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
72
98
|
requirements:
|
73
|
-
- - '>='
|
99
|
+
- - ! '>='
|
74
100
|
- !ruby/object:Gem::Version
|
75
101
|
version: '0'
|
76
102
|
type: :development
|
77
103
|
prerelease: false
|
78
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
79
106
|
requirements:
|
80
|
-
- - '>='
|
107
|
+
- - ! '>='
|
81
108
|
- !ruby/object:Gem::Version
|
82
109
|
version: '0'
|
83
110
|
- !ruby/object:Gem::Dependency
|
84
111
|
name: turn
|
85
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
86
114
|
requirements:
|
87
|
-
- - '>='
|
115
|
+
- - ! '>='
|
88
116
|
- !ruby/object:Gem::Version
|
89
117
|
version: '0'
|
90
118
|
type: :development
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
93
122
|
requirements:
|
94
|
-
- - '>='
|
123
|
+
- - ! '>='
|
95
124
|
- !ruby/object:Gem::Version
|
96
125
|
version: '0'
|
97
126
|
- !ruby/object:Gem::Dependency
|
98
127
|
name: sinatra
|
99
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
100
130
|
requirements:
|
101
|
-
- - '>='
|
131
|
+
- - ! '>='
|
102
132
|
- !ruby/object:Gem::Version
|
103
133
|
version: '0'
|
104
134
|
type: :development
|
105
135
|
prerelease: false
|
106
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
107
138
|
requirements:
|
108
|
-
- - '>='
|
139
|
+
- - ! '>='
|
109
140
|
- !ruby/object:Gem::Version
|
110
141
|
version: '0'
|
111
142
|
description: A DSL to write web spider. Depend on capybara and capybara-webkit.
|
@@ -128,27 +159,29 @@ files:
|
|
128
159
|
- test/micro_spider_test.rb
|
129
160
|
- test/test_helper.rb
|
130
161
|
homepage: https://github.com/zires/micro-spider
|
131
|
-
licenses:
|
132
|
-
|
162
|
+
licenses:
|
163
|
+
- MIT
|
133
164
|
post_install_message:
|
134
165
|
rdoc_options: []
|
135
166
|
require_paths:
|
136
167
|
- lib
|
137
168
|
required_ruby_version: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
138
170
|
requirements:
|
139
|
-
- - '>='
|
171
|
+
- - ! '>='
|
140
172
|
- !ruby/object:Gem::Version
|
141
173
|
version: '0'
|
142
174
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
175
|
+
none: false
|
143
176
|
requirements:
|
144
|
-
- - '>='
|
177
|
+
- - ! '>='
|
145
178
|
- !ruby/object:Gem::Version
|
146
179
|
version: '0'
|
147
180
|
requirements: []
|
148
181
|
rubyforge_project:
|
149
|
-
rubygems_version:
|
182
|
+
rubygems_version: 1.8.23
|
150
183
|
signing_key:
|
151
|
-
specification_version:
|
184
|
+
specification_version: 3
|
152
185
|
summary: A DSL to write web spider.
|
153
186
|
test_files:
|
154
187
|
- test/micro_spider_test.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: f9363e70b57c95de9256ea2549cf2ee76c2669c0
|
4
|
-
data.tar.gz: 7f8b0bc18fde686058c2b426f84c05b26ba3a1b3
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: eb6a2ca107f788c95b4244b06de2ac8b7c94e983e0306dbcce71872cee37dd5946784cb900681fdd9d0ee35f6e82c2c6f7abbfed4916fabbdcc97a1ee27c849b
|
7
|
-
data.tar.gz: 06ea26fcfd3b53edbb461927772a50d4320525e89f9d7c19e250cc93ba33937362dd60d7d5467dd5d7a0c8950fa67e50c0a7d2df6f5fd5c81a51ca1a9e78c9cb
|