micro_spider 0.1.19 → 0.1.20
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/micro_spider.rb +35 -9
- data/lib/spider_core/behavior.rb +36 -0
- data/lib/spider_core/field_dsl.rb +0 -21
- data/lib/spider_core/version.rb +1 -1
- metadata +2 -2
data/lib/micro_spider.rb
CHANGED
@@ -35,13 +35,14 @@ class MicroSpider
|
|
35
35
|
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
|
36
36
|
|
37
37
|
def initialize(excretion = nil)
|
38
|
-
@paths
|
39
|
-
@actions
|
40
|
-
@
|
41
|
-
@
|
42
|
-
@
|
43
|
-
@
|
44
|
-
@
|
38
|
+
@paths = []
|
39
|
+
@actions = []
|
40
|
+
@setted_variables = {}
|
41
|
+
@timeout = 120
|
42
|
+
@excretion = excretion || { status: 'inprogress', results: [] }
|
43
|
+
@logger = Logger.new(STDOUT)
|
44
|
+
@visited_paths = Set.new
|
45
|
+
@broken_paths = []
|
45
46
|
end
|
46
47
|
|
47
48
|
# The seconds between each two request.
|
@@ -70,6 +71,31 @@ class MicroSpider
|
|
70
71
|
logger.info "Current location is #{path}."
|
71
72
|
end
|
72
73
|
|
74
|
+
# Set a variable. You can use it later.
|
75
|
+
#
|
76
|
+
# @param name [String] the variable name
|
77
|
+
# @param value [String] the variable value
|
78
|
+
# @param opts [Hash] the options. can set selector with css or xpath
|
79
|
+
#
|
80
|
+
# @example Set a variable
|
81
|
+
# spider = MicroSpider.new
|
82
|
+
# spider.set :id, '645'
|
83
|
+
# spider.set :table, '.tb a', selector: :css
|
84
|
+
# spider.set :table, '.tb a', selector: :css do |e|
|
85
|
+
# e['src']
|
86
|
+
# end
|
87
|
+
def set(name, value, opts = {}, &block)
|
88
|
+
selector = opts.delete(:selector)
|
89
|
+
if selector.nil?
|
90
|
+
@setted_variables[name.to_s] = value
|
91
|
+
else
|
92
|
+
actions << lambda {
|
93
|
+
elements = scan_all(selector, value, opts)
|
94
|
+
@setted_variables[name.to_s] = block_given? ? yield(elements) : handle_element(elements.first)
|
95
|
+
}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
73
99
|
# Click the locator. This will trigger visit action and change current location.
|
74
100
|
# @params locator [String] the text or id of the link.
|
75
101
|
#
|
@@ -129,7 +155,7 @@ class MicroSpider
|
|
129
155
|
return if @site
|
130
156
|
Capybara.app_host = @excretion[:site] = @site = url
|
131
157
|
end
|
132
|
-
|
158
|
+
|
133
159
|
# This will be the first path for spider to visit.
|
134
160
|
# If more than one entrance, the spider will crawl theme one by one.
|
135
161
|
# @param path_or_paths [String] one or more entrances
|
@@ -248,7 +274,7 @@ class MicroSpider
|
|
248
274
|
spider.instance_variable_set(:@paths, [])
|
249
275
|
spider.instance_variable_set(:@actions, [])
|
250
276
|
spider.instance_variable_set(:@visited_paths, Set.new)
|
251
|
-
spider.instance_variable_set(:@broken_paths,
|
277
|
+
spider.instance_variable_set(:@broken_paths, Set.new)
|
252
278
|
spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
|
253
279
|
spider.skip_set_entrance = false
|
254
280
|
spider
|
data/lib/spider_core/behavior.rb
CHANGED
@@ -6,6 +6,7 @@ module SpiderCore
|
|
6
6
|
protected
|
7
7
|
|
8
8
|
def scan_all(kind, pattern, opts = {})
|
9
|
+
pattern = handle_pattern(pattern)
|
9
10
|
if pattern.is_a?(String)
|
10
11
|
elements = all(kind, pattern).lazy
|
11
12
|
if opts[:limit] && opts[:limit].to_i > 0
|
@@ -18,6 +19,7 @@ module SpiderCore
|
|
18
19
|
end
|
19
20
|
|
20
21
|
def scan_first(kind, pattern)
|
22
|
+
pattern = handle_pattern(pattern)
|
21
23
|
if pattern.is_a?(String)
|
22
24
|
first(kind, pattern)
|
23
25
|
elsif pattern.is_a?(Regexp)
|
@@ -25,5 +27,39 @@ module SpiderCore
|
|
25
27
|
end
|
26
28
|
end
|
27
29
|
|
30
|
+
def handle_element(element)
|
31
|
+
if element.is_a?(String)
|
32
|
+
element
|
33
|
+
elsif element.tag_name == 'input'
|
34
|
+
element.value
|
35
|
+
else
|
36
|
+
element.text
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def handle_elements(elements, &block)
|
41
|
+
if elements.respond_to?(:map) && block_given?
|
42
|
+
elements.map { |element| yield(element) }.force
|
43
|
+
elsif elements.respond_to?(:map)
|
44
|
+
elements.map { |element| handle_element(element) }.force
|
45
|
+
elsif block_given?
|
46
|
+
yield(elements)
|
47
|
+
else
|
48
|
+
handle_element(elements)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# @example Handle pattern
|
53
|
+
# handle_pattern('.a') # =>'.a'
|
54
|
+
# set :id, 'a'
|
55
|
+
# handle_pattern('.%{id}bc') # =>'.abc'
|
56
|
+
def handle_pattern(pattern)
|
57
|
+
scan_results = pattern.scan(/(?<=%{)[^}]*(?=})/)
|
58
|
+
unless scan_results.empty?
|
59
|
+
scan_results.each { |v| pattern = pattern.sub(/%\{#{v}\}/, @setted_variables[v]) }
|
60
|
+
end
|
61
|
+
pattern
|
62
|
+
end
|
63
|
+
|
28
64
|
end
|
29
65
|
end
|
@@ -35,27 +35,6 @@ module SpiderCore
|
|
35
35
|
end
|
36
36
|
|
37
37
|
protected
|
38
|
-
def handle_element(element)
|
39
|
-
if element.is_a?(String)
|
40
|
-
element
|
41
|
-
elsif element.tag_name == 'input'
|
42
|
-
element.value
|
43
|
-
else
|
44
|
-
element.text
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def handle_elements(elements, &block)
|
49
|
-
if elements.respond_to?(:map) && block_given?
|
50
|
-
elements.map { |element| yield(element) }.force
|
51
|
-
elsif elements.respond_to?(:map)
|
52
|
-
elements.map { |element| handle_element(element) }.force
|
53
|
-
elsif block_given?
|
54
|
-
yield(elements)
|
55
|
-
else
|
56
|
-
handle_element(elements)
|
57
|
-
end
|
58
|
-
end
|
59
38
|
|
60
39
|
def action_for(action, action_opts = {}, opts = {}, &block)
|
61
40
|
begin
|
data/lib/spider_core/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: capybara
|