micro_spider 0.1.19 → 0.1.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/micro_spider.rb +35 -9
- data/lib/spider_core/behavior.rb +36 -0
- data/lib/spider_core/field_dsl.rb +0 -21
- data/lib/spider_core/version.rb +1 -1
- metadata +2 -2
data/lib/micro_spider.rb
CHANGED
@@ -35,13 +35,14 @@ class MicroSpider
|
|
35
35
|
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
|
36
36
|
|
37
37
|
def initialize(excretion = nil)
|
38
|
-
@paths
|
39
|
-
@actions
|
40
|
-
@
|
41
|
-
@
|
42
|
-
@
|
43
|
-
@
|
44
|
-
@
|
38
|
+
@paths = []
|
39
|
+
@actions = []
|
40
|
+
@setted_variables = {}
|
41
|
+
@timeout = 120
|
42
|
+
@excretion = excretion || { status: 'inprogress', results: [] }
|
43
|
+
@logger = Logger.new(STDOUT)
|
44
|
+
@visited_paths = Set.new
|
45
|
+
@broken_paths = []
|
45
46
|
end
|
46
47
|
|
47
48
|
# The seconds between each two request.
|
@@ -70,6 +71,31 @@ class MicroSpider
|
|
70
71
|
logger.info "Current location is #{path}."
|
71
72
|
end
|
72
73
|
|
74
|
+
# Set a variable. You can use it later.
|
75
|
+
#
|
76
|
+
# @param name [String] the variable name
|
77
|
+
# @param value [String] the variable value
|
78
|
+
# @param opts [Hash] the options. can set selector with css or xpath
|
79
|
+
#
|
80
|
+
# @example Set a variable
|
81
|
+
# spider = MicroSpider.new
|
82
|
+
# spider.set :id, '645'
|
83
|
+
# spider.set :table, '.tb a', selector: :css
|
84
|
+
# spider.set :table, '.tb a', selector: :css do |e|
|
85
|
+
# e['src']
|
86
|
+
# end
|
87
|
+
def set(name, value, opts = {}, &block)
|
88
|
+
selector = opts.delete(:selector)
|
89
|
+
if selector.nil?
|
90
|
+
@setted_variables[name.to_s] = value
|
91
|
+
else
|
92
|
+
actions << lambda {
|
93
|
+
elements = scan_all(selector, value, opts)
|
94
|
+
@setted_variables[name.to_s] = block_given? ? yield(elements) : handle_element(elements.first)
|
95
|
+
}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
73
99
|
# Click the locator. This will trigger visit action and change current location.
|
74
100
|
# @params locator [String] the text or id of the link.
|
75
101
|
#
|
@@ -129,7 +155,7 @@ class MicroSpider
|
|
129
155
|
return if @site
|
130
156
|
Capybara.app_host = @excretion[:site] = @site = url
|
131
157
|
end
|
132
|
-
|
158
|
+
|
133
159
|
# This will be the first path for spider to visit.
|
134
160
|
# If more than one entrance, the spider will crawl theme one by one.
|
135
161
|
# @param path_or_paths [String] one or more entrances
|
@@ -248,7 +274,7 @@ class MicroSpider
|
|
248
274
|
spider.instance_variable_set(:@paths, [])
|
249
275
|
spider.instance_variable_set(:@actions, [])
|
250
276
|
spider.instance_variable_set(:@visited_paths, Set.new)
|
251
|
-
spider.instance_variable_set(:@broken_paths,
|
277
|
+
spider.instance_variable_set(:@broken_paths, Set.new)
|
252
278
|
spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
|
253
279
|
spider.skip_set_entrance = false
|
254
280
|
spider
|
data/lib/spider_core/behavior.rb
CHANGED
@@ -6,6 +6,7 @@ module SpiderCore
|
|
6
6
|
protected
|
7
7
|
|
8
8
|
def scan_all(kind, pattern, opts = {})
|
9
|
+
pattern = handle_pattern(pattern)
|
9
10
|
if pattern.is_a?(String)
|
10
11
|
elements = all(kind, pattern).lazy
|
11
12
|
if opts[:limit] && opts[:limit].to_i > 0
|
@@ -18,6 +19,7 @@ module SpiderCore
|
|
18
19
|
end
|
19
20
|
|
20
21
|
def scan_first(kind, pattern)
|
22
|
+
pattern = handle_pattern(pattern)
|
21
23
|
if pattern.is_a?(String)
|
22
24
|
first(kind, pattern)
|
23
25
|
elsif pattern.is_a?(Regexp)
|
@@ -25,5 +27,39 @@ module SpiderCore
|
|
25
27
|
end
|
26
28
|
end
|
27
29
|
|
30
|
+
def handle_element(element)
|
31
|
+
if element.is_a?(String)
|
32
|
+
element
|
33
|
+
elsif element.tag_name == 'input'
|
34
|
+
element.value
|
35
|
+
else
|
36
|
+
element.text
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def handle_elements(elements, &block)
|
41
|
+
if elements.respond_to?(:map) && block_given?
|
42
|
+
elements.map { |element| yield(element) }.force
|
43
|
+
elsif elements.respond_to?(:map)
|
44
|
+
elements.map { |element| handle_element(element) }.force
|
45
|
+
elsif block_given?
|
46
|
+
yield(elements)
|
47
|
+
else
|
48
|
+
handle_element(elements)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# @example Handle pattern
|
53
|
+
# handle_pattern('.a') # =>'.a'
|
54
|
+
# set :id, 'a'
|
55
|
+
# handle_pattern('.%{id}bc') # =>'.abc'
|
56
|
+
def handle_pattern(pattern)
|
57
|
+
scan_results = pattern.scan(/(?<=%{)[^}]*(?=})/)
|
58
|
+
unless scan_results.empty?
|
59
|
+
scan_results.each { |v| pattern = pattern.sub(/%\{#{v}\}/, @setted_variables[v]) }
|
60
|
+
end
|
61
|
+
pattern
|
62
|
+
end
|
63
|
+
|
28
64
|
end
|
29
65
|
end
|
@@ -35,27 +35,6 @@ module SpiderCore
|
|
35
35
|
end
|
36
36
|
|
37
37
|
protected
|
38
|
-
def handle_element(element)
|
39
|
-
if element.is_a?(String)
|
40
|
-
element
|
41
|
-
elsif element.tag_name == 'input'
|
42
|
-
element.value
|
43
|
-
else
|
44
|
-
element.text
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def handle_elements(elements, &block)
|
49
|
-
if elements.respond_to?(:map) && block_given?
|
50
|
-
elements.map { |element| yield(element) }.force
|
51
|
-
elsif elements.respond_to?(:map)
|
52
|
-
elements.map { |element| handle_element(element) }.force
|
53
|
-
elsif block_given?
|
54
|
-
yield(elements)
|
55
|
-
else
|
56
|
-
handle_element(elements)
|
57
|
-
end
|
58
|
-
end
|
59
38
|
|
60
39
|
def action_for(action, action_opts = {}, opts = {}, &block)
|
61
40
|
begin
|
data/lib/spider_core/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: capybara
|