micro_spider 0.1.19 → 0.1.20

Sign up to get free protection for your applications and to get access to all the features.
data/lib/micro_spider.rb CHANGED
@@ -35,13 +35,14 @@ class MicroSpider
35
35
  attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
36
36
 
37
37
  def initialize(excretion = nil)
38
- @paths = []
39
- @actions = []
40
- @timeout = 120
41
- @excretion = excretion || { status: 'inprogress', results: [] }
42
- @logger = Logger.new(STDOUT)
43
- @visited_paths = Set.new
44
- @broken_paths = []
38
+ @paths = []
39
+ @actions = []
40
+ @setted_variables = {}
41
+ @timeout = 120
42
+ @excretion = excretion || { status: 'inprogress', results: [] }
43
+ @logger = Logger.new(STDOUT)
44
+ @visited_paths = Set.new
45
+ @broken_paths = []
45
46
  end
46
47
 
47
48
  # The seconds between each two request.
@@ -70,6 +71,31 @@ class MicroSpider
70
71
  logger.info "Current location is #{path}."
71
72
  end
72
73
 
74
+ # Set a variable. You can use it later.
75
+ #
76
+ # @param name [String] the variable name
77
+ # @param value [String] the variable value
78
+ # @param opts [Hash] the options. can set selector with css or xpath
79
+ #
80
+ # @example Set a variable
81
+ # spider = MicroSpider.new
82
+ # spider.set :id, '645'
83
+ # spider.set :table, '.tb a', selector: :css
84
+ # spider.set :table, '.tb a', selector: :css do |e|
85
+ # e['src']
86
+ # end
87
+ def set(name, value, opts = {}, &block)
88
+ selector = opts.delete(:selector)
89
+ if selector.nil?
90
+ @setted_variables[name.to_s] = value
91
+ else
92
+ actions << lambda {
93
+ elements = scan_all(selector, value, opts)
94
+ @setted_variables[name.to_s] = block_given? ? yield(elements) : handle_element(elements.first)
95
+ }
96
+ end
97
+ end
98
+
73
99
  # Click the locator. This will trigger visit action and change current location.
74
100
  # @params locator [String] the text or id of the link.
75
101
  #
@@ -129,7 +155,7 @@ class MicroSpider
129
155
  return if @site
130
156
  Capybara.app_host = @excretion[:site] = @site = url
131
157
  end
132
-
158
+
133
159
  # This will be the first path for spider to visit.
134
160
  # If more than one entrance, the spider will crawl theme one by one.
135
161
  # @param path_or_paths [String] one or more entrances
@@ -248,7 +274,7 @@ class MicroSpider
248
274
  spider.instance_variable_set(:@paths, [])
249
275
  spider.instance_variable_set(:@actions, [])
250
276
  spider.instance_variable_set(:@visited_paths, Set.new)
251
- spider.instance_variable_set(:@broken_paths, Set.new)
277
+ spider.instance_variable_set(:@broken_paths, Set.new)
252
278
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
253
279
  spider.skip_set_entrance = false
254
280
  spider
@@ -6,6 +6,7 @@ module SpiderCore
6
6
  protected
7
7
 
8
8
  def scan_all(kind, pattern, opts = {})
9
+ pattern = handle_pattern(pattern)
9
10
  if pattern.is_a?(String)
10
11
  elements = all(kind, pattern).lazy
11
12
  if opts[:limit] && opts[:limit].to_i > 0
@@ -18,6 +19,7 @@ module SpiderCore
18
19
  end
19
20
 
20
21
  def scan_first(kind, pattern)
22
+ pattern = handle_pattern(pattern)
21
23
  if pattern.is_a?(String)
22
24
  first(kind, pattern)
23
25
  elsif pattern.is_a?(Regexp)
@@ -25,5 +27,39 @@ module SpiderCore
25
27
  end
26
28
  end
27
29
 
30
+ def handle_element(element)
31
+ if element.is_a?(String)
32
+ element
33
+ elsif element.tag_name == 'input'
34
+ element.value
35
+ else
36
+ element.text
37
+ end
38
+ end
39
+
40
+ def handle_elements(elements, &block)
41
+ if elements.respond_to?(:map) && block_given?
42
+ elements.map { |element| yield(element) }.force
43
+ elsif elements.respond_to?(:map)
44
+ elements.map { |element| handle_element(element) }.force
45
+ elsif block_given?
46
+ yield(elements)
47
+ else
48
+ handle_element(elements)
49
+ end
50
+ end
51
+
52
+ # @example Handle pattern
53
+ # handle_pattern('.a') # =>'.a'
54
+ # set :id, 'a'
55
+ # handle_pattern('.%{id}bc') # =>'.abc'
56
+ def handle_pattern(pattern)
57
+ scan_results = pattern.scan(/(?<=%{)[^}]*(?=})/)
58
+ unless scan_results.empty?
59
+ scan_results.each { |v| pattern = pattern.sub(/%\{#{v}\}/, @setted_variables[v]) }
60
+ end
61
+ pattern
62
+ end
63
+
28
64
  end
29
65
  end
@@ -35,27 +35,6 @@ module SpiderCore
35
35
  end
36
36
 
37
37
  protected
38
- def handle_element(element)
39
- if element.is_a?(String)
40
- element
41
- elsif element.tag_name == 'input'
42
- element.value
43
- else
44
- element.text
45
- end
46
- end
47
-
48
- def handle_elements(elements, &block)
49
- if elements.respond_to?(:map) && block_given?
50
- elements.map { |element| yield(element) }.force
51
- elsif elements.respond_to?(:map)
52
- elements.map { |element| handle_element(element) }.force
53
- elsif block_given?
54
- yield(elements)
55
- else
56
- handle_element(elements)
57
- end
58
- end
59
38
 
60
39
  def action_for(action, action_opts = {}, opts = {}, &block)
61
40
  begin
@@ -1,3 +1,3 @@
1
1
  module SpiderCore
2
- VERSION = "0.1.19"
2
+ VERSION = "0.1.20"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: micro_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.19
4
+ version: 0.1.20
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-12 00:00:00.000000000 Z
12
+ date: 2013-09-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: capybara