micro_spider 0.1.19 → 0.1.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/micro_spider.rb CHANGED
@@ -35,13 +35,14 @@ class MicroSpider
35
35
  attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
36
36
 
37
37
  def initialize(excretion = nil)
38
- @paths = []
39
- @actions = []
40
- @timeout = 120
41
- @excretion = excretion || { status: 'inprogress', results: [] }
42
- @logger = Logger.new(STDOUT)
43
- @visited_paths = Set.new
44
- @broken_paths = []
38
+ @paths = []
39
+ @actions = []
40
+ @setted_variables = {}
41
+ @timeout = 120
42
+ @excretion = excretion || { status: 'inprogress', results: [] }
43
+ @logger = Logger.new(STDOUT)
44
+ @visited_paths = Set.new
45
+ @broken_paths = []
45
46
  end
46
47
 
47
48
  # The seconds between each two request.
@@ -70,6 +71,31 @@ class MicroSpider
70
71
  logger.info "Current location is #{path}."
71
72
  end
72
73
 
74
+ # Set a variable. You can use it later.
75
+ #
76
+ # @param name [String] the variable name
77
+ # @param value [String] the variable value
78
+ # @param opts [Hash] the options. can set selector with css or xpath
79
+ #
80
+ # @example Set a variable
81
+ # spider = MicroSpider.new
82
+ # spider.set :id, '645'
83
+ # spider.set :table, '.tb a', selector: :css
84
+ # spider.set :table, '.tb a', selector: :css do |e|
85
+ # e['src']
86
+ # end
87
+ def set(name, value, opts = {}, &block)
88
+ selector = opts.delete(:selector)
89
+ if selector.nil?
90
+ @setted_variables[name.to_s] = value
91
+ else
92
+ actions << lambda {
93
+ elements = scan_all(selector, value, opts)
94
+ @setted_variables[name.to_s] = block_given? ? yield(elements) : handle_element(elements.first)
95
+ }
96
+ end
97
+ end
98
+
73
99
  # Click the locator. This will trigger visit action and change current location.
74
100
  # @params locator [String] the text or id of the link.
75
101
  #
@@ -129,7 +155,7 @@ class MicroSpider
129
155
  return if @site
130
156
  Capybara.app_host = @excretion[:site] = @site = url
131
157
  end
132
-
158
+
133
159
  # This will be the first path for spider to visit.
134
160
  # If more than one entrance, the spider will crawl theme one by one.
135
161
  # @param path_or_paths [String] one or more entrances
@@ -248,7 +274,7 @@ class MicroSpider
248
274
  spider.instance_variable_set(:@paths, [])
249
275
  spider.instance_variable_set(:@actions, [])
250
276
  spider.instance_variable_set(:@visited_paths, Set.new)
251
- spider.instance_variable_set(:@broken_paths, Set.new)
277
+ spider.instance_variable_set(:@broken_paths, Set.new)
252
278
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
253
279
  spider.skip_set_entrance = false
254
280
  spider
@@ -6,6 +6,7 @@ module SpiderCore
6
6
  protected
7
7
 
8
8
  def scan_all(kind, pattern, opts = {})
9
+ pattern = handle_pattern(pattern)
9
10
  if pattern.is_a?(String)
10
11
  elements = all(kind, pattern).lazy
11
12
  if opts[:limit] && opts[:limit].to_i > 0
@@ -18,6 +19,7 @@ module SpiderCore
18
19
  end
19
20
 
20
21
  def scan_first(kind, pattern)
22
+ pattern = handle_pattern(pattern)
21
23
  if pattern.is_a?(String)
22
24
  first(kind, pattern)
23
25
  elsif pattern.is_a?(Regexp)
@@ -25,5 +27,39 @@ module SpiderCore
25
27
  end
26
28
  end
27
29
 
30
+ def handle_element(element)
31
+ if element.is_a?(String)
32
+ element
33
+ elsif element.tag_name == 'input'
34
+ element.value
35
+ else
36
+ element.text
37
+ end
38
+ end
39
+
40
+ def handle_elements(elements, &block)
41
+ if elements.respond_to?(:map) && block_given?
42
+ elements.map { |element| yield(element) }.force
43
+ elsif elements.respond_to?(:map)
44
+ elements.map { |element| handle_element(element) }.force
45
+ elsif block_given?
46
+ yield(elements)
47
+ else
48
+ handle_element(elements)
49
+ end
50
+ end
51
+
52
+ # @example Handle pattern
53
+ # handle_pattern('.a') # =>'.a'
54
+ # set :id, 'a'
55
+ # handle_pattern('.%{id}bc') # =>'.abc'
56
+ def handle_pattern(pattern)
57
+ scan_results = pattern.scan(/(?<=%{)[^}]*(?=})/)
58
+ unless scan_results.empty?
59
+ scan_results.each { |v| pattern = pattern.sub(/%\{#{v}\}/, @setted_variables[v]) }
60
+ end
61
+ pattern
62
+ end
63
+
28
64
  end
29
65
  end
@@ -35,27 +35,6 @@ module SpiderCore
35
35
  end
36
36
 
37
37
  protected
38
- def handle_element(element)
39
- if element.is_a?(String)
40
- element
41
- elsif element.tag_name == 'input'
42
- element.value
43
- else
44
- element.text
45
- end
46
- end
47
-
48
- def handle_elements(elements, &block)
49
- if elements.respond_to?(:map) && block_given?
50
- elements.map { |element| yield(element) }.force
51
- elsif elements.respond_to?(:map)
52
- elements.map { |element| handle_element(element) }.force
53
- elsif block_given?
54
- yield(elements)
55
- else
56
- handle_element(elements)
57
- end
58
- end
59
38
 
60
39
  def action_for(action, action_opts = {}, opts = {}, &block)
61
40
  begin
@@ -1,3 +1,3 @@
1
1
  module SpiderCore
2
- VERSION = "0.1.19"
2
+ VERSION = "0.1.20"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: micro_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.19
4
+ version: 0.1.20
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-12 00:00:00.000000000 Z
12
+ date: 2013-09-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: capybara