micro_spider 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/micro_spider.rb CHANGED
@@ -2,10 +2,14 @@ require 'capybara'
2
2
  require 'capybara-webkit'
3
3
  require 'capybara/dsl'
4
4
 
5
- Capybara.run_server = false
6
5
  Capybara.current_driver = :webkit
6
+ Capybara.configure do |config|
7
+ config.ignore_hidden_elements = false
8
+ config.run_server = false
9
+ end
7
10
 
8
11
  require 'logger'
12
+ require 'set'
9
13
  require 'spider_core'
10
14
 
11
15
  class MicroSpider
@@ -16,14 +20,16 @@ class MicroSpider
16
20
  include SpiderCore::FollowDSL
17
21
  include SpiderCore::PaginationDSL
18
22
 
19
- attr_reader :excretion, :paths, :delay, :current_location
23
+ attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
20
24
  attr_accessor :logger, :actions, :recipe, :skip_set_entrance
21
25
 
22
26
  def initialize(excretion = nil)
23
- @paths = []
24
- @actions = []
27
+ @paths = []
28
+ @actions = []
25
29
  @excretion = excretion || { status: 'inprogress', results: [] }
26
- @logger = Logger.new(STDOUT)
30
+ @logger = Logger.new(STDOUT)
31
+ @visited_paths = Set.new
32
+ @broken_paths = []
27
33
  end
28
34
 
29
35
  # The seconds between each two request.
@@ -37,8 +43,9 @@ class MicroSpider
37
43
  # Visit the path.
38
44
  #
39
45
  # @param path [String] the path to visit, can be absolute path or relative path.
46
+ #
40
47
  # @example Visit a path
41
- # spider = TinySpider.new
48
+ # spider = MicroSpider.new
42
49
  # spider.visit('/example')
43
50
  # spider.visit('http://google.com')
44
51
  #
@@ -50,13 +57,37 @@ class MicroSpider
50
57
  logger.info "Current location is #{path}."
51
58
  end
52
59
 
60
+ # Click the locator. This will trigger visit action and change current location.
61
+ # @params locator [String] the text or id of the link.
62
+ #
53
63
  def click(locator, opts = {})
54
- actions << lambda {
64
+ actions << lambda {
55
65
  path = find_link(locator, opts)[:href]
56
66
  visit(path)
57
67
  }
58
68
  end
59
-
69
+
70
+ # Teach the spider behaviors and it will repeat to the end.
71
+ # @param recipe [String, Proc] the recipe be learned.
72
+ #
73
+ # @example
74
+ # spider = MicroSpider.new
75
+ # spider.learn do
76
+ # entrance 'http://google.com'
77
+ # end
78
+ # spider.crawl
79
+ #
80
+ # @example
81
+ # spider.learn("entrance 'http://google.com'")
82
+ # spider.crawl
83
+ #
84
+ # @example
85
+ # recipe = lambda {
86
+ # entrance 'http://google.com'
87
+ # }
88
+ # spider.learn(recipe)
89
+ # spider.crawl
90
+ #
60
91
  def learn(recipe = nil, &block)
61
92
  if block_given?
62
93
  instance_eval(&block)
@@ -76,14 +107,33 @@ class MicroSpider
76
107
  return if @site
77
108
  Capybara.app_host = @excretion[:site] = @site = url
78
109
  end
79
-
110
+
111
+ # This will be the first path for spider to visit.
112
+ # If more than one entrance, the spider will crawl theme one by one.
113
+ # @param path_or_paths [String] one or more entrances
114
+ #
115
+ # @example
116
+ # spider = MicroSpider.new
117
+ # spider.site('http://google.com')
118
+ # spider.entrance('/a')
119
+ # spider.entrance('/b')
120
+ #
80
121
  def entrance(*path_or_paths)
81
122
  return if @skip_set_entrance
82
123
  @paths += path_or_paths
83
124
  end
84
125
 
85
- def entrance_on_path(path, pattern, kind: :css, **opts, &block)
126
+ # Sometimes the entrances are on the page.
127
+ # @param path [String] path to visit
128
+ # @param pattern [String, Regexp] links pattern
129
+ #
130
+ # @example
131
+ # spider = MicroSpider.new
132
+ # spider.entrance_on_path('http://google.com', '.links a')
133
+ #
134
+ def entrance_on_path(path, pattern, opts = {}, &block)
86
135
  return if @skip_set_entrance
136
+ kind = opts[:kind] || :css
87
137
  visit(path)
88
138
  entrances = scan_all(kind, pattern, opts).map do |element|
89
139
  block_given? ? yield(element) : element[:href]
@@ -95,24 +145,60 @@ class MicroSpider
95
145
  return excretion if completed?
96
146
 
97
147
  @paths.compact!
98
- path = @paths.shift
148
+ path = nil
149
+ loop do
150
+ path = @paths.shift
151
+ break if path.nil?
152
+ break unless @visited_paths.include?(path)
153
+ end
154
+
99
155
  if path.nil?
100
156
  excretion[:status] = 'completed'
101
157
  return excretion
102
158
  end
103
159
 
104
- visit(path)
105
- execute_actions
106
- yield(@current_location) if block_given?
107
- excretion[:results] << @current_location
160
+ learn(@recipe) if @actions.empty?
108
161
 
109
- @skip_set_entrance = true
110
- learn(@recipe)
111
- crawl(&block)
162
+ begin
163
+ visit(path)
164
+ rescue Timeout::Error => err
165
+ @broken_paths << path
166
+ logger.fatal("Timeout!!! execution expired when visit `#{path}`")
167
+ logger.fatal(err)
168
+ rescue SystemExit, Interrupt
169
+ logger.fatal("SystemExit && Interrupt")
170
+ exit!
171
+ rescue Exception => err
172
+ @broken_paths << path
173
+ logger.fatal("Caught exception when visit `#{path}`")
174
+ logger.fatal(err)
175
+ else
176
+ @visited_paths << path
177
+ execute_actions
178
+ yield(@current_location) if block_given?
179
+ excretion[:results] << @current_location
180
+ ensure
181
+ @actions = []
182
+ @skip_set_entrance = true
183
+ crawl(&block)
184
+ end
112
185
 
113
186
  excretion
114
187
  end
115
188
 
189
+ # Spider can create custom action when it is crawling.
190
+ # @param name [String] the name of action
191
+ # @param block [Proc] the actions
192
+ #
193
+ # @example
194
+ # spider = MicroSpider.new
195
+ #
196
+ # spider.create_action :save do |result|
197
+ # SomeClass.save(result)
198
+ # end
199
+ #
200
+ # spider.save
201
+ #
116
202
  def create_action(name, &block)
117
203
  action = proc { actions << lambda { block.call(current_location) } }
118
204
  metaclass.send :define_method, name, &action
@@ -126,6 +212,8 @@ class MicroSpider
126
212
  spider = self.clone
127
213
  spider.instance_variable_set(:@paths, [])
128
214
  spider.instance_variable_set(:@actions, [])
215
+ spider.instance_variable_set(:@visited_paths, [])
216
+ spider.instance_variable_set(:@broken_paths, Set.new)
129
217
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
130
218
  spider.skip_set_entrance = false
131
219
  spider
@@ -1,9 +1,11 @@
1
+ require 'enumerable/lazy' if RUBY_VERSION < '2.0'
2
+
1
3
  module SpiderCore
2
4
  module Behavior
3
5
 
4
6
  protected
5
7
 
6
- def scan_all(kind, pattern, **opts)
8
+ def scan_all(kind, pattern, opts = {})
7
9
  if pattern.is_a?(String)
8
10
  elements = all(kind, pattern).lazy
9
11
  if opts[:limit] && opts[:limit].to_i > 0
@@ -36,10 +36,12 @@ module SpiderCore
36
36
 
37
37
  protected
38
38
  def handle_element(element)
39
- if element && element.respond_to?(:text)
40
- element.text
41
- else
39
+ if element.is_a?(String)
42
40
  element
41
+ elsif element.tag_name == 'input'
42
+ element.value
43
+ else
44
+ element.text
43
45
  end
44
46
  end
45
47
 
@@ -63,7 +65,7 @@ module SpiderCore
63
65
  when :field
64
66
  scan_first(action_opts[:kind], action_opts[:pattern])
65
67
  when :fields
66
- scan_all(action_opts[:kind], action_opts[:pattern], opts).lazy
68
+ scan_all(action_opts[:kind], action_opts[:pattern], opts)
67
69
  else
68
70
  raise 'Unknow action.'
69
71
  end
@@ -3,8 +3,9 @@ module SpiderCore
3
3
 
4
4
  attr_accessor :skip_followers
5
5
 
6
- def follow(pattern, kind: :css, **opts, &block)
6
+ def follow(pattern, opts = {}, &block)
7
7
  return unless block_given?
8
+ kind = opts[:kind] || :css
8
9
  actions << lambda {
9
10
  spider = self.spawn
10
11
  spider.learn(&block)
@@ -3,7 +3,8 @@ module SpiderCore
3
3
 
4
4
  attr_accessor :next_page, :skip_pages
5
5
 
6
- def keep_eyes_on_next_page(pattern, kind: :css)
6
+ def keep_eyes_on_next_page(pattern, opts = {})
7
+ kind = opts[:kind] || :css
7
8
  actions << lambda {
8
9
  @next_page = first(kind, pattern)[:href] rescue nil
9
10
  @paths.unshift(@next_page) if @next_page
@@ -1,3 +1,3 @@
1
1
  module SpiderCore
2
- VERSION = "0.1.16"
2
+ VERSION = "0.1.17"
3
3
  end
@@ -4,6 +4,7 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
4
4
 
5
5
  def setup
6
6
  @spider = MicroSpider.new
7
+ @spider.logger.level = Logger::WARN
7
8
  end
8
9
 
9
10
  def test_spider_can_visit_path_with_some_delays
metadata CHANGED
@@ -1,111 +1,142 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: micro_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.16
4
+ version: 0.1.17
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - zires
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-07-23 00:00:00.000000000 Z
12
+ date: 2013-07-25 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: capybara
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
- - - '>='
19
+ - - ! '>='
18
20
  - !ruby/object:Gem::Version
19
21
  version: '0'
20
22
  type: :runtime
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
- - - '>='
27
+ - - ! '>='
25
28
  - !ruby/object:Gem::Version
26
29
  version: '0'
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: capybara-webkit
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
- - - '>='
35
+ - - ! '>='
32
36
  - !ruby/object:Gem::Version
33
37
  version: '0'
34
38
  type: :runtime
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
- - - '>='
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: enumerable-lazy
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
39
60
  - !ruby/object:Gem::Version
40
61
  version: '0'
41
62
  - !ruby/object:Gem::Dependency
42
63
  name: pry
43
64
  requirement: !ruby/object:Gem::Requirement
65
+ none: false
44
66
  requirements:
45
- - - '>='
67
+ - - ! '>='
46
68
  - !ruby/object:Gem::Version
47
69
  version: '0'
48
70
  type: :development
49
71
  prerelease: false
50
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
51
74
  requirements:
52
- - - '>='
75
+ - - ! '>='
53
76
  - !ruby/object:Gem::Version
54
77
  version: '0'
55
78
  - !ruby/object:Gem::Dependency
56
79
  name: yard
57
80
  requirement: !ruby/object:Gem::Requirement
81
+ none: false
58
82
  requirements:
59
- - - '>='
83
+ - - ! '>='
60
84
  - !ruby/object:Gem::Version
61
85
  version: '0'
62
86
  type: :development
63
87
  prerelease: false
64
88
  version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
65
90
  requirements:
66
- - - '>='
91
+ - - ! '>='
67
92
  - !ruby/object:Gem::Version
68
93
  version: '0'
69
94
  - !ruby/object:Gem::Dependency
70
95
  name: rake
71
96
  requirement: !ruby/object:Gem::Requirement
97
+ none: false
72
98
  requirements:
73
- - - '>='
99
+ - - ! '>='
74
100
  - !ruby/object:Gem::Version
75
101
  version: '0'
76
102
  type: :development
77
103
  prerelease: false
78
104
  version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
79
106
  requirements:
80
- - - '>='
107
+ - - ! '>='
81
108
  - !ruby/object:Gem::Version
82
109
  version: '0'
83
110
  - !ruby/object:Gem::Dependency
84
111
  name: turn
85
112
  requirement: !ruby/object:Gem::Requirement
113
+ none: false
86
114
  requirements:
87
- - - '>='
115
+ - - ! '>='
88
116
  - !ruby/object:Gem::Version
89
117
  version: '0'
90
118
  type: :development
91
119
  prerelease: false
92
120
  version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
93
122
  requirements:
94
- - - '>='
123
+ - - ! '>='
95
124
  - !ruby/object:Gem::Version
96
125
  version: '0'
97
126
  - !ruby/object:Gem::Dependency
98
127
  name: sinatra
99
128
  requirement: !ruby/object:Gem::Requirement
129
+ none: false
100
130
  requirements:
101
- - - '>='
131
+ - - ! '>='
102
132
  - !ruby/object:Gem::Version
103
133
  version: '0'
104
134
  type: :development
105
135
  prerelease: false
106
136
  version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
107
138
  requirements:
108
- - - '>='
139
+ - - ! '>='
109
140
  - !ruby/object:Gem::Version
110
141
  version: '0'
111
142
  description: A DSL to write web spider. Depend on capybara and capybara-webkit.
@@ -128,27 +159,29 @@ files:
128
159
  - test/micro_spider_test.rb
129
160
  - test/test_helper.rb
130
161
  homepage: https://github.com/zires/micro-spider
131
- licenses: []
132
- metadata: {}
162
+ licenses:
163
+ - MIT
133
164
  post_install_message:
134
165
  rdoc_options: []
135
166
  require_paths:
136
167
  - lib
137
168
  required_ruby_version: !ruby/object:Gem::Requirement
169
+ none: false
138
170
  requirements:
139
- - - '>='
171
+ - - ! '>='
140
172
  - !ruby/object:Gem::Version
141
173
  version: '0'
142
174
  required_rubygems_version: !ruby/object:Gem::Requirement
175
+ none: false
143
176
  requirements:
144
- - - '>='
177
+ - - ! '>='
145
178
  - !ruby/object:Gem::Version
146
179
  version: '0'
147
180
  requirements: []
148
181
  rubyforge_project:
149
- rubygems_version: 2.0.0.rc.2
182
+ rubygems_version: 1.8.23
150
183
  signing_key:
151
- specification_version: 4
184
+ specification_version: 3
152
185
  summary: A DSL to write web spider.
153
186
  test_files:
154
187
  - test/micro_spider_test.rb
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: f9363e70b57c95de9256ea2549cf2ee76c2669c0
4
- data.tar.gz: 7f8b0bc18fde686058c2b426f84c05b26ba3a1b3
5
- SHA512:
6
- metadata.gz: eb6a2ca107f788c95b4244b06de2ac8b7c94e983e0306dbcce71872cee37dd5946784cb900681fdd9d0ee35f6e82c2c6f7abbfed4916fabbdcc97a1ee27c849b
7
- data.tar.gz: 06ea26fcfd3b53edbb461927772a50d4320525e89f9d7c19e250cc93ba33937362dd60d7d5467dd5d7a0c8950fa67e50c0a7d2df6f5fd5c81a51ca1a9e78c9cb