micro_spider 0.1.16 → 0.1.17

Sign up to get free protection for your applications and to get access to all the features.
data/lib/micro_spider.rb CHANGED
@@ -2,10 +2,14 @@ require 'capybara'
2
2
  require 'capybara-webkit'
3
3
  require 'capybara/dsl'
4
4
 
5
- Capybara.run_server = false
6
5
  Capybara.current_driver = :webkit
6
+ Capybara.configure do |config|
7
+ config.ignore_hidden_elements = false
8
+ config.run_server = false
9
+ end
7
10
 
8
11
  require 'logger'
12
+ require 'set'
9
13
  require 'spider_core'
10
14
 
11
15
  class MicroSpider
@@ -16,14 +20,16 @@ class MicroSpider
16
20
  include SpiderCore::FollowDSL
17
21
  include SpiderCore::PaginationDSL
18
22
 
19
- attr_reader :excretion, :paths, :delay, :current_location
23
+ attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
20
24
  attr_accessor :logger, :actions, :recipe, :skip_set_entrance
21
25
 
22
26
  def initialize(excretion = nil)
23
- @paths = []
24
- @actions = []
27
+ @paths = []
28
+ @actions = []
25
29
  @excretion = excretion || { status: 'inprogress', results: [] }
26
- @logger = Logger.new(STDOUT)
30
+ @logger = Logger.new(STDOUT)
31
+ @visited_paths = Set.new
32
+ @broken_paths = []
27
33
  end
28
34
 
29
35
  # The seconds between each two request.
@@ -37,8 +43,9 @@ class MicroSpider
37
43
  # Visit the path.
38
44
  #
39
45
  # @param path [String] the path to visit, can be absolute path or relative path.
46
+ #
40
47
  # @example Visit a path
41
- # spider = TinySpider.new
48
+ # spider = MicroSpider.new
42
49
  # spider.visit('/example')
43
50
  # spider.visit('http://google.com')
44
51
  #
@@ -50,13 +57,37 @@ class MicroSpider
50
57
  logger.info "Current location is #{path}."
51
58
  end
52
59
 
60
+ # Click the locator. This will trigger visit action and change current location.
61
+ # @params locator [String] the text or id of the link.
62
+ #
53
63
  def click(locator, opts = {})
54
- actions << lambda {
64
+ actions << lambda {
55
65
  path = find_link(locator, opts)[:href]
56
66
  visit(path)
57
67
  }
58
68
  end
59
-
69
+
70
+ # Teach the spider behaviors and it will repeat to the end.
71
+ # @param recipe [String, Proc] the recipe be learned.
72
+ #
73
+ # @example
74
+ # spider = MicroSpider.new
75
+ # spider.learn do
76
+ # entrance 'http://google.com'
77
+ # end
78
+ # spider.crawl
79
+ #
80
+ # @example
81
+ # spider.learn("entrance 'http://google.com'")
82
+ # spider.crawl
83
+ #
84
+ # @example
85
+ # recipe = lambda {
86
+ # entrance 'http://google.com'
87
+ # }
88
+ # spider.learn(recipe)
89
+ # spider.crawl
90
+ #
60
91
  def learn(recipe = nil, &block)
61
92
  if block_given?
62
93
  instance_eval(&block)
@@ -76,14 +107,33 @@ class MicroSpider
76
107
  return if @site
77
108
  Capybara.app_host = @excretion[:site] = @site = url
78
109
  end
79
-
110
+
111
+ # This will be the first path for spider to visit.
112
+ # If more than one entrance, the spider will crawl theme one by one.
113
+ # @param path_or_paths [String] one or more entrances
114
+ #
115
+ # @example
116
+ # spider = MicroSpider.new
117
+ # spider.site('http://google.com')
118
+ # spider.entrance('/a')
119
+ # spider.entrance('/b')
120
+ #
80
121
  def entrance(*path_or_paths)
81
122
  return if @skip_set_entrance
82
123
  @paths += path_or_paths
83
124
  end
84
125
 
85
- def entrance_on_path(path, pattern, kind: :css, **opts, &block)
126
+ # Sometimes the entrances are on the page.
127
+ # @param path [String] path to visit
128
+ # @param pattern [String, Regexp] links pattern
129
+ #
130
+ # @example
131
+ # spider = MicroSpider.new
132
+ # spider.entrance_on_path('http://google.com', '.links a')
133
+ #
134
+ def entrance_on_path(path, pattern, opts = {}, &block)
86
135
  return if @skip_set_entrance
136
+ kind = opts[:kind] || :css
87
137
  visit(path)
88
138
  entrances = scan_all(kind, pattern, opts).map do |element|
89
139
  block_given? ? yield(element) : element[:href]
@@ -95,24 +145,60 @@ class MicroSpider
95
145
  return excretion if completed?
96
146
 
97
147
  @paths.compact!
98
- path = @paths.shift
148
+ path = nil
149
+ loop do
150
+ path = @paths.shift
151
+ break if path.nil?
152
+ break unless @visited_paths.include?(path)
153
+ end
154
+
99
155
  if path.nil?
100
156
  excretion[:status] = 'completed'
101
157
  return excretion
102
158
  end
103
159
 
104
- visit(path)
105
- execute_actions
106
- yield(@current_location) if block_given?
107
- excretion[:results] << @current_location
160
+ learn(@recipe) if @actions.empty?
108
161
 
109
- @skip_set_entrance = true
110
- learn(@recipe)
111
- crawl(&block)
162
+ begin
163
+ visit(path)
164
+ rescue Timeout::Error => err
165
+ @broken_paths << path
166
+ logger.fatal("Timeout!!! execution expired when visit `#{path}`")
167
+ logger.fatal(err)
168
+ rescue SystemExit, Interrupt
169
+ logger.fatal("SystemExit && Interrupt")
170
+ exit!
171
+ rescue Exception => err
172
+ @broken_paths << path
173
+ logger.fatal("Caught exception when visit `#{path}`")
174
+ logger.fatal(err)
175
+ else
176
+ @visited_paths << path
177
+ execute_actions
178
+ yield(@current_location) if block_given?
179
+ excretion[:results] << @current_location
180
+ ensure
181
+ @actions = []
182
+ @skip_set_entrance = true
183
+ crawl(&block)
184
+ end
112
185
 
113
186
  excretion
114
187
  end
115
188
 
189
+ # Spider can create custom action when it is crawling.
190
+ # @param name [String] the name of action
191
+ # @param block [Proc] the actions
192
+ #
193
+ # @example
194
+ # spider = MicroSpider.new
195
+ #
196
+ # spider.create_action :save do |result|
197
+ # SomeClass.save(result)
198
+ # end
199
+ #
200
+ # spider.save
201
+ #
116
202
  def create_action(name, &block)
117
203
  action = proc { actions << lambda { block.call(current_location) } }
118
204
  metaclass.send :define_method, name, &action
@@ -126,6 +212,8 @@ class MicroSpider
126
212
  spider = self.clone
127
213
  spider.instance_variable_set(:@paths, [])
128
214
  spider.instance_variable_set(:@actions, [])
215
+ spider.instance_variable_set(:@visited_paths, [])
216
+ spider.instance_variable_set(:@broken_paths, Set.new)
129
217
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
130
218
  spider.skip_set_entrance = false
131
219
  spider
@@ -1,9 +1,11 @@
1
+ require 'enumerable/lazy' if RUBY_VERSION < '2.0'
2
+
1
3
  module SpiderCore
2
4
  module Behavior
3
5
 
4
6
  protected
5
7
 
6
- def scan_all(kind, pattern, **opts)
8
+ def scan_all(kind, pattern, opts = {})
7
9
  if pattern.is_a?(String)
8
10
  elements = all(kind, pattern).lazy
9
11
  if opts[:limit] && opts[:limit].to_i > 0
@@ -36,10 +36,12 @@ module SpiderCore
36
36
 
37
37
  protected
38
38
  def handle_element(element)
39
- if element && element.respond_to?(:text)
40
- element.text
41
- else
39
+ if element.is_a?(String)
42
40
  element
41
+ elsif element.tag_name == 'input'
42
+ element.value
43
+ else
44
+ element.text
43
45
  end
44
46
  end
45
47
 
@@ -63,7 +65,7 @@ module SpiderCore
63
65
  when :field
64
66
  scan_first(action_opts[:kind], action_opts[:pattern])
65
67
  when :fields
66
- scan_all(action_opts[:kind], action_opts[:pattern], opts).lazy
68
+ scan_all(action_opts[:kind], action_opts[:pattern], opts)
67
69
  else
68
70
  raise 'Unknow action.'
69
71
  end
@@ -3,8 +3,9 @@ module SpiderCore
3
3
 
4
4
  attr_accessor :skip_followers
5
5
 
6
- def follow(pattern, kind: :css, **opts, &block)
6
+ def follow(pattern, opts = {}, &block)
7
7
  return unless block_given?
8
+ kind = opts[:kind] || :css
8
9
  actions << lambda {
9
10
  spider = self.spawn
10
11
  spider.learn(&block)
@@ -3,7 +3,8 @@ module SpiderCore
3
3
 
4
4
  attr_accessor :next_page, :skip_pages
5
5
 
6
- def keep_eyes_on_next_page(pattern, kind: :css)
6
+ def keep_eyes_on_next_page(pattern, opts = {})
7
+ kind = opts[:kind] || :css
7
8
  actions << lambda {
8
9
  @next_page = first(kind, pattern)[:href] rescue nil
9
10
  @paths.unshift(@next_page) if @next_page
@@ -1,3 +1,3 @@
1
1
  module SpiderCore
2
- VERSION = "0.1.16"
2
+ VERSION = "0.1.17"
3
3
  end
@@ -4,6 +4,7 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
4
4
 
5
5
  def setup
6
6
  @spider = MicroSpider.new
7
+ @spider.logger.level = Logger::WARN
7
8
  end
8
9
 
9
10
  def test_spider_can_visit_path_with_some_delays
metadata CHANGED
@@ -1,111 +1,142 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: micro_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.16
4
+ version: 0.1.17
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - zires
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-07-23 00:00:00.000000000 Z
12
+ date: 2013-07-25 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: capybara
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
- - - '>='
19
+ - - ! '>='
18
20
  - !ruby/object:Gem::Version
19
21
  version: '0'
20
22
  type: :runtime
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
- - - '>='
27
+ - - ! '>='
25
28
  - !ruby/object:Gem::Version
26
29
  version: '0'
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: capybara-webkit
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
- - - '>='
35
+ - - ! '>='
32
36
  - !ruby/object:Gem::Version
33
37
  version: '0'
34
38
  type: :runtime
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
- - - '>='
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: enumerable-lazy
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
39
60
  - !ruby/object:Gem::Version
40
61
  version: '0'
41
62
  - !ruby/object:Gem::Dependency
42
63
  name: pry
43
64
  requirement: !ruby/object:Gem::Requirement
65
+ none: false
44
66
  requirements:
45
- - - '>='
67
+ - - ! '>='
46
68
  - !ruby/object:Gem::Version
47
69
  version: '0'
48
70
  type: :development
49
71
  prerelease: false
50
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
51
74
  requirements:
52
- - - '>='
75
+ - - ! '>='
53
76
  - !ruby/object:Gem::Version
54
77
  version: '0'
55
78
  - !ruby/object:Gem::Dependency
56
79
  name: yard
57
80
  requirement: !ruby/object:Gem::Requirement
81
+ none: false
58
82
  requirements:
59
- - - '>='
83
+ - - ! '>='
60
84
  - !ruby/object:Gem::Version
61
85
  version: '0'
62
86
  type: :development
63
87
  prerelease: false
64
88
  version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
65
90
  requirements:
66
- - - '>='
91
+ - - ! '>='
67
92
  - !ruby/object:Gem::Version
68
93
  version: '0'
69
94
  - !ruby/object:Gem::Dependency
70
95
  name: rake
71
96
  requirement: !ruby/object:Gem::Requirement
97
+ none: false
72
98
  requirements:
73
- - - '>='
99
+ - - ! '>='
74
100
  - !ruby/object:Gem::Version
75
101
  version: '0'
76
102
  type: :development
77
103
  prerelease: false
78
104
  version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
79
106
  requirements:
80
- - - '>='
107
+ - - ! '>='
81
108
  - !ruby/object:Gem::Version
82
109
  version: '0'
83
110
  - !ruby/object:Gem::Dependency
84
111
  name: turn
85
112
  requirement: !ruby/object:Gem::Requirement
113
+ none: false
86
114
  requirements:
87
- - - '>='
115
+ - - ! '>='
88
116
  - !ruby/object:Gem::Version
89
117
  version: '0'
90
118
  type: :development
91
119
  prerelease: false
92
120
  version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
93
122
  requirements:
94
- - - '>='
123
+ - - ! '>='
95
124
  - !ruby/object:Gem::Version
96
125
  version: '0'
97
126
  - !ruby/object:Gem::Dependency
98
127
  name: sinatra
99
128
  requirement: !ruby/object:Gem::Requirement
129
+ none: false
100
130
  requirements:
101
- - - '>='
131
+ - - ! '>='
102
132
  - !ruby/object:Gem::Version
103
133
  version: '0'
104
134
  type: :development
105
135
  prerelease: false
106
136
  version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
107
138
  requirements:
108
- - - '>='
139
+ - - ! '>='
109
140
  - !ruby/object:Gem::Version
110
141
  version: '0'
111
142
  description: A DSL to write web spider. Depend on capybara and capybara-webkit.
@@ -128,27 +159,29 @@ files:
128
159
  - test/micro_spider_test.rb
129
160
  - test/test_helper.rb
130
161
  homepage: https://github.com/zires/micro-spider
131
- licenses: []
132
- metadata: {}
162
+ licenses:
163
+ - MIT
133
164
  post_install_message:
134
165
  rdoc_options: []
135
166
  require_paths:
136
167
  - lib
137
168
  required_ruby_version: !ruby/object:Gem::Requirement
169
+ none: false
138
170
  requirements:
139
- - - '>='
171
+ - - ! '>='
140
172
  - !ruby/object:Gem::Version
141
173
  version: '0'
142
174
  required_rubygems_version: !ruby/object:Gem::Requirement
175
+ none: false
143
176
  requirements:
144
- - - '>='
177
+ - - ! '>='
145
178
  - !ruby/object:Gem::Version
146
179
  version: '0'
147
180
  requirements: []
148
181
  rubyforge_project:
149
- rubygems_version: 2.0.0.rc.2
182
+ rubygems_version: 1.8.23
150
183
  signing_key:
151
- specification_version: 4
184
+ specification_version: 3
152
185
  summary: A DSL to write web spider.
153
186
  test_files:
154
187
  - test/micro_spider_test.rb
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: f9363e70b57c95de9256ea2549cf2ee76c2669c0
4
- data.tar.gz: 7f8b0bc18fde686058c2b426f84c05b26ba3a1b3
5
- SHA512:
6
- metadata.gz: eb6a2ca107f788c95b4244b06de2ac8b7c94e983e0306dbcce71872cee37dd5946784cb900681fdd9d0ee35f6e82c2c6f7abbfed4916fabbdcc97a1ee27c849b
7
- data.tar.gz: 06ea26fcfd3b53edbb461927772a50d4320525e89f9d7c19e250cc93ba33937362dd60d7d5467dd5d7a0c8950fa67e50c0a7d2df6f5fd5c81a51ca1a9e78c9cb