klepto 0.5.3 → 0.5.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -165,7 +165,7 @@ end
165
165
 
166
166
  ## Configuration Options
167
167
  * config.headers - Hash; Sets request headers
168
- * config.urls - Array(String); Sets URLs to structure
168
+ * config.url - String; Set URL to structure
169
169
  * config.abort_on_failure - Boolean(Default: true); Should structuring be aborted on 4xx or 5xx
170
170
 
171
171
  ## Callbacks & Processing
data/lib/klepto/bot.rb CHANGED
@@ -1,24 +1,13 @@
1
1
  module Klepto
2
2
  class Bot
3
3
  attr_reader :config
4
- @@_bots = {}
5
- class << self
6
- def run(name,*urls)
7
- urls.each do |url|
8
- @@_bots[name].parse! url
9
- end
10
- end
11
- def make(name, &block)
12
- @@_bots[name] = Klepto::Bot.new(&block)
13
- end
14
- end
15
4
 
16
- def initialize(*urls, &block)
5
+ def initialize(url=nil, &block)
17
6
  @config = Klepto::Config.new
18
- @config.urls urls
7
+ @config.url url
19
8
  @queue = []
20
- @pages = {}
21
-
9
+ @browser = Klepto::Browser.new
10
+
22
11
  # Evaluate the block as DSL, proxy off anything that isn't on #config
23
12
  # to a queue, then apply that queue to the top-level Klepto::Structure
24
13
  instance_eval &block
@@ -27,9 +16,9 @@ module Klepto
27
16
  # and restore method_missing (for sanity sake)
28
17
  instance_eval <<-EOS
29
18
  def queue; @queue; end;
30
- def pages; @pages; end;
31
- def parse!(*_urls); __process!(*_urls); end;
32
- def resources; @resources; end;
19
+ def browser; @browser; end;
20
+ def url=(_url); @config.url(_url); end;
21
+ def structure; @structure; end;
33
22
  def method_missing(meth, *args, &block)
34
23
  raise NoMethodError.new("undefined method: Klepto::Bot#" + meth.to_s)
35
24
  end
@@ -39,54 +28,47 @@ EOS
39
28
  end
40
29
 
41
30
  # Structure all the pages
42
- def __process!(*_urls)
43
- @resources = []
44
-
45
- (_urls + config.urls).each do |url|
46
- browser = Klepto::Browser.new
31
+ def __process!
32
+ @structure = nil
33
+ @browser.set_headers @config.headers
34
+ #browser.set_driver config.driver
47
35
 
48
- browser.set_headers config.headers
49
- #browser.set_driver config.driver
36
+ # Call before(:each) handlers...
37
+ @config.before_handlers[:each].each { |bh|
38
+ bh.call(url, browser)
39
+ }
40
+
41
+ begin
42
+ @browser.fetch! @config.url
50
43
 
51
- # Call before(:each) handlers...
52
- config.before_handlers[:each].each { |bh|
53
- bh.call(url, browser)
54
- }
44
+ # Fire callbacks on GET
45
+ @config.after_handlers[:get].each do |ah|
46
+ ah.call(@browser.page, @browser, @config.url)
47
+ end
48
+
49
+ # Dispatch all the handlers for HTTP Status Codes.
50
+ @browser.statuses.each do |status|
51
+ @config.dispatch_status_handlers(status, @browser.page)
52
+ end
55
53
 
56
- begin
57
- browser.fetch! url
58
-
59
- @pages[url] = browser.page if config.keep_pages
60
-
61
- # Fire callbacks on GET
62
- config.after_handlers[:get].each do |ah|
63
- ah.call(browser.page, browser, url)
64
- end
65
-
66
- # Dispatch all the handlers for HTTP Status Codes.
67
- browser.statuses.each do |status|
68
- config.dispatch_status_handlers(status, browser.page)
69
- end
70
-
71
- # If the page was not a failure or if not aborting, structure that bad boy.
72
- if (browser.failure? && config.abort_on_failure?) || (config.abort_on_redirect? && browser.was_redirected?)
73
- config.after_handlers[:abort].each do |ah|
74
- ah.call(browser.page,{
75
- browser_failure: browser.failure?,
76
- abort_on_failure: config.abort_on_failure?,
77
- abort_on_redirect: config.abort_on_redirect?,
78
- redirect: browser.was_redirected?
79
- })
80
- end
81
- else
82
- @resources << __structure(browser.page)
54
+ # If the page was not a failure or if not aborting, structure that bad boy.
55
+ if (@browser.failure? && @config.abort_on_failure?) || (@config.abort_on_redirect? && @browser.was_redirected?)
56
+ @config.after_handlers[:abort].each do |ah|
57
+ ah.call(browser.page,{
58
+ browser_failure: @browser.failure?,
59
+ abort_on_failure: @config.abort_on_failure?,
60
+ abort_on_redirect: @config.abort_on_redirect?,
61
+ redirect: @browser.was_redirected?
62
+ })
83
63
  end
84
- rescue Capybara::Poltergeist::TimeoutError => ex
85
- config.dispatch_timeout_handler(ex, url)
86
- end
64
+ else
65
+ @structure = __structure(@browser.page)
66
+ end
67
+ rescue Capybara::Poltergeist::TimeoutError => ex
68
+ config.dispatch_timeout_handler(ex, url)
87
69
  end
88
70
 
89
- @resources
71
+ @structure
90
72
  end
91
73
 
92
74
  def __structure(context)
data/lib/klepto/config.rb CHANGED
@@ -2,14 +2,11 @@ module Klepto
2
2
  class Config
3
3
  attr_reader :after_handlers
4
4
  attr_reader :before_handlers
5
- attr_reader :keep_pages
6
5
 
7
6
  def initialize
8
7
  @headers = {}
9
- @keep_pages = false
10
8
  @abort_on_failure = true
11
9
  @abort_on_redirect = false
12
- @urls = []
13
10
  @after_handlers = {
14
11
  :each => [], #after each call to
15
12
  :get => [], #after GET, before structure
@@ -32,11 +29,6 @@ module Klepto
32
29
  # @default_driver
33
30
  # end
34
31
 
35
- def keep_pages(_keep = nil)
36
- @keep_pages = _keep if _keep != nil
37
- @keep_pages
38
- end
39
-
40
32
  def headers(_headers=nil)
41
33
  @headers = _headers if _headers
42
34
  @headers
@@ -103,12 +95,9 @@ module Klepto
103
95
  @before_handlers[which].push block
104
96
  end
105
97
 
106
- def url(*args)
107
- @urls += args
108
- @urls.flatten!
109
- @urls.uniq!
110
- @urls
98
+ def url(url=nil)
99
+ @url = url if url
100
+ @url
111
101
  end
112
- alias :urls :url
113
102
  end
114
103
  end
@@ -61,8 +61,8 @@ module Klepto
61
61
  Klepto.logger.debug("\t\t\tAs: block (match all), Result? #{!result.nil?}")
62
62
  @_hash[meth] = []
63
63
  options[:limit] ||= result.length
64
- result[0, options[:limit]].each do |node|
65
- @_hash[meth] << block.call( node )
64
+ result[0, options[:limit]].each do |_node|
65
+ @_hash[meth] << block.call( _node )
66
66
  end
67
67
  else
68
68
  if result
@@ -81,8 +81,8 @@ module Klepto
81
81
  Klepto.logger.debug("\t\t\tAs: simple (match all), Result? #{!result.nil?}")
82
82
  @_hash[meth] = []
83
83
  options[:limit] ||= result.length
84
- result[0, options[:limit]].each do |node|
85
- @_hash[meth] << (node[options[:attr]] || node.try(:text))
84
+ result[0, options[:limit]].each do |_node|
85
+ @_hash[meth] << (_node[options[:attr]] || _node.try(:text))
86
86
  end
87
87
  elsif result
88
88
  Klepto.logger.debug("\t\t\tAs: block (match one)")
@@ -1,3 +1,3 @@
1
1
  module Klepto
2
- VERSION = "0.5.3"
2
+ VERSION = "0.5.5"
3
3
  end
@@ -13,11 +13,10 @@ describe Klepto::Bot do
13
13
  StatusLog.create message: 'Abort!'
14
14
  }
15
15
  }
16
- @structure = @bot.resources
17
16
  end
18
17
 
19
18
  it 'should structure not have structured the data' do
20
- @structure.should be_empty
19
+ @bot.structure.should be_nil
21
20
  end
22
21
 
23
22
  it 'should have dispatched abort handlers' do
@@ -37,11 +36,10 @@ describe Klepto::Bot do
37
36
  StatusLog.create message: '200'
38
37
  }
39
38
  }
40
- @structure = @bot.resources
41
39
  end
42
40
 
43
41
  it 'should structure the data' do
44
- @structure.first[:name].should match(/Justin/i)
42
+ @bot.structure[:name].should match(/Justin/i)
45
43
  end
46
44
 
47
45
  it 'should have dispatched status handlers' do
@@ -62,7 +60,6 @@ describe Klepto::Bot do
62
60
  StatusLog.create message: 'Aborted.'
63
61
  end
64
62
  }
65
- @structure = @bot.resources
66
63
  end
67
64
 
68
65
  it 'should abort after a 4xx or 5xx' do
@@ -79,11 +76,10 @@ describe Klepto::Bot do
79
76
  StatusLog.create message: 'Aborted.'
80
77
  end
81
78
  }
82
- @structure = @bot.resources
83
79
  end
84
80
 
85
81
  it 'should perform structuring' do
86
- @structure.first[:title].should == 'Not Found'
82
+ @bot.structure[:title].should == 'Not Found'
87
83
  end
88
84
 
89
85
  it 'should not abort after a 4xx or 5xx' do
@@ -91,21 +87,6 @@ describe Klepto::Bot do
91
87
  end
92
88
  end
93
89
 
94
- describe 'structuring multiple pages' do
95
- before(:each) do
96
- @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
97
- config.urls "https://twitter.com/ladygaga"
98
- name 'h1.fullname'
99
- }
100
- @structure = @bot.resources
101
- end
102
-
103
- it 'should have both pages data' do
104
- @structure.first[:name].should match(/Justin/i)
105
- @structure.last[:name].should match(/Lady/i)
106
- end
107
- end
108
-
109
90
  describe 'creating a bot' do
110
91
  before(:each) do
111
92
  @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
@@ -114,8 +95,6 @@ describe Klepto::Bot do
114
95
  'X-Sup-Dawg' => "Yo, What's up?"
115
96
  })
116
97
 
117
- config.keep_pages true
118
-
119
98
  # Structure that stuff
120
99
  name 'h1.fullname'
121
100
  username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
@@ -169,28 +148,18 @@ describe Klepto::Bot do
169
148
  end
170
149
  end
171
150
  }
172
- @structure = @bot.resources
173
151
  end
174
152
 
175
153
  it 'should structure the data' do
176
- @structure.first[:name].should match(/Justin/i)
177
- @structure.first[:links].first.should match(/^http:/i)
178
- #@structure.first[:links].should == ["http://t.co/2oSNE36kNM"]
179
- @structure.first[:username].should eq '@justinbieber'
180
- @structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
181
- end
182
-
183
- it 'should have the pages stored' do
184
- @bot.pages["https://twitter.com/justinbieber"].should_not be_nil
185
- end
186
-
187
- it 'should be able to #parse! a url' do
188
- @new_structure = @bot.parse!("https://twitter.com/justinbieber")
189
- @new_structure.first[:name].should match(/Justin/i)
154
+ @bot.structure[:name].should match(/Justin/i)
155
+ @bot.structure[:links].first.should match(/^http:/i)
156
+ #@bot.structure[:links].should == ["http://t.co/2oSNE36kNM"]
157
+ @bot.structure[:username].should eq '@justinbieber'
158
+ @bot.structure[:last_tweet][:twitter_id].should == @bot.structure[:tweets].first[:twitter_id]
190
159
  end
191
160
 
192
161
  it 'should store the data' do
193
- User.first.name.should eq( @structure.first[:name] )
162
+ User.first.name.should eq( @bot.structure[:name] )
194
163
  User.count.should be(1)
195
164
  Tweet.count.should_not be(0)
196
165
  end
@@ -229,13 +198,12 @@ describe Klepto::Bot do
229
198
  # end
230
199
  # end
231
200
  # }
232
- # @structure = @bot.resources
233
201
  # end
234
202
 
235
203
  # it 'should set the value to nil when an exception is raised' do
236
- # @structure.first[:name].should match(/Justin/i)
237
- # @structure.first[:tweets].first.keys.should include(:timestamp)
238
- # @structure.first[:tweets].first[:timestamp].should be(nil)
204
+ # @bot.structure[:name].should match(/Justin/i)
205
+ # @bot.structure[:tweets].first.keys.should include(:timestamp)
206
+ # @bot.structure[:tweets].first[:timestamp].should be(nil)
239
207
  # end
240
208
  # end
241
209
 
@@ -245,11 +213,10 @@ describe Klepto::Bot do
245
213
  name 'h1.fullname'
246
214
  username "span.screen-NOPE", default: "CHICKENS"
247
215
  }
248
- @structure = @bot.resources
249
216
  end
250
217
 
251
218
  it 'should have a sensible default for the structure' do
252
- @structure.first[:username].should eq('CHICKENS')
219
+ @bot.structure[:username].should eq('CHICKENS')
253
220
  end
254
221
  end
255
222
 
@@ -259,12 +226,11 @@ describe Klepto::Bot do
259
226
  name 'h1.fullname', parser: TextParser
260
227
  links 'span.url a', :match => :all, :parser => HrefParser
261
228
  }
262
- @structure = @bot.resources
263
229
  end
264
230
 
265
231
  it 'should structure the data' do
266
- @structure.first[:name].should match(/Justin/i)
267
- @structure.first[:links].first.should match(/^http:/i)
232
+ @bot.structure[:name].should match(/Justin/i)
233
+ @bot.structure[:links].first.should match(/^http:/i)
268
234
  end
269
235
  end
270
236
 
@@ -300,7 +266,6 @@ describe Klepto::Bot do
300
266
  end
301
267
  end
302
268
  }
303
- @structure = @bot.resources
304
269
  end
305
270
 
306
271
  it 'should limit the nodes structured' do
@@ -308,7 +273,5 @@ describe Klepto::Bot do
308
273
  Tweet.count.should be(5)
309
274
  end
310
275
  end
311
-
312
-
313
276
  end
314
277
  end
@@ -4,7 +4,7 @@ describe Klepto::Config do
4
4
  before(:each) do
5
5
  @config = Klepto::Config.new
6
6
  @config.headers({'Referer' => 'http://example.com'})
7
- @config.urls 'http://example.com', 'http://www.iana.org'
7
+ @config.url 'http://example.com'
8
8
  @config.on_http_status(200){
9
9
  "Its 200"
10
10
  }
@@ -39,8 +39,8 @@ describe Klepto::Config do
39
39
  @config.instance_variable_get("@status_handlers")['4xx'].first.call.should eq ('Its crazy.')
40
40
  end
41
41
 
42
- it 'should be able to set URLs' do
43
- @config.urls.should == ['http://example.com', 'http://www.iana.org']
42
+ it 'should be able to set a URL' do
43
+ @config.url.should == 'http://example.com'
44
44
  end
45
45
 
46
46
  it 'should have an abort on 4xx/5xx option' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-05-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: poltergeist
16
- requirement: &70314273924640 !ruby/object:Gem::Requirement
16
+ requirement: &70303258668580 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - =
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.1.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70314273924640
24
+ version_requirements: *70303258668580
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: capybara
27
- requirement: &70314260798000 !ruby/object:Gem::Requirement
27
+ requirement: &70303258666700 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.0.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70314260798000
35
+ version_requirements: *70303258666700
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70314260796320 !ruby/object:Gem::Requirement
38
+ requirement: &70303259419200 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.5.6
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70314260796320
46
+ version_requirements: *70303259419200
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: activesupport
49
- requirement: &70314260795500 !ruby/object:Gem::Requirement
49
+ requirement: &70303259418820 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70314260795500
57
+ version_requirements: *70303259418820
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: multi_json
60
- requirement: &70314260792000 !ruby/object:Gem::Requirement
60
+ requirement: &70303259418280 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '1.0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70314260792000
68
+ version_requirements: *70303259418280
69
69
  description: Tearing up web pages into ActiveRecord resources
70
70
  email:
71
71
  - github@coryodaniel.com