klepto 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -165,7 +165,7 @@ end
165
165
 
166
166
  ## Configuration Options
167
167
  * config.headers - Hash; Sets request headers
168
- * config.urls - Array(String); Sets URLs to structure
168
+ * config.url - String; Set URL to structure
169
169
  * config.abort_on_failure - Boolean(Default: true); Should structuring be aborted on 4xx or 5xx
170
170
 
171
171
  ## Callbacks & Processing
data/lib/klepto/bot.rb CHANGED
@@ -1,24 +1,13 @@
1
1
  module Klepto
2
2
  class Bot
3
3
  attr_reader :config
4
- @@_bots = {}
5
- class << self
6
- def run(name,*urls)
7
- urls.each do |url|
8
- @@_bots[name].parse! url
9
- end
10
- end
11
- def make(name, &block)
12
- @@_bots[name] = Klepto::Bot.new(&block)
13
- end
14
- end
15
4
 
16
- def initialize(*urls, &block)
5
+ def initialize(url=nil, &block)
17
6
  @config = Klepto::Config.new
18
- @config.urls urls
7
+ @config.url url
19
8
  @queue = []
20
- @pages = {}
21
-
9
+ @browser = Klepto::Browser.new
10
+
22
11
  # Evaluate the block as DSL, proxy off anything that isn't on #config
23
12
  # to a queue, then apply that queue to the top-level Klepto::Structure
24
13
  instance_eval &block
@@ -27,9 +16,9 @@ module Klepto
27
16
  # and restore method_missing (for sanity sake)
28
17
  instance_eval <<-EOS
29
18
  def queue; @queue; end;
30
- def pages; @pages; end;
31
- def parse!(*_urls); __process!(*_urls); end;
32
- def resources; @resources; end;
19
+ def browser; @browser; end;
20
+ def url=(_url); @config.url(_url); end;
21
+ def structure; @structure; end;
33
22
  def method_missing(meth, *args, &block)
34
23
  raise NoMethodError.new("undefined method: Klepto::Bot#" + meth.to_s)
35
24
  end
@@ -39,54 +28,47 @@ EOS
39
28
  end
40
29
 
41
30
  # Structure all the pages
42
- def __process!(*_urls)
43
- @resources = []
44
-
45
- (_urls + config.urls).each do |url|
46
- browser = Klepto::Browser.new
31
+ def __process!
32
+ @structure = nil
33
+ @browser.set_headers @config.headers
34
+ #browser.set_driver config.driver
47
35
 
48
- browser.set_headers config.headers
49
- #browser.set_driver config.driver
36
+ # Call before(:each) handlers...
37
+ @config.before_handlers[:each].each { |bh|
38
+ bh.call(url, browser)
39
+ }
40
+
41
+ begin
42
+ @browser.fetch! @config.url
50
43
 
51
- # Call before(:each) handlers...
52
- config.before_handlers[:each].each { |bh|
53
- bh.call(url, browser)
54
- }
44
+ # Fire callbacks on GET
45
+ @config.after_handlers[:get].each do |ah|
46
+ ah.call(@browser.page, @browser, @config.url)
47
+ end
48
+
49
+ # Dispatch all the handlers for HTTP Status Codes.
50
+ @browser.statuses.each do |status|
51
+ @config.dispatch_status_handlers(status, @browser.page)
52
+ end
55
53
 
56
- begin
57
- browser.fetch! url
58
-
59
- @pages[url] = browser.page if config.keep_pages
60
-
61
- # Fire callbacks on GET
62
- config.after_handlers[:get].each do |ah|
63
- ah.call(browser.page, browser, url)
64
- end
65
-
66
- # Dispatch all the handlers for HTTP Status Codes.
67
- browser.statuses.each do |status|
68
- config.dispatch_status_handlers(status, browser.page)
69
- end
70
-
71
- # If the page was not a failure or if not aborting, structure that bad boy.
72
- if (browser.failure? && config.abort_on_failure?) || (config.abort_on_redirect? && browser.was_redirected?)
73
- config.after_handlers[:abort].each do |ah|
74
- ah.call(browser.page,{
75
- browser_failure: browser.failure?,
76
- abort_on_failure: config.abort_on_failure?,
77
- abort_on_redirect: config.abort_on_redirect?,
78
- redirect: browser.was_redirected?
79
- })
80
- end
81
- else
82
- @resources << __structure(browser.page)
54
+ # If the page was not a failure or if not aborting, structure that bad boy.
55
+ if (@browser.failure? && @config.abort_on_failure?) || (@config.abort_on_redirect? && @browser.was_redirected?)
56
+ @config.after_handlers[:abort].each do |ah|
57
+ ah.call(browser.page,{
58
+ browser_failure: @browser.failure?,
59
+ abort_on_failure: @config.abort_on_failure?,
60
+ abort_on_redirect: @config.abort_on_redirect?,
61
+ redirect: @browser.was_redirected?
62
+ })
83
63
  end
84
- rescue Capybara::Poltergeist::TimeoutError => ex
85
- config.dispatch_timeout_handler(ex, url)
86
- end
64
+ else
65
+ @structure = __structure(@browser.page)
66
+ end
67
+ rescue Capybara::Poltergeist::TimeoutError => ex
68
+ config.dispatch_timeout_handler(ex, url)
87
69
  end
88
70
 
89
- @resources
71
+ @structure
90
72
  end
91
73
 
92
74
  def __structure(context)
data/lib/klepto/config.rb CHANGED
@@ -2,14 +2,11 @@ module Klepto
2
2
  class Config
3
3
  attr_reader :after_handlers
4
4
  attr_reader :before_handlers
5
- attr_reader :keep_pages
6
5
 
7
6
  def initialize
8
7
  @headers = {}
9
- @keep_pages = false
10
8
  @abort_on_failure = true
11
9
  @abort_on_redirect = false
12
- @urls = []
13
10
  @after_handlers = {
14
11
  :each => [], #after each call to
15
12
  :get => [], #after GET, before structure
@@ -32,11 +29,6 @@ module Klepto
32
29
  # @default_driver
33
30
  # end
34
31
 
35
- def keep_pages(_keep = nil)
36
- @keep_pages = _keep if _keep != nil
37
- @keep_pages
38
- end
39
-
40
32
  def headers(_headers=nil)
41
33
  @headers = _headers if _headers
42
34
  @headers
@@ -103,12 +95,9 @@ module Klepto
103
95
  @before_handlers[which].push block
104
96
  end
105
97
 
106
- def url(*args)
107
- @urls += args
108
- @urls.flatten!
109
- @urls.uniq!
110
- @urls
98
+ def url(url=nil)
99
+ @url = url if url
100
+ @url
111
101
  end
112
- alias :urls :url
113
102
  end
114
103
  end
@@ -61,8 +61,8 @@ module Klepto
61
61
  Klepto.logger.debug("\t\t\tAs: block (match all), Result? #{!result.nil?}")
62
62
  @_hash[meth] = []
63
63
  options[:limit] ||= result.length
64
- result[0, options[:limit]].each do |node|
65
- @_hash[meth] << block.call( node )
64
+ result[0, options[:limit]].each do |_node|
65
+ @_hash[meth] << block.call( _node )
66
66
  end
67
67
  else
68
68
  if result
@@ -81,8 +81,8 @@ module Klepto
81
81
  Klepto.logger.debug("\t\t\tAs: simple (match all), Result? #{!result.nil?}")
82
82
  @_hash[meth] = []
83
83
  options[:limit] ||= result.length
84
- result[0, options[:limit]].each do |node|
85
- @_hash[meth] << (node[options[:attr]] || node.try(:text))
84
+ result[0, options[:limit]].each do |_node|
85
+ @_hash[meth] << (_node[options[:attr]] || _node.try(:text))
86
86
  end
87
87
  elsif result
88
88
  Klepto.logger.debug("\t\t\tAs: block (match one)")
@@ -1,3 +1,3 @@
1
1
  module Klepto
2
- VERSION = "0.5.3"
2
+ VERSION = "0.5.5"
3
3
  end
@@ -13,11 +13,10 @@ describe Klepto::Bot do
13
13
  StatusLog.create message: 'Abort!'
14
14
  }
15
15
  }
16
- @structure = @bot.resources
17
16
  end
18
17
 
19
18
  it 'should structure not have structured the data' do
20
- @structure.should be_empty
19
+ @bot.structure.should be_nil
21
20
  end
22
21
 
23
22
  it 'should have dispatched abort handlers' do
@@ -37,11 +36,10 @@ describe Klepto::Bot do
37
36
  StatusLog.create message: '200'
38
37
  }
39
38
  }
40
- @structure = @bot.resources
41
39
  end
42
40
 
43
41
  it 'should structure the data' do
44
- @structure.first[:name].should match(/Justin/i)
42
+ @bot.structure[:name].should match(/Justin/i)
45
43
  end
46
44
 
47
45
  it 'should have dispatched status handlers' do
@@ -62,7 +60,6 @@ describe Klepto::Bot do
62
60
  StatusLog.create message: 'Aborted.'
63
61
  end
64
62
  }
65
- @structure = @bot.resources
66
63
  end
67
64
 
68
65
  it 'should abort after a 4xx or 5xx' do
@@ -79,11 +76,10 @@ describe Klepto::Bot do
79
76
  StatusLog.create message: 'Aborted.'
80
77
  end
81
78
  }
82
- @structure = @bot.resources
83
79
  end
84
80
 
85
81
  it 'should perform structuring' do
86
- @structure.first[:title].should == 'Not Found'
82
+ @bot.structure[:title].should == 'Not Found'
87
83
  end
88
84
 
89
85
  it 'should not abort after a 4xx or 5xx' do
@@ -91,21 +87,6 @@ describe Klepto::Bot do
91
87
  end
92
88
  end
93
89
 
94
- describe 'structuring multiple pages' do
95
- before(:each) do
96
- @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
97
- config.urls "https://twitter.com/ladygaga"
98
- name 'h1.fullname'
99
- }
100
- @structure = @bot.resources
101
- end
102
-
103
- it 'should have both pages data' do
104
- @structure.first[:name].should match(/Justin/i)
105
- @structure.last[:name].should match(/Lady/i)
106
- end
107
- end
108
-
109
90
  describe 'creating a bot' do
110
91
  before(:each) do
111
92
  @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
@@ -114,8 +95,6 @@ describe Klepto::Bot do
114
95
  'X-Sup-Dawg' => "Yo, What's up?"
115
96
  })
116
97
 
117
- config.keep_pages true
118
-
119
98
  # Structure that stuff
120
99
  name 'h1.fullname'
121
100
  username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
@@ -169,28 +148,18 @@ describe Klepto::Bot do
169
148
  end
170
149
  end
171
150
  }
172
- @structure = @bot.resources
173
151
  end
174
152
 
175
153
  it 'should structure the data' do
176
- @structure.first[:name].should match(/Justin/i)
177
- @structure.first[:links].first.should match(/^http:/i)
178
- #@structure.first[:links].should == ["http://t.co/2oSNE36kNM"]
179
- @structure.first[:username].should eq '@justinbieber'
180
- @structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
181
- end
182
-
183
- it 'should have the pages stored' do
184
- @bot.pages["https://twitter.com/justinbieber"].should_not be_nil
185
- end
186
-
187
- it 'should be able to #parse! a url' do
188
- @new_structure = @bot.parse!("https://twitter.com/justinbieber")
189
- @new_structure.first[:name].should match(/Justin/i)
154
+ @bot.structure[:name].should match(/Justin/i)
155
+ @bot.structure[:links].first.should match(/^http:/i)
156
+ #@bot.structure[:links].should == ["http://t.co/2oSNE36kNM"]
157
+ @bot.structure[:username].should eq '@justinbieber'
158
+ @bot.structure[:last_tweet][:twitter_id].should == @bot.structure[:tweets].first[:twitter_id]
190
159
  end
191
160
 
192
161
  it 'should store the data' do
193
- User.first.name.should eq( @structure.first[:name] )
162
+ User.first.name.should eq( @bot.structure[:name] )
194
163
  User.count.should be(1)
195
164
  Tweet.count.should_not be(0)
196
165
  end
@@ -229,13 +198,12 @@ describe Klepto::Bot do
229
198
  # end
230
199
  # end
231
200
  # }
232
- # @structure = @bot.resources
233
201
  # end
234
202
 
235
203
  # it 'should set the value to nil when an exception is raised' do
236
- # @structure.first[:name].should match(/Justin/i)
237
- # @structure.first[:tweets].first.keys.should include(:timestamp)
238
- # @structure.first[:tweets].first[:timestamp].should be(nil)
204
+ # @bot.structure[:name].should match(/Justin/i)
205
+ # @bot.structure[:tweets].first.keys.should include(:timestamp)
206
+ # @bot.structure[:tweets].first[:timestamp].should be(nil)
239
207
  # end
240
208
  # end
241
209
 
@@ -245,11 +213,10 @@ describe Klepto::Bot do
245
213
  name 'h1.fullname'
246
214
  username "span.screen-NOPE", default: "CHICKENS"
247
215
  }
248
- @structure = @bot.resources
249
216
  end
250
217
 
251
218
  it 'should have a sensible default for the structure' do
252
- @structure.first[:username].should eq('CHICKENS')
219
+ @bot.structure[:username].should eq('CHICKENS')
253
220
  end
254
221
  end
255
222
 
@@ -259,12 +226,11 @@ describe Klepto::Bot do
259
226
  name 'h1.fullname', parser: TextParser
260
227
  links 'span.url a', :match => :all, :parser => HrefParser
261
228
  }
262
- @structure = @bot.resources
263
229
  end
264
230
 
265
231
  it 'should structure the data' do
266
- @structure.first[:name].should match(/Justin/i)
267
- @structure.first[:links].first.should match(/^http:/i)
232
+ @bot.structure[:name].should match(/Justin/i)
233
+ @bot.structure[:links].first.should match(/^http:/i)
268
234
  end
269
235
  end
270
236
 
@@ -300,7 +266,6 @@ describe Klepto::Bot do
300
266
  end
301
267
  end
302
268
  }
303
- @structure = @bot.resources
304
269
  end
305
270
 
306
271
  it 'should limit the nodes structured' do
@@ -308,7 +273,5 @@ describe Klepto::Bot do
308
273
  Tweet.count.should be(5)
309
274
  end
310
275
  end
311
-
312
-
313
276
  end
314
277
  end
@@ -4,7 +4,7 @@ describe Klepto::Config do
4
4
  before(:each) do
5
5
  @config = Klepto::Config.new
6
6
  @config.headers({'Referer' => 'http://example.com'})
7
- @config.urls 'http://example.com', 'http://www.iana.org'
7
+ @config.url 'http://example.com'
8
8
  @config.on_http_status(200){
9
9
  "Its 200"
10
10
  }
@@ -39,8 +39,8 @@ describe Klepto::Config do
39
39
  @config.instance_variable_get("@status_handlers")['4xx'].first.call.should eq ('Its crazy.')
40
40
  end
41
41
 
42
- it 'should be able to set URLs' do
43
- @config.urls.should == ['http://example.com', 'http://www.iana.org']
42
+ it 'should be able to set a URL' do
43
+ @config.url.should == 'http://example.com'
44
44
  end
45
45
 
46
46
  it 'should have an abort on 4xx/5xx option' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-05-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: poltergeist
16
- requirement: &70314273924640 !ruby/object:Gem::Requirement
16
+ requirement: &70303258668580 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - =
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.1.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70314273924640
24
+ version_requirements: *70303258668580
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: capybara
27
- requirement: &70314260798000 !ruby/object:Gem::Requirement
27
+ requirement: &70303258666700 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.0.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70314260798000
35
+ version_requirements: *70303258666700
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70314260796320 !ruby/object:Gem::Requirement
38
+ requirement: &70303259419200 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.5.6
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70314260796320
46
+ version_requirements: *70303259419200
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: activesupport
49
- requirement: &70314260795500 !ruby/object:Gem::Requirement
49
+ requirement: &70303259418820 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70314260795500
57
+ version_requirements: *70303259418820
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: multi_json
60
- requirement: &70314260792000 !ruby/object:Gem::Requirement
60
+ requirement: &70303259418280 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '1.0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70314260792000
68
+ version_requirements: *70303259418280
69
69
  description: Tearing up web pages into ActiveRecord resources
70
70
  email:
71
71
  - github@coryodaniel.com