klepto 0.5.3 → 0.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/lib/klepto/bot.rb +42 -60
- data/lib/klepto/config.rb +3 -14
- data/lib/klepto/structure.rb +4 -4
- data/lib/klepto/version.rb +1 -1
- data/spec/lib/klepto/bot_spec.rb +15 -52
- data/spec/lib/klepto/config_spec.rb +3 -3
- metadata +11 -11
data/README.md
CHANGED
@@ -165,7 +165,7 @@ end
|
|
165
165
|
|
166
166
|
## Configuration Options
|
167
167
|
* config.headers - Hash; Sets request headers
|
168
|
-
* config.
|
168
|
+
* config.url - String; Set URL to structure
|
169
169
|
* config.abort_on_failure - Boolean(Default: true); Should structuring be aborted on 4xx or 5xx
|
170
170
|
|
171
171
|
## Callbacks & Processing
|
data/lib/klepto/bot.rb
CHANGED
@@ -1,24 +1,13 @@
|
|
1
1
|
module Klepto
|
2
2
|
class Bot
|
3
3
|
attr_reader :config
|
4
|
-
@@_bots = {}
|
5
|
-
class << self
|
6
|
-
def run(name,*urls)
|
7
|
-
urls.each do |url|
|
8
|
-
@@_bots[name].parse! url
|
9
|
-
end
|
10
|
-
end
|
11
|
-
def make(name, &block)
|
12
|
-
@@_bots[name] = Klepto::Bot.new(&block)
|
13
|
-
end
|
14
|
-
end
|
15
4
|
|
16
|
-
def initialize(
|
5
|
+
def initialize(url=nil, &block)
|
17
6
|
@config = Klepto::Config.new
|
18
|
-
@config.
|
7
|
+
@config.url url
|
19
8
|
@queue = []
|
20
|
-
@
|
21
|
-
|
9
|
+
@browser = Klepto::Browser.new
|
10
|
+
|
22
11
|
# Evaluate the block as DSL, proxy off anything that isn't on #config
|
23
12
|
# to a queue, then apply that queue to the top-level Klepto::Structure
|
24
13
|
instance_eval &block
|
@@ -27,9 +16,9 @@ module Klepto
|
|
27
16
|
# and restore method_missing (for sanity sake)
|
28
17
|
instance_eval <<-EOS
|
29
18
|
def queue; @queue; end;
|
30
|
-
def
|
31
|
-
def
|
32
|
-
def
|
19
|
+
def browser; @browser; end;
|
20
|
+
def url=(_url); @config.url(_url); end;
|
21
|
+
def structure; @structure; end;
|
33
22
|
def method_missing(meth, *args, &block)
|
34
23
|
raise NoMethodError.new("undefined method: Klepto::Bot#" + meth.to_s)
|
35
24
|
end
|
@@ -39,54 +28,47 @@ EOS
|
|
39
28
|
end
|
40
29
|
|
41
30
|
# Structure all the pages
|
42
|
-
def __process!
|
43
|
-
@
|
44
|
-
|
45
|
-
|
46
|
-
browser = Klepto::Browser.new
|
31
|
+
def __process!
|
32
|
+
@structure = nil
|
33
|
+
@browser.set_headers @config.headers
|
34
|
+
#browser.set_driver config.driver
|
47
35
|
|
48
|
-
|
49
|
-
|
36
|
+
# Call before(:each) handlers...
|
37
|
+
@config.before_handlers[:each].each { |bh|
|
38
|
+
bh.call(url, browser)
|
39
|
+
}
|
40
|
+
|
41
|
+
begin
|
42
|
+
@browser.fetch! @config.url
|
50
43
|
|
51
|
-
#
|
52
|
-
config.
|
53
|
-
|
54
|
-
|
44
|
+
# Fire callbacks on GET
|
45
|
+
@config.after_handlers[:get].each do |ah|
|
46
|
+
ah.call(@browser.page, @browser, @config.url)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Dispatch all the handlers for HTTP Status Codes.
|
50
|
+
@browser.statuses.each do |status|
|
51
|
+
@config.dispatch_status_handlers(status, @browser.page)
|
52
|
+
end
|
55
53
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
# Dispatch all the handlers for HTTP Status Codes.
|
67
|
-
browser.statuses.each do |status|
|
68
|
-
config.dispatch_status_handlers(status, browser.page)
|
69
|
-
end
|
70
|
-
|
71
|
-
# If the page was not a failure or if not aborting, structure that bad boy.
|
72
|
-
if (browser.failure? && config.abort_on_failure?) || (config.abort_on_redirect? && browser.was_redirected?)
|
73
|
-
config.after_handlers[:abort].each do |ah|
|
74
|
-
ah.call(browser.page,{
|
75
|
-
browser_failure: browser.failure?,
|
76
|
-
abort_on_failure: config.abort_on_failure?,
|
77
|
-
abort_on_redirect: config.abort_on_redirect?,
|
78
|
-
redirect: browser.was_redirected?
|
79
|
-
})
|
80
|
-
end
|
81
|
-
else
|
82
|
-
@resources << __structure(browser.page)
|
54
|
+
# If the page was not a failure or if not aborting, structure that bad boy.
|
55
|
+
if (@browser.failure? && @config.abort_on_failure?) || (@config.abort_on_redirect? && @browser.was_redirected?)
|
56
|
+
@config.after_handlers[:abort].each do |ah|
|
57
|
+
ah.call(browser.page,{
|
58
|
+
browser_failure: @browser.failure?,
|
59
|
+
abort_on_failure: @config.abort_on_failure?,
|
60
|
+
abort_on_redirect: @config.abort_on_redirect?,
|
61
|
+
redirect: @browser.was_redirected?
|
62
|
+
})
|
83
63
|
end
|
84
|
-
|
85
|
-
|
86
|
-
end
|
64
|
+
else
|
65
|
+
@structure = __structure(@browser.page)
|
66
|
+
end
|
67
|
+
rescue Capybara::Poltergeist::TimeoutError => ex
|
68
|
+
config.dispatch_timeout_handler(ex, url)
|
87
69
|
end
|
88
70
|
|
89
|
-
@
|
71
|
+
@structure
|
90
72
|
end
|
91
73
|
|
92
74
|
def __structure(context)
|
data/lib/klepto/config.rb
CHANGED
@@ -2,14 +2,11 @@ module Klepto
|
|
2
2
|
class Config
|
3
3
|
attr_reader :after_handlers
|
4
4
|
attr_reader :before_handlers
|
5
|
-
attr_reader :keep_pages
|
6
5
|
|
7
6
|
def initialize
|
8
7
|
@headers = {}
|
9
|
-
@keep_pages = false
|
10
8
|
@abort_on_failure = true
|
11
9
|
@abort_on_redirect = false
|
12
|
-
@urls = []
|
13
10
|
@after_handlers = {
|
14
11
|
:each => [], #after each call to
|
15
12
|
:get => [], #after GET, before structure
|
@@ -32,11 +29,6 @@ module Klepto
|
|
32
29
|
# @default_driver
|
33
30
|
# end
|
34
31
|
|
35
|
-
def keep_pages(_keep = nil)
|
36
|
-
@keep_pages = _keep if _keep != nil
|
37
|
-
@keep_pages
|
38
|
-
end
|
39
|
-
|
40
32
|
def headers(_headers=nil)
|
41
33
|
@headers = _headers if _headers
|
42
34
|
@headers
|
@@ -103,12 +95,9 @@ module Klepto
|
|
103
95
|
@before_handlers[which].push block
|
104
96
|
end
|
105
97
|
|
106
|
-
def url(
|
107
|
-
@
|
108
|
-
@
|
109
|
-
@urls.uniq!
|
110
|
-
@urls
|
98
|
+
def url(url=nil)
|
99
|
+
@url = url if url
|
100
|
+
@url
|
111
101
|
end
|
112
|
-
alias :urls :url
|
113
102
|
end
|
114
103
|
end
|
data/lib/klepto/structure.rb
CHANGED
@@ -61,8 +61,8 @@ module Klepto
|
|
61
61
|
Klepto.logger.debug("\t\t\tAs: block (match all), Result? #{!result.nil?}")
|
62
62
|
@_hash[meth] = []
|
63
63
|
options[:limit] ||= result.length
|
64
|
-
result[0, options[:limit]].each do |
|
65
|
-
@_hash[meth] << block.call(
|
64
|
+
result[0, options[:limit]].each do |_node|
|
65
|
+
@_hash[meth] << block.call( _node )
|
66
66
|
end
|
67
67
|
else
|
68
68
|
if result
|
@@ -81,8 +81,8 @@ module Klepto
|
|
81
81
|
Klepto.logger.debug("\t\t\tAs: simple (match all), Result? #{!result.nil?}")
|
82
82
|
@_hash[meth] = []
|
83
83
|
options[:limit] ||= result.length
|
84
|
-
result[0, options[:limit]].each do |
|
85
|
-
@_hash[meth] << (
|
84
|
+
result[0, options[:limit]].each do |_node|
|
85
|
+
@_hash[meth] << (_node[options[:attr]] || _node.try(:text))
|
86
86
|
end
|
87
87
|
elsif result
|
88
88
|
Klepto.logger.debug("\t\t\tAs: block (match one)")
|
data/lib/klepto/version.rb
CHANGED
data/spec/lib/klepto/bot_spec.rb
CHANGED
@@ -13,11 +13,10 @@ describe Klepto::Bot do
|
|
13
13
|
StatusLog.create message: 'Abort!'
|
14
14
|
}
|
15
15
|
}
|
16
|
-
@structure = @bot.resources
|
17
16
|
end
|
18
17
|
|
19
18
|
it 'should structure not have structured the data' do
|
20
|
-
@structure.should
|
19
|
+
@bot.structure.should be_nil
|
21
20
|
end
|
22
21
|
|
23
22
|
it 'should have dispatched abort handlers' do
|
@@ -37,11 +36,10 @@ describe Klepto::Bot do
|
|
37
36
|
StatusLog.create message: '200'
|
38
37
|
}
|
39
38
|
}
|
40
|
-
@structure = @bot.resources
|
41
39
|
end
|
42
40
|
|
43
41
|
it 'should structure the data' do
|
44
|
-
@structure
|
42
|
+
@bot.structure[:name].should match(/Justin/i)
|
45
43
|
end
|
46
44
|
|
47
45
|
it 'should have dispatched status handlers' do
|
@@ -62,7 +60,6 @@ describe Klepto::Bot do
|
|
62
60
|
StatusLog.create message: 'Aborted.'
|
63
61
|
end
|
64
62
|
}
|
65
|
-
@structure = @bot.resources
|
66
63
|
end
|
67
64
|
|
68
65
|
it 'should abort after a 4xx or 5xx' do
|
@@ -79,11 +76,10 @@ describe Klepto::Bot do
|
|
79
76
|
StatusLog.create message: 'Aborted.'
|
80
77
|
end
|
81
78
|
}
|
82
|
-
@structure = @bot.resources
|
83
79
|
end
|
84
80
|
|
85
81
|
it 'should perform structuring' do
|
86
|
-
@structure
|
82
|
+
@bot.structure[:title].should == 'Not Found'
|
87
83
|
end
|
88
84
|
|
89
85
|
it 'should not abort after a 4xx or 5xx' do
|
@@ -91,21 +87,6 @@ describe Klepto::Bot do
|
|
91
87
|
end
|
92
88
|
end
|
93
89
|
|
94
|
-
describe 'structuring multiple pages' do
|
95
|
-
before(:each) do
|
96
|
-
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
97
|
-
config.urls "https://twitter.com/ladygaga"
|
98
|
-
name 'h1.fullname'
|
99
|
-
}
|
100
|
-
@structure = @bot.resources
|
101
|
-
end
|
102
|
-
|
103
|
-
it 'should have both pages data' do
|
104
|
-
@structure.first[:name].should match(/Justin/i)
|
105
|
-
@structure.last[:name].should match(/Lady/i)
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
90
|
describe 'creating a bot' do
|
110
91
|
before(:each) do
|
111
92
|
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
@@ -114,8 +95,6 @@ describe Klepto::Bot do
|
|
114
95
|
'X-Sup-Dawg' => "Yo, What's up?"
|
115
96
|
})
|
116
97
|
|
117
|
-
config.keep_pages true
|
118
|
-
|
119
98
|
# Structure that stuff
|
120
99
|
name 'h1.fullname'
|
121
100
|
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
@@ -169,28 +148,18 @@ describe Klepto::Bot do
|
|
169
148
|
end
|
170
149
|
end
|
171
150
|
}
|
172
|
-
@structure = @bot.resources
|
173
151
|
end
|
174
152
|
|
175
153
|
it 'should structure the data' do
|
176
|
-
@structure
|
177
|
-
@structure
|
178
|
-
#@structure
|
179
|
-
@structure
|
180
|
-
@structure
|
181
|
-
end
|
182
|
-
|
183
|
-
it 'should have the pages stored' do
|
184
|
-
@bot.pages["https://twitter.com/justinbieber"].should_not be_nil
|
185
|
-
end
|
186
|
-
|
187
|
-
it 'should be able to #parse! a url' do
|
188
|
-
@new_structure = @bot.parse!("https://twitter.com/justinbieber")
|
189
|
-
@new_structure.first[:name].should match(/Justin/i)
|
154
|
+
@bot.structure[:name].should match(/Justin/i)
|
155
|
+
@bot.structure[:links].first.should match(/^http:/i)
|
156
|
+
#@bot.structure[:links].should == ["http://t.co/2oSNE36kNM"]
|
157
|
+
@bot.structure[:username].should eq '@justinbieber'
|
158
|
+
@bot.structure[:last_tweet][:twitter_id].should == @bot.structure[:tweets].first[:twitter_id]
|
190
159
|
end
|
191
160
|
|
192
161
|
it 'should store the data' do
|
193
|
-
User.first.name.should eq( @structure
|
162
|
+
User.first.name.should eq( @bot.structure[:name] )
|
194
163
|
User.count.should be(1)
|
195
164
|
Tweet.count.should_not be(0)
|
196
165
|
end
|
@@ -229,13 +198,12 @@ describe Klepto::Bot do
|
|
229
198
|
# end
|
230
199
|
# end
|
231
200
|
# }
|
232
|
-
# @structure = @bot.resources
|
233
201
|
# end
|
234
202
|
|
235
203
|
# it 'should set the value to nil when an exception is raised' do
|
236
|
-
# @structure
|
237
|
-
# @structure
|
238
|
-
# @structure
|
204
|
+
# @bot.structure[:name].should match(/Justin/i)
|
205
|
+
# @bot.structure[:tweets].first.keys.should include(:timestamp)
|
206
|
+
# @bot.structure[:tweets].first[:timestamp].should be(nil)
|
239
207
|
# end
|
240
208
|
# end
|
241
209
|
|
@@ -245,11 +213,10 @@ describe Klepto::Bot do
|
|
245
213
|
name 'h1.fullname'
|
246
214
|
username "span.screen-NOPE", default: "CHICKENS"
|
247
215
|
}
|
248
|
-
@structure = @bot.resources
|
249
216
|
end
|
250
217
|
|
251
218
|
it 'should have a sensible default for the structure' do
|
252
|
-
@structure
|
219
|
+
@bot.structure[:username].should eq('CHICKENS')
|
253
220
|
end
|
254
221
|
end
|
255
222
|
|
@@ -259,12 +226,11 @@ describe Klepto::Bot do
|
|
259
226
|
name 'h1.fullname', parser: TextParser
|
260
227
|
links 'span.url a', :match => :all, :parser => HrefParser
|
261
228
|
}
|
262
|
-
@structure = @bot.resources
|
263
229
|
end
|
264
230
|
|
265
231
|
it 'should structure the data' do
|
266
|
-
@structure
|
267
|
-
@structure
|
232
|
+
@bot.structure[:name].should match(/Justin/i)
|
233
|
+
@bot.structure[:links].first.should match(/^http:/i)
|
268
234
|
end
|
269
235
|
end
|
270
236
|
|
@@ -300,7 +266,6 @@ describe Klepto::Bot do
|
|
300
266
|
end
|
301
267
|
end
|
302
268
|
}
|
303
|
-
@structure = @bot.resources
|
304
269
|
end
|
305
270
|
|
306
271
|
it 'should limit the nodes structured' do
|
@@ -308,7 +273,5 @@ describe Klepto::Bot do
|
|
308
273
|
Tweet.count.should be(5)
|
309
274
|
end
|
310
275
|
end
|
311
|
-
|
312
|
-
|
313
276
|
end
|
314
277
|
end
|
@@ -4,7 +4,7 @@ describe Klepto::Config do
|
|
4
4
|
before(:each) do
|
5
5
|
@config = Klepto::Config.new
|
6
6
|
@config.headers({'Referer' => 'http://example.com'})
|
7
|
-
@config.
|
7
|
+
@config.url 'http://example.com'
|
8
8
|
@config.on_http_status(200){
|
9
9
|
"Its 200"
|
10
10
|
}
|
@@ -39,8 +39,8 @@ describe Klepto::Config do
|
|
39
39
|
@config.instance_variable_get("@status_handlers")['4xx'].first.call.should eq ('Its crazy.')
|
40
40
|
end
|
41
41
|
|
42
|
-
it 'should be able to set
|
43
|
-
@config.
|
42
|
+
it 'should be able to set a URL' do
|
43
|
+
@config.url.should == 'http://example.com'
|
44
44
|
end
|
45
45
|
|
46
46
|
it 'should have an abort on 4xx/5xx option' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2013-05-31 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
16
|
-
requirement: &
|
16
|
+
requirement: &70303258668580 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70303258668580
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: capybara
|
27
|
-
requirement: &
|
27
|
+
requirement: &70303258666700 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.2
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70303258666700
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70303259419200 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.5.6
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70303259419200
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: activesupport
|
49
|
-
requirement: &
|
49
|
+
requirement: &70303259418820 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70303259418820
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: multi_json
|
60
|
-
requirement: &
|
60
|
+
requirement: &70303259418280 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '1.0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70303259418280
|
69
69
|
description: Tearing up web pages into ActiveRecord resources
|
70
70
|
email:
|
71
71
|
- github@coryodaniel.com
|