klepto 0.5.3 → 0.5.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/lib/klepto/bot.rb +42 -60
- data/lib/klepto/config.rb +3 -14
- data/lib/klepto/structure.rb +4 -4
- data/lib/klepto/version.rb +1 -1
- data/spec/lib/klepto/bot_spec.rb +15 -52
- data/spec/lib/klepto/config_spec.rb +3 -3
- metadata +11 -11
data/README.md
CHANGED
@@ -165,7 +165,7 @@ end
|
|
165
165
|
|
166
166
|
## Configuration Options
|
167
167
|
* config.headers - Hash; Sets request headers
|
168
|
-
* config.
|
168
|
+
* config.url - String; Set URL to structure
|
169
169
|
* config.abort_on_failure - Boolean(Default: true); Should structuring be aborted on 4xx or 5xx
|
170
170
|
|
171
171
|
## Callbacks & Processing
|
data/lib/klepto/bot.rb
CHANGED
@@ -1,24 +1,13 @@
|
|
1
1
|
module Klepto
|
2
2
|
class Bot
|
3
3
|
attr_reader :config
|
4
|
-
@@_bots = {}
|
5
|
-
class << self
|
6
|
-
def run(name,*urls)
|
7
|
-
urls.each do |url|
|
8
|
-
@@_bots[name].parse! url
|
9
|
-
end
|
10
|
-
end
|
11
|
-
def make(name, &block)
|
12
|
-
@@_bots[name] = Klepto::Bot.new(&block)
|
13
|
-
end
|
14
|
-
end
|
15
4
|
|
16
|
-
def initialize(
|
5
|
+
def initialize(url=nil, &block)
|
17
6
|
@config = Klepto::Config.new
|
18
|
-
@config.
|
7
|
+
@config.url url
|
19
8
|
@queue = []
|
20
|
-
@
|
21
|
-
|
9
|
+
@browser = Klepto::Browser.new
|
10
|
+
|
22
11
|
# Evaluate the block as DSL, proxy off anything that isn't on #config
|
23
12
|
# to a queue, then apply that queue to the top-level Klepto::Structure
|
24
13
|
instance_eval &block
|
@@ -27,9 +16,9 @@ module Klepto
|
|
27
16
|
# and restore method_missing (for sanity sake)
|
28
17
|
instance_eval <<-EOS
|
29
18
|
def queue; @queue; end;
|
30
|
-
def
|
31
|
-
def
|
32
|
-
def
|
19
|
+
def browser; @browser; end;
|
20
|
+
def url=(_url); @config.url(_url); end;
|
21
|
+
def structure; @structure; end;
|
33
22
|
def method_missing(meth, *args, &block)
|
34
23
|
raise NoMethodError.new("undefined method: Klepto::Bot#" + meth.to_s)
|
35
24
|
end
|
@@ -39,54 +28,47 @@ EOS
|
|
39
28
|
end
|
40
29
|
|
41
30
|
# Structure all the pages
|
42
|
-
def __process!
|
43
|
-
@
|
44
|
-
|
45
|
-
|
46
|
-
browser = Klepto::Browser.new
|
31
|
+
def __process!
|
32
|
+
@structure = nil
|
33
|
+
@browser.set_headers @config.headers
|
34
|
+
#browser.set_driver config.driver
|
47
35
|
|
48
|
-
|
49
|
-
|
36
|
+
# Call before(:each) handlers...
|
37
|
+
@config.before_handlers[:each].each { |bh|
|
38
|
+
bh.call(url, browser)
|
39
|
+
}
|
40
|
+
|
41
|
+
begin
|
42
|
+
@browser.fetch! @config.url
|
50
43
|
|
51
|
-
#
|
52
|
-
config.
|
53
|
-
|
54
|
-
|
44
|
+
# Fire callbacks on GET
|
45
|
+
@config.after_handlers[:get].each do |ah|
|
46
|
+
ah.call(@browser.page, @browser, @config.url)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Dispatch all the handlers for HTTP Status Codes.
|
50
|
+
@browser.statuses.each do |status|
|
51
|
+
@config.dispatch_status_handlers(status, @browser.page)
|
52
|
+
end
|
55
53
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
# Dispatch all the handlers for HTTP Status Codes.
|
67
|
-
browser.statuses.each do |status|
|
68
|
-
config.dispatch_status_handlers(status, browser.page)
|
69
|
-
end
|
70
|
-
|
71
|
-
# If the page was not a failure or if not aborting, structure that bad boy.
|
72
|
-
if (browser.failure? && config.abort_on_failure?) || (config.abort_on_redirect? && browser.was_redirected?)
|
73
|
-
config.after_handlers[:abort].each do |ah|
|
74
|
-
ah.call(browser.page,{
|
75
|
-
browser_failure: browser.failure?,
|
76
|
-
abort_on_failure: config.abort_on_failure?,
|
77
|
-
abort_on_redirect: config.abort_on_redirect?,
|
78
|
-
redirect: browser.was_redirected?
|
79
|
-
})
|
80
|
-
end
|
81
|
-
else
|
82
|
-
@resources << __structure(browser.page)
|
54
|
+
# If the page was not a failure or if not aborting, structure that bad boy.
|
55
|
+
if (@browser.failure? && @config.abort_on_failure?) || (@config.abort_on_redirect? && @browser.was_redirected?)
|
56
|
+
@config.after_handlers[:abort].each do |ah|
|
57
|
+
ah.call(browser.page,{
|
58
|
+
browser_failure: @browser.failure?,
|
59
|
+
abort_on_failure: @config.abort_on_failure?,
|
60
|
+
abort_on_redirect: @config.abort_on_redirect?,
|
61
|
+
redirect: @browser.was_redirected?
|
62
|
+
})
|
83
63
|
end
|
84
|
-
|
85
|
-
|
86
|
-
end
|
64
|
+
else
|
65
|
+
@structure = __structure(@browser.page)
|
66
|
+
end
|
67
|
+
rescue Capybara::Poltergeist::TimeoutError => ex
|
68
|
+
config.dispatch_timeout_handler(ex, url)
|
87
69
|
end
|
88
70
|
|
89
|
-
@
|
71
|
+
@structure
|
90
72
|
end
|
91
73
|
|
92
74
|
def __structure(context)
|
data/lib/klepto/config.rb
CHANGED
@@ -2,14 +2,11 @@ module Klepto
|
|
2
2
|
class Config
|
3
3
|
attr_reader :after_handlers
|
4
4
|
attr_reader :before_handlers
|
5
|
-
attr_reader :keep_pages
|
6
5
|
|
7
6
|
def initialize
|
8
7
|
@headers = {}
|
9
|
-
@keep_pages = false
|
10
8
|
@abort_on_failure = true
|
11
9
|
@abort_on_redirect = false
|
12
|
-
@urls = []
|
13
10
|
@after_handlers = {
|
14
11
|
:each => [], #after each call to
|
15
12
|
:get => [], #after GET, before structure
|
@@ -32,11 +29,6 @@ module Klepto
|
|
32
29
|
# @default_driver
|
33
30
|
# end
|
34
31
|
|
35
|
-
def keep_pages(_keep = nil)
|
36
|
-
@keep_pages = _keep if _keep != nil
|
37
|
-
@keep_pages
|
38
|
-
end
|
39
|
-
|
40
32
|
def headers(_headers=nil)
|
41
33
|
@headers = _headers if _headers
|
42
34
|
@headers
|
@@ -103,12 +95,9 @@ module Klepto
|
|
103
95
|
@before_handlers[which].push block
|
104
96
|
end
|
105
97
|
|
106
|
-
def url(
|
107
|
-
@
|
108
|
-
@
|
109
|
-
@urls.uniq!
|
110
|
-
@urls
|
98
|
+
def url(url=nil)
|
99
|
+
@url = url if url
|
100
|
+
@url
|
111
101
|
end
|
112
|
-
alias :urls :url
|
113
102
|
end
|
114
103
|
end
|
data/lib/klepto/structure.rb
CHANGED
@@ -61,8 +61,8 @@ module Klepto
|
|
61
61
|
Klepto.logger.debug("\t\t\tAs: block (match all), Result? #{!result.nil?}")
|
62
62
|
@_hash[meth] = []
|
63
63
|
options[:limit] ||= result.length
|
64
|
-
result[0, options[:limit]].each do |
|
65
|
-
@_hash[meth] << block.call(
|
64
|
+
result[0, options[:limit]].each do |_node|
|
65
|
+
@_hash[meth] << block.call( _node )
|
66
66
|
end
|
67
67
|
else
|
68
68
|
if result
|
@@ -81,8 +81,8 @@ module Klepto
|
|
81
81
|
Klepto.logger.debug("\t\t\tAs: simple (match all), Result? #{!result.nil?}")
|
82
82
|
@_hash[meth] = []
|
83
83
|
options[:limit] ||= result.length
|
84
|
-
result[0, options[:limit]].each do |
|
85
|
-
@_hash[meth] << (
|
84
|
+
result[0, options[:limit]].each do |_node|
|
85
|
+
@_hash[meth] << (_node[options[:attr]] || _node.try(:text))
|
86
86
|
end
|
87
87
|
elsif result
|
88
88
|
Klepto.logger.debug("\t\t\tAs: block (match one)")
|
data/lib/klepto/version.rb
CHANGED
data/spec/lib/klepto/bot_spec.rb
CHANGED
@@ -13,11 +13,10 @@ describe Klepto::Bot do
|
|
13
13
|
StatusLog.create message: 'Abort!'
|
14
14
|
}
|
15
15
|
}
|
16
|
-
@structure = @bot.resources
|
17
16
|
end
|
18
17
|
|
19
18
|
it 'should structure not have structured the data' do
|
20
|
-
@structure.should
|
19
|
+
@bot.structure.should be_nil
|
21
20
|
end
|
22
21
|
|
23
22
|
it 'should have dispatched abort handlers' do
|
@@ -37,11 +36,10 @@ describe Klepto::Bot do
|
|
37
36
|
StatusLog.create message: '200'
|
38
37
|
}
|
39
38
|
}
|
40
|
-
@structure = @bot.resources
|
41
39
|
end
|
42
40
|
|
43
41
|
it 'should structure the data' do
|
44
|
-
@structure
|
42
|
+
@bot.structure[:name].should match(/Justin/i)
|
45
43
|
end
|
46
44
|
|
47
45
|
it 'should have dispatched status handlers' do
|
@@ -62,7 +60,6 @@ describe Klepto::Bot do
|
|
62
60
|
StatusLog.create message: 'Aborted.'
|
63
61
|
end
|
64
62
|
}
|
65
|
-
@structure = @bot.resources
|
66
63
|
end
|
67
64
|
|
68
65
|
it 'should abort after a 4xx or 5xx' do
|
@@ -79,11 +76,10 @@ describe Klepto::Bot do
|
|
79
76
|
StatusLog.create message: 'Aborted.'
|
80
77
|
end
|
81
78
|
}
|
82
|
-
@structure = @bot.resources
|
83
79
|
end
|
84
80
|
|
85
81
|
it 'should perform structuring' do
|
86
|
-
@structure
|
82
|
+
@bot.structure[:title].should == 'Not Found'
|
87
83
|
end
|
88
84
|
|
89
85
|
it 'should not abort after a 4xx or 5xx' do
|
@@ -91,21 +87,6 @@ describe Klepto::Bot do
|
|
91
87
|
end
|
92
88
|
end
|
93
89
|
|
94
|
-
describe 'structuring multiple pages' do
|
95
|
-
before(:each) do
|
96
|
-
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
97
|
-
config.urls "https://twitter.com/ladygaga"
|
98
|
-
name 'h1.fullname'
|
99
|
-
}
|
100
|
-
@structure = @bot.resources
|
101
|
-
end
|
102
|
-
|
103
|
-
it 'should have both pages data' do
|
104
|
-
@structure.first[:name].should match(/Justin/i)
|
105
|
-
@structure.last[:name].should match(/Lady/i)
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
90
|
describe 'creating a bot' do
|
110
91
|
before(:each) do
|
111
92
|
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
@@ -114,8 +95,6 @@ describe Klepto::Bot do
|
|
114
95
|
'X-Sup-Dawg' => "Yo, What's up?"
|
115
96
|
})
|
116
97
|
|
117
|
-
config.keep_pages true
|
118
|
-
|
119
98
|
# Structure that stuff
|
120
99
|
name 'h1.fullname'
|
121
100
|
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
@@ -169,28 +148,18 @@ describe Klepto::Bot do
|
|
169
148
|
end
|
170
149
|
end
|
171
150
|
}
|
172
|
-
@structure = @bot.resources
|
173
151
|
end
|
174
152
|
|
175
153
|
it 'should structure the data' do
|
176
|
-
@structure
|
177
|
-
@structure
|
178
|
-
#@structure
|
179
|
-
@structure
|
180
|
-
@structure
|
181
|
-
end
|
182
|
-
|
183
|
-
it 'should have the pages stored' do
|
184
|
-
@bot.pages["https://twitter.com/justinbieber"].should_not be_nil
|
185
|
-
end
|
186
|
-
|
187
|
-
it 'should be able to #parse! a url' do
|
188
|
-
@new_structure = @bot.parse!("https://twitter.com/justinbieber")
|
189
|
-
@new_structure.first[:name].should match(/Justin/i)
|
154
|
+
@bot.structure[:name].should match(/Justin/i)
|
155
|
+
@bot.structure[:links].first.should match(/^http:/i)
|
156
|
+
#@bot.structure[:links].should == ["http://t.co/2oSNE36kNM"]
|
157
|
+
@bot.structure[:username].should eq '@justinbieber'
|
158
|
+
@bot.structure[:last_tweet][:twitter_id].should == @bot.structure[:tweets].first[:twitter_id]
|
190
159
|
end
|
191
160
|
|
192
161
|
it 'should store the data' do
|
193
|
-
User.first.name.should eq( @structure
|
162
|
+
User.first.name.should eq( @bot.structure[:name] )
|
194
163
|
User.count.should be(1)
|
195
164
|
Tweet.count.should_not be(0)
|
196
165
|
end
|
@@ -229,13 +198,12 @@ describe Klepto::Bot do
|
|
229
198
|
# end
|
230
199
|
# end
|
231
200
|
# }
|
232
|
-
# @structure = @bot.resources
|
233
201
|
# end
|
234
202
|
|
235
203
|
# it 'should set the value to nil when an exception is raised' do
|
236
|
-
# @structure
|
237
|
-
# @structure
|
238
|
-
# @structure
|
204
|
+
# @bot.structure[:name].should match(/Justin/i)
|
205
|
+
# @bot.structure[:tweets].first.keys.should include(:timestamp)
|
206
|
+
# @bot.structure[:tweets].first[:timestamp].should be(nil)
|
239
207
|
# end
|
240
208
|
# end
|
241
209
|
|
@@ -245,11 +213,10 @@ describe Klepto::Bot do
|
|
245
213
|
name 'h1.fullname'
|
246
214
|
username "span.screen-NOPE", default: "CHICKENS"
|
247
215
|
}
|
248
|
-
@structure = @bot.resources
|
249
216
|
end
|
250
217
|
|
251
218
|
it 'should have a sensible default for the structure' do
|
252
|
-
@structure
|
219
|
+
@bot.structure[:username].should eq('CHICKENS')
|
253
220
|
end
|
254
221
|
end
|
255
222
|
|
@@ -259,12 +226,11 @@ describe Klepto::Bot do
|
|
259
226
|
name 'h1.fullname', parser: TextParser
|
260
227
|
links 'span.url a', :match => :all, :parser => HrefParser
|
261
228
|
}
|
262
|
-
@structure = @bot.resources
|
263
229
|
end
|
264
230
|
|
265
231
|
it 'should structure the data' do
|
266
|
-
@structure
|
267
|
-
@structure
|
232
|
+
@bot.structure[:name].should match(/Justin/i)
|
233
|
+
@bot.structure[:links].first.should match(/^http:/i)
|
268
234
|
end
|
269
235
|
end
|
270
236
|
|
@@ -300,7 +266,6 @@ describe Klepto::Bot do
|
|
300
266
|
end
|
301
267
|
end
|
302
268
|
}
|
303
|
-
@structure = @bot.resources
|
304
269
|
end
|
305
270
|
|
306
271
|
it 'should limit the nodes structured' do
|
@@ -308,7 +273,5 @@ describe Klepto::Bot do
|
|
308
273
|
Tweet.count.should be(5)
|
309
274
|
end
|
310
275
|
end
|
311
|
-
|
312
|
-
|
313
276
|
end
|
314
277
|
end
|
@@ -4,7 +4,7 @@ describe Klepto::Config do
|
|
4
4
|
before(:each) do
|
5
5
|
@config = Klepto::Config.new
|
6
6
|
@config.headers({'Referer' => 'http://example.com'})
|
7
|
-
@config.
|
7
|
+
@config.url 'http://example.com'
|
8
8
|
@config.on_http_status(200){
|
9
9
|
"Its 200"
|
10
10
|
}
|
@@ -39,8 +39,8 @@ describe Klepto::Config do
|
|
39
39
|
@config.instance_variable_get("@status_handlers")['4xx'].first.call.should eq ('Its crazy.')
|
40
40
|
end
|
41
41
|
|
42
|
-
it 'should be able to set
|
43
|
-
@config.
|
42
|
+
it 'should be able to set a URL' do
|
43
|
+
@config.url.should == 'http://example.com'
|
44
44
|
end
|
45
45
|
|
46
46
|
it 'should have an abort on 4xx/5xx option' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2013-05-31 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
16
|
-
requirement: &
|
16
|
+
requirement: &70303258668580 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70303258668580
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: capybara
|
27
|
-
requirement: &
|
27
|
+
requirement: &70303258666700 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.2
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70303258666700
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70303259419200 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.5.6
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70303259419200
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: activesupport
|
49
|
-
requirement: &
|
49
|
+
requirement: &70303259418820 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70303259418820
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: multi_json
|
60
|
-
requirement: &
|
60
|
+
requirement: &70303259418280 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '1.0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70303259418280
|
69
69
|
description: Tearing up web pages into ActiveRecord resources
|
70
70
|
email:
|
71
71
|
- github@coryodaniel.com
|