klepto 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +21 -7
- data/lib/klepto/bot.rb +4 -7
- data/lib/klepto/config.rb +12 -1
- data/lib/klepto/version.rb +1 -1
- data/spec/lib/klepto/bot_spec.rb +6 -1
- data/spec/lib/klepto/config_spec.rb +5 -1
- metadata +11 -11
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Klepto
|
2
2
|
|
3
|
-
A mean little DSL'd capybara (poltergeist) based web
|
3
|
+
A mean little DSL'd capybara (poltergeist) based web scraper that structures data into ActiveRecord or wherever(TM).
|
4
4
|
|
5
5
|
## Features
|
6
6
|
|
@@ -8,7 +8,6 @@ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data in
|
|
8
8
|
* Full javascript processing via phantomjs / poltergeist
|
9
9
|
* All the fun of capybara
|
10
10
|
* Scrape multiple pages with a single bot
|
11
|
-
* Scrape individuals pages with multiple 'crawlers', see Bieber example.
|
12
11
|
* Pretty nifty DSL
|
13
12
|
* Test coverage!
|
14
13
|
|
@@ -16,8 +15,8 @@ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data in
|
|
16
15
|
Say you want a bunch of Bieb tweets! How is there not profit in that?
|
17
16
|
|
18
17
|
```ruby
|
19
|
-
#
|
20
|
-
@
|
18
|
+
# Fetch a web site or multiple. Bot#new takes a *splat!
|
19
|
+
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
21
20
|
# By default, it uses CSS selectors
|
22
21
|
name 'h1.fullname'
|
23
22
|
|
@@ -73,6 +72,10 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
|
|
73
72
|
puts "HOLY CRAP!"
|
74
73
|
}
|
75
74
|
|
75
|
+
config.after(:get) do |page|
|
76
|
+
# This is fired after each HTTP GET. It receives a Capybara::Node
|
77
|
+
end
|
78
|
+
|
76
79
|
# If you want to do something with each resource, like stick it in AR
|
77
80
|
# go for it here...
|
78
81
|
config.after do |resource|
|
@@ -87,10 +90,10 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
|
|
87
90
|
end #=> Profit!
|
88
91
|
}
|
89
92
|
|
90
|
-
#
|
93
|
+
# You can get an array of hashes(resources), so if you wanted to do something else
|
91
94
|
# you could do it here...
|
92
|
-
@
|
93
|
-
pp
|
95
|
+
@bot.resources.each do |resource|
|
96
|
+
pp resource
|
94
97
|
end
|
95
98
|
```
|
96
99
|
|
@@ -118,8 +121,19 @@ end
|
|
118
121
|
}
|
119
122
|
```
|
120
123
|
|
124
|
+
## Callbacks & Processing
|
125
|
+
|
126
|
+
* before
|
127
|
+
* n/a
|
128
|
+
* after
|
129
|
+
* :each (resource, Hash) - called for each resource parsed
|
130
|
+
* :get (page, Capybara::Node) - called after each HTTP GET
|
121
131
|
|
122
132
|
## Stuff I'm going to add.
|
133
|
+
Async
|
134
|
+
--------
|
135
|
+
-> https://github.com/igrigorik/em-synchrony
|
136
|
+
|
123
137
|
Cookie Stuffing
|
124
138
|
-------------------
|
125
139
|
```ruby
|
data/lib/klepto/bot.rb
CHANGED
@@ -17,13 +17,6 @@ EOS
|
|
17
17
|
__process!
|
18
18
|
end
|
19
19
|
|
20
|
-
def __dispatch_handlers_for(status_code)
|
21
|
-
if status_code.is_a?(Fixnum)
|
22
|
-
elsif status_code.is_a?(Symbol)
|
23
|
-
elsif status_code.is_a?(String)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
20
|
def __process!
|
28
21
|
@resources = []
|
29
22
|
|
@@ -32,6 +25,10 @@ EOS
|
|
32
25
|
|
33
26
|
browser.set_headers config.headers
|
34
27
|
browser.fetch! url
|
28
|
+
|
29
|
+
config.after_handlers[:get].each do |ah|
|
30
|
+
ah.call(browser.page)
|
31
|
+
end
|
35
32
|
|
36
33
|
statuses = [browser.status, browser.statusx]
|
37
34
|
statuses.push :redirect if url != browser.page.current_url
|
data/lib/klepto/config.rb
CHANGED
@@ -4,8 +4,9 @@ module Klepto
|
|
4
4
|
|
5
5
|
def initialize
|
6
6
|
@headers = {}
|
7
|
+
@abort_on_failure = true
|
7
8
|
@urls = []
|
8
|
-
@after_handlers = {:each => []}
|
9
|
+
@after_handlers = {:each => [], :get => []}
|
9
10
|
@before_handlers = {:each => []}
|
10
11
|
@status_handlers = {}
|
11
12
|
end
|
@@ -15,6 +16,11 @@ module Klepto
|
|
15
16
|
@headers
|
16
17
|
end
|
17
18
|
|
19
|
+
# 4xx, 5xx
|
20
|
+
def abort_on_failure(aof)
|
21
|
+
@abort_on_failure = aof
|
22
|
+
end
|
23
|
+
|
18
24
|
def on_http_status(*statuses,&block)
|
19
25
|
statuses.each do |status|
|
20
26
|
@status_handlers[status] ||= []
|
@@ -36,6 +42,11 @@ module Klepto
|
|
36
42
|
@after_handlers[which].push block
|
37
43
|
end
|
38
44
|
|
45
|
+
def before(which, &block)
|
46
|
+
@before_handlers[which] ||= []
|
47
|
+
@before_handlers[which].push block
|
48
|
+
end
|
49
|
+
|
39
50
|
def url(*args)
|
40
51
|
@urls += args
|
41
52
|
@urls.flatten!
|
data/lib/klepto/version.rb
CHANGED
data/spec/lib/klepto/bot_spec.rb
CHANGED
@@ -27,7 +27,7 @@ describe Klepto::Bot do
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
describe '
|
30
|
+
describe 'structuring multiple pages' do
|
31
31
|
before(:each) do
|
32
32
|
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
33
33
|
config.urls "https://twitter.com/ladygaga"
|
@@ -76,6 +76,10 @@ describe Klepto::Bot do
|
|
76
76
|
permalink '.time a', :css, :attr => :href
|
77
77
|
end
|
78
78
|
|
79
|
+
config.after(:get) do |page|
|
80
|
+
StatusLog.create message: 'got a page'
|
81
|
+
end
|
82
|
+
|
79
83
|
config.on_http_status('2xx'){
|
80
84
|
StatusLog.create message: '2xx'
|
81
85
|
}
|
@@ -120,6 +124,7 @@ describe Klepto::Bot do
|
|
120
124
|
statuses.should_not include 'redirect'
|
121
125
|
statuses.should include '200'
|
122
126
|
statuses.should include '2xx'
|
127
|
+
statuses.should include 'got a page'
|
123
128
|
end
|
124
129
|
end
|
125
130
|
end
|
@@ -14,6 +14,7 @@ describe Klepto::Config do
|
|
14
14
|
@config.on_http_status('5xx','4xx'){
|
15
15
|
"Its crazy."
|
16
16
|
}
|
17
|
+
@config.abort_on_failure(false)
|
17
18
|
end
|
18
19
|
|
19
20
|
it 'should be able to set headers' do
|
@@ -37,7 +38,10 @@ describe Klepto::Config do
|
|
37
38
|
@config.urls.should == ['http://example.com', 'http://www.iana.org']
|
38
39
|
end
|
39
40
|
|
40
|
-
|
41
|
+
it 'should have an abort on 4xx/5xx option' do
|
42
|
+
@config.instance_variable_get("@abort_on_failure").should be false
|
43
|
+
end
|
44
|
+
|
41
45
|
pending 'should be able to set cookies'
|
42
46
|
pending 'should be able to set steps'
|
43
47
|
pending 'should be able to set assertions'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2013-04-19 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
16
|
-
requirement: &
|
16
|
+
requirement: &70237982927260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70237982927260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: capybara
|
27
|
-
requirement: &
|
27
|
+
requirement: &70237982925860 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.2
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70237982925860
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70237982924820 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.5.6
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70237982924820
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: activesupport
|
49
|
-
requirement: &
|
49
|
+
requirement: &70237982923900 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70237982923900
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: multi_json
|
60
|
-
requirement: &
|
60
|
+
requirement: &70237982922600 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '1.0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70237982922600
|
69
69
|
description: Tearing up web pages into ActiveRecord resources
|
70
70
|
email:
|
71
71
|
- github@coryodaniel.com
|