klepto 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +48 -0
- data/lib/klepto/bot.rb +24 -11
- data/lib/klepto/browser.rb +5 -0
- data/lib/klepto/config.rb +9 -1
- data/lib/klepto/version.rb +1 -1
- data/spec/lib/klepto/bot_spec.rb +45 -0
- metadata +11 -11
data/README.md
CHANGED
@@ -11,6 +11,47 @@ A mean little DSL'd capybara (poltergeist) based web scraper that structures dat
|
|
11
11
|
* Pretty nifty DSL
|
12
12
|
* Test coverage!
|
13
13
|
|
14
|
+
## Installing
|
15
|
+
You need at least PhantomJS 1.8.1. There are *no other external
|
16
|
+
dependencies* (you don't need Qt, or a running X server, etc.)
|
17
|
+
|
18
|
+
### Mac ###
|
19
|
+
|
20
|
+
* *Homebrew*: `brew install phantomjs`
|
21
|
+
* *MacPorts*: `sudo port install phantomjs`
|
22
|
+
* *Manual install*: [Download this](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-macosx.zip&can=2&q=)
|
23
|
+
|
24
|
+
### Linux ###
|
25
|
+
|
26
|
+
* Download the [32
|
27
|
+
bit](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-linux-i686.tar.bz2&can=2&q=)
|
28
|
+
or [64
|
29
|
+
bit](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-linux-x86_64.tar.bz2&can=2&q=)
|
30
|
+
binary.
|
31
|
+
* Extract the tarball and copy `bin/phantomjs` into your `PATH`
|
32
|
+
|
33
|
+
### Windows ###
|
34
|
+
* Download the [precompiled binary](http://phantomjs.org/download.html) for Windows
|
35
|
+
|
36
|
+
### Manual compilation ###
|
37
|
+
|
38
|
+
Do this as a last resort if the binaries don't work for you. It will
|
39
|
+
take quite a long time as it has to build WebKit.
|
40
|
+
|
41
|
+
* Download [the source tarball](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-source.zip&can=2&q=)
|
42
|
+
* Extract and cd in
|
43
|
+
* `./build.sh`
|
44
|
+
|
45
|
+
(See also the [PhantomJS building guide](http://phantomjs.org/build.html).)
|
46
|
+
|
47
|
+
Then put klepto in your gemfile.
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
gem 'klepto', '>= 0.2.5'
|
51
|
+
```
|
52
|
+
|
53
|
+
|
54
|
+
|
14
55
|
## Usage (All your content are belong to us)
|
15
56
|
Say you want a bunch of Bieb tweets! How is there not profit in that?
|
16
57
|
|
@@ -121,6 +162,11 @@ end
|
|
121
162
|
}
|
122
163
|
```
|
123
164
|
|
165
|
+
## Configuration Options
|
166
|
+
* config.headers - Hash; Sets request headers
|
167
|
+
* config.urls - Array(String); Sets URLs to structure
|
168
|
+
* config.abort_on_failure - Boolean(Default: true); Should structuring be aborted on 4xx or 5xx
|
169
|
+
|
124
170
|
## Callbacks & Processing
|
125
171
|
|
126
172
|
* before
|
@@ -128,6 +174,8 @@ end
|
|
128
174
|
* after
|
129
175
|
* :each (resource, Hash) - called for each resource parsed
|
130
176
|
* :get (page, Capybara::Node) - called after each HTTP GET
|
177
|
+
* :abort (page, Capybara::Node) - called after a 4xx or 5xx if config.abort_on_failure is true (default)
|
178
|
+
|
131
179
|
|
132
180
|
## Stuff I'm going to add.
|
133
181
|
Async
|
data/lib/klepto/bot.rb
CHANGED
@@ -12,6 +12,9 @@ module Klepto
|
|
12
12
|
instance_eval <<-EOS
|
13
13
|
def queue; @queue; end;
|
14
14
|
def resources; @resources; end;
|
15
|
+
def method_missing(meth, *args, &block)
|
16
|
+
raise NoMethodError.new("undefined method: Klepto::Bot#" + meth.to_s)
|
17
|
+
end
|
15
18
|
EOS
|
16
19
|
|
17
20
|
__process!
|
@@ -36,24 +39,34 @@ EOS
|
|
36
39
|
config.dispatch_status_handlers(status, browser.page)
|
37
40
|
end
|
38
41
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
else
|
45
|
-
structure.send instruction[0], *instruction[1]
|
42
|
+
if !browser.failure? || (browser.failure? && !config.abort_on_failure?)
|
43
|
+
resources << __structure(browser.page)
|
44
|
+
else
|
45
|
+
config.after_handlers[:abort].each do |ah|
|
46
|
+
ah.call(browser.page)
|
46
47
|
end
|
47
48
|
end
|
49
|
+
end
|
50
|
+
|
51
|
+
@resources
|
52
|
+
end
|
53
|
+
|
54
|
+
def __structure(context)
|
55
|
+
structure = Structure.new(context)
|
48
56
|
|
49
|
-
|
50
|
-
|
57
|
+
queue.each do |instruction|
|
58
|
+
if instruction[2]
|
59
|
+
structure.send instruction[0], *instruction[1], &instruction[2]
|
60
|
+
else
|
61
|
+
structure.send instruction[0], *instruction[1]
|
51
62
|
end
|
63
|
+
end
|
52
64
|
|
53
|
-
|
65
|
+
config.after_handlers[:each].each do |ah|
|
66
|
+
ah.call(structure._hash)
|
54
67
|
end
|
55
68
|
|
56
|
-
|
69
|
+
structure._hash
|
57
70
|
end
|
58
71
|
|
59
72
|
def method_missing(meth, *args, &block)
|
data/lib/klepto/browser.rb
CHANGED
data/lib/klepto/config.rb
CHANGED
@@ -6,7 +6,11 @@ module Klepto
|
|
6
6
|
@headers = {}
|
7
7
|
@abort_on_failure = true
|
8
8
|
@urls = []
|
9
|
-
@after_handlers = {
|
9
|
+
@after_handlers = {
|
10
|
+
:each => [],
|
11
|
+
:get => [],
|
12
|
+
:abort=> []
|
13
|
+
}
|
10
14
|
@before_handlers = {:each => []}
|
11
15
|
@status_handlers = {}
|
12
16
|
end
|
@@ -16,6 +20,10 @@ module Klepto
|
|
16
20
|
@headers
|
17
21
|
end
|
18
22
|
|
23
|
+
def abort_on_failure?
|
24
|
+
!!@abort_on_failure
|
25
|
+
end
|
26
|
+
|
19
27
|
# 4xx, 5xx
|
20
28
|
def abort_on_failure(aof)
|
21
29
|
@abort_on_failure = aof
|
data/lib/klepto/version.rb
CHANGED
data/spec/lib/klepto/bot_spec.rb
CHANGED
@@ -27,6 +27,44 @@ describe Klepto::Bot do
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
describe 'aborting after a failure' do
|
31
|
+
before(:each) do
|
32
|
+
@bot = Klepto::Bot.new("https://github.com/coryodaniel/nonexistentproject"){
|
33
|
+
name 'h1.fullname'
|
34
|
+
config.abort_on_failure true
|
35
|
+
config.after(:abort) do |page|
|
36
|
+
StatusLog.create message: 'Aborted.'
|
37
|
+
end
|
38
|
+
}
|
39
|
+
@structure = @bot.resources
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should abort after a 4xx or 5xx' do
|
43
|
+
StatusLog.first.message.should eq("Aborted.")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe 'structuring a 4xx or 5xx response' do
|
48
|
+
before(:each) do
|
49
|
+
@bot = Klepto::Bot.new("https://github.com/coryodaniel/nonexistentproject"){
|
50
|
+
cat 'img#parallax_octocat', :attr => :alt
|
51
|
+
config.abort_on_failure false
|
52
|
+
config.after(:abort) do |page|
|
53
|
+
StatusLog.create message: 'Aborted.'
|
54
|
+
end
|
55
|
+
}
|
56
|
+
@structure = @bot.resources
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should perform structuring' do
|
60
|
+
@structure.first[:cat].should eq('Octobi Wan Catnobi')
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should not abort after a 4xx or 5xx' do
|
64
|
+
StatusLog.first.should be(nil)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
30
68
|
describe 'structuring multiple pages' do
|
31
69
|
before(:each) do
|
32
70
|
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
@@ -118,6 +156,13 @@ describe Klepto::Bot do
|
|
118
156
|
Tweet.count.should_not be(0)
|
119
157
|
end
|
120
158
|
|
159
|
+
it 'should not have the DSL once its been processed' do
|
160
|
+
lambda{
|
161
|
+
@bot.i_dont_exist
|
162
|
+
}.should raise_error(NoMethodError)
|
163
|
+
|
164
|
+
end
|
165
|
+
|
121
166
|
it 'should have dispatched status handlers' do
|
122
167
|
statuses = StatusLog.all.map(&:message)
|
123
168
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2013-04-19 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
16
|
-
requirement: &
|
16
|
+
requirement: &70341397514440 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70341397514440
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: capybara
|
27
|
-
requirement: &
|
27
|
+
requirement: &70341397512360 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.2
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70341397512360
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70341397509940 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.5.6
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70341397509940
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: activesupport
|
49
|
-
requirement: &
|
49
|
+
requirement: &70341397506060 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70341397506060
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: multi_json
|
60
|
-
requirement: &
|
60
|
+
requirement: &70341397504900 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '1.0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70341397504900
|
69
69
|
description: Tearing up web pages into ActiveRecord resources
|
70
70
|
email:
|
71
71
|
- github@coryodaniel.com
|