klepto 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -11,6 +11,47 @@ A mean little DSL'd capybara (poltergeist) based web scraper that structures dat
11
11
  * Pretty nifty DSL
12
12
  * Test coverage!
13
13
 
14
+ ## Installing
15
+ You need at least PhantomJS 1.8.1. There are *no other external
16
+ dependencies* (you don't need Qt, or a running X server, etc.)
17
+
18
+ ### Mac ###
19
+
20
+ * *Homebrew*: `brew install phantomjs`
21
+ * *MacPorts*: `sudo port install phantomjs`
22
+ * *Manual install*: [Download this](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-macosx.zip&can=2&q=)
23
+
24
+ ### Linux ###
25
+
26
+ * Download the [32
27
+ bit](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-linux-i686.tar.bz2&can=2&q=)
28
+ or [64
29
+ bit](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-linux-x86_64.tar.bz2&can=2&q=)
30
+ binary.
31
+ * Extract the tarball and copy `bin/phantomjs` into your `PATH`
32
+
33
+ ### Windows ###
34
+ * Download the [precompiled binary](http://phantomjs.org/download.html) for Windows
35
+
36
+ ### Manual compilation ###
37
+
38
+ Do this as a last resort if the binaries don't work for you. It will
39
+ take quite a long time as it has to build WebKit.
40
+
41
+ * Download [the source tarball](http://code.google.com/p/phantomjs/downloads/detail?name=phantomjs-1.8.1-source.zip&can=2&q=)
42
+ * Extract and cd in
43
+ * `./build.sh`
44
+
45
+ (See also the [PhantomJS building guide](http://phantomjs.org/build.html).)
46
+
47
+ Then put klepto in your gemfile.
48
+
49
+ ```ruby
50
+ gem 'klepto', '>= 0.2.5'
51
+ ```
52
+
53
+
54
+
14
55
  ## Usage (All your content are belong to us)
15
56
  Say you want a bunch of Bieb tweets! How is there not profit in that?
16
57
 
@@ -121,6 +162,11 @@ end
121
162
  }
122
163
  ```
123
164
 
165
+ ## Configuration Options
166
+ * config.headers - Hash; Sets request headers
167
+ * config.urls - Array(String); Sets URLs to structure
168
+ * config.abort_on_failure - Boolean(Default: true); Should structuring be aborted on 4xx or 5xx
169
+
124
170
  ## Callbacks & Processing
125
171
 
126
172
  * before
@@ -128,6 +174,8 @@ end
128
174
  * after
129
175
  * :each (resource, Hash) - called for each resource parsed
130
176
  * :get (page, Capybara::Node) - called after each HTTP GET
177
+ * :abort (page, Capybara::Node) - called after a 4xx or 5xx if config.abort_on_failure is true (default)
178
+
131
179
 
132
180
  ## Stuff I'm going to add.
133
181
  Async
@@ -12,6 +12,9 @@ module Klepto
12
12
  instance_eval <<-EOS
13
13
  def queue; @queue; end;
14
14
  def resources; @resources; end;
15
+ def method_missing(meth, *args, &block)
16
+ raise NoMethodError.new("undefined method: Klepto::Bot#" + meth.to_s)
17
+ end
15
18
  EOS
16
19
 
17
20
  __process!
@@ -36,24 +39,34 @@ EOS
36
39
  config.dispatch_status_handlers(status, browser.page)
37
40
  end
38
41
 
39
- structure = Structure.new(browser.page)
40
-
41
- queue.each do |instruction|
42
- if instruction[2]
43
- structure.send instruction[0], *instruction[1], &instruction[2]
44
- else
45
- structure.send instruction[0], *instruction[1]
42
+ if !browser.failure? || (browser.failure? && !config.abort_on_failure?)
43
+ resources << __structure(browser.page)
44
+ else
45
+ config.after_handlers[:abort].each do |ah|
46
+ ah.call(browser.page)
46
47
  end
47
48
  end
49
+ end
50
+
51
+ @resources
52
+ end
53
+
54
+ def __structure(context)
55
+ structure = Structure.new(context)
48
56
 
49
- config.after_handlers[:each].each do |ah|
50
- ah.call(structure._hash)
57
+ queue.each do |instruction|
58
+ if instruction[2]
59
+ structure.send instruction[0], *instruction[1], &instruction[2]
60
+ else
61
+ structure.send instruction[0], *instruction[1]
51
62
  end
63
+ end
52
64
 
53
- resources << structure._hash
65
+ config.after_handlers[:each].each do |ah|
66
+ ah.call(structure._hash)
54
67
  end
55
68
 
56
- @resources
69
+ structure._hash
57
70
  end
58
71
 
59
72
  def method_missing(meth, *args, &block)
@@ -14,6 +14,11 @@ module Klepto
14
14
  page.status_code
15
15
  end
16
16
 
17
+ # 4xx or 5xx
18
+ def failure?
19
+ ['4xx', '5xx'].include? statusx
20
+ end
21
+
17
22
  def statusx
18
23
  page.status_code.to_s[0..-3] + "xx"
19
24
  end
@@ -6,7 +6,11 @@ module Klepto
6
6
  @headers = {}
7
7
  @abort_on_failure = true
8
8
  @urls = []
9
- @after_handlers = {:each => [], :get => []}
9
+ @after_handlers = {
10
+ :each => [],
11
+ :get => [],
12
+ :abort=> []
13
+ }
10
14
  @before_handlers = {:each => []}
11
15
  @status_handlers = {}
12
16
  end
@@ -16,6 +20,10 @@ module Klepto
16
20
  @headers
17
21
  end
18
22
 
23
+ def abort_on_failure?
24
+ !!@abort_on_failure
25
+ end
26
+
19
27
  # 4xx, 5xx
20
28
  def abort_on_failure(aof)
21
29
  @abort_on_failure = aof
@@ -1,3 +1,3 @@
1
1
  module Klepto
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.5"
3
3
  end
@@ -27,6 +27,44 @@ describe Klepto::Bot do
27
27
  end
28
28
  end
29
29
 
30
+ describe 'aborting after a failure' do
31
+ before(:each) do
32
+ @bot = Klepto::Bot.new("https://github.com/coryodaniel/nonexistentproject"){
33
+ name 'h1.fullname'
34
+ config.abort_on_failure true
35
+ config.after(:abort) do |page|
36
+ StatusLog.create message: 'Aborted.'
37
+ end
38
+ }
39
+ @structure = @bot.resources
40
+ end
41
+
42
+ it 'should abort after a 4xx or 5xx' do
43
+ StatusLog.first.message.should eq("Aborted.")
44
+ end
45
+ end
46
+
47
+ describe 'structuring a 4xx or 5xx response' do
48
+ before(:each) do
49
+ @bot = Klepto::Bot.new("https://github.com/coryodaniel/nonexistentproject"){
50
+ cat 'img#parallax_octocat', :attr => :alt
51
+ config.abort_on_failure false
52
+ config.after(:abort) do |page|
53
+ StatusLog.create message: 'Aborted.'
54
+ end
55
+ }
56
+ @structure = @bot.resources
57
+ end
58
+
59
+ it 'should perform structuring' do
60
+ @structure.first[:cat].should eq('Octobi Wan Catnobi')
61
+ end
62
+
63
+ it 'should not abort after a 4xx or 5xx' do
64
+ StatusLog.first.should be(nil)
65
+ end
66
+ end
67
+
30
68
  describe 'structuring multiple pages' do
31
69
  before(:each) do
32
70
  @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
@@ -118,6 +156,13 @@ describe Klepto::Bot do
118
156
  Tweet.count.should_not be(0)
119
157
  end
120
158
 
159
+ it 'should not have the DSL once its been processed' do
160
+ lambda{
161
+ @bot.i_dont_exist
162
+ }.should raise_error(NoMethodError)
163
+
164
+ end
165
+
121
166
  it 'should have dispatched status handlers' do
122
167
  statuses = StatusLog.all.map(&:message)
123
168
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: poltergeist
16
- requirement: &70237982927260 !ruby/object:Gem::Requirement
16
+ requirement: &70341397514440 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - =
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.1.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70237982927260
24
+ version_requirements: *70341397514440
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: capybara
27
- requirement: &70237982925860 !ruby/object:Gem::Requirement
27
+ requirement: &70341397512360 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.0.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70237982925860
35
+ version_requirements: *70341397512360
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70237982924820 !ruby/object:Gem::Requirement
38
+ requirement: &70341397509940 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.5.6
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70237982924820
46
+ version_requirements: *70341397509940
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: activesupport
49
- requirement: &70237982923900 !ruby/object:Gem::Requirement
49
+ requirement: &70341397506060 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70237982923900
57
+ version_requirements: *70341397506060
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: multi_json
60
- requirement: &70237982922600 !ruby/object:Gem::Requirement
60
+ requirement: &70341397504900 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '1.0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70237982922600
68
+ version_requirements: *70341397504900
69
69
  description: Tearing up web pages into ActiveRecord resources
70
70
  email:
71
71
  - github@coryodaniel.com