klepto 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Klepto
2
2
 
3
- A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data into ActiveRecord or wherever(TM).
3
+ A mean little DSL'd capybara (poltergeist) based web scraper that structures data into ActiveRecord or wherever(TM).
4
4
 
5
5
  ## Features
6
6
 
@@ -8,7 +8,6 @@ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data in
8
8
  * Full javascript processing via phantomjs / poltergeist
9
9
  * All the fun of capybara
10
10
  * Scrape multiple pages with a single bot
11
- * Scrape individuals pages with multiple 'crawlers', see Bieber example.
12
11
  * Pretty nifty DSL
13
12
  * Test coverage!
14
13
 
@@ -16,8 +15,8 @@ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data in
16
15
  Say you want a bunch of Bieb tweets! How is there not profit in that?
17
16
 
18
17
  ```ruby
19
- # Crawl a web site or multiple. Structure#crawl takes a *splat!
20
- @structures = Klepto::Bot.new("https://twitter.com/justinbieber"){
18
+ # Fetch a web site or multiple. Bot#new takes a *splat!
19
+ @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
21
20
  # By default, it uses CSS selectors
22
21
  name 'h1.fullname'
23
22
 
@@ -73,6 +72,10 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
73
72
  puts "HOLY CRAP!"
74
73
  }
75
74
 
75
+ config.after(:get) do |page|
76
+ # This is fired after each HTTP GET. It receives a Capybara::Node
77
+ end
78
+
76
79
  # If you want to do something with each resource, like stick it in AR
77
80
  # go for it here...
78
81
  config.after do |resource|
@@ -87,10 +90,10 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
87
90
  end #=> Profit!
88
91
  }
89
92
 
90
- # An array of hashes is returned, so if you wanted to do something else
93
+ # You can get an array of hashes(resources), so if you wanted to do something else
91
94
  # you could do it here...
92
- @structures.each do |structure|
93
- pp structure
95
+ @bot.resources.each do |resource|
96
+ pp resource
94
97
  end
95
98
  ```
96
99
 
@@ -118,8 +121,19 @@ end
118
121
  }
119
122
  ```
120
123
 
124
+ ## Callbacks & Processing
125
+
126
+ * before
127
+ * n/a
128
+ * after
129
+ * :each (resource, Hash) - called for each resource parsed
130
+ * :get (page, Capybara::Node) - called after each HTTP GET
121
131
 
122
132
  ## Stuff I'm going to add.
133
+ Async
134
+ --------
135
+ -> https://github.com/igrigorik/em-synchrony
136
+
123
137
  Cookie Stuffing
124
138
  -------------------
125
139
  ```ruby
data/lib/klepto/bot.rb CHANGED
@@ -17,13 +17,6 @@ EOS
17
17
  __process!
18
18
  end
19
19
 
20
- def __dispatch_handlers_for(status_code)
21
- if status_code.is_a?(Fixnum)
22
- elsif status_code.is_a?(Symbol)
23
- elsif status_code.is_a?(String)
24
- end
25
- end
26
-
27
20
  def __process!
28
21
  @resources = []
29
22
 
@@ -32,6 +25,10 @@ EOS
32
25
 
33
26
  browser.set_headers config.headers
34
27
  browser.fetch! url
28
+
29
+ config.after_handlers[:get].each do |ah|
30
+ ah.call(browser.page)
31
+ end
35
32
 
36
33
  statuses = [browser.status, browser.statusx]
37
34
  statuses.push :redirect if url != browser.page.current_url
data/lib/klepto/config.rb CHANGED
@@ -4,8 +4,9 @@ module Klepto
4
4
 
5
5
  def initialize
6
6
  @headers = {}
7
+ @abort_on_failure = true
7
8
  @urls = []
8
- @after_handlers = {:each => []}
9
+ @after_handlers = {:each => [], :get => []}
9
10
  @before_handlers = {:each => []}
10
11
  @status_handlers = {}
11
12
  end
@@ -15,6 +16,11 @@ module Klepto
15
16
  @headers
16
17
  end
17
18
 
19
+ # 4xx, 5xx
20
+ def abort_on_failure(aof)
21
+ @abort_on_failure = aof
22
+ end
23
+
18
24
  def on_http_status(*statuses,&block)
19
25
  statuses.each do |status|
20
26
  @status_handlers[status] ||= []
@@ -36,6 +42,11 @@ module Klepto
36
42
  @after_handlers[which].push block
37
43
  end
38
44
 
45
+ def before(which, &block)
46
+ @before_handlers[which] ||= []
47
+ @before_handlers[which].push block
48
+ end
49
+
39
50
  def url(*args)
40
51
  @urls += args
41
52
  @urls.flatten!
@@ -1,3 +1,3 @@
1
1
  module Klepto
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
@@ -27,7 +27,7 @@ describe Klepto::Bot do
27
27
  end
28
28
  end
29
29
 
30
- describe 'crawling multiple pages' do
30
+ describe 'structuring multiple pages' do
31
31
  before(:each) do
32
32
  @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
33
33
  config.urls "https://twitter.com/ladygaga"
@@ -76,6 +76,10 @@ describe Klepto::Bot do
76
76
  permalink '.time a', :css, :attr => :href
77
77
  end
78
78
 
79
+ config.after(:get) do |page|
80
+ StatusLog.create message: 'got a page'
81
+ end
82
+
79
83
  config.on_http_status('2xx'){
80
84
  StatusLog.create message: '2xx'
81
85
  }
@@ -120,6 +124,7 @@ describe Klepto::Bot do
120
124
  statuses.should_not include 'redirect'
121
125
  statuses.should include '200'
122
126
  statuses.should include '2xx'
127
+ statuses.should include 'got a page'
123
128
  end
124
129
  end
125
130
  end
@@ -14,6 +14,7 @@ describe Klepto::Config do
14
14
  @config.on_http_status('5xx','4xx'){
15
15
  "Its crazy."
16
16
  }
17
+ @config.abort_on_failure(false)
17
18
  end
18
19
 
19
20
  it 'should be able to set headers' do
@@ -37,7 +38,10 @@ describe Klepto::Config do
37
38
  @config.urls.should == ['http://example.com', 'http://www.iana.org']
38
39
  end
39
40
 
40
- pending 'should be able to set before handlers'
41
+ it 'should have an abort on 4xx/5xx option' do
42
+ @config.instance_variable_get("@abort_on_failure").should be false
43
+ end
44
+
41
45
  pending 'should be able to set cookies'
42
46
  pending 'should be able to set steps'
43
47
  pending 'should be able to set assertions'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: poltergeist
16
- requirement: &70235243869100 !ruby/object:Gem::Requirement
16
+ requirement: &70237982927260 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - =
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.1.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70235243869100
24
+ version_requirements: *70237982927260
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: capybara
27
- requirement: &70235243868100 !ruby/object:Gem::Requirement
27
+ requirement: &70237982925860 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.0.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70235243868100
35
+ version_requirements: *70237982925860
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70235243867320 !ruby/object:Gem::Requirement
38
+ requirement: &70237982924820 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.5.6
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70235243867320
46
+ version_requirements: *70237982924820
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: activesupport
49
- requirement: &70235243866680 !ruby/object:Gem::Requirement
49
+ requirement: &70237982923900 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70235243866680
57
+ version_requirements: *70237982923900
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: multi_json
60
- requirement: &70235243865240 !ruby/object:Gem::Requirement
60
+ requirement: &70237982922600 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '1.0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70235243865240
68
+ version_requirements: *70237982922600
69
69
  description: Tearing up web pages into ActiveRecord resources
70
70
  email:
71
71
  - github@coryodaniel.com