klepto 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Klepto
2
2
 
3
- A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data into ActiveRecord or wherever(TM).
3
+ A mean little DSL'd capybara (poltergeist) based web scraper that structures data into ActiveRecord or wherever(TM).
4
4
 
5
5
  ## Features
6
6
 
@@ -8,7 +8,6 @@ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data in
8
8
  * Full javascript processing via phantomjs / poltergeist
9
9
  * All the fun of capybara
10
10
  * Scrape multiple pages with a single bot
11
- * Scrape individuals pages with multiple 'crawlers', see Bieber example.
12
11
  * Pretty nifty DSL
13
12
  * Test coverage!
14
13
 
@@ -16,8 +15,8 @@ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data in
16
15
  Say you want a bunch of Bieb tweets! How is there not profit in that?
17
16
 
18
17
  ```ruby
19
- # Crawl a web site or multiple. Structure#crawl takes a *splat!
20
- @structures = Klepto::Bot.new("https://twitter.com/justinbieber"){
18
+ # Fetch a web site or multiple. Bot#new takes a *splat!
19
+ @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
21
20
  # By default, it uses CSS selectors
22
21
  name 'h1.fullname'
23
22
 
@@ -73,6 +72,10 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
73
72
  puts "HOLY CRAP!"
74
73
  }
75
74
 
75
+ config.after(:get) do |page|
76
+ # This is fired after each HTTP GET. It receives a Capybara::Node
77
+ end
78
+
76
79
  # If you want to do something with each resource, like stick it in AR
77
80
  # go for it here...
78
81
  config.after do |resource|
@@ -87,10 +90,10 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
87
90
  end #=> Profit!
88
91
  }
89
92
 
90
- # An array of hashes is returned, so if you wanted to do something else
93
+ # You can get an array of hashes(resources), so if you wanted to do something else
91
94
  # you could do it here...
92
- @structures.each do |structure|
93
- pp structure
95
+ @bot.resources.each do |resource|
96
+ pp resource
94
97
  end
95
98
  ```
96
99
 
@@ -118,8 +121,19 @@ end
118
121
  }
119
122
  ```
120
123
 
124
+ ## Callbacks & Processing
125
+
126
+ * before
127
+ * n/a
128
+ * after
129
+ * :each (resource, Hash) - called for each resource parsed
130
+ * :get (page, Capybara::Node) - called after each HTTP GET
121
131
 
122
132
  ## Stuff I'm going to add.
133
+ Async
134
+ --------
135
+ -> https://github.com/igrigorik/em-synchrony
136
+
123
137
  Cookie Stuffing
124
138
  -------------------
125
139
  ```ruby
data/lib/klepto/bot.rb CHANGED
@@ -17,13 +17,6 @@ EOS
17
17
  __process!
18
18
  end
19
19
 
20
- def __dispatch_handlers_for(status_code)
21
- if status_code.is_a?(Fixnum)
22
- elsif status_code.is_a?(Symbol)
23
- elsif status_code.is_a?(String)
24
- end
25
- end
26
-
27
20
  def __process!
28
21
  @resources = []
29
22
 
@@ -32,6 +25,10 @@ EOS
32
25
 
33
26
  browser.set_headers config.headers
34
27
  browser.fetch! url
28
+
29
+ config.after_handlers[:get].each do |ah|
30
+ ah.call(browser.page)
31
+ end
35
32
 
36
33
  statuses = [browser.status, browser.statusx]
37
34
  statuses.push :redirect if url != browser.page.current_url
data/lib/klepto/config.rb CHANGED
@@ -4,8 +4,9 @@ module Klepto
4
4
 
5
5
  def initialize
6
6
  @headers = {}
7
+ @abort_on_failure = true
7
8
  @urls = []
8
- @after_handlers = {:each => []}
9
+ @after_handlers = {:each => [], :get => []}
9
10
  @before_handlers = {:each => []}
10
11
  @status_handlers = {}
11
12
  end
@@ -15,6 +16,11 @@ module Klepto
15
16
  @headers
16
17
  end
17
18
 
19
+ # 4xx, 5xx
20
+ def abort_on_failure(aof)
21
+ @abort_on_failure = aof
22
+ end
23
+
18
24
  def on_http_status(*statuses,&block)
19
25
  statuses.each do |status|
20
26
  @status_handlers[status] ||= []
@@ -36,6 +42,11 @@ module Klepto
36
42
  @after_handlers[which].push block
37
43
  end
38
44
 
45
+ def before(which, &block)
46
+ @before_handlers[which] ||= []
47
+ @before_handlers[which].push block
48
+ end
49
+
39
50
  def url(*args)
40
51
  @urls += args
41
52
  @urls.flatten!
@@ -1,3 +1,3 @@
1
1
  module Klepto
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
@@ -27,7 +27,7 @@ describe Klepto::Bot do
27
27
  end
28
28
  end
29
29
 
30
- describe 'crawling multiple pages' do
30
+ describe 'structuring multiple pages' do
31
31
  before(:each) do
32
32
  @bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
33
33
  config.urls "https://twitter.com/ladygaga"
@@ -76,6 +76,10 @@ describe Klepto::Bot do
76
76
  permalink '.time a', :css, :attr => :href
77
77
  end
78
78
 
79
+ config.after(:get) do |page|
80
+ StatusLog.create message: 'got a page'
81
+ end
82
+
79
83
  config.on_http_status('2xx'){
80
84
  StatusLog.create message: '2xx'
81
85
  }
@@ -120,6 +124,7 @@ describe Klepto::Bot do
120
124
  statuses.should_not include 'redirect'
121
125
  statuses.should include '200'
122
126
  statuses.should include '2xx'
127
+ statuses.should include 'got a page'
123
128
  end
124
129
  end
125
130
  end
@@ -14,6 +14,7 @@ describe Klepto::Config do
14
14
  @config.on_http_status('5xx','4xx'){
15
15
  "Its crazy."
16
16
  }
17
+ @config.abort_on_failure(false)
17
18
  end
18
19
 
19
20
  it 'should be able to set headers' do
@@ -37,7 +38,10 @@ describe Klepto::Config do
37
38
  @config.urls.should == ['http://example.com', 'http://www.iana.org']
38
39
  end
39
40
 
40
- pending 'should be able to set before handlers'
41
+ it 'should have an abort on 4xx/5xx option' do
42
+ @config.instance_variable_get("@abort_on_failure").should be false
43
+ end
44
+
41
45
  pending 'should be able to set cookies'
42
46
  pending 'should be able to set steps'
43
47
  pending 'should be able to set assertions'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: poltergeist
16
- requirement: &70235243869100 !ruby/object:Gem::Requirement
16
+ requirement: &70237982927260 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - =
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.1.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70235243869100
24
+ version_requirements: *70237982927260
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: capybara
27
- requirement: &70235243868100 !ruby/object:Gem::Requirement
27
+ requirement: &70237982925860 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.0.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70235243868100
35
+ version_requirements: *70237982925860
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70235243867320 !ruby/object:Gem::Requirement
38
+ requirement: &70237982924820 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.5.6
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70235243867320
46
+ version_requirements: *70237982924820
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: activesupport
49
- requirement: &70235243866680 !ruby/object:Gem::Requirement
49
+ requirement: &70237982923900 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70235243866680
57
+ version_requirements: *70237982923900
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: multi_json
60
- requirement: &70235243865240 !ruby/object:Gem::Requirement
60
+ requirement: &70237982922600 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '1.0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70235243865240
68
+ version_requirements: *70237982922600
69
69
  description: Tearing up web pages into ActiveRecord resources
70
70
  email:
71
71
  - github@coryodaniel.com