klepto 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +23 -17
- data/lib/klepto/bot.rb +66 -0
- data/lib/klepto/browser.rb +8 -0
- data/lib/klepto/config.rb +29 -0
- data/lib/klepto/structure.rb +3 -24
- data/lib/klepto/version.rb +1 -1
- data/lib/klepto.rb +2 -1
- data/samples/concept.rb +30 -24
- data/spec/lib/klepto/bot_spec.rb +126 -0
- data/spec/lib/klepto/config_spec.rb +23 -8
- data/spec/lib/klepto/structure_spec.rb +0 -61
- data/spec/orm/active_record.rb +10 -0
- data/spec/spec_helper.rb +1 -0
- metadata +15 -12
data/README.md
CHANGED
@@ -17,7 +17,7 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
|
|
17
17
|
|
18
18
|
```ruby
|
19
19
|
# Crawl a web site or multiple. Structure#crawl takes a *splat!
|
20
|
-
@structures = Klepto::
|
20
|
+
@structures = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
21
21
|
# By default, it uses CSS selectors
|
22
22
|
name 'h1.fullname'
|
23
23
|
|
@@ -53,9 +53,29 @@ Say you want a bunch of Bieb tweets! How is there not profit in that?
|
|
53
53
|
permalink '.time a', :css, :attr => :href
|
54
54
|
end
|
55
55
|
|
56
|
+
# Set some headers, why not.
|
57
|
+
config.headers({
|
58
|
+
'Referer' => 'http://www.twitter.com'
|
59
|
+
})
|
60
|
+
|
61
|
+
# on_http_status can take a splat of statuses or ~statuses(4xx,5xx)
|
62
|
+
# you can also have multiple handlers on a status
|
63
|
+
# Note: Capybara automatically follows redirects, so the statuses 3xx
|
64
|
+
# are never present. If you want to watch for a redirect pass see below
|
65
|
+
config.on_http_status(:redirect){
|
66
|
+
puts "Something redirected..."
|
67
|
+
}
|
68
|
+
config.on_http_status(200){
|
69
|
+
puts "Expected this, NBD."
|
70
|
+
}
|
71
|
+
|
72
|
+
config.on_http_status('5xx','4xx'){
|
73
|
+
puts "HOLY CRAP!"
|
74
|
+
}
|
75
|
+
|
56
76
|
# If you want to do something with each resource, like stick it in AR
|
57
77
|
# go for it here...
|
58
|
-
|
78
|
+
config.after do |resource|
|
59
79
|
@user = User.new
|
60
80
|
@user.name = resource[:name]
|
61
81
|
@user.username = resource[:username]
|
@@ -79,7 +99,7 @@ end
|
|
79
99
|
```ruby
|
80
100
|
@html = Capybara::Node::Simple.new(@html_string)
|
81
101
|
@structure = Klepto::Structure.build(@html){
|
82
|
-
# inside the build method, everything works the same as
|
102
|
+
# inside the build method, everything works the same as Bot.new
|
83
103
|
name 'h1.fullname'
|
84
104
|
username 'span.screen-name'
|
85
105
|
|
@@ -98,20 +118,6 @@ end
|
|
98
118
|
}
|
99
119
|
```
|
100
120
|
|
101
|
-
## Extra Configuration
|
102
|
-
```ruby
|
103
|
-
config = {
|
104
|
-
:headers => {
|
105
|
-
'Referer' => 'http://www.twitter.com',
|
106
|
-
'X-Sup-Dawg' => "Yo, What's up?"
|
107
|
-
}
|
108
|
-
}
|
109
|
-
@structures = Klepto::Structure.crawl("https://twitter.com/justinbieber",config){
|
110
|
-
#... yada, yada
|
111
|
-
}
|
112
|
-
```
|
113
|
-
|
114
|
-
|
115
121
|
|
116
122
|
## Stuff I'm going to add.
|
117
123
|
Cookie Stuffing
|
data/lib/klepto/bot.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
module Klepto
|
2
|
+
class Bot
|
3
|
+
attr_reader :config
|
4
|
+
|
5
|
+
def initialize(*urls, &block)
|
6
|
+
@config = Klepto::Config.new
|
7
|
+
@config.urls urls
|
8
|
+
@queue = []
|
9
|
+
|
10
|
+
instance_eval &block
|
11
|
+
|
12
|
+
instance_eval <<-EOS
|
13
|
+
def queue; @queue; end;
|
14
|
+
def resources; @resources; end;
|
15
|
+
EOS
|
16
|
+
|
17
|
+
__process!
|
18
|
+
end
|
19
|
+
|
20
|
+
def __dispatch_handlers_for(status_code)
|
21
|
+
if status_code.is_a?(Fixnum)
|
22
|
+
elsif status_code.is_a?(Symbol)
|
23
|
+
elsif status_code.is_a?(String)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def __process!
|
28
|
+
@resources = []
|
29
|
+
|
30
|
+
config.urls.each do |url|
|
31
|
+
browser = Klepto::Browser.new
|
32
|
+
|
33
|
+
browser.set_headers config.headers
|
34
|
+
browser.fetch! url
|
35
|
+
|
36
|
+
statuses = [browser.status, browser.statusx]
|
37
|
+
statuses.push :redirect if url != browser.page.current_url
|
38
|
+
statuses.each do |status|
|
39
|
+
config.dispatch_status_handlers(status, browser.page)
|
40
|
+
end
|
41
|
+
|
42
|
+
structure = Structure.new(browser.page)
|
43
|
+
|
44
|
+
queue.each do |instruction|
|
45
|
+
if instruction[2]
|
46
|
+
structure.send instruction[0], *instruction[1], &instruction[2]
|
47
|
+
else
|
48
|
+
structure.send instruction[0], *instruction[1]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
config.after_handlers[:each].each do |ah|
|
53
|
+
ah.call(structure._hash)
|
54
|
+
end
|
55
|
+
|
56
|
+
resources << structure._hash
|
57
|
+
end
|
58
|
+
|
59
|
+
@resources
|
60
|
+
end
|
61
|
+
|
62
|
+
def method_missing(meth, *args, &block)
|
63
|
+
@queue.push([meth, args, block])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/klepto/browser.rb
CHANGED
data/lib/klepto/config.rb
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
module Klepto
|
2
2
|
class Config
|
3
|
+
attr_reader :after_handlers
|
4
|
+
|
3
5
|
def initialize
|
4
6
|
@headers = {}
|
5
7
|
@urls = []
|
8
|
+
@after_handlers = {:each => []}
|
9
|
+
@before_handlers = {:each => []}
|
10
|
+
@status_handlers = {}
|
6
11
|
end
|
7
12
|
|
8
13
|
def headers(_headers=nil)
|
@@ -10,8 +15,32 @@ module Klepto
|
|
10
15
|
@headers
|
11
16
|
end
|
12
17
|
|
18
|
+
def on_http_status(*statuses,&block)
|
19
|
+
statuses.each do |status|
|
20
|
+
@status_handlers[status] ||= []
|
21
|
+
@status_handlers[status].push block
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def dispatch_status_handlers(status, page)
|
26
|
+
handlers = @status_handlers[status]
|
27
|
+
if handlers.present?
|
28
|
+
@status_handlers[status].each do |handler|
|
29
|
+
handler.call(page)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def after(which = :each, &block)
|
35
|
+
@after_handlers[which] ||= []
|
36
|
+
@after_handlers[which].push block
|
37
|
+
end
|
38
|
+
|
13
39
|
def url(*args)
|
14
40
|
@urls += args
|
41
|
+
@urls.flatten!
|
42
|
+
@urls.uniq!
|
43
|
+
@urls
|
15
44
|
end
|
16
45
|
alias :urls :url
|
17
46
|
end
|
data/lib/klepto/structure.rb
CHANGED
@@ -1,31 +1,14 @@
|
|
1
1
|
module Klepto
|
2
2
|
class Structure
|
3
3
|
def self.build(_context=nil, _parent=nil, &block)
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
hb._after_handler.call(hb._hash)
|
8
|
-
end
|
9
|
-
|
10
|
-
hb._hash
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.crawl(*urls, &block)
|
14
|
-
config = urls.last.is_a?(Hash) ? urls.pop : {}
|
15
|
-
resources = []
|
16
|
-
urls.each do |url|
|
17
|
-
browser = Klepto::Browser.new
|
18
|
-
browser.set_headers config[:headers]
|
19
|
-
browser.fetch! url
|
20
|
-
resources << Structure.build(browser.page, &block)
|
21
|
-
end
|
22
|
-
resources
|
4
|
+
structure = Structure.new(_context, _parent)
|
5
|
+
structure.instance_eval &block
|
6
|
+
structure._hash
|
23
7
|
end
|
24
8
|
|
25
9
|
attr_reader :_parent
|
26
10
|
attr_reader :_hash
|
27
11
|
attr_reader :_context
|
28
|
-
attr_reader :_after_handler
|
29
12
|
|
30
13
|
def initialize(_context=nil, _parent=nil)
|
31
14
|
@_context = _context
|
@@ -34,10 +17,6 @@ module Klepto
|
|
34
17
|
@_after_handler = nil
|
35
18
|
end
|
36
19
|
|
37
|
-
def after_crawl(&block)
|
38
|
-
@_after_handler = block
|
39
|
-
end
|
40
|
-
|
41
20
|
#options[:as] :collection, :resource
|
42
21
|
#options[:match] :first, :all
|
43
22
|
#options[:syntax] :xpath, :css
|
data/lib/klepto/version.rb
CHANGED
data/lib/klepto.rb
CHANGED
data/samples/concept.rb
CHANGED
@@ -2,30 +2,41 @@
|
|
2
2
|
require 'bundler/setup'
|
3
3
|
require 'klepto'
|
4
4
|
|
5
|
-
|
5
|
+
Klepto::Bot.new do
|
6
6
|
config.headers 'Referer' => 'http://www.twitter.com'
|
7
|
+
config.on_http_status('5xx','4xx'){
|
8
|
+
puts "HOLY CRAP!"
|
9
|
+
}
|
10
|
+
|
11
|
+
# If you want to do something with each resource, like stick it in AR
|
12
|
+
# go for it here...
|
13
|
+
config.after do |resource|
|
14
|
+
@user = User.new
|
15
|
+
@user.name = resource[:name]
|
16
|
+
@user.username = resource[:username]
|
17
|
+
@user.save
|
18
|
+
|
19
|
+
resource[:tweets].each do |tweet|
|
20
|
+
Tweet.create(tweet)
|
21
|
+
end
|
22
|
+
end
|
7
23
|
|
8
|
-
config.steps [
|
9
|
-
[:GET, 'https://twitter.com/login'],
|
10
|
-
[:POST,'https://twitter.com/sessions',
|
11
|
-
{
|
12
|
-
session: {
|
13
|
-
username_or_email: 'example',
|
14
|
-
password:'123456'
|
15
|
-
}
|
16
|
-
}
|
17
|
-
]
|
18
|
-
]
|
19
24
|
config.urls 'https://twitter.com/justinbieber',
|
20
25
|
'https://twitter.com/ladygaga'
|
21
|
-
# config.cookies 'jsession' => 'abcdefg1234567890'
|
22
|
-
# config.on_http_status(500,404){}
|
23
|
-
# assertions do
|
24
|
-
# end
|
25
|
-
# config.on_failed_assertion(){}
|
26
26
|
|
27
|
+
# config.steps [
|
28
|
+
# [:GET, 'https://twitter.com/login'],
|
29
|
+
# [:POST,'https://twitter.com/sessions',
|
30
|
+
# {
|
31
|
+
# session: {
|
32
|
+
# username_or_email: 'example',
|
33
|
+
# password:'123456'
|
34
|
+
# }
|
35
|
+
# }
|
36
|
+
# ]
|
37
|
+
# ]
|
27
38
|
|
28
|
-
#
|
39
|
+
# Structure the content
|
29
40
|
name 'h1.fullname'
|
30
41
|
username '.username span.screen-name'
|
31
42
|
links 'span.url a', :list, :attr => 'href'
|
@@ -42,9 +53,4 @@ require 'klepto'
|
|
42
53
|
|
43
54
|
permalink '.time a', :css, :attr => :href
|
44
55
|
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# @resources = @structure.parse! #=> Array[Hash]
|
48
|
-
# @resources.each do |resource|
|
49
|
-
# User.create(resource)
|
50
|
-
# end
|
56
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Bot do
|
4
|
+
describe 'Klepto::Bot.new' do
|
5
|
+
describe 'create a bot with a redirect' do
|
6
|
+
before(:each) do
|
7
|
+
@bot = Klepto::Bot.new("https://www.twitter.com/justinbieber"){
|
8
|
+
name 'h1.fullname'
|
9
|
+
config.on_http_status(:redirect){
|
10
|
+
StatusLog.create message: 'redirect'
|
11
|
+
}
|
12
|
+
config.on_http_status(200){
|
13
|
+
StatusLog.create message: '200'
|
14
|
+
}
|
15
|
+
}
|
16
|
+
@structure = @bot.resources
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should structure the data' do
|
20
|
+
@structure.first[:name].should match(/Justin/i)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should have dispatched status handlers' do
|
24
|
+
statuses = StatusLog.all.map(&:message)
|
25
|
+
statuses.should include 'redirect'
|
26
|
+
statuses.should include '200'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'crawling multiple pages' do
|
31
|
+
before(:each) do
|
32
|
+
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
33
|
+
config.urls "https://twitter.com/ladygaga"
|
34
|
+
name 'h1.fullname'
|
35
|
+
}
|
36
|
+
@structure = @bot.resources
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should have both pages data' do
|
40
|
+
@structure.first[:name].should match(/Justin/i)
|
41
|
+
@structure.last[:name].should match(/Lady/i)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe 'creating a bot' do
|
46
|
+
before(:each) do
|
47
|
+
@bot = Klepto::Bot.new("https://twitter.com/justinbieber"){
|
48
|
+
config.headers({
|
49
|
+
'Referer' => 'http://www.twitter.com',
|
50
|
+
'X-Sup-Dawg' => "Yo, What's up?"
|
51
|
+
})
|
52
|
+
|
53
|
+
# Structure that stuff
|
54
|
+
name 'h1.fullname'
|
55
|
+
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
56
|
+
tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
|
57
|
+
links 'span.url a', :match => :all do |node|
|
58
|
+
node[:href]
|
59
|
+
end
|
60
|
+
|
61
|
+
last_tweet 'li.stream-item', :as => :resource do
|
62
|
+
twitter_id do |node|
|
63
|
+
node['data-item-id']
|
64
|
+
end
|
65
|
+
content '.content p'
|
66
|
+
timestamp '._timestamp', :attr => 'data-time'
|
67
|
+
permalink '.time a', :attr => :href
|
68
|
+
end
|
69
|
+
|
70
|
+
tweets 'li.stream-item', :as => :collection do
|
71
|
+
twitter_id do |node|
|
72
|
+
node['data-item-id']
|
73
|
+
end
|
74
|
+
tweet '.content p', :css
|
75
|
+
timestamp '._timestamp', :attr => 'data-time'
|
76
|
+
permalink '.time a', :css, :attr => :href
|
77
|
+
end
|
78
|
+
|
79
|
+
config.on_http_status('2xx'){
|
80
|
+
StatusLog.create message: '2xx'
|
81
|
+
}
|
82
|
+
|
83
|
+
config.on_http_status(:redirect){
|
84
|
+
StatusLog.create message: 'redirect'
|
85
|
+
}
|
86
|
+
|
87
|
+
config.on_http_status(200){
|
88
|
+
StatusLog.create message: '200'
|
89
|
+
}
|
90
|
+
|
91
|
+
config.after(:each) do |resource|
|
92
|
+
@user = User.new
|
93
|
+
@user.name = resource[:name]
|
94
|
+
@user.username = resource[:username]
|
95
|
+
@user.save
|
96
|
+
|
97
|
+
resource[:tweets].each do |tweet|
|
98
|
+
Tweet.create(tweet)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
}
|
102
|
+
@structure = @bot.resources
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'should structure the data' do
|
106
|
+
@structure.first[:name].should match(/Justin/i)
|
107
|
+
@structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
|
108
|
+
@structure.first[:username].should eq '@justinbieber'
|
109
|
+
@structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'should store the data' do
|
113
|
+
User.count.should be(1)
|
114
|
+
Tweet.count.should_not be(0)
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should have dispatched status handlers' do
|
118
|
+
statuses = StatusLog.all.map(&:message)
|
119
|
+
|
120
|
+
statuses.should_not include 'redirect'
|
121
|
+
statuses.should include '200'
|
122
|
+
statuses.should include '2xx'
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -5,26 +5,41 @@ describe Klepto::Config do
|
|
5
5
|
@config = Klepto::Config.new
|
6
6
|
@config.headers({'Referer' => 'http://example.com'})
|
7
7
|
@config.urls 'http://example.com', 'http://www.iana.org'
|
8
|
+
@config.on_http_status(200){
|
9
|
+
"Its 200"
|
10
|
+
}
|
11
|
+
@config.on_http_status('2xx'){
|
12
|
+
"Its 2xx"
|
13
|
+
}
|
14
|
+
@config.on_http_status('5xx','4xx'){
|
15
|
+
"Its crazy."
|
16
|
+
}
|
8
17
|
end
|
9
18
|
|
10
19
|
it 'should be able to set headers' do
|
11
20
|
@config.headers['Referer'].should eq('http://example.com')
|
12
21
|
end
|
13
22
|
|
23
|
+
it 'should have a 2xx status handler' do
|
24
|
+
@config.instance_variable_get("@status_handlers")['2xx'].first.call.should eq ('Its 2xx')
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should have a 200 status handler' do
|
28
|
+
@config.instance_variable_get("@status_handlers")[200].first.call.should eq ('Its 200')
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should have a 4xx and 5xx status handler' do
|
32
|
+
@config.instance_variable_get("@status_handlers")['5xx'].first.call.should eq ('Its crazy.')
|
33
|
+
@config.instance_variable_get("@status_handlers")['4xx'].first.call.should eq ('Its crazy.')
|
34
|
+
end
|
35
|
+
|
14
36
|
it 'should be able to set URLs' do
|
15
37
|
@config.urls.should == ['http://example.com', 'http://www.iana.org']
|
16
38
|
end
|
17
39
|
|
40
|
+
pending 'should be able to set before handlers'
|
18
41
|
pending 'should be able to set cookies'
|
19
42
|
pending 'should be able to set steps'
|
20
43
|
pending 'should be able to set assertions'
|
21
|
-
pending 'should be able to set on_http_status handler'
|
22
44
|
pending 'should be able to set on_failed_assertion handler'
|
23
|
-
pending 'should be a sexier config' do
|
24
|
-
# Klepto::Structure.crawl("https://twitter.com/justinbieber"){
|
25
|
-
# config.headers({
|
26
|
-
# "Referer" => "http://example.com"
|
27
|
-
# })
|
28
|
-
# }
|
29
|
-
end
|
30
45
|
end
|
@@ -41,65 +41,4 @@ describe Klepto::Structure do
|
|
41
41
|
@structure[:last_tweet][:twitter_id].should == @structure[:tweets].first[:twitter_id]
|
42
42
|
end
|
43
43
|
end
|
44
|
-
|
45
|
-
describe 'Klepto::Structure.crawl' do
|
46
|
-
before(:each) do
|
47
|
-
config = {
|
48
|
-
:headers => {
|
49
|
-
'Referer' => 'http://www.twitter.com',
|
50
|
-
'X-Sup-Dawg' => "Yo, What's up?"
|
51
|
-
}
|
52
|
-
}
|
53
|
-
@structure = Klepto::Structure.crawl("https://twitter.com/justinbieber", config){
|
54
|
-
# Structure that stuff
|
55
|
-
name 'h1.fullname'
|
56
|
-
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
57
|
-
tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
|
58
|
-
links 'span.url a', :match => :all do |node|
|
59
|
-
node[:href]
|
60
|
-
end
|
61
|
-
|
62
|
-
last_tweet 'li.stream-item', :as => :resource do
|
63
|
-
twitter_id do |node|
|
64
|
-
node['data-item-id']
|
65
|
-
end
|
66
|
-
content '.content p'
|
67
|
-
timestamp '._timestamp', :attr => 'data-time'
|
68
|
-
permalink '.time a', :attr => :href
|
69
|
-
end
|
70
|
-
|
71
|
-
tweets 'li.stream-item', :as => :collection do
|
72
|
-
twitter_id do |node|
|
73
|
-
node['data-item-id']
|
74
|
-
end
|
75
|
-
tweet '.content p', :css
|
76
|
-
timestamp '._timestamp', :attr => 'data-time'
|
77
|
-
permalink '.time a', :css, :attr => :href
|
78
|
-
end
|
79
|
-
|
80
|
-
after_crawl do |resource|
|
81
|
-
@user = User.new
|
82
|
-
@user.name = resource[:name]
|
83
|
-
@user.username = resource[:username]
|
84
|
-
@user.save
|
85
|
-
|
86
|
-
resource[:tweets].each do |tweet|
|
87
|
-
Tweet.create(tweet)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
}
|
91
|
-
end
|
92
|
-
|
93
|
-
it 'should structure the data' do
|
94
|
-
@structure.first[:name].should match(/Justin/i)
|
95
|
-
@structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
|
96
|
-
@structure.first[:username].should eq '@justinbieber'
|
97
|
-
@structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
|
98
|
-
end
|
99
|
-
|
100
|
-
it 'should store the data' do
|
101
|
-
User.count.should be(1)
|
102
|
-
Tweet.count.should_not be(0)
|
103
|
-
end
|
104
|
-
end
|
105
44
|
end
|
data/spec/orm/active_record.rb
CHANGED
@@ -19,14 +19,24 @@ class TestMigration < ActiveRecord::Migration
|
|
19
19
|
t.string :name
|
20
20
|
t.string :username
|
21
21
|
end
|
22
|
+
|
23
|
+
create_table :status_logs, :force => true do |t|
|
24
|
+
t.string :message
|
25
|
+
end
|
22
26
|
end
|
23
27
|
|
24
28
|
def self.down
|
25
29
|
drop_table :tweets
|
30
|
+
drop_table :status_logs
|
26
31
|
drop_table :users
|
27
32
|
end
|
28
33
|
end
|
29
34
|
|
35
|
+
|
36
|
+
class StatusLog < ActiveRecord::Base
|
37
|
+
|
38
|
+
end
|
39
|
+
|
30
40
|
class Tweet < ActiveRecord::Base
|
31
41
|
validates_presence_of :timestamp, :twitter_id, :permalink, :tweet
|
32
42
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
16
|
-
requirement: &
|
16
|
+
requirement: &70235243869100 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70235243869100
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: capybara
|
27
|
-
requirement: &
|
27
|
+
requirement: &70235243868100 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.2
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70235243868100
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70235243867320 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.5.6
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70235243867320
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: activesupport
|
49
|
-
requirement: &
|
49
|
+
requirement: &70235243866680 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70235243866680
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: multi_json
|
60
|
-
requirement: &
|
60
|
+
requirement: &70235243865240 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '1.0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70235243865240
|
69
69
|
description: Tearing up web pages into ActiveRecord resources
|
70
70
|
email:
|
71
71
|
- github@coryodaniel.com
|
@@ -83,6 +83,7 @@ files:
|
|
83
83
|
- Rakefile
|
84
84
|
- klepto.gemspec
|
85
85
|
- lib/klepto.rb
|
86
|
+
- lib/klepto/bot.rb
|
86
87
|
- lib/klepto/browser.rb
|
87
88
|
- lib/klepto/config.rb
|
88
89
|
- lib/klepto/structure.rb
|
@@ -97,6 +98,7 @@ files:
|
|
97
98
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
|
98
99
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
|
99
100
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
|
101
|
+
- spec/lib/klepto/bot_spec.rb
|
100
102
|
- spec/lib/klepto/browser_spec.rb
|
101
103
|
- spec/lib/klepto/config_spec.rb
|
102
104
|
- spec/lib/klepto/structure_spec.rb
|
@@ -136,6 +138,7 @@ test_files:
|
|
136
138
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
|
137
139
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
|
138
140
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
|
141
|
+
- spec/lib/klepto/bot_spec.rb
|
139
142
|
- spec/lib/klepto/browser_spec.rb
|
140
143
|
- spec/lib/klepto/config_spec.rb
|
141
144
|
- spec/lib/klepto/structure_spec.rb
|