klepto 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +120 -89
- data/lib/klepto.rb +2 -2
- data/lib/klepto/config.rb +18 -0
- data/lib/klepto/structure.rb +88 -0
- data/lib/klepto/version.rb +1 -1
- data/samples/bieber.html +4859 -0
- data/samples/concept.rb +50 -0
- data/spec/lib/klepto/config_spec.rb +23 -0
- data/spec/lib/klepto/structure_spec.rb +105 -0
- data/spec/lib/klepto_spec.rb +1 -58
- data/spec/orm/active_record.rb +2 -2
- data/spec/spec_helper.rb +1 -1
- metadata +22 -23
- data/lib/klepto/bot.rb +0 -59
- data/lib/klepto/crawler.rb +0 -72
- data/samples/example.rb +0 -49
- data/spec/lib/klepto/bot_spec.rb +0 -40
- data/spec/lib/klepto/crawler_spec.rb +0 -88
- data/spec/lib/klepto/dsl_spec.rb +0 -6
data/samples/concept.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'klepto'
|
4
|
+
|
5
|
+
@structure = Klepto::Structure.crawl('https://twitter.com/justinbieber')
|
6
|
+
config.headers 'Referer' => 'http://www.twitter.com'
|
7
|
+
|
8
|
+
config.steps [
|
9
|
+
[:GET, 'https://twitter.com/login'],
|
10
|
+
[:POST,'https://twitter.com/sessions',
|
11
|
+
{
|
12
|
+
session: {
|
13
|
+
username_or_email: 'example',
|
14
|
+
password:'123456'
|
15
|
+
}
|
16
|
+
}
|
17
|
+
]
|
18
|
+
]
|
19
|
+
config.urls 'https://twitter.com/justinbieber',
|
20
|
+
'https://twitter.com/ladygaga'
|
21
|
+
# config.cookies 'jsession' => 'abcdefg1234567890'
|
22
|
+
# config.on_http_status(500,404){}
|
23
|
+
# assertions do
|
24
|
+
# end
|
25
|
+
# config.on_failed_assertion(){}
|
26
|
+
|
27
|
+
|
28
|
+
# Structur the content
|
29
|
+
name 'h1.fullname'
|
30
|
+
username '.username span.screen-name'
|
31
|
+
links 'span.url a', :list, :attr => 'href'
|
32
|
+
|
33
|
+
tweets 'li.stream-item', :collection do |node|
|
34
|
+
# You can access the current parent node
|
35
|
+
twitter_id node['data-item-id']
|
36
|
+
|
37
|
+
# Defaults to innerText
|
38
|
+
content '.content p', :css
|
39
|
+
|
40
|
+
# get an attribute off an element
|
41
|
+
timestamp '._timestamp', :attr => 'data-time'
|
42
|
+
|
43
|
+
permalink '.time a', :css, :attr => :href
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# @resources = @structure.parse! #=> Array[Hash]
|
48
|
+
# @resources.each do |resource|
|
49
|
+
# User.create(resource)
|
50
|
+
# end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Config do
|
4
|
+
before(:each) do
|
5
|
+
@config = Klepto::Config.new
|
6
|
+
@config.headers({'Referer' => 'http://example.com'})
|
7
|
+
@config.urls 'http://example.com', 'http://www.iana.org'
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should be able to set headers' do
|
11
|
+
@config.headers['Referer'].should eq('http://example.com')
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should be able to set URLs' do
|
15
|
+
@config.urls.should == ['http://example.com', 'http://www.iana.org']
|
16
|
+
end
|
17
|
+
|
18
|
+
pending 'should be able to set cookies'
|
19
|
+
pending 'should be able to set steps'
|
20
|
+
pending 'should be able to set assertions'
|
21
|
+
pending 'should be able to set on_http_status handler'
|
22
|
+
pending 'should be able to set on_failed_assertion handler'
|
23
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
describe Klepto::Structure do
|
5
|
+
describe 'Klepto::Structure.build' do
|
6
|
+
before(:each) do
|
7
|
+
@page = Capybara::Node::Simple.new(open("./samples/bieber.html").read)
|
8
|
+
|
9
|
+
@structure = Klepto::Structure.build(@page){
|
10
|
+
name 'h1.fullname'
|
11
|
+
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
12
|
+
tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
|
13
|
+
links 'span.url a', :match => :all do |node|
|
14
|
+
node[:href]
|
15
|
+
end
|
16
|
+
|
17
|
+
last_tweet 'li.stream-item', :as => :resource do
|
18
|
+
twitter_id do |node|
|
19
|
+
node['data-item-id']
|
20
|
+
end
|
21
|
+
content '.content p'
|
22
|
+
timestamp '._timestamp', :attr => 'data-time'
|
23
|
+
permalink '.time a', :attr => :href
|
24
|
+
end
|
25
|
+
|
26
|
+
tweets 'li.stream-item', :as => :collection do
|
27
|
+
twitter_id do |node|
|
28
|
+
node['data-item-id']
|
29
|
+
end
|
30
|
+
tweet '.content p', :css
|
31
|
+
timestamp '._timestamp', :attr => 'data-time'
|
32
|
+
permalink '.time a', :css, :attr => :href
|
33
|
+
end
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should structure the data' do
|
38
|
+
@structure[:name].should match(/Justin/i)
|
39
|
+
@structure[:links].should == ["http://www.youtube.com/justinbieber"]
|
40
|
+
@structure[:username].should eq '@justinbieber'
|
41
|
+
@structure[:last_tweet][:twitter_id].should == @structure[:tweets].first[:twitter_id]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe 'Klepto::Structure.crawl' do
|
46
|
+
before(:each) do
|
47
|
+
config = {
|
48
|
+
:headers => {
|
49
|
+
'Referer' => 'http://www.twitter.com',
|
50
|
+
'X-Sup-Dawg' => "Yo, What's up?"
|
51
|
+
}
|
52
|
+
}
|
53
|
+
@structure = Klepto::Structure.crawl("https://twitter.com/justinbieber", config){
|
54
|
+
# Structure that stuff
|
55
|
+
name 'h1.fullname'
|
56
|
+
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
57
|
+
tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
|
58
|
+
links 'span.url a', :match => :all do |node|
|
59
|
+
node[:href]
|
60
|
+
end
|
61
|
+
|
62
|
+
last_tweet 'li.stream-item', :as => :resource do
|
63
|
+
twitter_id do |node|
|
64
|
+
node['data-item-id']
|
65
|
+
end
|
66
|
+
content '.content p'
|
67
|
+
timestamp '._timestamp', :attr => 'data-time'
|
68
|
+
permalink '.time a', :attr => :href
|
69
|
+
end
|
70
|
+
|
71
|
+
tweets 'li.stream-item', :as => :collection do
|
72
|
+
twitter_id do |node|
|
73
|
+
node['data-item-id']
|
74
|
+
end
|
75
|
+
tweet '.content p', :css
|
76
|
+
timestamp '._timestamp', :attr => 'data-time'
|
77
|
+
permalink '.time a', :css, :attr => :href
|
78
|
+
end
|
79
|
+
|
80
|
+
after_crawl do |resource|
|
81
|
+
@user = User.new
|
82
|
+
@user.name = resource[:name]
|
83
|
+
@user.username = resource[:username]
|
84
|
+
@user.save
|
85
|
+
|
86
|
+
resource[:tweets].each do |tweet|
|
87
|
+
Tweet.create(tweet)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should structure the data' do
|
94
|
+
@structure.first[:name].should match(/Justin/i)
|
95
|
+
@structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
|
96
|
+
@structure.first[:username].should eq '@justinbieber'
|
97
|
+
@structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should store the data' do
|
101
|
+
User.count.should be(1)
|
102
|
+
Tweet.count.should_not be(0)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
data/spec/lib/klepto_spec.rb
CHANGED
@@ -4,61 +4,4 @@ describe Klepto do
|
|
4
4
|
it "should have a version" do
|
5
5
|
Klepto::VERSION.should_not be_nil
|
6
6
|
end
|
7
|
-
end
|
8
|
-
|
9
|
-
describe 'Scraping pages', :skip => false do
|
10
|
-
before(:each) do
|
11
|
-
@bot = Klepto::Bot.new do
|
12
|
-
syntax :css
|
13
|
-
|
14
|
-
headers({
|
15
|
-
'Referer' => 'https://twitter.com',
|
16
|
-
'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
|
17
|
-
})
|
18
|
-
|
19
|
-
urls 'https://twitter.com/justinbieber'
|
20
|
-
|
21
|
-
crawl 'body' do
|
22
|
-
scrape "h1.fullname", :name
|
23
|
-
scrape '.username span.screen-name', :username
|
24
|
-
save do |params|
|
25
|
-
user = User.find_by_name(params[:username]) || User.new
|
26
|
-
user.update_attributes params
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
crawl 'li.stream-item' do
|
31
|
-
scrape do |node|
|
32
|
-
{:twitter_id => node['data-item-id']}
|
33
|
-
end
|
34
|
-
|
35
|
-
scrape '.content p', :content
|
36
|
-
|
37
|
-
scrape '._timestamp' do |node|
|
38
|
-
{timestamp: node['data-time']}
|
39
|
-
end
|
40
|
-
|
41
|
-
scrape '.time a' do |node|
|
42
|
-
{permalink: node[:href]}
|
43
|
-
end
|
44
|
-
|
45
|
-
save do |params|
|
46
|
-
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
47
|
-
tweet.update_attributes params
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
@bot.start!
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'should have collected some resources' do
|
56
|
-
@bot.crawlers.should have(2).crawlers
|
57
|
-
@bot.crawlers.first.resources.should have(1).user
|
58
|
-
end
|
59
|
-
|
60
|
-
it 'should persist resources' do
|
61
|
-
User.count.should be(1)
|
62
|
-
Tweet.count.should_not be(0)
|
63
|
-
end
|
64
|
-
end
|
7
|
+
end
|
data/spec/orm/active_record.rb
CHANGED
@@ -9,7 +9,7 @@ ActiveRecord::Migration.verbose = false
|
|
9
9
|
class TestMigration < ActiveRecord::Migration
|
10
10
|
def self.up
|
11
11
|
create_table :tweets, :force => true do |t|
|
12
|
-
t.string :
|
12
|
+
t.string :tweet
|
13
13
|
t.string :twitter_id
|
14
14
|
t.integer :timestamp
|
15
15
|
t.string :permalink
|
@@ -28,7 +28,7 @@ class TestMigration < ActiveRecord::Migration
|
|
28
28
|
end
|
29
29
|
|
30
30
|
class Tweet < ActiveRecord::Base
|
31
|
-
validates_presence_of :timestamp, :twitter_id, :permalink, :
|
31
|
+
validates_presence_of :timestamp, :twitter_id, :permalink, :tweet
|
32
32
|
end
|
33
33
|
|
34
34
|
class User < ActiveRecord::Base
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: docile
|
16
|
-
requirement: &
|
16
|
+
requirement: &70231930844560 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70231930844560
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: poltergeist
|
27
|
-
requirement: &
|
27
|
+
requirement: &70231930843200 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.1.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70231930843200
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: capybara
|
38
|
-
requirement: &
|
38
|
+
requirement: &70231930839840 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - =
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 2.0.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70231930839840
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70231930836140 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 1.5.6
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70231930836140
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: activesupport
|
60
|
-
requirement: &
|
60
|
+
requirement: &70231930835280 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70231930835280
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: multi_json
|
71
|
-
requirement: &
|
71
|
+
requirement: &70231930834460 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '1.0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70231930834460
|
80
80
|
description: Tearing up web pages into ActiveRecord resources
|
81
81
|
email:
|
82
82
|
- github@coryodaniel.com
|
@@ -94,12 +94,13 @@ files:
|
|
94
94
|
- Rakefile
|
95
95
|
- klepto.gemspec
|
96
96
|
- lib/klepto.rb
|
97
|
-
- lib/klepto/bot.rb
|
98
97
|
- lib/klepto/browser.rb
|
99
|
-
- lib/klepto/
|
98
|
+
- lib/klepto/config.rb
|
99
|
+
- lib/klepto/structure.rb
|
100
100
|
- lib/klepto/tasks.rb
|
101
101
|
- lib/klepto/version.rb
|
102
|
-
- samples/
|
102
|
+
- samples/bieber.html
|
103
|
+
- samples/concept.rb
|
103
104
|
- spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml
|
104
105
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml
|
105
106
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml
|
@@ -107,10 +108,9 @@ files:
|
|
107
108
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
|
108
109
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
|
109
110
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
|
110
|
-
- spec/lib/klepto/bot_spec.rb
|
111
111
|
- spec/lib/klepto/browser_spec.rb
|
112
|
-
- spec/lib/klepto/
|
113
|
-
- spec/lib/klepto/
|
112
|
+
- spec/lib/klepto/config_spec.rb
|
113
|
+
- spec/lib/klepto/structure_spec.rb
|
114
114
|
- spec/lib/klepto_spec.rb
|
115
115
|
- spec/orm/active_record.rb
|
116
116
|
- spec/orm/database.example.yml
|
@@ -147,10 +147,9 @@ test_files:
|
|
147
147
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
|
148
148
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
|
149
149
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
|
150
|
-
- spec/lib/klepto/bot_spec.rb
|
151
150
|
- spec/lib/klepto/browser_spec.rb
|
152
|
-
- spec/lib/klepto/
|
153
|
-
- spec/lib/klepto/
|
151
|
+
- spec/lib/klepto/config_spec.rb
|
152
|
+
- spec/lib/klepto/structure_spec.rb
|
154
153
|
- spec/lib/klepto_spec.rb
|
155
154
|
- spec/orm/active_record.rb
|
156
155
|
- spec/orm/database.example.yml
|
data/lib/klepto/bot.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
module Klepto
|
2
|
-
class Bot
|
3
|
-
def initialize(*args, &block)
|
4
|
-
@syntax = :css
|
5
|
-
@is_dry_run = false
|
6
|
-
@urls = []
|
7
|
-
@crawlers = []
|
8
|
-
@browser = Klepto::Browser.new
|
9
|
-
Docile.dsl_eval(self, &block) if block_given?
|
10
|
-
end
|
11
|
-
|
12
|
-
attr_reader :browser, :crawlers
|
13
|
-
|
14
|
-
def dry_run!
|
15
|
-
@is_dry_run = true
|
16
|
-
end
|
17
|
-
|
18
|
-
def dry_run?
|
19
|
-
!!@is_dry_run
|
20
|
-
end
|
21
|
-
|
22
|
-
def syntax(kind=nil)
|
23
|
-
@syntax = kind unless kind.nil?
|
24
|
-
@syntax
|
25
|
-
end
|
26
|
-
|
27
|
-
def headers(_headers)
|
28
|
-
@browser.set_headers(_headers)
|
29
|
-
end
|
30
|
-
|
31
|
-
def url(*args)
|
32
|
-
@urls += args
|
33
|
-
end
|
34
|
-
alias :urls :url
|
35
|
-
|
36
|
-
def crawl(scope, options={}, &block)
|
37
|
-
options[:syntax] = @syntax
|
38
|
-
@crawlers << Klepto::Crawler.new(scope, options, &block)
|
39
|
-
end
|
40
|
-
|
41
|
-
def start!
|
42
|
-
@urls.each do |url|
|
43
|
-
browser.fetch!(url)
|
44
|
-
@crawlers.each do |crawler|
|
45
|
-
crawler.crawl browser.page
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
@crawlers.each do |crawler|
|
50
|
-
if dry_run?
|
51
|
-
pp crawler.resources
|
52
|
-
else
|
53
|
-
crawler.persist!
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
end
|