klepto 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +120 -89
- data/lib/klepto.rb +2 -2
- data/lib/klepto/config.rb +18 -0
- data/lib/klepto/structure.rb +88 -0
- data/lib/klepto/version.rb +1 -1
- data/samples/bieber.html +4859 -0
- data/samples/concept.rb +50 -0
- data/spec/lib/klepto/config_spec.rb +23 -0
- data/spec/lib/klepto/structure_spec.rb +105 -0
- data/spec/lib/klepto_spec.rb +1 -58
- data/spec/orm/active_record.rb +2 -2
- data/spec/spec_helper.rb +1 -1
- metadata +22 -23
- data/lib/klepto/bot.rb +0 -59
- data/lib/klepto/crawler.rb +0 -72
- data/samples/example.rb +0 -49
- data/spec/lib/klepto/bot_spec.rb +0 -40
- data/spec/lib/klepto/crawler_spec.rb +0 -88
- data/spec/lib/klepto/dsl_spec.rb +0 -6
data/samples/concept.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'klepto'
|
4
|
+
|
5
|
+
@structure = Klepto::Structure.crawl('https://twitter.com/justinbieber')
|
6
|
+
config.headers 'Referer' => 'http://www.twitter.com'
|
7
|
+
|
8
|
+
config.steps [
|
9
|
+
[:GET, 'https://twitter.com/login'],
|
10
|
+
[:POST,'https://twitter.com/sessions',
|
11
|
+
{
|
12
|
+
session: {
|
13
|
+
username_or_email: 'example',
|
14
|
+
password:'123456'
|
15
|
+
}
|
16
|
+
}
|
17
|
+
]
|
18
|
+
]
|
19
|
+
config.urls 'https://twitter.com/justinbieber',
|
20
|
+
'https://twitter.com/ladygaga'
|
21
|
+
# config.cookies 'jsession' => 'abcdefg1234567890'
|
22
|
+
# config.on_http_status(500,404){}
|
23
|
+
# assertions do
|
24
|
+
# end
|
25
|
+
# config.on_failed_assertion(){}
|
26
|
+
|
27
|
+
|
28
|
+
# Structur the content
|
29
|
+
name 'h1.fullname'
|
30
|
+
username '.username span.screen-name'
|
31
|
+
links 'span.url a', :list, :attr => 'href'
|
32
|
+
|
33
|
+
tweets 'li.stream-item', :collection do |node|
|
34
|
+
# You can access the current parent node
|
35
|
+
twitter_id node['data-item-id']
|
36
|
+
|
37
|
+
# Defaults to innerText
|
38
|
+
content '.content p', :css
|
39
|
+
|
40
|
+
# get an attribute off an element
|
41
|
+
timestamp '._timestamp', :attr => 'data-time'
|
42
|
+
|
43
|
+
permalink '.time a', :css, :attr => :href
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# @resources = @structure.parse! #=> Array[Hash]
|
48
|
+
# @resources.each do |resource|
|
49
|
+
# User.create(resource)
|
50
|
+
# end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Config do
|
4
|
+
before(:each) do
|
5
|
+
@config = Klepto::Config.new
|
6
|
+
@config.headers({'Referer' => 'http://example.com'})
|
7
|
+
@config.urls 'http://example.com', 'http://www.iana.org'
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should be able to set headers' do
|
11
|
+
@config.headers['Referer'].should eq('http://example.com')
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should be able to set URLs' do
|
15
|
+
@config.urls.should == ['http://example.com', 'http://www.iana.org']
|
16
|
+
end
|
17
|
+
|
18
|
+
pending 'should be able to set cookies'
|
19
|
+
pending 'should be able to set steps'
|
20
|
+
pending 'should be able to set assertions'
|
21
|
+
pending 'should be able to set on_http_status handler'
|
22
|
+
pending 'should be able to set on_failed_assertion handler'
|
23
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
describe Klepto::Structure do
|
5
|
+
describe 'Klepto::Structure.build' do
|
6
|
+
before(:each) do
|
7
|
+
@page = Capybara::Node::Simple.new(open("./samples/bieber.html").read)
|
8
|
+
|
9
|
+
@structure = Klepto::Structure.build(@page){
|
10
|
+
name 'h1.fullname'
|
11
|
+
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
12
|
+
tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
|
13
|
+
links 'span.url a', :match => :all do |node|
|
14
|
+
node[:href]
|
15
|
+
end
|
16
|
+
|
17
|
+
last_tweet 'li.stream-item', :as => :resource do
|
18
|
+
twitter_id do |node|
|
19
|
+
node['data-item-id']
|
20
|
+
end
|
21
|
+
content '.content p'
|
22
|
+
timestamp '._timestamp', :attr => 'data-time'
|
23
|
+
permalink '.time a', :attr => :href
|
24
|
+
end
|
25
|
+
|
26
|
+
tweets 'li.stream-item', :as => :collection do
|
27
|
+
twitter_id do |node|
|
28
|
+
node['data-item-id']
|
29
|
+
end
|
30
|
+
tweet '.content p', :css
|
31
|
+
timestamp '._timestamp', :attr => 'data-time'
|
32
|
+
permalink '.time a', :css, :attr => :href
|
33
|
+
end
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should structure the data' do
|
38
|
+
@structure[:name].should match(/Justin/i)
|
39
|
+
@structure[:links].should == ["http://www.youtube.com/justinbieber"]
|
40
|
+
@structure[:username].should eq '@justinbieber'
|
41
|
+
@structure[:last_tweet][:twitter_id].should == @structure[:tweets].first[:twitter_id]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe 'Klepto::Structure.crawl' do
|
46
|
+
before(:each) do
|
47
|
+
config = {
|
48
|
+
:headers => {
|
49
|
+
'Referer' => 'http://www.twitter.com',
|
50
|
+
'X-Sup-Dawg' => "Yo, What's up?"
|
51
|
+
}
|
52
|
+
}
|
53
|
+
@structure = Klepto::Structure.crawl("https://twitter.com/justinbieber", config){
|
54
|
+
# Structure that stuff
|
55
|
+
name 'h1.fullname'
|
56
|
+
username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
|
57
|
+
tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
|
58
|
+
links 'span.url a', :match => :all do |node|
|
59
|
+
node[:href]
|
60
|
+
end
|
61
|
+
|
62
|
+
last_tweet 'li.stream-item', :as => :resource do
|
63
|
+
twitter_id do |node|
|
64
|
+
node['data-item-id']
|
65
|
+
end
|
66
|
+
content '.content p'
|
67
|
+
timestamp '._timestamp', :attr => 'data-time'
|
68
|
+
permalink '.time a', :attr => :href
|
69
|
+
end
|
70
|
+
|
71
|
+
tweets 'li.stream-item', :as => :collection do
|
72
|
+
twitter_id do |node|
|
73
|
+
node['data-item-id']
|
74
|
+
end
|
75
|
+
tweet '.content p', :css
|
76
|
+
timestamp '._timestamp', :attr => 'data-time'
|
77
|
+
permalink '.time a', :css, :attr => :href
|
78
|
+
end
|
79
|
+
|
80
|
+
after_crawl do |resource|
|
81
|
+
@user = User.new
|
82
|
+
@user.name = resource[:name]
|
83
|
+
@user.username = resource[:username]
|
84
|
+
@user.save
|
85
|
+
|
86
|
+
resource[:tweets].each do |tweet|
|
87
|
+
Tweet.create(tweet)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should structure the data' do
|
94
|
+
@structure.first[:name].should match(/Justin/i)
|
95
|
+
@structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
|
96
|
+
@structure.first[:username].should eq '@justinbieber'
|
97
|
+
@structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should store the data' do
|
101
|
+
User.count.should be(1)
|
102
|
+
Tweet.count.should_not be(0)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
data/spec/lib/klepto_spec.rb
CHANGED
@@ -4,61 +4,4 @@ describe Klepto do
|
|
4
4
|
it "should have a version" do
|
5
5
|
Klepto::VERSION.should_not be_nil
|
6
6
|
end
|
7
|
-
end
|
8
|
-
|
9
|
-
describe 'Scraping pages', :skip => false do
|
10
|
-
before(:each) do
|
11
|
-
@bot = Klepto::Bot.new do
|
12
|
-
syntax :css
|
13
|
-
|
14
|
-
headers({
|
15
|
-
'Referer' => 'https://twitter.com',
|
16
|
-
'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
|
17
|
-
})
|
18
|
-
|
19
|
-
urls 'https://twitter.com/justinbieber'
|
20
|
-
|
21
|
-
crawl 'body' do
|
22
|
-
scrape "h1.fullname", :name
|
23
|
-
scrape '.username span.screen-name', :username
|
24
|
-
save do |params|
|
25
|
-
user = User.find_by_name(params[:username]) || User.new
|
26
|
-
user.update_attributes params
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
crawl 'li.stream-item' do
|
31
|
-
scrape do |node|
|
32
|
-
{:twitter_id => node['data-item-id']}
|
33
|
-
end
|
34
|
-
|
35
|
-
scrape '.content p', :content
|
36
|
-
|
37
|
-
scrape '._timestamp' do |node|
|
38
|
-
{timestamp: node['data-time']}
|
39
|
-
end
|
40
|
-
|
41
|
-
scrape '.time a' do |node|
|
42
|
-
{permalink: node[:href]}
|
43
|
-
end
|
44
|
-
|
45
|
-
save do |params|
|
46
|
-
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
47
|
-
tweet.update_attributes params
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
@bot.start!
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'should have collected some resources' do
|
56
|
-
@bot.crawlers.should have(2).crawlers
|
57
|
-
@bot.crawlers.first.resources.should have(1).user
|
58
|
-
end
|
59
|
-
|
60
|
-
it 'should persist resources' do
|
61
|
-
User.count.should be(1)
|
62
|
-
Tweet.count.should_not be(0)
|
63
|
-
end
|
64
|
-
end
|
7
|
+
end
|
data/spec/orm/active_record.rb
CHANGED
@@ -9,7 +9,7 @@ ActiveRecord::Migration.verbose = false
|
|
9
9
|
class TestMigration < ActiveRecord::Migration
|
10
10
|
def self.up
|
11
11
|
create_table :tweets, :force => true do |t|
|
12
|
-
t.string :
|
12
|
+
t.string :tweet
|
13
13
|
t.string :twitter_id
|
14
14
|
t.integer :timestamp
|
15
15
|
t.string :permalink
|
@@ -28,7 +28,7 @@ class TestMigration < ActiveRecord::Migration
|
|
28
28
|
end
|
29
29
|
|
30
30
|
class Tweet < ActiveRecord::Base
|
31
|
-
validates_presence_of :timestamp, :twitter_id, :permalink, :
|
31
|
+
validates_presence_of :timestamp, :twitter_id, :permalink, :tweet
|
32
32
|
end
|
33
33
|
|
34
34
|
class User < ActiveRecord::Base
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: docile
|
16
|
-
requirement: &
|
16
|
+
requirement: &70231930844560 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70231930844560
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: poltergeist
|
27
|
-
requirement: &
|
27
|
+
requirement: &70231930843200 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - =
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.1.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70231930843200
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: capybara
|
38
|
-
requirement: &
|
38
|
+
requirement: &70231930839840 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - =
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 2.0.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70231930839840
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70231930836140 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 1.5.6
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70231930836140
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: activesupport
|
60
|
-
requirement: &
|
60
|
+
requirement: &70231930835280 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70231930835280
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: multi_json
|
71
|
-
requirement: &
|
71
|
+
requirement: &70231930834460 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '1.0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70231930834460
|
80
80
|
description: Tearing up web pages into ActiveRecord resources
|
81
81
|
email:
|
82
82
|
- github@coryodaniel.com
|
@@ -94,12 +94,13 @@ files:
|
|
94
94
|
- Rakefile
|
95
95
|
- klepto.gemspec
|
96
96
|
- lib/klepto.rb
|
97
|
-
- lib/klepto/bot.rb
|
98
97
|
- lib/klepto/browser.rb
|
99
|
-
- lib/klepto/
|
98
|
+
- lib/klepto/config.rb
|
99
|
+
- lib/klepto/structure.rb
|
100
100
|
- lib/klepto/tasks.rb
|
101
101
|
- lib/klepto/version.rb
|
102
|
-
- samples/
|
102
|
+
- samples/bieber.html
|
103
|
+
- samples/concept.rb
|
103
104
|
- spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml
|
104
105
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml
|
105
106
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml
|
@@ -107,10 +108,9 @@ files:
|
|
107
108
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
|
108
109
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
|
109
110
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
|
110
|
-
- spec/lib/klepto/bot_spec.rb
|
111
111
|
- spec/lib/klepto/browser_spec.rb
|
112
|
-
- spec/lib/klepto/
|
113
|
-
- spec/lib/klepto/
|
112
|
+
- spec/lib/klepto/config_spec.rb
|
113
|
+
- spec/lib/klepto/structure_spec.rb
|
114
114
|
- spec/lib/klepto_spec.rb
|
115
115
|
- spec/orm/active_record.rb
|
116
116
|
- spec/orm/database.example.yml
|
@@ -147,10 +147,9 @@ test_files:
|
|
147
147
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
|
148
148
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
|
149
149
|
- spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
|
150
|
-
- spec/lib/klepto/bot_spec.rb
|
151
150
|
- spec/lib/klepto/browser_spec.rb
|
152
|
-
- spec/lib/klepto/
|
153
|
-
- spec/lib/klepto/
|
151
|
+
- spec/lib/klepto/config_spec.rb
|
152
|
+
- spec/lib/klepto/structure_spec.rb
|
154
153
|
- spec/lib/klepto_spec.rb
|
155
154
|
- spec/orm/active_record.rb
|
156
155
|
- spec/orm/database.example.yml
|
data/lib/klepto/bot.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
module Klepto
|
2
|
-
class Bot
|
3
|
-
def initialize(*args, &block)
|
4
|
-
@syntax = :css
|
5
|
-
@is_dry_run = false
|
6
|
-
@urls = []
|
7
|
-
@crawlers = []
|
8
|
-
@browser = Klepto::Browser.new
|
9
|
-
Docile.dsl_eval(self, &block) if block_given?
|
10
|
-
end
|
11
|
-
|
12
|
-
attr_reader :browser, :crawlers
|
13
|
-
|
14
|
-
def dry_run!
|
15
|
-
@is_dry_run = true
|
16
|
-
end
|
17
|
-
|
18
|
-
def dry_run?
|
19
|
-
!!@is_dry_run
|
20
|
-
end
|
21
|
-
|
22
|
-
def syntax(kind=nil)
|
23
|
-
@syntax = kind unless kind.nil?
|
24
|
-
@syntax
|
25
|
-
end
|
26
|
-
|
27
|
-
def headers(_headers)
|
28
|
-
@browser.set_headers(_headers)
|
29
|
-
end
|
30
|
-
|
31
|
-
def url(*args)
|
32
|
-
@urls += args
|
33
|
-
end
|
34
|
-
alias :urls :url
|
35
|
-
|
36
|
-
def crawl(scope, options={}, &block)
|
37
|
-
options[:syntax] = @syntax
|
38
|
-
@crawlers << Klepto::Crawler.new(scope, options, &block)
|
39
|
-
end
|
40
|
-
|
41
|
-
def start!
|
42
|
-
@urls.each do |url|
|
43
|
-
browser.fetch!(url)
|
44
|
-
@crawlers.each do |crawler|
|
45
|
-
crawler.crawl browser.page
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
@crawlers.each do |crawler|
|
50
|
-
if dry_run?
|
51
|
-
pp crawler.resources
|
52
|
-
else
|
53
|
-
crawler.persist!
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
end
|