klepto 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ #! /usr/bin/env ruby
2
+ require 'bundler/setup'
3
+ require 'klepto'
4
+
5
+ @structure = Klepto::Structure.crawl('https://twitter.com/justinbieber')
6
+ config.headers 'Referer' => 'http://www.twitter.com'
7
+
8
+ config.steps [
9
+ [:GET, 'https://twitter.com/login'],
10
+ [:POST,'https://twitter.com/sessions',
11
+ {
12
+ session: {
13
+ username_or_email: 'example',
14
+ password:'123456'
15
+ }
16
+ }
17
+ ]
18
+ ]
19
+ config.urls 'https://twitter.com/justinbieber',
20
+ 'https://twitter.com/ladygaga'
21
+ # config.cookies 'jsession' => 'abcdefg1234567890'
22
+ # config.on_http_status(500,404){}
23
+ # assertions do
24
+ # end
25
+ # config.on_failed_assertion(){}
26
+
27
+
28
+ # Structur the content
29
+ name 'h1.fullname'
30
+ username '.username span.screen-name'
31
+ links 'span.url a', :list, :attr => 'href'
32
+
33
+ tweets 'li.stream-item', :collection do |node|
34
+ # You can access the current parent node
35
+ twitter_id node['data-item-id']
36
+
37
+ # Defaults to innerText
38
+ content '.content p', :css
39
+
40
+ # get an attribute off an element
41
+ timestamp '._timestamp', :attr => 'data-time'
42
+
43
+ permalink '.time a', :css, :attr => :href
44
+ end
45
+ end
46
+
47
+ # @resources = @structure.parse! #=> Array[Hash]
48
+ # @resources.each do |resource|
49
+ # User.create(resource)
50
+ # end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto::Config do
4
+ before(:each) do
5
+ @config = Klepto::Config.new
6
+ @config.headers({'Referer' => 'http://example.com'})
7
+ @config.urls 'http://example.com', 'http://www.iana.org'
8
+ end
9
+
10
+ it 'should be able to set headers' do
11
+ @config.headers['Referer'].should eq('http://example.com')
12
+ end
13
+
14
+ it 'should be able to set URLs' do
15
+ @config.urls.should == ['http://example.com', 'http://www.iana.org']
16
+ end
17
+
18
+ pending 'should be able to set cookies'
19
+ pending 'should be able to set steps'
20
+ pending 'should be able to set assertions'
21
+ pending 'should be able to set on_http_status handler'
22
+ pending 'should be able to set on_failed_assertion handler'
23
+ end
@@ -0,0 +1,105 @@
1
+ require 'spec_helper'
2
+ require 'pp'
3
+
4
+ describe Klepto::Structure do
5
+ describe 'Klepto::Structure.build' do
6
+ before(:each) do
7
+ @page = Capybara::Node::Simple.new(open("./samples/bieber.html").read)
8
+
9
+ @structure = Klepto::Structure.build(@page){
10
+ name 'h1.fullname'
11
+ username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
12
+ tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
13
+ links 'span.url a', :match => :all do |node|
14
+ node[:href]
15
+ end
16
+
17
+ last_tweet 'li.stream-item', :as => :resource do
18
+ twitter_id do |node|
19
+ node['data-item-id']
20
+ end
21
+ content '.content p'
22
+ timestamp '._timestamp', :attr => 'data-time'
23
+ permalink '.time a', :attr => :href
24
+ end
25
+
26
+ tweets 'li.stream-item', :as => :collection do
27
+ twitter_id do |node|
28
+ node['data-item-id']
29
+ end
30
+ tweet '.content p', :css
31
+ timestamp '._timestamp', :attr => 'data-time'
32
+ permalink '.time a', :css, :attr => :href
33
+ end
34
+ }
35
+ end
36
+
37
+ it 'should structure the data' do
38
+ @structure[:name].should match(/Justin/i)
39
+ @structure[:links].should == ["http://www.youtube.com/justinbieber"]
40
+ @structure[:username].should eq '@justinbieber'
41
+ @structure[:last_tweet][:twitter_id].should == @structure[:tweets].first[:twitter_id]
42
+ end
43
+ end
44
+
45
+ describe 'Klepto::Structure.crawl' do
46
+ before(:each) do
47
+ config = {
48
+ :headers => {
49
+ 'Referer' => 'http://www.twitter.com',
50
+ 'X-Sup-Dawg' => "Yo, What's up?"
51
+ }
52
+ }
53
+ @structure = Klepto::Structure.crawl("https://twitter.com/justinbieber", config){
54
+ # Structure that stuff
55
+ name 'h1.fullname'
56
+ username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
57
+ tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
58
+ links 'span.url a', :match => :all do |node|
59
+ node[:href]
60
+ end
61
+
62
+ last_tweet 'li.stream-item', :as => :resource do
63
+ twitter_id do |node|
64
+ node['data-item-id']
65
+ end
66
+ content '.content p'
67
+ timestamp '._timestamp', :attr => 'data-time'
68
+ permalink '.time a', :attr => :href
69
+ end
70
+
71
+ tweets 'li.stream-item', :as => :collection do
72
+ twitter_id do |node|
73
+ node['data-item-id']
74
+ end
75
+ tweet '.content p', :css
76
+ timestamp '._timestamp', :attr => 'data-time'
77
+ permalink '.time a', :css, :attr => :href
78
+ end
79
+
80
+ after_crawl do |resource|
81
+ @user = User.new
82
+ @user.name = resource[:name]
83
+ @user.username = resource[:username]
84
+ @user.save
85
+
86
+ resource[:tweets].each do |tweet|
87
+ Tweet.create(tweet)
88
+ end
89
+ end
90
+ }
91
+ end
92
+
93
+ it 'should structure the data' do
94
+ @structure.first[:name].should match(/Justin/i)
95
+ @structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
96
+ @structure.first[:username].should eq '@justinbieber'
97
+ @structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
98
+ end
99
+
100
+ it 'should store the data' do
101
+ User.count.should be(1)
102
+ Tweet.count.should_not be(0)
103
+ end
104
+ end
105
+ end
@@ -4,61 +4,4 @@ describe Klepto do
4
4
  it "should have a version" do
5
5
  Klepto::VERSION.should_not be_nil
6
6
  end
7
- end
8
-
9
- describe 'Scraping pages', :skip => false do
10
- before(:each) do
11
- @bot = Klepto::Bot.new do
12
- syntax :css
13
-
14
- headers({
15
- 'Referer' => 'https://twitter.com',
16
- 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
17
- })
18
-
19
- urls 'https://twitter.com/justinbieber'
20
-
21
- crawl 'body' do
22
- scrape "h1.fullname", :name
23
- scrape '.username span.screen-name', :username
24
- save do |params|
25
- user = User.find_by_name(params[:username]) || User.new
26
- user.update_attributes params
27
- end
28
- end
29
-
30
- crawl 'li.stream-item' do
31
- scrape do |node|
32
- {:twitter_id => node['data-item-id']}
33
- end
34
-
35
- scrape '.content p', :content
36
-
37
- scrape '._timestamp' do |node|
38
- {timestamp: node['data-time']}
39
- end
40
-
41
- scrape '.time a' do |node|
42
- {permalink: node[:href]}
43
- end
44
-
45
- save do |params|
46
- tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
47
- tweet.update_attributes params
48
- end
49
- end
50
- end
51
-
52
- @bot.start!
53
- end
54
-
55
- it 'should have collected some resources' do
56
- @bot.crawlers.should have(2).crawlers
57
- @bot.crawlers.first.resources.should have(1).user
58
- end
59
-
60
- it 'should persist resources' do
61
- User.count.should be(1)
62
- Tweet.count.should_not be(0)
63
- end
64
- end
7
+ end
@@ -9,7 +9,7 @@ ActiveRecord::Migration.verbose = false
9
9
  class TestMigration < ActiveRecord::Migration
10
10
  def self.up
11
11
  create_table :tweets, :force => true do |t|
12
- t.string :content
12
+ t.string :tweet
13
13
  t.string :twitter_id
14
14
  t.integer :timestamp
15
15
  t.string :permalink
@@ -28,7 +28,7 @@ class TestMigration < ActiveRecord::Migration
28
28
  end
29
29
 
30
30
  class Tweet < ActiveRecord::Base
31
- validates_presence_of :timestamp, :twitter_id, :permalink, :content
31
+ validates_presence_of :timestamp, :twitter_id, :permalink, :tweet
32
32
  end
33
33
 
34
34
  class User < ActiveRecord::Base
@@ -28,5 +28,5 @@ RSpec.configure do |config|
28
28
  }
29
29
  config.after(:all) { TestMigration.down }
30
30
  config.treat_symbols_as_metadata_keys_with_true_values = true
31
- config.filter_run_excluding :skip => true
31
+ #config.filter_run_including :only => true
32
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-10 00:00:00.000000000 Z
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: docile
16
- requirement: &70348289181900 !ruby/object:Gem::Requirement
16
+ requirement: &70231930844560 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70348289181900
24
+ version_requirements: *70231930844560
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: poltergeist
27
- requirement: &70348289181040 !ruby/object:Gem::Requirement
27
+ requirement: &70231930843200 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.1.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70348289181040
35
+ version_requirements: *70231930843200
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: capybara
38
- requirement: &70348289179880 !ruby/object:Gem::Requirement
38
+ requirement: &70231930839840 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - =
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 2.0.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70348289179880
46
+ version_requirements: *70231930839840
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70348289179180 !ruby/object:Gem::Requirement
49
+ requirement: &70231930836140 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 1.5.6
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70348289179180
57
+ version_requirements: *70231930836140
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: activesupport
60
- requirement: &70348289178640 !ruby/object:Gem::Requirement
60
+ requirement: &70231930835280 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70348289178640
68
+ version_requirements: *70231930835280
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: multi_json
71
- requirement: &70348289177300 !ruby/object:Gem::Requirement
71
+ requirement: &70231930834460 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: '1.0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70348289177300
79
+ version_requirements: *70231930834460
80
80
  description: Tearing up web pages into ActiveRecord resources
81
81
  email:
82
82
  - github@coryodaniel.com
@@ -94,12 +94,13 @@ files:
94
94
  - Rakefile
95
95
  - klepto.gemspec
96
96
  - lib/klepto.rb
97
- - lib/klepto/bot.rb
98
97
  - lib/klepto/browser.rb
99
- - lib/klepto/crawler.rb
98
+ - lib/klepto/config.rb
99
+ - lib/klepto/structure.rb
100
100
  - lib/klepto/tasks.rb
101
101
  - lib/klepto/version.rb
102
- - samples/example.rb
102
+ - samples/bieber.html
103
+ - samples/concept.rb
103
104
  - spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml
104
105
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml
105
106
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml
@@ -107,10 +108,9 @@ files:
107
108
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
108
109
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
109
110
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
110
- - spec/lib/klepto/bot_spec.rb
111
111
  - spec/lib/klepto/browser_spec.rb
112
- - spec/lib/klepto/crawler_spec.rb
113
- - spec/lib/klepto/dsl_spec.rb
112
+ - spec/lib/klepto/config_spec.rb
113
+ - spec/lib/klepto/structure_spec.rb
114
114
  - spec/lib/klepto_spec.rb
115
115
  - spec/orm/active_record.rb
116
116
  - spec/orm/database.example.yml
@@ -147,10 +147,9 @@ test_files:
147
147
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
148
148
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
149
149
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
150
- - spec/lib/klepto/bot_spec.rb
151
150
  - spec/lib/klepto/browser_spec.rb
152
- - spec/lib/klepto/crawler_spec.rb
153
- - spec/lib/klepto/dsl_spec.rb
151
+ - spec/lib/klepto/config_spec.rb
152
+ - spec/lib/klepto/structure_spec.rb
154
153
  - spec/lib/klepto_spec.rb
155
154
  - spec/orm/active_record.rb
156
155
  - spec/orm/database.example.yml
@@ -1,59 +0,0 @@
1
- module Klepto
2
- class Bot
3
- def initialize(*args, &block)
4
- @syntax = :css
5
- @is_dry_run = false
6
- @urls = []
7
- @crawlers = []
8
- @browser = Klepto::Browser.new
9
- Docile.dsl_eval(self, &block) if block_given?
10
- end
11
-
12
- attr_reader :browser, :crawlers
13
-
14
- def dry_run!
15
- @is_dry_run = true
16
- end
17
-
18
- def dry_run?
19
- !!@is_dry_run
20
- end
21
-
22
- def syntax(kind=nil)
23
- @syntax = kind unless kind.nil?
24
- @syntax
25
- end
26
-
27
- def headers(_headers)
28
- @browser.set_headers(_headers)
29
- end
30
-
31
- def url(*args)
32
- @urls += args
33
- end
34
- alias :urls :url
35
-
36
- def crawl(scope, options={}, &block)
37
- options[:syntax] = @syntax
38
- @crawlers << Klepto::Crawler.new(scope, options, &block)
39
- end
40
-
41
- def start!
42
- @urls.each do |url|
43
- browser.fetch!(url)
44
- @crawlers.each do |crawler|
45
- crawler.crawl browser.page
46
- end
47
- end
48
-
49
- @crawlers.each do |crawler|
50
- if dry_run?
51
- pp crawler.resources
52
- else
53
- crawler.persist!
54
- end
55
- end
56
- end
57
-
58
- end
59
- end