klepto 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ #! /usr/bin/env ruby
2
+ require 'bundler/setup'
3
+ require 'klepto'
4
+
5
+ @structure = Klepto::Structure.crawl('https://twitter.com/justinbieber')
6
+ config.headers 'Referer' => 'http://www.twitter.com'
7
+
8
+ config.steps [
9
+ [:GET, 'https://twitter.com/login'],
10
+ [:POST,'https://twitter.com/sessions',
11
+ {
12
+ session: {
13
+ username_or_email: 'example',
14
+ password:'123456'
15
+ }
16
+ }
17
+ ]
18
+ ]
19
+ config.urls 'https://twitter.com/justinbieber',
20
+ 'https://twitter.com/ladygaga'
21
+ # config.cookies 'jsession' => 'abcdefg1234567890'
22
+ # config.on_http_status(500,404){}
23
+ # assertions do
24
+ # end
25
+ # config.on_failed_assertion(){}
26
+
27
+
28
+ # Structur the content
29
+ name 'h1.fullname'
30
+ username '.username span.screen-name'
31
+ links 'span.url a', :list, :attr => 'href'
32
+
33
+ tweets 'li.stream-item', :collection do |node|
34
+ # You can access the current parent node
35
+ twitter_id node['data-item-id']
36
+
37
+ # Defaults to innerText
38
+ content '.content p', :css
39
+
40
+ # get an attribute off an element
41
+ timestamp '._timestamp', :attr => 'data-time'
42
+
43
+ permalink '.time a', :css, :attr => :href
44
+ end
45
+ end
46
+
47
+ # @resources = @structure.parse! #=> Array[Hash]
48
+ # @resources.each do |resource|
49
+ # User.create(resource)
50
+ # end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto::Config do
4
+ before(:each) do
5
+ @config = Klepto::Config.new
6
+ @config.headers({'Referer' => 'http://example.com'})
7
+ @config.urls 'http://example.com', 'http://www.iana.org'
8
+ end
9
+
10
+ it 'should be able to set headers' do
11
+ @config.headers['Referer'].should eq('http://example.com')
12
+ end
13
+
14
+ it 'should be able to set URLs' do
15
+ @config.urls.should == ['http://example.com', 'http://www.iana.org']
16
+ end
17
+
18
+ pending 'should be able to set cookies'
19
+ pending 'should be able to set steps'
20
+ pending 'should be able to set assertions'
21
+ pending 'should be able to set on_http_status handler'
22
+ pending 'should be able to set on_failed_assertion handler'
23
+ end
@@ -0,0 +1,105 @@
1
+ require 'spec_helper'
2
+ require 'pp'
3
+
4
+ describe Klepto::Structure do
5
+ describe 'Klepto::Structure.build' do
6
+ before(:each) do
7
+ @page = Capybara::Node::Simple.new(open("./samples/bieber.html").read)
8
+
9
+ @structure = Klepto::Structure.build(@page){
10
+ name 'h1.fullname'
11
+ username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
12
+ tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
13
+ links 'span.url a', :match => :all do |node|
14
+ node[:href]
15
+ end
16
+
17
+ last_tweet 'li.stream-item', :as => :resource do
18
+ twitter_id do |node|
19
+ node['data-item-id']
20
+ end
21
+ content '.content p'
22
+ timestamp '._timestamp', :attr => 'data-time'
23
+ permalink '.time a', :attr => :href
24
+ end
25
+
26
+ tweets 'li.stream-item', :as => :collection do
27
+ twitter_id do |node|
28
+ node['data-item-id']
29
+ end
30
+ tweet '.content p', :css
31
+ timestamp '._timestamp', :attr => 'data-time'
32
+ permalink '.time a', :css, :attr => :href
33
+ end
34
+ }
35
+ end
36
+
37
+ it 'should structure the data' do
38
+ @structure[:name].should match(/Justin/i)
39
+ @structure[:links].should == ["http://www.youtube.com/justinbieber"]
40
+ @structure[:username].should eq '@justinbieber'
41
+ @structure[:last_tweet][:twitter_id].should == @structure[:tweets].first[:twitter_id]
42
+ end
43
+ end
44
+
45
+ describe 'Klepto::Structure.crawl' do
46
+ before(:each) do
47
+ config = {
48
+ :headers => {
49
+ 'Referer' => 'http://www.twitter.com',
50
+ 'X-Sup-Dawg' => "Yo, What's up?"
51
+ }
52
+ }
53
+ @structure = Klepto::Structure.crawl("https://twitter.com/justinbieber", config){
54
+ # Structure that stuff
55
+ name 'h1.fullname'
56
+ username "//span[contains(concat(' ',normalize-space(@class),' '),' screen-name ')]", :syntax => :xpath
57
+ tweet_ids 'li.stream-item', :match => :all, :attr => 'data-item-id'
58
+ links 'span.url a', :match => :all do |node|
59
+ node[:href]
60
+ end
61
+
62
+ last_tweet 'li.stream-item', :as => :resource do
63
+ twitter_id do |node|
64
+ node['data-item-id']
65
+ end
66
+ content '.content p'
67
+ timestamp '._timestamp', :attr => 'data-time'
68
+ permalink '.time a', :attr => :href
69
+ end
70
+
71
+ tweets 'li.stream-item', :as => :collection do
72
+ twitter_id do |node|
73
+ node['data-item-id']
74
+ end
75
+ tweet '.content p', :css
76
+ timestamp '._timestamp', :attr => 'data-time'
77
+ permalink '.time a', :css, :attr => :href
78
+ end
79
+
80
+ after_crawl do |resource|
81
+ @user = User.new
82
+ @user.name = resource[:name]
83
+ @user.username = resource[:username]
84
+ @user.save
85
+
86
+ resource[:tweets].each do |tweet|
87
+ Tweet.create(tweet)
88
+ end
89
+ end
90
+ }
91
+ end
92
+
93
+ it 'should structure the data' do
94
+ @structure.first[:name].should match(/Justin/i)
95
+ @structure.first[:links].should == ["http://www.youtube.com/justinbieber"]
96
+ @structure.first[:username].should eq '@justinbieber'
97
+ @structure.first[:last_tweet][:twitter_id].should == @structure.first[:tweets].first[:twitter_id]
98
+ end
99
+
100
+ it 'should store the data' do
101
+ User.count.should be(1)
102
+ Tweet.count.should_not be(0)
103
+ end
104
+ end
105
+ end
@@ -4,61 +4,4 @@ describe Klepto do
4
4
  it "should have a version" do
5
5
  Klepto::VERSION.should_not be_nil
6
6
  end
7
- end
8
-
9
- describe 'Scraping pages', :skip => false do
10
- before(:each) do
11
- @bot = Klepto::Bot.new do
12
- syntax :css
13
-
14
- headers({
15
- 'Referer' => 'https://twitter.com',
16
- 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
17
- })
18
-
19
- urls 'https://twitter.com/justinbieber'
20
-
21
- crawl 'body' do
22
- scrape "h1.fullname", :name
23
- scrape '.username span.screen-name', :username
24
- save do |params|
25
- user = User.find_by_name(params[:username]) || User.new
26
- user.update_attributes params
27
- end
28
- end
29
-
30
- crawl 'li.stream-item' do
31
- scrape do |node|
32
- {:twitter_id => node['data-item-id']}
33
- end
34
-
35
- scrape '.content p', :content
36
-
37
- scrape '._timestamp' do |node|
38
- {timestamp: node['data-time']}
39
- end
40
-
41
- scrape '.time a' do |node|
42
- {permalink: node[:href]}
43
- end
44
-
45
- save do |params|
46
- tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
47
- tweet.update_attributes params
48
- end
49
- end
50
- end
51
-
52
- @bot.start!
53
- end
54
-
55
- it 'should have collected some resources' do
56
- @bot.crawlers.should have(2).crawlers
57
- @bot.crawlers.first.resources.should have(1).user
58
- end
59
-
60
- it 'should persist resources' do
61
- User.count.should be(1)
62
- Tweet.count.should_not be(0)
63
- end
64
- end
7
+ end
@@ -9,7 +9,7 @@ ActiveRecord::Migration.verbose = false
9
9
  class TestMigration < ActiveRecord::Migration
10
10
  def self.up
11
11
  create_table :tweets, :force => true do |t|
12
- t.string :content
12
+ t.string :tweet
13
13
  t.string :twitter_id
14
14
  t.integer :timestamp
15
15
  t.string :permalink
@@ -28,7 +28,7 @@ class TestMigration < ActiveRecord::Migration
28
28
  end
29
29
 
30
30
  class Tweet < ActiveRecord::Base
31
- validates_presence_of :timestamp, :twitter_id, :permalink, :content
31
+ validates_presence_of :timestamp, :twitter_id, :permalink, :tweet
32
32
  end
33
33
 
34
34
  class User < ActiveRecord::Base
@@ -28,5 +28,5 @@ RSpec.configure do |config|
28
28
  }
29
29
  config.after(:all) { TestMigration.down }
30
30
  config.treat_symbols_as_metadata_keys_with_true_values = true
31
- config.filter_run_excluding :skip => true
31
+ #config.filter_run_including :only => true
32
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: klepto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-10 00:00:00.000000000 Z
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: docile
16
- requirement: &70348289181900 !ruby/object:Gem::Requirement
16
+ requirement: &70231930844560 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70348289181900
24
+ version_requirements: *70231930844560
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: poltergeist
27
- requirement: &70348289181040 !ruby/object:Gem::Requirement
27
+ requirement: &70231930843200 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - =
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.1.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70348289181040
35
+ version_requirements: *70231930843200
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: capybara
38
- requirement: &70348289179880 !ruby/object:Gem::Requirement
38
+ requirement: &70231930839840 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - =
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 2.0.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70348289179880
46
+ version_requirements: *70231930839840
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70348289179180 !ruby/object:Gem::Requirement
49
+ requirement: &70231930836140 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 1.5.6
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70348289179180
57
+ version_requirements: *70231930836140
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: activesupport
60
- requirement: &70348289178640 !ruby/object:Gem::Requirement
60
+ requirement: &70231930835280 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70348289178640
68
+ version_requirements: *70231930835280
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: multi_json
71
- requirement: &70348289177300 !ruby/object:Gem::Requirement
71
+ requirement: &70231930834460 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: '1.0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70348289177300
79
+ version_requirements: *70231930834460
80
80
  description: Tearing up web pages into ActiveRecord resources
81
81
  email:
82
82
  - github@coryodaniel.com
@@ -94,12 +94,13 @@ files:
94
94
  - Rakefile
95
95
  - klepto.gemspec
96
96
  - lib/klepto.rb
97
- - lib/klepto/bot.rb
98
97
  - lib/klepto/browser.rb
99
- - lib/klepto/crawler.rb
98
+ - lib/klepto/config.rb
99
+ - lib/klepto/structure.rb
100
100
  - lib/klepto/tasks.rb
101
101
  - lib/klepto/version.rb
102
- - samples/example.rb
102
+ - samples/bieber.html
103
+ - samples/concept.rb
103
104
  - spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml
104
105
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml
105
106
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml
@@ -107,10 +108,9 @@ files:
107
108
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
108
109
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
109
110
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
110
- - spec/lib/klepto/bot_spec.rb
111
111
  - spec/lib/klepto/browser_spec.rb
112
- - spec/lib/klepto/crawler_spec.rb
113
- - spec/lib/klepto/dsl_spec.rb
112
+ - spec/lib/klepto/config_spec.rb
113
+ - spec/lib/klepto/structure_spec.rb
114
114
  - spec/lib/klepto_spec.rb
115
115
  - spec/orm/active_record.rb
116
116
  - spec/orm/database.example.yml
@@ -147,10 +147,9 @@ test_files:
147
147
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml
148
148
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml
149
149
  - spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml
150
- - spec/lib/klepto/bot_spec.rb
151
150
  - spec/lib/klepto/browser_spec.rb
152
- - spec/lib/klepto/crawler_spec.rb
153
- - spec/lib/klepto/dsl_spec.rb
151
+ - spec/lib/klepto/config_spec.rb
152
+ - spec/lib/klepto/structure_spec.rb
154
153
  - spec/lib/klepto_spec.rb
155
154
  - spec/orm/active_record.rb
156
155
  - spec/orm/database.example.yml
@@ -1,59 +0,0 @@
1
- module Klepto
2
- class Bot
3
- def initialize(*args, &block)
4
- @syntax = :css
5
- @is_dry_run = false
6
- @urls = []
7
- @crawlers = []
8
- @browser = Klepto::Browser.new
9
- Docile.dsl_eval(self, &block) if block_given?
10
- end
11
-
12
- attr_reader :browser, :crawlers
13
-
14
- def dry_run!
15
- @is_dry_run = true
16
- end
17
-
18
- def dry_run?
19
- !!@is_dry_run
20
- end
21
-
22
- def syntax(kind=nil)
23
- @syntax = kind unless kind.nil?
24
- @syntax
25
- end
26
-
27
- def headers(_headers)
28
- @browser.set_headers(_headers)
29
- end
30
-
31
- def url(*args)
32
- @urls += args
33
- end
34
- alias :urls :url
35
-
36
- def crawl(scope, options={}, &block)
37
- options[:syntax] = @syntax
38
- @crawlers << Klepto::Crawler.new(scope, options, &block)
39
- end
40
-
41
- def start!
42
- @urls.each do |url|
43
- browser.fetch!(url)
44
- @crawlers.each do |crawler|
45
- crawler.crawl browser.page
46
- end
47
- end
48
-
49
- @crawlers.each do |crawler|
50
- if dry_run?
51
- pp crawler.resources
52
- else
53
- crawler.persist!
54
- end
55
- end
56
- end
57
-
58
- end
59
- end