klepto 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,72 +0,0 @@
1
- require 'docile'
2
- module Klepto
3
- class Crawler
4
-
5
- def initialize(scope,options={},&block)
6
- @resources = []
7
- @limit = options[:limit]
8
- @skip = options[:skip]
9
- @syntax = options[:syntax]
10
- @scope = scope
11
- @designations = []
12
-
13
- Docile.dsl_eval(self, &block) if block_given?
14
- end
15
- attr_accessor :resources
16
- attr_reader :scope, :syntax
17
-
18
- def scrape(selector=nil, assignee=nil, &block)
19
- raise Exception if assignee.nil? && !block_given?
20
- raise Exception if !assignee.nil? && block_given?
21
- designate(:first, selector, assignee, &block)
22
- end
23
-
24
- def scrape_all(selector, assignee=nil, &block)
25
- raise Exception if assignee.nil? && !block_given?
26
- raise Exception if !assignee.nil? && block_given?
27
- designate(:all, selector, assignee, &block)
28
- end
29
-
30
- def save(&block)
31
- @resource_handler = block
32
- end
33
-
34
- def crawl(page)
35
- page.all(syntax, scope).each do |selection|
36
- params = {}
37
- @designations.each do |first_or_all, selector, assignee, handler|
38
- if selector.nil?
39
- attribs = handler.call selection
40
- params.merge!( attribs )
41
- elsif first_or_all == :first
42
- node = selection.first(syntax, selector)
43
- if assignee
44
- params[assignee] = node.try(:text)
45
- else
46
- attribs = handler.call node
47
- params.merge!( attribs )
48
- end
49
- else
50
- nodes = selection.all(syntax, selector)
51
- attribs = handler.call nodes
52
- params.merge!( attribs )
53
- end
54
- end
55
- @resources << params
56
- end
57
-
58
- @resources
59
- end
60
-
61
- def persist!
62
- if @resource_handler
63
- @resources.each {|resource| @resource_handler.call(resource)}
64
- end
65
- end
66
-
67
- protected
68
- def designate(count, selector, assignee, &block)
69
- @designations << [count, selector, assignee, block]
70
- end
71
- end
72
- end
@@ -1,49 +0,0 @@
1
- #! /usr/bin/env ruby
2
- require 'bundler/setup'
3
- require 'klepto'
4
-
5
- @bot = Klepto::Bot.new do
6
- syntax :css
7
- dry_run!
8
-
9
- headers({
10
- 'Referer' => 'http://www.twitter.com',
11
- 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
12
-
13
- })
14
-
15
- # Lootin' them bieb tweets
16
- urls 'https://twitter.com/justinbieber'
17
-
18
- crawl 'body' do
19
- scrape "h1.fullname", :name
20
- scrape '.username span.screen-name', :username
21
- save do |params|
22
- user = User.find_by_name(params[:username]) || User.new
23
- user.update_attributes params
24
- end
25
- end
26
-
27
- crawl 'li.stream-item' do
28
- scrape do |node|
29
- {:twitter_id => node['data-item-id']}
30
- end
31
-
32
- scrape '.content p', :content
33
-
34
- scrape '._timestamp' do |node|
35
- {timestamp: node['data-time']}
36
- end
37
-
38
- scrape '.time a' do |node|
39
- {permalink: node[:href]}
40
- end
41
-
42
- save do |params|
43
- tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
44
- tweet.update_attributes params
45
- end
46
- end
47
- end
48
-
49
- @bot.start!
@@ -1,40 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Klepto::Bot, :vcr => {:record => :new_episodes} do
4
- before(:each) do
5
- @bot = Klepto::Bot.new
6
- end
7
-
8
- it 'should know if it is a dry run' do
9
- @bot.dry_run?.should be false
10
- @bot.dry_run!
11
- @bot.dry_run?.should be true
12
- end
13
-
14
- it 'should be able to set the selection syntax' do
15
- @bot.syntax(:xpath)
16
- @bot.syntax.should be(:xpath)
17
- end
18
-
19
- it 'should be able to read the selection syntax' do
20
- @bot.syntax.should be(:css)
21
- end
22
-
23
- it 'should be able to set request headers' do
24
- @bot.should respond_to(:headers)
25
- end
26
-
27
- it 'should be able to set a list of URLs to crawl' do
28
- @bot.url 'http://www.google.com'
29
- @bot.urls.should include('http://www.google.com')
30
- @bot.urls 'http://twitter.com', 'http://facebook.com'
31
- @bot.urls.should include('http://twitter.com')
32
- @bot.urls.should include('http://facebook.com')
33
- end
34
-
35
- it 'should be able to add crawlers' do
36
- @bot.crawl('div'){}
37
- @bot.instance_variable_get("@crawlers").should have(1).crawler
38
- @bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
39
- end
40
- end
@@ -1,88 +0,0 @@
1
- require 'spec_helper'
2
- require 'open-uri'
3
-
4
- describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
5
- describe 'dsl interaction' do
6
- before(:each) do
7
- @page = page("http://www.iana.org")
8
- @crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
9
- scrape 'h1', :title
10
-
11
- scrape '#intro p' do |node|
12
- {description: node.text}
13
- end
14
-
15
- scrape_all '.home-panel h2' do |nodes|
16
- { sections: nodes.map{|n| n.text} }
17
- end
18
- end
19
- @resources = @crawler.crawl @page
20
- end #end before
21
-
22
- it 'should crawl the resource' do
23
- @resources.should have(1).resource
24
- @resources.first[:title].should match('Internet Assigned Numbers Authority')
25
- @resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
26
- @resources.first[:sections].should have(3).sections
27
- end
28
- end
29
-
30
- describe 'standard interaction' do
31
- before(:each) do
32
- @page = page()
33
- @crawler = Klepto::Crawler.new 'body', {:syntax => :css}
34
- end
35
- it 'should have a CSS scope' do
36
- @crawler.scope.should eq 'body'
37
- end
38
- it 'should have a desired syntax' do
39
- @crawler.syntax.should == :css
40
- end
41
-
42
- it 'should be able to scrape the node that the crawler is scoped to' do
43
- @crawler.scrape do |node|
44
- {:name => node.native.name}
45
- end
46
- resources = @crawler.crawl( @page )
47
- resources.should have(1).resource
48
- resources.first[:name].should eq('body')
49
- end
50
-
51
- it 'should be able to designate scraping of a single node with a symbol' do
52
- @crawler.scrape 'h1', :title
53
- resources = @crawler.crawl( @page )
54
- resources.should have(1).resource
55
- resources.first[:title].should eq('Example Domain')
56
- end
57
-
58
- it 'should be able to designate scraping of a single node with a block' do
59
- @crawler.scrape 'h1' do |node|
60
- {title: node.text}
61
- end
62
-
63
- resources = @crawler.crawl( @page )
64
- resources.should have(1).resource
65
- resources.first[:title].should eq('Example Domain')
66
- end
67
-
68
- it 'should be able to designate scraping of a set of nodes' do
69
- @crawler.scrape_all 'p' do |nodes|
70
- {
71
- paragraphs: [
72
- nodes.first.text,
73
- nodes.last.text
74
- ]
75
- }
76
- end
77
- resources = @crawler.crawl( @page )
78
- resources.should have(1).resource
79
- resources.first[:paragraphs].should be_kind_of(Array)
80
- resources.first[:paragraphs].last.should eq("More information...")
81
- end
82
-
83
- pending 'should be able to save a set of resources'
84
- pending 'should be able to specify a limit'
85
- pending 'should be able to specify a skip'
86
- end
87
-
88
- end
@@ -1,6 +0,0 @@
1
- require 'spec_helper'
2
-
3
-
4
-
5
- describe 'Klepto DSL' do
6
- end