klepto 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,72 +0,0 @@
1
- require 'docile'
2
- module Klepto
3
- class Crawler
4
-
5
- def initialize(scope,options={},&block)
6
- @resources = []
7
- @limit = options[:limit]
8
- @skip = options[:skip]
9
- @syntax = options[:syntax]
10
- @scope = scope
11
- @designations = []
12
-
13
- Docile.dsl_eval(self, &block) if block_given?
14
- end
15
- attr_accessor :resources
16
- attr_reader :scope, :syntax
17
-
18
- def scrape(selector=nil, assignee=nil, &block)
19
- raise Exception if assignee.nil? && !block_given?
20
- raise Exception if !assignee.nil? && block_given?
21
- designate(:first, selector, assignee, &block)
22
- end
23
-
24
- def scrape_all(selector, assignee=nil, &block)
25
- raise Exception if assignee.nil? && !block_given?
26
- raise Exception if !assignee.nil? && block_given?
27
- designate(:all, selector, assignee, &block)
28
- end
29
-
30
- def save(&block)
31
- @resource_handler = block
32
- end
33
-
34
- def crawl(page)
35
- page.all(syntax, scope).each do |selection|
36
- params = {}
37
- @designations.each do |first_or_all, selector, assignee, handler|
38
- if selector.nil?
39
- attribs = handler.call selection
40
- params.merge!( attribs )
41
- elsif first_or_all == :first
42
- node = selection.first(syntax, selector)
43
- if assignee
44
- params[assignee] = node.try(:text)
45
- else
46
- attribs = handler.call node
47
- params.merge!( attribs )
48
- end
49
- else
50
- nodes = selection.all(syntax, selector)
51
- attribs = handler.call nodes
52
- params.merge!( attribs )
53
- end
54
- end
55
- @resources << params
56
- end
57
-
58
- @resources
59
- end
60
-
61
- def persist!
62
- if @resource_handler
63
- @resources.each {|resource| @resource_handler.call(resource)}
64
- end
65
- end
66
-
67
- protected
68
- def designate(count, selector, assignee, &block)
69
- @designations << [count, selector, assignee, block]
70
- end
71
- end
72
- end
@@ -1,49 +0,0 @@
1
- #! /usr/bin/env ruby
2
- require 'bundler/setup'
3
- require 'klepto'
4
-
5
- @bot = Klepto::Bot.new do
6
- syntax :css
7
- dry_run!
8
-
9
- headers({
10
- 'Referer' => 'http://www.twitter.com',
11
- 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
12
-
13
- })
14
-
15
- # Lootin' them bieb tweets
16
- urls 'https://twitter.com/justinbieber'
17
-
18
- crawl 'body' do
19
- scrape "h1.fullname", :name
20
- scrape '.username span.screen-name', :username
21
- save do |params|
22
- user = User.find_by_name(params[:username]) || User.new
23
- user.update_attributes params
24
- end
25
- end
26
-
27
- crawl 'li.stream-item' do
28
- scrape do |node|
29
- {:twitter_id => node['data-item-id']}
30
- end
31
-
32
- scrape '.content p', :content
33
-
34
- scrape '._timestamp' do |node|
35
- {timestamp: node['data-time']}
36
- end
37
-
38
- scrape '.time a' do |node|
39
- {permalink: node[:href]}
40
- end
41
-
42
- save do |params|
43
- tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
44
- tweet.update_attributes params
45
- end
46
- end
47
- end
48
-
49
- @bot.start!
@@ -1,40 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Klepto::Bot, :vcr => {:record => :new_episodes} do
4
- before(:each) do
5
- @bot = Klepto::Bot.new
6
- end
7
-
8
- it 'should know if it is a dry run' do
9
- @bot.dry_run?.should be false
10
- @bot.dry_run!
11
- @bot.dry_run?.should be true
12
- end
13
-
14
- it 'should be able to set the selection syntax' do
15
- @bot.syntax(:xpath)
16
- @bot.syntax.should be(:xpath)
17
- end
18
-
19
- it 'should be able to read the selection syntax' do
20
- @bot.syntax.should be(:css)
21
- end
22
-
23
- it 'should be able to set request headers' do
24
- @bot.should respond_to(:headers)
25
- end
26
-
27
- it 'should be able to set a list of URLs to crawl' do
28
- @bot.url 'http://www.google.com'
29
- @bot.urls.should include('http://www.google.com')
30
- @bot.urls 'http://twitter.com', 'http://facebook.com'
31
- @bot.urls.should include('http://twitter.com')
32
- @bot.urls.should include('http://facebook.com')
33
- end
34
-
35
- it 'should be able to add crawlers' do
36
- @bot.crawl('div'){}
37
- @bot.instance_variable_get("@crawlers").should have(1).crawler
38
- @bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
39
- end
40
- end
@@ -1,88 +0,0 @@
1
- require 'spec_helper'
2
- require 'open-uri'
3
-
4
- describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
5
- describe 'dsl interaction' do
6
- before(:each) do
7
- @page = page("http://www.iana.org")
8
- @crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
9
- scrape 'h1', :title
10
-
11
- scrape '#intro p' do |node|
12
- {description: node.text}
13
- end
14
-
15
- scrape_all '.home-panel h2' do |nodes|
16
- { sections: nodes.map{|n| n.text} }
17
- end
18
- end
19
- @resources = @crawler.crawl @page
20
- end #end before
21
-
22
- it 'should crawl the resource' do
23
- @resources.should have(1).resource
24
- @resources.first[:title].should match('Internet Assigned Numbers Authority')
25
- @resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
26
- @resources.first[:sections].should have(3).sections
27
- end
28
- end
29
-
30
- describe 'standard interaction' do
31
- before(:each) do
32
- @page = page()
33
- @crawler = Klepto::Crawler.new 'body', {:syntax => :css}
34
- end
35
- it 'should have a CSS scope' do
36
- @crawler.scope.should eq 'body'
37
- end
38
- it 'should have a desired syntax' do
39
- @crawler.syntax.should == :css
40
- end
41
-
42
- it 'should be able to scrape the node that the crawler is scoped to' do
43
- @crawler.scrape do |node|
44
- {:name => node.native.name}
45
- end
46
- resources = @crawler.crawl( @page )
47
- resources.should have(1).resource
48
- resources.first[:name].should eq('body')
49
- end
50
-
51
- it 'should be able to designate scraping of a single node with a symbol' do
52
- @crawler.scrape 'h1', :title
53
- resources = @crawler.crawl( @page )
54
- resources.should have(1).resource
55
- resources.first[:title].should eq('Example Domain')
56
- end
57
-
58
- it 'should be able to designate scraping of a single node with a block' do
59
- @crawler.scrape 'h1' do |node|
60
- {title: node.text}
61
- end
62
-
63
- resources = @crawler.crawl( @page )
64
- resources.should have(1).resource
65
- resources.first[:title].should eq('Example Domain')
66
- end
67
-
68
- it 'should be able to designate scraping of a set of nodes' do
69
- @crawler.scrape_all 'p' do |nodes|
70
- {
71
- paragraphs: [
72
- nodes.first.text,
73
- nodes.last.text
74
- ]
75
- }
76
- end
77
- resources = @crawler.crawl( @page )
78
- resources.should have(1).resource
79
- resources.first[:paragraphs].should be_kind_of(Array)
80
- resources.first[:paragraphs].last.should eq("More information...")
81
- end
82
-
83
- pending 'should be able to save a set of resources'
84
- pending 'should be able to specify a limit'
85
- pending 'should be able to specify a skip'
86
- end
87
-
88
- end
@@ -1,6 +0,0 @@
1
- require 'spec_helper'
2
-
3
-
4
-
5
- describe 'Klepto DSL' do
6
- end