klepto 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +120 -89
- data/lib/klepto.rb +2 -2
- data/lib/klepto/config.rb +18 -0
- data/lib/klepto/structure.rb +88 -0
- data/lib/klepto/version.rb +1 -1
- data/samples/bieber.html +4859 -0
- data/samples/concept.rb +50 -0
- data/spec/lib/klepto/config_spec.rb +23 -0
- data/spec/lib/klepto/structure_spec.rb +105 -0
- data/spec/lib/klepto_spec.rb +1 -58
- data/spec/orm/active_record.rb +2 -2
- data/spec/spec_helper.rb +1 -1
- metadata +22 -23
- data/lib/klepto/bot.rb +0 -59
- data/lib/klepto/crawler.rb +0 -72
- data/samples/example.rb +0 -49
- data/spec/lib/klepto/bot_spec.rb +0 -40
- data/spec/lib/klepto/crawler_spec.rb +0 -88
- data/spec/lib/klepto/dsl_spec.rb +0 -6
data/lib/klepto/crawler.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
require 'docile'
|
2
|
-
module Klepto
|
3
|
-
class Crawler
|
4
|
-
|
5
|
-
def initialize(scope,options={},&block)
|
6
|
-
@resources = []
|
7
|
-
@limit = options[:limit]
|
8
|
-
@skip = options[:skip]
|
9
|
-
@syntax = options[:syntax]
|
10
|
-
@scope = scope
|
11
|
-
@designations = []
|
12
|
-
|
13
|
-
Docile.dsl_eval(self, &block) if block_given?
|
14
|
-
end
|
15
|
-
attr_accessor :resources
|
16
|
-
attr_reader :scope, :syntax
|
17
|
-
|
18
|
-
def scrape(selector=nil, assignee=nil, &block)
|
19
|
-
raise Exception if assignee.nil? && !block_given?
|
20
|
-
raise Exception if !assignee.nil? && block_given?
|
21
|
-
designate(:first, selector, assignee, &block)
|
22
|
-
end
|
23
|
-
|
24
|
-
def scrape_all(selector, assignee=nil, &block)
|
25
|
-
raise Exception if assignee.nil? && !block_given?
|
26
|
-
raise Exception if !assignee.nil? && block_given?
|
27
|
-
designate(:all, selector, assignee, &block)
|
28
|
-
end
|
29
|
-
|
30
|
-
def save(&block)
|
31
|
-
@resource_handler = block
|
32
|
-
end
|
33
|
-
|
34
|
-
def crawl(page)
|
35
|
-
page.all(syntax, scope).each do |selection|
|
36
|
-
params = {}
|
37
|
-
@designations.each do |first_or_all, selector, assignee, handler|
|
38
|
-
if selector.nil?
|
39
|
-
attribs = handler.call selection
|
40
|
-
params.merge!( attribs )
|
41
|
-
elsif first_or_all == :first
|
42
|
-
node = selection.first(syntax, selector)
|
43
|
-
if assignee
|
44
|
-
params[assignee] = node.try(:text)
|
45
|
-
else
|
46
|
-
attribs = handler.call node
|
47
|
-
params.merge!( attribs )
|
48
|
-
end
|
49
|
-
else
|
50
|
-
nodes = selection.all(syntax, selector)
|
51
|
-
attribs = handler.call nodes
|
52
|
-
params.merge!( attribs )
|
53
|
-
end
|
54
|
-
end
|
55
|
-
@resources << params
|
56
|
-
end
|
57
|
-
|
58
|
-
@resources
|
59
|
-
end
|
60
|
-
|
61
|
-
def persist!
|
62
|
-
if @resource_handler
|
63
|
-
@resources.each {|resource| @resource_handler.call(resource)}
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
protected
|
68
|
-
def designate(count, selector, assignee, &block)
|
69
|
-
@designations << [count, selector, assignee, block]
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/samples/example.rb
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
require 'bundler/setup'
|
3
|
-
require 'klepto'
|
4
|
-
|
5
|
-
@bot = Klepto::Bot.new do
|
6
|
-
syntax :css
|
7
|
-
dry_run!
|
8
|
-
|
9
|
-
headers({
|
10
|
-
'Referer' => 'http://www.twitter.com',
|
11
|
-
'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
|
12
|
-
|
13
|
-
})
|
14
|
-
|
15
|
-
# Lootin' them bieb tweets
|
16
|
-
urls 'https://twitter.com/justinbieber'
|
17
|
-
|
18
|
-
crawl 'body' do
|
19
|
-
scrape "h1.fullname", :name
|
20
|
-
scrape '.username span.screen-name', :username
|
21
|
-
save do |params|
|
22
|
-
user = User.find_by_name(params[:username]) || User.new
|
23
|
-
user.update_attributes params
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
crawl 'li.stream-item' do
|
28
|
-
scrape do |node|
|
29
|
-
{:twitter_id => node['data-item-id']}
|
30
|
-
end
|
31
|
-
|
32
|
-
scrape '.content p', :content
|
33
|
-
|
34
|
-
scrape '._timestamp' do |node|
|
35
|
-
{timestamp: node['data-time']}
|
36
|
-
end
|
37
|
-
|
38
|
-
scrape '.time a' do |node|
|
39
|
-
{permalink: node[:href]}
|
40
|
-
end
|
41
|
-
|
42
|
-
save do |params|
|
43
|
-
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
44
|
-
tweet.update_attributes params
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
@bot.start!
|
data/spec/lib/klepto/bot_spec.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Klepto::Bot, :vcr => {:record => :new_episodes} do
|
4
|
-
before(:each) do
|
5
|
-
@bot = Klepto::Bot.new
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'should know if it is a dry run' do
|
9
|
-
@bot.dry_run?.should be false
|
10
|
-
@bot.dry_run!
|
11
|
-
@bot.dry_run?.should be true
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'should be able to set the selection syntax' do
|
15
|
-
@bot.syntax(:xpath)
|
16
|
-
@bot.syntax.should be(:xpath)
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'should be able to read the selection syntax' do
|
20
|
-
@bot.syntax.should be(:css)
|
21
|
-
end
|
22
|
-
|
23
|
-
it 'should be able to set request headers' do
|
24
|
-
@bot.should respond_to(:headers)
|
25
|
-
end
|
26
|
-
|
27
|
-
it 'should be able to set a list of URLs to crawl' do
|
28
|
-
@bot.url 'http://www.google.com'
|
29
|
-
@bot.urls.should include('http://www.google.com')
|
30
|
-
@bot.urls 'http://twitter.com', 'http://facebook.com'
|
31
|
-
@bot.urls.should include('http://twitter.com')
|
32
|
-
@bot.urls.should include('http://facebook.com')
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'should be able to add crawlers' do
|
36
|
-
@bot.crawl('div'){}
|
37
|
-
@bot.instance_variable_get("@crawlers").should have(1).crawler
|
38
|
-
@bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
|
39
|
-
end
|
40
|
-
end
|
@@ -1,88 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'open-uri'
|
3
|
-
|
4
|
-
describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
|
5
|
-
describe 'dsl interaction' do
|
6
|
-
before(:each) do
|
7
|
-
@page = page("http://www.iana.org")
|
8
|
-
@crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
|
9
|
-
scrape 'h1', :title
|
10
|
-
|
11
|
-
scrape '#intro p' do |node|
|
12
|
-
{description: node.text}
|
13
|
-
end
|
14
|
-
|
15
|
-
scrape_all '.home-panel h2' do |nodes|
|
16
|
-
{ sections: nodes.map{|n| n.text} }
|
17
|
-
end
|
18
|
-
end
|
19
|
-
@resources = @crawler.crawl @page
|
20
|
-
end #end before
|
21
|
-
|
22
|
-
it 'should crawl the resource' do
|
23
|
-
@resources.should have(1).resource
|
24
|
-
@resources.first[:title].should match('Internet Assigned Numbers Authority')
|
25
|
-
@resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
|
26
|
-
@resources.first[:sections].should have(3).sections
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
describe 'standard interaction' do
|
31
|
-
before(:each) do
|
32
|
-
@page = page()
|
33
|
-
@crawler = Klepto::Crawler.new 'body', {:syntax => :css}
|
34
|
-
end
|
35
|
-
it 'should have a CSS scope' do
|
36
|
-
@crawler.scope.should eq 'body'
|
37
|
-
end
|
38
|
-
it 'should have a desired syntax' do
|
39
|
-
@crawler.syntax.should == :css
|
40
|
-
end
|
41
|
-
|
42
|
-
it 'should be able to scrape the node that the crawler is scoped to' do
|
43
|
-
@crawler.scrape do |node|
|
44
|
-
{:name => node.native.name}
|
45
|
-
end
|
46
|
-
resources = @crawler.crawl( @page )
|
47
|
-
resources.should have(1).resource
|
48
|
-
resources.first[:name].should eq('body')
|
49
|
-
end
|
50
|
-
|
51
|
-
it 'should be able to designate scraping of a single node with a symbol' do
|
52
|
-
@crawler.scrape 'h1', :title
|
53
|
-
resources = @crawler.crawl( @page )
|
54
|
-
resources.should have(1).resource
|
55
|
-
resources.first[:title].should eq('Example Domain')
|
56
|
-
end
|
57
|
-
|
58
|
-
it 'should be able to designate scraping of a single node with a block' do
|
59
|
-
@crawler.scrape 'h1' do |node|
|
60
|
-
{title: node.text}
|
61
|
-
end
|
62
|
-
|
63
|
-
resources = @crawler.crawl( @page )
|
64
|
-
resources.should have(1).resource
|
65
|
-
resources.first[:title].should eq('Example Domain')
|
66
|
-
end
|
67
|
-
|
68
|
-
it 'should be able to designate scraping of a set of nodes' do
|
69
|
-
@crawler.scrape_all 'p' do |nodes|
|
70
|
-
{
|
71
|
-
paragraphs: [
|
72
|
-
nodes.first.text,
|
73
|
-
nodes.last.text
|
74
|
-
]
|
75
|
-
}
|
76
|
-
end
|
77
|
-
resources = @crawler.crawl( @page )
|
78
|
-
resources.should have(1).resource
|
79
|
-
resources.first[:paragraphs].should be_kind_of(Array)
|
80
|
-
resources.first[:paragraphs].last.should eq("More information...")
|
81
|
-
end
|
82
|
-
|
83
|
-
pending 'should be able to save a set of resources'
|
84
|
-
pending 'should be able to specify a limit'
|
85
|
-
pending 'should be able to specify a skip'
|
86
|
-
end
|
87
|
-
|
88
|
-
end
|