klepto 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/.rspec +2 -0
- data/.rvmrc +1 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +129 -0
- data/Rakefile +7 -0
- data/klepto.gemspec +26 -0
- data/lib/klepto.rb +26 -0
- data/lib/klepto/bot.rb +59 -0
- data/lib/klepto/browser.rb +18 -0
- data/lib/klepto/crawler.rb +72 -0
- data/lib/klepto/tasks.rb +15 -0
- data/lib/klepto/version.rb +3 -0
- data/samples/example.rb +49 -0
- data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
- data/spec/lib/klepto/bot_spec.rb +40 -0
- data/spec/lib/klepto/browser_spec.rb +15 -0
- data/spec/lib/klepto/crawler_spec.rb +88 -0
- data/spec/lib/klepto/dsl_spec.rb +6 -0
- data/spec/lib/klepto_spec.rb +64 -0
- data/spec/orm/active_record.rb +36 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +32 -0
- metadata +157 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://example.com/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
accept:
|
11
|
+
- ! '*/*'
|
12
|
+
user-agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 302
|
17
|
+
message: Found
|
18
|
+
headers:
|
19
|
+
location:
|
20
|
+
- http://www.iana.org/domains/example/
|
21
|
+
server:
|
22
|
+
- BigIP
|
23
|
+
connection:
|
24
|
+
- Keep-Alive
|
25
|
+
content-length:
|
26
|
+
- '0'
|
27
|
+
body:
|
28
|
+
encoding: US-ASCII
|
29
|
+
string: ''
|
30
|
+
http_version: '1.0'
|
31
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
32
|
+
- request:
|
33
|
+
method: get
|
34
|
+
uri: http://www.iana.org/domains/example/
|
35
|
+
body:
|
36
|
+
encoding: US-ASCII
|
37
|
+
string: ''
|
38
|
+
headers:
|
39
|
+
accept:
|
40
|
+
- ! '*/*'
|
41
|
+
user-agent:
|
42
|
+
- Ruby
|
43
|
+
response:
|
44
|
+
status:
|
45
|
+
code: 302
|
46
|
+
message: FOUND
|
47
|
+
headers:
|
48
|
+
date:
|
49
|
+
- Wed, 10 Apr 2013 06:51:03 GMT
|
50
|
+
server:
|
51
|
+
- Apache/2.2.3 (CentOS)
|
52
|
+
location:
|
53
|
+
- http://www.iana.org/domains/example
|
54
|
+
content-length:
|
55
|
+
- '0'
|
56
|
+
connection:
|
57
|
+
- close
|
58
|
+
content-type:
|
59
|
+
- text/html; charset=utf-8
|
60
|
+
body:
|
61
|
+
encoding: US-ASCII
|
62
|
+
string: ''
|
63
|
+
http_version: '1.1'
|
64
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
65
|
+
- request:
|
66
|
+
method: get
|
67
|
+
uri: http://www.iana.org/domains/example
|
68
|
+
body:
|
69
|
+
encoding: US-ASCII
|
70
|
+
string: ''
|
71
|
+
headers:
|
72
|
+
accept:
|
73
|
+
- ! '*/*'
|
74
|
+
user-agent:
|
75
|
+
- Ruby
|
76
|
+
response:
|
77
|
+
status:
|
78
|
+
code: 200
|
79
|
+
message: OK
|
80
|
+
headers:
|
81
|
+
date:
|
82
|
+
- Wed, 10 Apr 2013 06:51:04 GMT
|
83
|
+
server:
|
84
|
+
- Apache/2.2.3 (CentOS)
|
85
|
+
last-modified:
|
86
|
+
- Fri, 04 Jan 2013 01:17:22 GMT
|
87
|
+
vary:
|
88
|
+
- Accept-Encoding
|
89
|
+
connection:
|
90
|
+
- close
|
91
|
+
transfer-encoding:
|
92
|
+
- chunked
|
93
|
+
content-type:
|
94
|
+
- text/html; charset=UTF-8
|
95
|
+
body:
|
96
|
+
encoding: US-ASCII
|
97
|
+
string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
|
98
|
+
charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
|
99
|
+
charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
|
100
|
+
initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
|
101
|
+
#f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
|
102
|
+
\"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
|
103
|
+
600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
|
104
|
+
1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
|
105
|
+
none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
|
106
|
+
#fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
|
107
|
+
0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
|
108
|
+
Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
|
109
|
+
in documents. You do not need to\n\t\tcoordinate or ask for permission to
|
110
|
+
use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
|
111
|
+
href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
|
112
|
+
http_version: '1.1'
|
113
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
114
|
+
recorded_with: VCR 2.4.0
|
@@ -0,0 +1,114 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://example.com/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
accept:
|
11
|
+
- ! '*/*'
|
12
|
+
user-agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 302
|
17
|
+
message: Found
|
18
|
+
headers:
|
19
|
+
location:
|
20
|
+
- http://www.iana.org/domains/example/
|
21
|
+
server:
|
22
|
+
- BigIP
|
23
|
+
connection:
|
24
|
+
- Keep-Alive
|
25
|
+
content-length:
|
26
|
+
- '0'
|
27
|
+
body:
|
28
|
+
encoding: US-ASCII
|
29
|
+
string: ''
|
30
|
+
http_version: '1.0'
|
31
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
32
|
+
- request:
|
33
|
+
method: get
|
34
|
+
uri: http://www.iana.org/domains/example/
|
35
|
+
body:
|
36
|
+
encoding: US-ASCII
|
37
|
+
string: ''
|
38
|
+
headers:
|
39
|
+
accept:
|
40
|
+
- ! '*/*'
|
41
|
+
user-agent:
|
42
|
+
- Ruby
|
43
|
+
response:
|
44
|
+
status:
|
45
|
+
code: 302
|
46
|
+
message: FOUND
|
47
|
+
headers:
|
48
|
+
date:
|
49
|
+
- Wed, 10 Apr 2013 06:51:04 GMT
|
50
|
+
server:
|
51
|
+
- Apache/2.2.3 (CentOS)
|
52
|
+
location:
|
53
|
+
- http://www.iana.org/domains/example
|
54
|
+
content-length:
|
55
|
+
- '0'
|
56
|
+
connection:
|
57
|
+
- close
|
58
|
+
content-type:
|
59
|
+
- text/html; charset=utf-8
|
60
|
+
body:
|
61
|
+
encoding: US-ASCII
|
62
|
+
string: ''
|
63
|
+
http_version: '1.1'
|
64
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
65
|
+
- request:
|
66
|
+
method: get
|
67
|
+
uri: http://www.iana.org/domains/example
|
68
|
+
body:
|
69
|
+
encoding: US-ASCII
|
70
|
+
string: ''
|
71
|
+
headers:
|
72
|
+
accept:
|
73
|
+
- ! '*/*'
|
74
|
+
user-agent:
|
75
|
+
- Ruby
|
76
|
+
response:
|
77
|
+
status:
|
78
|
+
code: 200
|
79
|
+
message: OK
|
80
|
+
headers:
|
81
|
+
date:
|
82
|
+
- Wed, 10 Apr 2013 06:51:04 GMT
|
83
|
+
server:
|
84
|
+
- Apache/2.2.3 (CentOS)
|
85
|
+
last-modified:
|
86
|
+
- Fri, 04 Jan 2013 01:17:22 GMT
|
87
|
+
vary:
|
88
|
+
- Accept-Encoding
|
89
|
+
connection:
|
90
|
+
- close
|
91
|
+
transfer-encoding:
|
92
|
+
- chunked
|
93
|
+
content-type:
|
94
|
+
- text/html; charset=UTF-8
|
95
|
+
body:
|
96
|
+
encoding: US-ASCII
|
97
|
+
string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
|
98
|
+
charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
|
99
|
+
charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
|
100
|
+
initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
|
101
|
+
#f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
|
102
|
+
\"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
|
103
|
+
600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
|
104
|
+
1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
|
105
|
+
none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
|
106
|
+
#fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
|
107
|
+
0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
|
108
|
+
Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
|
109
|
+
in documents. You do not need to\n\t\tcoordinate or ask for permission to
|
110
|
+
use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
|
111
|
+
href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
|
112
|
+
http_version: '1.1'
|
113
|
+
recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
|
114
|
+
recorded_with: VCR 2.4.0
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Bot, :vcr => {:record => :new_episodes} do
|
4
|
+
before(:each) do
|
5
|
+
@bot = Klepto::Bot.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should know if it is a dry run' do
|
9
|
+
@bot.dry_run?.should be false
|
10
|
+
@bot.dry_run!
|
11
|
+
@bot.dry_run?.should be true
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should be able to set the selection syntax' do
|
15
|
+
@bot.syntax(:xpath)
|
16
|
+
@bot.syntax.should be(:xpath)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should be able to read the selection syntax' do
|
20
|
+
@bot.syntax.should be(:css)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should be able to set request headers' do
|
24
|
+
@bot.should respond_to(:headers)
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should be able to set a list of URLs to crawl' do
|
28
|
+
@bot.url 'http://www.google.com'
|
29
|
+
@bot.urls.should include('http://www.google.com')
|
30
|
+
@bot.urls 'http://twitter.com', 'http://facebook.com'
|
31
|
+
@bot.urls.should include('http://twitter.com')
|
32
|
+
@bot.urls.should include('http://facebook.com')
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should be able to add crawlers' do
|
36
|
+
@bot.crawl('div'){}
|
37
|
+
@bot.instance_variable_get("@crawlers").should have(1).crawler
|
38
|
+
@bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Browser, :vcr => {:record => :new_episodes} do
|
4
|
+
before(:each) do
|
5
|
+
@browser = Klepto::Browser.new
|
6
|
+
@browser.set_headers({
|
7
|
+
'Referer' => 'http://www.example.com'
|
8
|
+
})
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should be able to fetch a page' do
|
12
|
+
@page = @browser.fetch! 'http://www.example.com'
|
13
|
+
@page.status_code.should be(200)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
|
5
|
+
describe 'dsl interaction' do
|
6
|
+
before(:each) do
|
7
|
+
@page = page("http://www.iana.org")
|
8
|
+
@crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
|
9
|
+
scrape 'h1', :title
|
10
|
+
|
11
|
+
scrape '#intro p' do |node|
|
12
|
+
{description: node.text}
|
13
|
+
end
|
14
|
+
|
15
|
+
scrape_all '.home-panel h2' do |nodes|
|
16
|
+
{ sections: nodes.map{|n| n.text} }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
@resources = @crawler.crawl @page
|
20
|
+
end #end before
|
21
|
+
|
22
|
+
it 'should crawl the resource' do
|
23
|
+
@resources.should have(1).resource
|
24
|
+
@resources.first[:title].should match('Internet Assigned Numbers Authority')
|
25
|
+
@resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
|
26
|
+
@resources.first[:sections].should have(3).sections
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'standard interaction' do
|
31
|
+
before(:each) do
|
32
|
+
@page = page()
|
33
|
+
@crawler = Klepto::Crawler.new 'body', {:syntax => :css}
|
34
|
+
end
|
35
|
+
it 'should have a CSS scope' do
|
36
|
+
@crawler.scope.should eq 'body'
|
37
|
+
end
|
38
|
+
it 'should have a desired syntax' do
|
39
|
+
@crawler.syntax.should == :css
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should be able to scrape the node that the crawler is scoped to' do
|
43
|
+
@crawler.scrape do |node|
|
44
|
+
{:name => node.native.name}
|
45
|
+
end
|
46
|
+
resources = @crawler.crawl( @page )
|
47
|
+
resources.should have(1).resource
|
48
|
+
resources.first[:name].should eq('body')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should be able to designate scraping of a single node with a symbol' do
|
52
|
+
@crawler.scrape 'h1', :title
|
53
|
+
resources = @crawler.crawl( @page )
|
54
|
+
resources.should have(1).resource
|
55
|
+
resources.first[:title].should eq('Example Domain')
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should be able to designate scraping of a single node with a block' do
|
59
|
+
@crawler.scrape 'h1' do |node|
|
60
|
+
{title: node.text}
|
61
|
+
end
|
62
|
+
|
63
|
+
resources = @crawler.crawl( @page )
|
64
|
+
resources.should have(1).resource
|
65
|
+
resources.first[:title].should eq('Example Domain')
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should be able to designate scraping of a set of nodes' do
|
69
|
+
@crawler.scrape_all 'p' do |nodes|
|
70
|
+
{
|
71
|
+
paragraphs: [
|
72
|
+
nodes.first.text,
|
73
|
+
nodes.last.text
|
74
|
+
]
|
75
|
+
}
|
76
|
+
end
|
77
|
+
resources = @crawler.crawl( @page )
|
78
|
+
resources.should have(1).resource
|
79
|
+
resources.first[:paragraphs].should be_kind_of(Array)
|
80
|
+
resources.first[:paragraphs].last.should eq("More information...")
|
81
|
+
end
|
82
|
+
|
83
|
+
pending 'should be able to save a set of resources'
|
84
|
+
pending 'should be able to specify a limit'
|
85
|
+
pending 'should be able to specify a skip'
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto do
|
4
|
+
it "should have a version" do
|
5
|
+
Klepto::VERSION.should_not be_nil
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
describe 'Scraping pages', :skip => false do
|
10
|
+
before(:each) do
|
11
|
+
@bot = Klepto::Bot.new do
|
12
|
+
syntax :css
|
13
|
+
|
14
|
+
headers({
|
15
|
+
'Referer' => 'https://twitter.com',
|
16
|
+
'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
|
17
|
+
})
|
18
|
+
|
19
|
+
urls 'https://twitter.com/justinbieber'
|
20
|
+
|
21
|
+
crawl 'body' do
|
22
|
+
scrape "h1.fullname", :name
|
23
|
+
scrape '.username span.screen-name', :username
|
24
|
+
save do |params|
|
25
|
+
user = User.find_by_name(params[:username]) || User.new
|
26
|
+
user.update_attributes params
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
crawl 'li.stream-item' do
|
31
|
+
scrape do |node|
|
32
|
+
{:twitter_id => node['data-item-id']}
|
33
|
+
end
|
34
|
+
|
35
|
+
scrape '.content p', :content
|
36
|
+
|
37
|
+
scrape '._timestamp' do |node|
|
38
|
+
{timestamp: node['data-time']}
|
39
|
+
end
|
40
|
+
|
41
|
+
scrape '.time a' do |node|
|
42
|
+
{permalink: node[:href]}
|
43
|
+
end
|
44
|
+
|
45
|
+
save do |params|
|
46
|
+
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
47
|
+
tweet.update_attributes params
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
@bot.start!
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should have collected some resources' do
|
56
|
+
@bot.crawlers.should have(2).crawlers
|
57
|
+
@bot.crawlers.first.resources.should have(1).user
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should persist resources' do
|
61
|
+
User.count.should be(1)
|
62
|
+
Tweet.count.should_not be(0)
|
63
|
+
end
|
64
|
+
end
|