klepto 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +21 -0
- data/.rspec +2 -0
- data/.rvmrc +1 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +129 -0
- data/Rakefile +7 -0
- data/klepto.gemspec +26 -0
- data/lib/klepto.rb +26 -0
- data/lib/klepto/bot.rb +59 -0
- data/lib/klepto/browser.rb +18 -0
- data/lib/klepto/crawler.rb +72 -0
- data/lib/klepto/tasks.rb +15 -0
- data/lib/klepto/version.rb +3 -0
- data/samples/example.rb +49 -0
- data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
- data/spec/lib/klepto/bot_spec.rb +40 -0
- data/spec/lib/klepto/browser_spec.rb +15 -0
- data/spec/lib/klepto/crawler_spec.rb +88 -0
- data/spec/lib/klepto/dsl_spec.rb +6 -0
- data/spec/lib/klepto_spec.rb +64 -0
- data/spec/orm/active_record.rb +36 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +32 -0
- metadata +157 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://example.com/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
accept:
|
11
|
+
- ! '*/*'
|
12
|
+
user-agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 302
|
17
|
+
message: Found
|
18
|
+
headers:
|
19
|
+
location:
|
20
|
+
- http://www.iana.org/domains/example/
|
21
|
+
server:
|
22
|
+
- BigIP
|
23
|
+
connection:
|
24
|
+
- Keep-Alive
|
25
|
+
content-length:
|
26
|
+
- '0'
|
27
|
+
body:
|
28
|
+
encoding: US-ASCII
|
29
|
+
string: ''
|
30
|
+
http_version: '1.0'
|
31
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
32
|
+
- request:
|
33
|
+
method: get
|
34
|
+
uri: http://www.iana.org/domains/example/
|
35
|
+
body:
|
36
|
+
encoding: US-ASCII
|
37
|
+
string: ''
|
38
|
+
headers:
|
39
|
+
accept:
|
40
|
+
- ! '*/*'
|
41
|
+
user-agent:
|
42
|
+
- Ruby
|
43
|
+
response:
|
44
|
+
status:
|
45
|
+
code: 302
|
46
|
+
message: FOUND
|
47
|
+
headers:
|
48
|
+
date:
|
49
|
+
- Wed, 10 Apr 2013 06:51:03 GMT
|
50
|
+
server:
|
51
|
+
- Apache/2.2.3 (CentOS)
|
52
|
+
location:
|
53
|
+
- http://www.iana.org/domains/example
|
54
|
+
content-length:
|
55
|
+
- '0'
|
56
|
+
connection:
|
57
|
+
- close
|
58
|
+
content-type:
|
59
|
+
- text/html; charset=utf-8
|
60
|
+
body:
|
61
|
+
encoding: US-ASCII
|
62
|
+
string: ''
|
63
|
+
http_version: '1.1'
|
64
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
65
|
+
- request:
|
66
|
+
method: get
|
67
|
+
uri: http://www.iana.org/domains/example
|
68
|
+
body:
|
69
|
+
encoding: US-ASCII
|
70
|
+
string: ''
|
71
|
+
headers:
|
72
|
+
accept:
|
73
|
+
- ! '*/*'
|
74
|
+
user-agent:
|
75
|
+
- Ruby
|
76
|
+
response:
|
77
|
+
status:
|
78
|
+
code: 200
|
79
|
+
message: OK
|
80
|
+
headers:
|
81
|
+
date:
|
82
|
+
- Wed, 10 Apr 2013 06:51:04 GMT
|
83
|
+
server:
|
84
|
+
- Apache/2.2.3 (CentOS)
|
85
|
+
last-modified:
|
86
|
+
- Fri, 04 Jan 2013 01:17:22 GMT
|
87
|
+
vary:
|
88
|
+
- Accept-Encoding
|
89
|
+
connection:
|
90
|
+
- close
|
91
|
+
transfer-encoding:
|
92
|
+
- chunked
|
93
|
+
content-type:
|
94
|
+
- text/html; charset=UTF-8
|
95
|
+
body:
|
96
|
+
encoding: US-ASCII
|
97
|
+
string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
|
98
|
+
charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
|
99
|
+
charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
|
100
|
+
initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
|
101
|
+
#f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
|
102
|
+
\"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
|
103
|
+
600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
|
104
|
+
1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
|
105
|
+
none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
|
106
|
+
#fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
|
107
|
+
0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
|
108
|
+
Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
|
109
|
+
in documents. You do not need to\n\t\tcoordinate or ask for permission to
|
110
|
+
use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
|
111
|
+
href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
|
112
|
+
http_version: '1.1'
|
113
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
114
|
+
recorded_with: VCR 2.4.0
|
@@ -0,0 +1,114 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://example.com/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
accept:
|
11
|
+
- ! '*/*'
|
12
|
+
user-agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 302
|
17
|
+
message: Found
|
18
|
+
headers:
|
19
|
+
location:
|
20
|
+
- http://www.iana.org/domains/example/
|
21
|
+
server:
|
22
|
+
- BigIP
|
23
|
+
connection:
|
24
|
+
- Keep-Alive
|
25
|
+
content-length:
|
26
|
+
- '0'
|
27
|
+
body:
|
28
|
+
encoding: US-ASCII
|
29
|
+
string: ''
|
30
|
+
http_version: '1.0'
|
31
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
32
|
+
- request:
|
33
|
+
method: get
|
34
|
+
uri: http://www.iana.org/domains/example/
|
35
|
+
body:
|
36
|
+
encoding: US-ASCII
|
37
|
+
string: ''
|
38
|
+
headers:
|
39
|
+
accept:
|
40
|
+
- ! '*/*'
|
41
|
+
user-agent:
|
42
|
+
- Ruby
|
43
|
+
response:
|
44
|
+
status:
|
45
|
+
code: 302
|
46
|
+
message: FOUND
|
47
|
+
headers:
|
48
|
+
date:
|
49
|
+
- Wed, 10 Apr 2013 06:51:04 GMT
|
50
|
+
server:
|
51
|
+
- Apache/2.2.3 (CentOS)
|
52
|
+
location:
|
53
|
+
- http://www.iana.org/domains/example
|
54
|
+
content-length:
|
55
|
+
- '0'
|
56
|
+
connection:
|
57
|
+
- close
|
58
|
+
content-type:
|
59
|
+
- text/html; charset=utf-8
|
60
|
+
body:
|
61
|
+
encoding: US-ASCII
|
62
|
+
string: ''
|
63
|
+
http_version: '1.1'
|
64
|
+
recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
|
65
|
+
- request:
|
66
|
+
method: get
|
67
|
+
uri: http://www.iana.org/domains/example
|
68
|
+
body:
|
69
|
+
encoding: US-ASCII
|
70
|
+
string: ''
|
71
|
+
headers:
|
72
|
+
accept:
|
73
|
+
- ! '*/*'
|
74
|
+
user-agent:
|
75
|
+
- Ruby
|
76
|
+
response:
|
77
|
+
status:
|
78
|
+
code: 200
|
79
|
+
message: OK
|
80
|
+
headers:
|
81
|
+
date:
|
82
|
+
- Wed, 10 Apr 2013 06:51:04 GMT
|
83
|
+
server:
|
84
|
+
- Apache/2.2.3 (CentOS)
|
85
|
+
last-modified:
|
86
|
+
- Fri, 04 Jan 2013 01:17:22 GMT
|
87
|
+
vary:
|
88
|
+
- Accept-Encoding
|
89
|
+
connection:
|
90
|
+
- close
|
91
|
+
transfer-encoding:
|
92
|
+
- chunked
|
93
|
+
content-type:
|
94
|
+
- text/html; charset=UTF-8
|
95
|
+
body:
|
96
|
+
encoding: US-ASCII
|
97
|
+
string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
|
98
|
+
charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
|
99
|
+
charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
|
100
|
+
initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
|
101
|
+
#f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
|
102
|
+
\"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
|
103
|
+
600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
|
104
|
+
1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
|
105
|
+
none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
|
106
|
+
#fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
|
107
|
+
0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
|
108
|
+
Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
|
109
|
+
in documents. You do not need to\n\t\tcoordinate or ask for permission to
|
110
|
+
use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
|
111
|
+
href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
|
112
|
+
http_version: '1.1'
|
113
|
+
recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
|
114
|
+
recorded_with: VCR 2.4.0
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Bot, :vcr => {:record => :new_episodes} do
|
4
|
+
before(:each) do
|
5
|
+
@bot = Klepto::Bot.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should know if it is a dry run' do
|
9
|
+
@bot.dry_run?.should be false
|
10
|
+
@bot.dry_run!
|
11
|
+
@bot.dry_run?.should be true
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should be able to set the selection syntax' do
|
15
|
+
@bot.syntax(:xpath)
|
16
|
+
@bot.syntax.should be(:xpath)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should be able to read the selection syntax' do
|
20
|
+
@bot.syntax.should be(:css)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should be able to set request headers' do
|
24
|
+
@bot.should respond_to(:headers)
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should be able to set a list of URLs to crawl' do
|
28
|
+
@bot.url 'http://www.google.com'
|
29
|
+
@bot.urls.should include('http://www.google.com')
|
30
|
+
@bot.urls 'http://twitter.com', 'http://facebook.com'
|
31
|
+
@bot.urls.should include('http://twitter.com')
|
32
|
+
@bot.urls.should include('http://facebook.com')
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should be able to add crawlers' do
|
36
|
+
@bot.crawl('div'){}
|
37
|
+
@bot.instance_variable_get("@crawlers").should have(1).crawler
|
38
|
+
@bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto::Browser, :vcr => {:record => :new_episodes} do
|
4
|
+
before(:each) do
|
5
|
+
@browser = Klepto::Browser.new
|
6
|
+
@browser.set_headers({
|
7
|
+
'Referer' => 'http://www.example.com'
|
8
|
+
})
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should be able to fetch a page' do
|
12
|
+
@page = @browser.fetch! 'http://www.example.com'
|
13
|
+
@page.status_code.should be(200)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
|
5
|
+
describe 'dsl interaction' do
|
6
|
+
before(:each) do
|
7
|
+
@page = page("http://www.iana.org")
|
8
|
+
@crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
|
9
|
+
scrape 'h1', :title
|
10
|
+
|
11
|
+
scrape '#intro p' do |node|
|
12
|
+
{description: node.text}
|
13
|
+
end
|
14
|
+
|
15
|
+
scrape_all '.home-panel h2' do |nodes|
|
16
|
+
{ sections: nodes.map{|n| n.text} }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
@resources = @crawler.crawl @page
|
20
|
+
end #end before
|
21
|
+
|
22
|
+
it 'should crawl the resource' do
|
23
|
+
@resources.should have(1).resource
|
24
|
+
@resources.first[:title].should match('Internet Assigned Numbers Authority')
|
25
|
+
@resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
|
26
|
+
@resources.first[:sections].should have(3).sections
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'standard interaction' do
|
31
|
+
before(:each) do
|
32
|
+
@page = page()
|
33
|
+
@crawler = Klepto::Crawler.new 'body', {:syntax => :css}
|
34
|
+
end
|
35
|
+
it 'should have a CSS scope' do
|
36
|
+
@crawler.scope.should eq 'body'
|
37
|
+
end
|
38
|
+
it 'should have a desired syntax' do
|
39
|
+
@crawler.syntax.should == :css
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should be able to scrape the node that the crawler is scoped to' do
|
43
|
+
@crawler.scrape do |node|
|
44
|
+
{:name => node.native.name}
|
45
|
+
end
|
46
|
+
resources = @crawler.crawl( @page )
|
47
|
+
resources.should have(1).resource
|
48
|
+
resources.first[:name].should eq('body')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should be able to designate scraping of a single node with a symbol' do
|
52
|
+
@crawler.scrape 'h1', :title
|
53
|
+
resources = @crawler.crawl( @page )
|
54
|
+
resources.should have(1).resource
|
55
|
+
resources.first[:title].should eq('Example Domain')
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should be able to designate scraping of a single node with a block' do
|
59
|
+
@crawler.scrape 'h1' do |node|
|
60
|
+
{title: node.text}
|
61
|
+
end
|
62
|
+
|
63
|
+
resources = @crawler.crawl( @page )
|
64
|
+
resources.should have(1).resource
|
65
|
+
resources.first[:title].should eq('Example Domain')
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should be able to designate scraping of a set of nodes' do
|
69
|
+
@crawler.scrape_all 'p' do |nodes|
|
70
|
+
{
|
71
|
+
paragraphs: [
|
72
|
+
nodes.first.text,
|
73
|
+
nodes.last.text
|
74
|
+
]
|
75
|
+
}
|
76
|
+
end
|
77
|
+
resources = @crawler.crawl( @page )
|
78
|
+
resources.should have(1).resource
|
79
|
+
resources.first[:paragraphs].should be_kind_of(Array)
|
80
|
+
resources.first[:paragraphs].last.should eq("More information...")
|
81
|
+
end
|
82
|
+
|
83
|
+
pending 'should be able to save a set of resources'
|
84
|
+
pending 'should be able to specify a limit'
|
85
|
+
pending 'should be able to specify a skip'
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Klepto do
|
4
|
+
it "should have a version" do
|
5
|
+
Klepto::VERSION.should_not be_nil
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
describe 'Scraping pages', :skip => false do
|
10
|
+
before(:each) do
|
11
|
+
@bot = Klepto::Bot.new do
|
12
|
+
syntax :css
|
13
|
+
|
14
|
+
headers({
|
15
|
+
'Referer' => 'https://twitter.com',
|
16
|
+
'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
|
17
|
+
})
|
18
|
+
|
19
|
+
urls 'https://twitter.com/justinbieber'
|
20
|
+
|
21
|
+
crawl 'body' do
|
22
|
+
scrape "h1.fullname", :name
|
23
|
+
scrape '.username span.screen-name', :username
|
24
|
+
save do |params|
|
25
|
+
user = User.find_by_name(params[:username]) || User.new
|
26
|
+
user.update_attributes params
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
crawl 'li.stream-item' do
|
31
|
+
scrape do |node|
|
32
|
+
{:twitter_id => node['data-item-id']}
|
33
|
+
end
|
34
|
+
|
35
|
+
scrape '.content p', :content
|
36
|
+
|
37
|
+
scrape '._timestamp' do |node|
|
38
|
+
{timestamp: node['data-time']}
|
39
|
+
end
|
40
|
+
|
41
|
+
scrape '.time a' do |node|
|
42
|
+
{permalink: node[:href]}
|
43
|
+
end
|
44
|
+
|
45
|
+
save do |params|
|
46
|
+
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
47
|
+
tweet.update_attributes params
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
@bot.start!
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should have collected some resources' do
|
56
|
+
@bot.crawlers.should have(2).crawlers
|
57
|
+
@bot.crawlers.first.resources.should have(1).user
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should persist resources' do
|
61
|
+
User.count.should be(1)
|
62
|
+
Tweet.count.should_not be(0)
|
63
|
+
end
|
64
|
+
end
|