klepto 0.6.9 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/klepto.gemspec +1 -0
- data/lib/klepto/browser.rb +3 -4
- data/lib/klepto/version.rb +1 -1
- data/phantom/test.js +100 -0
- metadata +5 -3
data/klepto.gemspec
CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
+
gem.license = 'MIT'
|
19
20
|
|
20
21
|
gem.add_dependency "poltergeist", '~> 1.3.0'
|
21
22
|
gem.add_dependency "capybara", '~> 2.1.0'
|
data/lib/klepto/browser.rb
CHANGED
@@ -57,10 +57,9 @@ module Klepto
|
|
57
57
|
@url_to_structure = _url
|
58
58
|
Klepto.logger.debug("Fetching #{@url_to_structure}")
|
59
59
|
|
60
|
-
Capybara.
|
61
|
-
|
62
|
-
|
63
|
-
end
|
60
|
+
Capybara.current_driver = Capybara.javascript_driver = use_driver
|
61
|
+
visit @url_to_structure
|
62
|
+
page
|
64
63
|
end
|
65
64
|
end
|
66
65
|
end
|
data/lib/klepto/version.rb
CHANGED
data/phantom/test.js
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
/*
|
2
|
+
CoffeeScript
|
3
|
+
JS Lint
|
4
|
+
PhantomJS
|
5
|
+
|
6
|
+
Ruby 'configuration' gem
|
7
|
+
Ruby blocks -> Javascript -> Ruby OR Javascript post processors
|
8
|
+
Ruby blocks -> Assertion? Auto generate cucumbers? OR callbacks on node not found?
|
9
|
+
https://github.com/ariya/phantomjs/wiki/API-Reference-WebPage
|
10
|
+
|
11
|
+
Config.defaults {
|
12
|
+
on(200,'2xx', :redirect){}
|
13
|
+
on('4xx'){}
|
14
|
+
on('5xx'){}
|
15
|
+
on(:timeout){}
|
16
|
+
on(:abort){}
|
17
|
+
headers({})
|
18
|
+
cookies({})
|
19
|
+
agent "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) Klepto/#{Klepto::Version} Safari/534.34"
|
20
|
+
}
|
21
|
+
Bot.new("http://google.com")do
|
22
|
+
config{
|
23
|
+
# merges with Defaults, creates a Configuration
|
24
|
+
url "http://google.com"
|
25
|
+
auto_structure false # stops it from running structure (@bot.process! will run it)
|
26
|
+
abort_on_failure true
|
27
|
+
agent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22";
|
28
|
+
|
29
|
+
headers({})
|
30
|
+
cookies({})
|
31
|
+
|
32
|
+
on(200,'2xx', :redirect){}
|
33
|
+
on('4xx'){}
|
34
|
+
on('5xx'){}
|
35
|
+
on(:timeout){}
|
36
|
+
on(:abort){}
|
37
|
+
|
38
|
+
before(:get){}
|
39
|
+
after(:get){}
|
40
|
+
before(:structure){}
|
41
|
+
after(:structure){}
|
42
|
+
}
|
43
|
+
|
44
|
+
structure{
|
45
|
+
# Should yield against Proxy so method_missing and queueing isn't in Bot
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
*/
|
50
|
+
var page = require('webpage').create(),
|
51
|
+
system = require('system'),
|
52
|
+
lt, pt, t, currentAddress, requestedAddress;
|
53
|
+
|
54
|
+
page.settings.userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22";
|
55
|
+
page.settings.loadImages = false;
|
56
|
+
|
57
|
+
page.onUrlChanged = function(targetUrl){
|
58
|
+
currentAddress = targetUrl;
|
59
|
+
console.log("Redirecting to: " + currentAddress);
|
60
|
+
}
|
61
|
+
|
62
|
+
page.onResourceReceived = function(resource) {
|
63
|
+
if (resource.stage === 'end' && resource.status == 200 && resource.url == currentAddress) {
|
64
|
+
lt = Date.now() - t;
|
65
|
+
console.log("Crawling: " + resource.url);
|
66
|
+
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
|
67
|
+
var title = page.evaluate(function(){
|
68
|
+
return $("title").text();
|
69
|
+
});
|
70
|
+
|
71
|
+
pt = Date.now() - t;
|
72
|
+
var structure = JSON.stringify({
|
73
|
+
title: title,
|
74
|
+
_meta: {
|
75
|
+
loadTime: lt,
|
76
|
+
parseTime: pt,
|
77
|
+
redirectOccurred: (requestedAddress != resource.url),
|
78
|
+
requestedAddress: requestedAddress,
|
79
|
+
currentAddress: resource.url,
|
80
|
+
httpCode: resource.status
|
81
|
+
}
|
82
|
+
});
|
83
|
+
system.stdout.write(structure);
|
84
|
+
|
85
|
+
phantom.exit();
|
86
|
+
});
|
87
|
+
} else if(resource.stage === 'end' && resource.status != 200 && resource.url == currentAddress){
|
88
|
+
console.log("Oops: " + resource.status);
|
89
|
+
phantom.exit();
|
90
|
+
} else {/* NOOP*/}
|
91
|
+
}
|
92
|
+
|
93
|
+
if (system.args.length === 1) {
|
94
|
+
console.log('Usage: test.js <some URL>');
|
95
|
+
phantom.exit(1);
|
96
|
+
} else {
|
97
|
+
t = Date.now();
|
98
|
+
currentAddress = requestedAddress = system.args[1];
|
99
|
+
page.open(requestedAddress);
|
100
|
+
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
@@ -115,6 +115,7 @@ files:
|
|
115
115
|
- lib/klepto/structure.rb
|
116
116
|
- lib/klepto/tasks.rb
|
117
117
|
- lib/klepto/version.rb
|
118
|
+
- phantom/test.js
|
118
119
|
- samples/bieber.html
|
119
120
|
- samples/concept.rb
|
120
121
|
- spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml
|
@@ -133,7 +134,8 @@ files:
|
|
133
134
|
- spec/orm/database.example.yml
|
134
135
|
- spec/spec_helper.rb
|
135
136
|
homepage: http://github.com/coryodaniel/klepto
|
136
|
-
licenses:
|
137
|
+
licenses:
|
138
|
+
- MIT
|
137
139
|
post_install_message:
|
138
140
|
rdoc_options: []
|
139
141
|
require_paths:
|