klepto 0.6.9 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/klepto.gemspec +1 -0
- data/lib/klepto/browser.rb +3 -4
- data/lib/klepto/version.rb +1 -1
- data/phantom/test.js +100 -0
- metadata +5 -3
data/klepto.gemspec
CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
+
gem.license = 'MIT'
|
19
20
|
|
20
21
|
gem.add_dependency "poltergeist", '~> 1.3.0'
|
21
22
|
gem.add_dependency "capybara", '~> 2.1.0'
|
data/lib/klepto/browser.rb
CHANGED
@@ -57,10 +57,9 @@ module Klepto
|
|
57
57
|
@url_to_structure = _url
|
58
58
|
Klepto.logger.debug("Fetching #{@url_to_structure}")
|
59
59
|
|
60
|
-
Capybara.
|
61
|
-
|
62
|
-
|
63
|
-
end
|
60
|
+
Capybara.current_driver = Capybara.javascript_driver = use_driver
|
61
|
+
visit @url_to_structure
|
62
|
+
page
|
64
63
|
end
|
65
64
|
end
|
66
65
|
end
|
data/lib/klepto/version.rb
CHANGED
data/phantom/test.js
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
/*
|
2
|
+
CoffeeScript
|
3
|
+
JS Lint
|
4
|
+
PhantomJS
|
5
|
+
|
6
|
+
Ruby 'configuration' gem
|
7
|
+
Ruby blocks -> Javascript -> Ruby OR Javascript post processors
|
8
|
+
Ruby blocks -> Assertion? Auto generate cucumbers? OR callbacks on node not found?
|
9
|
+
https://github.com/ariya/phantomjs/wiki/API-Reference-WebPage
|
10
|
+
|
11
|
+
Config.defaults {
|
12
|
+
on(200,'2xx', :redirect){}
|
13
|
+
on('4xx'){}
|
14
|
+
on('5xx'){}
|
15
|
+
on(:timeout){}
|
16
|
+
on(:abort){}
|
17
|
+
headers({})
|
18
|
+
cookies({})
|
19
|
+
agent "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) Klepto/#{Klepto::Version} Safari/534.34"
|
20
|
+
}
|
21
|
+
Bot.new("http://google.com")do
|
22
|
+
config{
|
23
|
+
# merges with Defaults, creates a Configuration
|
24
|
+
url "http://google.com"
|
25
|
+
auto_structure false # stops it from running structure (@bot.process! will run it)
|
26
|
+
abort_on_failure true
|
27
|
+
agent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22";
|
28
|
+
|
29
|
+
headers({})
|
30
|
+
cookies({})
|
31
|
+
|
32
|
+
on(200,'2xx', :redirect){}
|
33
|
+
on('4xx'){}
|
34
|
+
on('5xx'){}
|
35
|
+
on(:timeout){}
|
36
|
+
on(:abort){}
|
37
|
+
|
38
|
+
before(:get){}
|
39
|
+
after(:get){}
|
40
|
+
before(:structure){}
|
41
|
+
after(:structure){}
|
42
|
+
}
|
43
|
+
|
44
|
+
structure{
|
45
|
+
# Should yield against Proxy so method_missing and queueing isn't in Bot
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
*/
|
50
|
+
var page = require('webpage').create(),
|
51
|
+
system = require('system'),
|
52
|
+
lt, pt, t, currentAddress, requestedAddress;
|
53
|
+
|
54
|
+
page.settings.userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22";
|
55
|
+
page.settings.loadImages = false;
|
56
|
+
|
57
|
+
page.onUrlChanged = function(targetUrl){
|
58
|
+
currentAddress = targetUrl;
|
59
|
+
console.log("Redirecting to: " + currentAddress);
|
60
|
+
}
|
61
|
+
|
62
|
+
page.onResourceReceived = function(resource) {
|
63
|
+
if (resource.stage === 'end' && resource.status == 200 && resource.url == currentAddress) {
|
64
|
+
lt = Date.now() - t;
|
65
|
+
console.log("Crawling: " + resource.url);
|
66
|
+
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
|
67
|
+
var title = page.evaluate(function(){
|
68
|
+
return $("title").text();
|
69
|
+
});
|
70
|
+
|
71
|
+
pt = Date.now() - t;
|
72
|
+
var structure = JSON.stringify({
|
73
|
+
title: title,
|
74
|
+
_meta: {
|
75
|
+
loadTime: lt,
|
76
|
+
parseTime: pt,
|
77
|
+
redirectOccurred: (requestedAddress != resource.url),
|
78
|
+
requestedAddress: requestedAddress,
|
79
|
+
currentAddress: resource.url,
|
80
|
+
httpCode: resource.status
|
81
|
+
}
|
82
|
+
});
|
83
|
+
system.stdout.write(structure);
|
84
|
+
|
85
|
+
phantom.exit();
|
86
|
+
});
|
87
|
+
} else if(resource.stage === 'end' && resource.status != 200 && resource.url == currentAddress){
|
88
|
+
console.log("Oops: " + resource.status);
|
89
|
+
phantom.exit();
|
90
|
+
} else {/* NOOP*/}
|
91
|
+
}
|
92
|
+
|
93
|
+
if (system.args.length === 1) {
|
94
|
+
console.log('Usage: test.js <some URL>');
|
95
|
+
phantom.exit(1);
|
96
|
+
} else {
|
97
|
+
t = Date.now();
|
98
|
+
currentAddress = requestedAddress = system.args[1];
|
99
|
+
page.open(requestedAddress);
|
100
|
+
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: klepto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: poltergeist
|
@@ -115,6 +115,7 @@ files:
|
|
115
115
|
- lib/klepto/structure.rb
|
116
116
|
- lib/klepto/tasks.rb
|
117
117
|
- lib/klepto/version.rb
|
118
|
+
- phantom/test.js
|
118
119
|
- samples/bieber.html
|
119
120
|
- samples/concept.rb
|
120
121
|
- spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml
|
@@ -133,7 +134,8 @@ files:
|
|
133
134
|
- spec/orm/database.example.yml
|
134
135
|
- spec/spec_helper.rb
|
135
136
|
homepage: http://github.com/coryodaniel/klepto
|
136
|
-
licenses:
|
137
|
+
licenses:
|
138
|
+
- MIT
|
137
139
|
post_install_message:
|
138
140
|
rdoc_options: []
|
139
141
|
require_paths:
|