crawlette 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +11 -0
- data/Rakefile +2 -0
- data/bin/crawlette +8 -0
- data/crawlette.gemspec +26 -0
- data/lib/crawlette.rb +2 -0
- data/lib/crawlette/crawler.rb +77 -0
- data/lib/crawlette/page.rb +43 -0
- data/lib/crawlette/version.rb +3 -0
- data/spec/files/page.html +209 -0
- data/spec/files/root.html +12 -0
- data/spec/files/section-1-1.html +11 -0
- data/spec/files/section-1.html +11 -0
- data/spec/files/section-2.html +12 -0
- data/spec/lib/crawler_spec.rb +40 -0
- data/spec/lib/page_spec.rb +49 -0
- data/spec/spec_helper.rb +10 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 88877d0289d6126cd7b340ce741db5da6c40ac3e
|
4
|
+
data.tar.gz: 08e58934ae52afcce2f5c7ab371f9ac55cfd5bfa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 06571be5090c0e02d374fb01f89c07f377761c498ff72ff28b437d6a97da7440baa3cd891230281554f21391839848e7761fa236f9af0154b96813325a117b06
|
7
|
+
data.tar.gz: 3f03a72370cdce93f983a633f57229132ee55fc8ae254fbfa3f89ef6310b10b2b24cfda848088042dc5f8554b67163a873b3d1a209407da624690552c9fe5da1
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Miguel Camba
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
data/Rakefile
ADDED
data/bin/crawlette
ADDED
data/crawlette.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'crawlette/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "crawlette"
|
8
|
+
spec.version = Crawlette::VERSION
|
9
|
+
spec.authors = ["Miguel Camba"]
|
10
|
+
spec.email = ["miguel.camba@gmail.com"]
|
11
|
+
spec.summary = %q{Very simple web crawler}
|
12
|
+
spec.description = %q{Crawls a page, with no limits and without visiting external domains}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_runtime_dependency "nokogiri", '~> 1.6'
|
22
|
+
spec.add_runtime_dependency "awesome_print", '~> 1.2'
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
24
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
25
|
+
spec.add_development_dependency "rspec", "~> 3.1"
|
26
|
+
end
|
data/lib/crawlette.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'crawlette/page'
|
4
|
+
|
5
|
+
module Crawlette
|
6
|
+
class Crawler
|
7
|
+
MAX_THREADS = 8
|
8
|
+
BadUrlError = Class.new(ArgumentError)
|
9
|
+
|
10
|
+
def initialize(url, sitemap = {})
|
11
|
+
@uri = URI.parse(url)
|
12
|
+
@pending_uris = [@uri]
|
13
|
+
@sitemap = sitemap
|
14
|
+
unless @uri.host && @uri.scheme
|
15
|
+
fail BadUrlError, "Invalid url: You must provide a full qualified url"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# Crawl a web page and generate a sitemap that must also contain:
|
21
|
+
#
|
22
|
+
# * Links betwenn pages.
|
23
|
+
# * On which static assets each page depend on.
|
24
|
+
#
|
25
|
+
# Example:
|
26
|
+
#
|
27
|
+
# Crawlette::Crawler.new('https://gocardless.com').crawl
|
28
|
+
# # => {
|
29
|
+
# 'http://example.com/' => {
|
30
|
+
# 'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'],
|
31
|
+
# 'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'],
|
32
|
+
# },
|
33
|
+
# 'http://example.com/watch-a-demo' => {
|
34
|
+
# 'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'],
|
35
|
+
# 'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'],
|
36
|
+
# },
|
37
|
+
# 'http://example.com/features' => {
|
38
|
+
# 'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'],
|
39
|
+
# 'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'],
|
40
|
+
# },
|
41
|
+
# 'http://example.com/features/api' => {
|
42
|
+
# ...
|
43
|
+
# },
|
44
|
+
# 'http://example.com/features/pricing' => {
|
45
|
+
# ...
|
46
|
+
# },
|
47
|
+
# }
|
48
|
+
|
49
|
+
def crawl
|
50
|
+
while @pending_uris.size > 0
|
51
|
+
threads = []
|
52
|
+
@pending_uris.pop(MAX_THREADS).each do |uri|
|
53
|
+
threads << Thread.new do
|
54
|
+
process_uri(uri)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
threads.each(&:join)
|
58
|
+
end
|
59
|
+
@sitemap
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def process_uri(uri)
|
66
|
+
@sitemap[uri.to_s] ||= begin
|
67
|
+
puts "... Fetching #{uri.to_s}"
|
68
|
+
page = Page.new(Net::HTTP.get(uri), uri)
|
69
|
+
more_uris = page.links.map { |url| URI.parse(url) }
|
70
|
+
@pending_uris.push(*more_uris)
|
71
|
+
{ 'links' => page.links, 'assets' => page.assets }
|
72
|
+
end
|
73
|
+
rescue => e
|
74
|
+
puts "ERROR! Cannot fetch #{@uri}: #{e.message}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Crawlette
|
4
|
+
class Page
|
5
|
+
MAILTO_REGEX = /^mailto:/
|
6
|
+
attr_reader :uri
|
7
|
+
|
8
|
+
def initialize(html, uri)
|
9
|
+
@html = html
|
10
|
+
@uri = uri
|
11
|
+
end
|
12
|
+
|
13
|
+
def links
|
14
|
+
@links ||= sanitize_urls(document.css('a[href]').map { |a| a["href"] })
|
15
|
+
end
|
16
|
+
|
17
|
+
def assets
|
18
|
+
@assets ||= begin
|
19
|
+
urls = document.css('[src]').map { |a| a["src"] }
|
20
|
+
urls += document.css('link[rel="stylesheet"][href]').map { |a| a["href"] }
|
21
|
+
urls += document.css('meta[name^="og:image"]').map { |a| a["content"] }
|
22
|
+
|
23
|
+
sanitize_urls(urls, external_links: true)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def document
|
30
|
+
@document ||= Nokogiri::HTML.parse(@html)
|
31
|
+
end
|
32
|
+
|
33
|
+
def sanitize_urls(urls, external_links: false)
|
34
|
+
urls.reject { |url| url =~ MAILTO_REGEX }
|
35
|
+
.map { |url| URI.parse(URI.escape(url.sub(/#.*$/, ''))) }
|
36
|
+
.map do |uri|
|
37
|
+
uri.host ||= @uri.host
|
38
|
+
uri.scheme ||= @uri.scheme
|
39
|
+
uri.to_s.sub(/\/$/, '') if external_links || uri.host =~ /#{@uri.host}$/
|
40
|
+
end.compact.uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
|
3
|
+
<!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]-->
|
4
|
+
<head>
|
5
|
+
<meta charset="utf-8">
|
6
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
7
|
+
<title>The UK’s #1 for online Direct Debit - GoCardless</title>
|
8
|
+
<meta name="description" content="GoCardless is the UK's #1 for Direct Debit, serving more companies than any other provider. Perfect for recurring billing and B2B invoicing.">
|
9
|
+
<meta name="viewport" content="width=device-width">
|
10
|
+
<meta name="og:image" content="https://gocardless.com/images/logos/gocardless-square.png">
|
11
|
+
<meta name="og:image:secure_url" content="https://gocardless.com/images/logos/gocardless-square.png">
|
12
|
+
<meta name="google-site-verification" content="Y80kah87ghJhwiDqw-5ap234p9wCcGt6kMRxvnamtHU">
|
13
|
+
<link href="https://plus.google.com/+Gocardless" rel="publisher">
|
14
|
+
<!--[if lt IE 9]>
|
15
|
+
<script>
|
16
|
+
(function() {
|
17
|
+
var elems = 'article aside details figcaption figure footer header hgroup main nav section summary'.split(' ');
|
18
|
+
var length = elems.length;
|
19
|
+
var index = 0;
|
20
|
+
for(;index < length; index++){
|
21
|
+
document.createElement(elems[index]);
|
22
|
+
}
|
23
|
+
})();
|
24
|
+
</script>
|
25
|
+
<![endif]-->
|
26
|
+
<link rel="stylesheet" href="/css/main.css">
|
27
|
+
<link rel="stylesheet" href="/css/fonts.css">
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
|
31
|
+
<script src="//cdn.optimizely.com/js/125150657.js"></script>
|
32
|
+
|
33
|
+
|
34
|
+
<!--[if lt IE 9]>
|
35
|
+
<p class="browsehappy">You are using an <strong>outdated</strong> browser.
|
36
|
+
Please <a href="http://browsehappy.com/">upgrade your browser</a> to improve your experience.</p>
|
37
|
+
<![endif]-->
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
<div class="worry-container">
|
42
|
+
|
43
|
+
<div class="hero-worry hero-worry--girl u-center u-text-center">
|
44
|
+
<div class="site-container site-gutter u-cf align-btn-small u-padding-Txs u-padding-Bm">
|
45
|
+
<ul class="u-cf u-pull-end">
|
46
|
+
<li class="u-pull-start">
|
47
|
+
<a href="/users/sign_in/" class="btn btn--invert-hollow btn--small" id="login">Login</a>
|
48
|
+
</li>
|
49
|
+
<li class="u-pull-start u-margin-Ls">
|
50
|
+
<a href="/merchants/new" class="btn btn--invert-hollow btn--small" id="nav_sign_up">Sign up</a>
|
51
|
+
</li>
|
52
|
+
</ul>
|
53
|
+
</div>
|
54
|
+
|
55
|
+
<!-- Example video -->
|
56
|
+
<video src="https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4"></video>
|
57
|
+
|
58
|
+
<div class="site-container">
|
59
|
+
<div class="grid site-gutter">
|
60
|
+
<div class="grid__cell u-size1of2 u-text-center u-text-heading hero-worry__heading">
|
61
|
+
<h1 class="u-color-invert hero-heading u-text-light">Stop worrying about online payments</h1>
|
62
|
+
<i class="icon-gc-logo-invert u-margin-Tm"></i>
|
63
|
+
</div>
|
64
|
+
<div class="grid__cell u-size1of2">
|
65
|
+
<div class="hero-worry__modal"></div>
|
66
|
+
</div>
|
67
|
+
</div>
|
68
|
+
</div>
|
69
|
+
|
70
|
+
</div>
|
71
|
+
|
72
|
+
<div class="u-background-white">
|
73
|
+
<div class="site-container u-text-center u-padding-Vl">
|
74
|
+
|
75
|
+
<div class="u-padding-Vl"><div class="u-padding-Vs">
|
76
|
+
<h2 class="u-text-heading u-color-meta">GoCardless makes it quick and easy to take Direct Debit</h2>
|
77
|
+
<div class="u-size5of12 u-center u-padding-Vm">
|
78
|
+
<div class="grid">
|
79
|
+
<div class="grid__cell u-size1of2">
|
80
|
+
<a href="/watch-a-demo" class="btn btn--block btn--info btn--large">Watch a demo</a>
|
81
|
+
</div>
|
82
|
+
<div class="grid__cell u-size1of2">
|
83
|
+
<a href="/features" class="btn btn--block btn--hollow btn--large">Learn more</a>
|
84
|
+
</div>
|
85
|
+
</div>
|
86
|
+
</div>
|
87
|
+
</div></div>
|
88
|
+
|
89
|
+
<hr class="horizontal-ruler-top-double">
|
90
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
91
|
+
<i class="benefits-led-container__people-icon u-margin-Bs"></i>
|
92
|
+
<h1 class="u-text-heading u-text-hero u-color-primary">SIMPLE TO USE</h1>
|
93
|
+
<p>Sign up for free, set up in minutes. Manage everything online.</p>
|
94
|
+
</div></div>
|
95
|
+
|
96
|
+
<hr class="horizontal-ruler-top-double">
|
97
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
98
|
+
<i class="benefits-led-container__money-icon u-margin-Bs"></i>
|
99
|
+
<h1 class="u-text-heading u-text-hero u-color-accent">AMAZING VALUE</h1>
|
100
|
+
<p>1% up to £2 with no other fees. Incredible, tailored rates for high volume.</p>
|
101
|
+
</div></div>
|
102
|
+
|
103
|
+
<hr class="horizontal-ruler-top-double">
|
104
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
105
|
+
<i class="benefits-led-container__service-icon u-margin-Bs"></i>
|
106
|
+
<h1 class="u-text-heading u-text-hero u-color-secondary">LEGENDARY SERVICE</h1>
|
107
|
+
<p>Our personal support team are the best in the industry.</p>
|
108
|
+
</div></div>
|
109
|
+
|
110
|
+
<hr class="horizontal-ruler-top-double">
|
111
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
112
|
+
<p class="u-margin-Bl">We've been featured in</p>
|
113
|
+
<div class="benefits-led-container__publications u-center"></div>
|
114
|
+
</div></div>
|
115
|
+
|
116
|
+
<hr class="horizontal-ruler-top-double u-margin-Vl">
|
117
|
+
<div class="u-padding-Vl"><div class="u-padding-Vs">
|
118
|
+
<p class="u-text-h2 u-text-heading u-color-meta">Get started today. <a href="/merchants/new" class="u-text-underline u-link-secondary">Sign up for free</a></p>
|
119
|
+
</div></div>
|
120
|
+
</div>
|
121
|
+
</div>
|
122
|
+
</div>
|
123
|
+
|
124
|
+
<div class="u-margin-Ts">
|
125
|
+
<div class="site-container site-gutter">
|
126
|
+
<div>
|
127
|
+
<div class="u-cf">
|
128
|
+
<ul class="nav u-padding-Vs u-pull-start">
|
129
|
+
<li class="nav__item">
|
130
|
+
<a href="https://help.gocardless.com#some-id">Help</a>
|
131
|
+
</li>
|
132
|
+
<li class="nav__item">
|
133
|
+
<a href="/contact-sales">Contact sales</a>
|
134
|
+
</li>
|
135
|
+
<li class="nav__item">
|
136
|
+
<a href="/faq/merchants" ng-gc-href-active>FAQ</a>
|
137
|
+
</li>
|
138
|
+
<li class="nav__item">
|
139
|
+
<a href="/direct-debit">Direct Debit</a>
|
140
|
+
</li>
|
141
|
+
<li class="nav__item">
|
142
|
+
<a href="/direct-debit/sepa/">SEPA</a>
|
143
|
+
</li>
|
144
|
+
<li class="nav__item">
|
145
|
+
<a href="/security" ng-gc-href-active>Security</a>
|
146
|
+
</li>
|
147
|
+
<li class="nav__item">
|
148
|
+
<a href="https://developer.gocardless.com">API</a>
|
149
|
+
</li>
|
150
|
+
<li class="nav__item">
|
151
|
+
<a href="/legal" ng-gc-href-active>Legal</a>
|
152
|
+
</li>
|
153
|
+
<li class="nav__item">
|
154
|
+
<a href="/about" ng-gc-href-active>About</a>
|
155
|
+
</li>
|
156
|
+
<li class="nav__item">
|
157
|
+
<a href="/jobs" ng-gc-href-active>Jobs</a>
|
158
|
+
</li>
|
159
|
+
<li class="nav__item">
|
160
|
+
<a href="/press" ng-gc-href-active>Press</a>
|
161
|
+
</li>
|
162
|
+
<li class="nav__item">
|
163
|
+
<a href="https://gocardless.com/blog">Blog</a>
|
164
|
+
</li>
|
165
|
+
</ul>
|
166
|
+
<div class="u-pull-end">
|
167
|
+
<a href="https://twitter.com/gocardless" class="u-margin-Ts twitter-follow-button" data-show-count="false" data-dnt="true">Follow @gocardless</a>
|
168
|
+
<script src="//platform.twitter.com/widgets.js" async id="twitter-wjs"></script>
|
169
|
+
</div>
|
170
|
+
</div>
|
171
|
+
<div class="grid u-margin-Ts">
|
172
|
+
<div class="u-text-h5 u-size3of12 u-padding-Vl grid__cell">
|
173
|
+
<b>GoCardless Ltd</b><br>
|
174
|
+
338-346 Goswell Road<br>London, EC1V 7LQ<br>
|
175
|
+
020 7183 8674<br>
|
176
|
+
<a href="mailto:help@gocardless.com">help@gocardless.com</a><br>
|
177
|
+
</div>
|
178
|
+
|
179
|
+
<div class="u-size3of12 grid__cell u-text-h5 u-padding-Vl ">
|
180
|
+
<p>
|
181
|
+
GoCardless is regulated by the <strong>Financial Conduct
|
182
|
+
Authority</strong> as an Authorised Payment Institution.
|
183
|
+
</p>
|
184
|
+
</div>
|
185
|
+
|
186
|
+
<div class="grid__cell u-size6of12 u-margin-Tm u-text-end">
|
187
|
+
<img alt="Footer logos" class="footer__logos" src="/images/footer/footer-logos@2x.png">
|
188
|
+
</div>
|
189
|
+
</div>
|
190
|
+
</div>
|
191
|
+
|
192
|
+
</div>
|
193
|
+
</div>
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
<script src="/js/vendor.js"></script>
|
198
|
+
<script src="/js/main.js"></script>
|
199
|
+
|
200
|
+
<!-- Google Tag Manager -->
|
201
|
+
<script>
|
202
|
+
dataLayer = [];
|
203
|
+
</script>
|
204
|
+
<script src="//www.googletagmanager.com/gtm.js?id=GTM-PRFKNC" async></script>
|
205
|
+
<script>
|
206
|
+
(function(w,l){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});})(window, 'dataLayer');
|
207
|
+
</script>
|
208
|
+
</body>
|
209
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<head>
|
3
|
+
<meta charset="utf-8">
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
5
|
+
<title>Fake root</title>
|
6
|
+
<link rel="stylesheet" href="/styles.css">
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<a href="http://example.com/section-1"></a>
|
10
|
+
<a href="http://example.com/section-2"></a>
|
11
|
+
</body>
|
12
|
+
</html>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<head>
|
3
|
+
<meta charset="utf-8">
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
5
|
+
<title>Section 1</title>
|
6
|
+
<link rel="stylesheet" href="/styles.css">
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<a href="http://example.com/section-1-1"></a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<head>
|
3
|
+
<meta charset="utf-8">
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
5
|
+
<title>Section 2</title>
|
6
|
+
<link rel="stylesheet" href="/styles.css">
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<!-- Also points to section 1.1 -->
|
10
|
+
<a href="http://example.com/section-1-1"></a>
|
11
|
+
</body>
|
12
|
+
</html>
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Crawlette::Crawler do
|
4
|
+
subject(:crawler) { Crawlette::Crawler.new('http://example.com') }
|
5
|
+
let(:root) { File.read('spec/files/root.html') }
|
6
|
+
let(:s1) { File.read('spec/files/section-1.html') }
|
7
|
+
let(:s2) { File.read('spec/files/section-2.html') }
|
8
|
+
let(:s1_1) { File.read('spec/files/section-1-1.html') }
|
9
|
+
|
10
|
+
|
11
|
+
describe '#crawl' do
|
12
|
+
before do
|
13
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com')){ root }
|
14
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1')){ s1 }
|
15
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-2')){ s2 }
|
16
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1-1')).twice{ s1_1 }
|
17
|
+
end
|
18
|
+
|
19
|
+
it "returns a hash with the crawled urls as keys and hashes with the links and assets of each one" do
|
20
|
+
expect(crawler.crawl).to eq(
|
21
|
+
"http://example.com" => {
|
22
|
+
"links"=>["http://example.com/section-1", "http://example.com/section-2"],
|
23
|
+
"assets"=>["http://example.com/styles.css"]
|
24
|
+
},
|
25
|
+
"http://example.com/section-2"=>{
|
26
|
+
"links"=>["http://example.com/section-1-1"],
|
27
|
+
"assets"=>["http://example.com/styles.css"]
|
28
|
+
},
|
29
|
+
"http://example.com/section-1"=>{
|
30
|
+
"links"=>["http://example.com/section-1-1"],
|
31
|
+
"assets"=>["http://example.com/styles.css"]
|
32
|
+
},
|
33
|
+
"http://example.com/section-1-1"=>{
|
34
|
+
"links"=>[],
|
35
|
+
"assets"=>["http://example.com/styles.css"]
|
36
|
+
}
|
37
|
+
)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Crawlette::Page do
|
4
|
+
subject(:page) { Crawlette::Page.new(html, URI.parse('https://gocardless.com')) }
|
5
|
+
let(:html) { File.read('spec/files/page.html') }
|
6
|
+
|
7
|
+
describe '#links' do
|
8
|
+
it "returns a unique list of normalized non-external urls" do
|
9
|
+
expected_links = [
|
10
|
+
"https://gocardless.com/users/sign_in",
|
11
|
+
"https://gocardless.com/merchants/new",
|
12
|
+
"https://gocardless.com/watch-a-demo",
|
13
|
+
"https://gocardless.com/features",
|
14
|
+
"https://help.gocardless.com",
|
15
|
+
"https://gocardless.com/contact-sales",
|
16
|
+
"https://gocardless.com/faq/merchants",
|
17
|
+
"https://gocardless.com/direct-debit",
|
18
|
+
"https://gocardless.com/direct-debit/sepa",
|
19
|
+
"https://gocardless.com/security",
|
20
|
+
"https://developer.gocardless.com",
|
21
|
+
"https://gocardless.com/legal",
|
22
|
+
"https://gocardless.com/about",
|
23
|
+
"https://gocardless.com/jobs",
|
24
|
+
"https://gocardless.com/press",
|
25
|
+
"https://gocardless.com/blog",
|
26
|
+
]
|
27
|
+
expect(page.links).to match_array(expected_links)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe '#assets' do
|
32
|
+
it 'returns a unique list with the normalized urls of the static assets on this page' do
|
33
|
+
expected_assets = [
|
34
|
+
"https://cdn.optimizely.com/js/125150657.js",
|
35
|
+
"https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4",
|
36
|
+
"https://platform.twitter.com/widgets.js",
|
37
|
+
"https://gocardless.com/images/footer/footer-logos@2x.png",
|
38
|
+
"https://gocardless.com/js/vendor.js",
|
39
|
+
"https://gocardless.com/js/main.js",
|
40
|
+
"https://www.googletagmanager.com/gtm.js?id=GTM-PRFKNC",
|
41
|
+
"https://gocardless.com/css/main.css",
|
42
|
+
"https://gocardless.com/css/fonts.css",
|
43
|
+
"https://gocardless.com/images/logos/gocardless-square.png"
|
44
|
+
]
|
45
|
+
|
46
|
+
expect(page.assets).to match_array(expected_assets)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawlette
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Miguel Camba
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-09-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: awesome_print
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.2'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.1'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.1'
|
83
|
+
description: Crawls a page, with no limits and without visiting external domains
|
84
|
+
email:
|
85
|
+
- miguel.camba@gmail.com
|
86
|
+
executables:
|
87
|
+
- crawlette
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- ".gitignore"
|
92
|
+
- ".rspec"
|
93
|
+
- Gemfile
|
94
|
+
- LICENSE.txt
|
95
|
+
- README.md
|
96
|
+
- Rakefile
|
97
|
+
- bin/crawlette
|
98
|
+
- crawlette.gemspec
|
99
|
+
- lib/crawlette.rb
|
100
|
+
- lib/crawlette/crawler.rb
|
101
|
+
- lib/crawlette/page.rb
|
102
|
+
- lib/crawlette/version.rb
|
103
|
+
- spec/files/page.html
|
104
|
+
- spec/files/root.html
|
105
|
+
- spec/files/section-1-1.html
|
106
|
+
- spec/files/section-1.html
|
107
|
+
- spec/files/section-2.html
|
108
|
+
- spec/lib/crawler_spec.rb
|
109
|
+
- spec/lib/page_spec.rb
|
110
|
+
- spec/spec_helper.rb
|
111
|
+
homepage: ''
|
112
|
+
licenses:
|
113
|
+
- MIT
|
114
|
+
metadata: {}
|
115
|
+
post_install_message:
|
116
|
+
rdoc_options: []
|
117
|
+
require_paths:
|
118
|
+
- lib
|
119
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
requirements: []
|
130
|
+
rubyforge_project:
|
131
|
+
rubygems_version: 2.4.1
|
132
|
+
signing_key:
|
133
|
+
specification_version: 4
|
134
|
+
summary: Very simple web crawler
|
135
|
+
test_files:
|
136
|
+
- spec/files/page.html
|
137
|
+
- spec/files/root.html
|
138
|
+
- spec/files/section-1-1.html
|
139
|
+
- spec/files/section-1.html
|
140
|
+
- spec/files/section-2.html
|
141
|
+
- spec/lib/crawler_spec.rb
|
142
|
+
- spec/lib/page_spec.rb
|
143
|
+
- spec/spec_helper.rb
|