crawlette 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +11 -0
- data/Rakefile +2 -0
- data/bin/crawlette +8 -0
- data/crawlette.gemspec +26 -0
- data/lib/crawlette.rb +2 -0
- data/lib/crawlette/crawler.rb +77 -0
- data/lib/crawlette/page.rb +43 -0
- data/lib/crawlette/version.rb +3 -0
- data/spec/files/page.html +209 -0
- data/spec/files/root.html +12 -0
- data/spec/files/section-1-1.html +11 -0
- data/spec/files/section-1.html +11 -0
- data/spec/files/section-2.html +12 -0
- data/spec/lib/crawler_spec.rb +40 -0
- data/spec/lib/page_spec.rb +49 -0
- data/spec/spec_helper.rb +10 -0
- metadata +143 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 88877d0289d6126cd7b340ce741db5da6c40ac3e
|
|
4
|
+
data.tar.gz: 08e58934ae52afcce2f5c7ab371f9ac55cfd5bfa
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 06571be5090c0e02d374fb01f89c07f377761c498ff72ff28b437d6a97da7440baa3cd891230281554f21391839848e7761fa236f9af0154b96813325a117b06
|
|
7
|
+
data.tar.gz: 3f03a72370cdce93f983a633f57229132ee55fc8ae254fbfa3f89ef6310b10b2b24cfda848088042dc5f8554b67163a873b3d1a209407da624690552c9fe5da1
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2014 Miguel Camba
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
data/Rakefile
ADDED
data/bin/crawlette
ADDED
data/crawlette.gemspec
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'crawlette/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "crawlette"
|
|
8
|
+
spec.version = Crawlette::VERSION
|
|
9
|
+
spec.authors = ["Miguel Camba"]
|
|
10
|
+
spec.email = ["miguel.camba@gmail.com"]
|
|
11
|
+
spec.summary = %q{Very simple web crawler}
|
|
12
|
+
spec.description = %q{Crawls a page, with no limits and without visiting external domains}
|
|
13
|
+
spec.homepage = ""
|
|
14
|
+
spec.license = "MIT"
|
|
15
|
+
|
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
|
+
spec.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
spec.add_runtime_dependency "nokogiri", '~> 1.6'
|
|
22
|
+
spec.add_runtime_dependency "awesome_print", '~> 1.2'
|
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
|
24
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
|
25
|
+
spec.add_development_dependency "rspec", "~> 3.1"
|
|
26
|
+
end
|
data/lib/crawlette.rb
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
require 'uri'
|
|
2
|
+
require 'net/http'
|
|
3
|
+
require 'crawlette/page'
|
|
4
|
+
|
|
5
|
+
module Crawlette
|
|
6
|
+
class Crawler
|
|
7
|
+
MAX_THREADS = 8
|
|
8
|
+
BadUrlError = Class.new(ArgumentError)
|
|
9
|
+
|
|
10
|
+
def initialize(url, sitemap = {})
|
|
11
|
+
@uri = URI.parse(url)
|
|
12
|
+
@pending_uris = [@uri]
|
|
13
|
+
@sitemap = sitemap
|
|
14
|
+
unless @uri.host && @uri.scheme
|
|
15
|
+
fail BadUrlError, "Invalid url: You must provide a full qualified url"
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Crawl a web page and generate a sitemap that must also contain:
|
|
21
|
+
#
|
|
22
|
+
# * Links betwenn pages.
|
|
23
|
+
# * On which static assets each page depend on.
|
|
24
|
+
#
|
|
25
|
+
# Example:
|
|
26
|
+
#
|
|
27
|
+
# Crawlette::Crawler.new('https://gocardless.com').crawl
|
|
28
|
+
# # => {
|
|
29
|
+
# 'http://example.com/' => {
|
|
30
|
+
# 'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'],
|
|
31
|
+
# 'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'],
|
|
32
|
+
# },
|
|
33
|
+
# 'http://example.com/watch-a-demo' => {
|
|
34
|
+
# 'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'],
|
|
35
|
+
# 'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'],
|
|
36
|
+
# },
|
|
37
|
+
# 'http://example.com/features' => {
|
|
38
|
+
# 'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'],
|
|
39
|
+
# 'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'],
|
|
40
|
+
# },
|
|
41
|
+
# 'http://example.com/features/api' => {
|
|
42
|
+
# ...
|
|
43
|
+
# },
|
|
44
|
+
# 'http://example.com/features/pricing' => {
|
|
45
|
+
# ...
|
|
46
|
+
# },
|
|
47
|
+
# }
|
|
48
|
+
|
|
49
|
+
def crawl
|
|
50
|
+
while @pending_uris.size > 0
|
|
51
|
+
threads = []
|
|
52
|
+
@pending_uris.pop(MAX_THREADS).each do |uri|
|
|
53
|
+
threads << Thread.new do
|
|
54
|
+
process_uri(uri)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
threads.each(&:join)
|
|
58
|
+
end
|
|
59
|
+
@sitemap
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def process_uri(uri)
|
|
66
|
+
@sitemap[uri.to_s] ||= begin
|
|
67
|
+
puts "... Fetching #{uri.to_s}"
|
|
68
|
+
page = Page.new(Net::HTTP.get(uri), uri)
|
|
69
|
+
more_uris = page.links.map { |url| URI.parse(url) }
|
|
70
|
+
@pending_uris.push(*more_uris)
|
|
71
|
+
{ 'links' => page.links, 'assets' => page.assets }
|
|
72
|
+
end
|
|
73
|
+
rescue => e
|
|
74
|
+
puts "ERROR! Cannot fetch #{@uri}: #{e.message}"
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
module Crawlette
|
|
4
|
+
class Page
|
|
5
|
+
MAILTO_REGEX = /^mailto:/
|
|
6
|
+
attr_reader :uri
|
|
7
|
+
|
|
8
|
+
def initialize(html, uri)
|
|
9
|
+
@html = html
|
|
10
|
+
@uri = uri
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def links
|
|
14
|
+
@links ||= sanitize_urls(document.css('a[href]').map { |a| a["href"] })
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def assets
|
|
18
|
+
@assets ||= begin
|
|
19
|
+
urls = document.css('[src]').map { |a| a["src"] }
|
|
20
|
+
urls += document.css('link[rel="stylesheet"][href]').map { |a| a["href"] }
|
|
21
|
+
urls += document.css('meta[name^="og:image"]').map { |a| a["content"] }
|
|
22
|
+
|
|
23
|
+
sanitize_urls(urls, external_links: true)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def document
|
|
30
|
+
@document ||= Nokogiri::HTML.parse(@html)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def sanitize_urls(urls, external_links: false)
|
|
34
|
+
urls.reject { |url| url =~ MAILTO_REGEX }
|
|
35
|
+
.map { |url| URI.parse(URI.escape(url.sub(/#.*$/, ''))) }
|
|
36
|
+
.map do |uri|
|
|
37
|
+
uri.host ||= @uri.host
|
|
38
|
+
uri.scheme ||= @uri.scheme
|
|
39
|
+
uri.to_s.sub(/\/$/, '') if external_links || uri.host =~ /#{@uri.host}$/
|
|
40
|
+
end.compact.uniq
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
|
|
3
|
+
<!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]-->
|
|
4
|
+
<head>
|
|
5
|
+
<meta charset="utf-8">
|
|
6
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
7
|
+
<title>The UK’s #1 for online Direct Debit - GoCardless</title>
|
|
8
|
+
<meta name="description" content="GoCardless is the UK's #1 for Direct Debit, serving more companies than any other provider. Perfect for recurring billing and B2B invoicing.">
|
|
9
|
+
<meta name="viewport" content="width=device-width">
|
|
10
|
+
<meta name="og:image" content="https://gocardless.com/images/logos/gocardless-square.png">
|
|
11
|
+
<meta name="og:image:secure_url" content="https://gocardless.com/images/logos/gocardless-square.png">
|
|
12
|
+
<meta name="google-site-verification" content="Y80kah87ghJhwiDqw-5ap234p9wCcGt6kMRxvnamtHU">
|
|
13
|
+
<link href="https://plus.google.com/+Gocardless" rel="publisher">
|
|
14
|
+
<!--[if lt IE 9]>
|
|
15
|
+
<script>
|
|
16
|
+
(function() {
|
|
17
|
+
var elems = 'article aside details figcaption figure footer header hgroup main nav section summary'.split(' ');
|
|
18
|
+
var length = elems.length;
|
|
19
|
+
var index = 0;
|
|
20
|
+
for(;index < length; index++){
|
|
21
|
+
document.createElement(elems[index]);
|
|
22
|
+
}
|
|
23
|
+
})();
|
|
24
|
+
</script>
|
|
25
|
+
<![endif]-->
|
|
26
|
+
<link rel="stylesheet" href="/css/main.css">
|
|
27
|
+
<link rel="stylesheet" href="/css/fonts.css">
|
|
28
|
+
</head>
|
|
29
|
+
<body>
|
|
30
|
+
|
|
31
|
+
<script src="//cdn.optimizely.com/js/125150657.js"></script>
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
<!--[if lt IE 9]>
|
|
35
|
+
<p class="browsehappy">You are using an <strong>outdated</strong> browser.
|
|
36
|
+
Please <a href="http://browsehappy.com/">upgrade your browser</a> to improve your experience.</p>
|
|
37
|
+
<![endif]-->
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
<div class="worry-container">
|
|
42
|
+
|
|
43
|
+
<div class="hero-worry hero-worry--girl u-center u-text-center">
|
|
44
|
+
<div class="site-container site-gutter u-cf align-btn-small u-padding-Txs u-padding-Bm">
|
|
45
|
+
<ul class="u-cf u-pull-end">
|
|
46
|
+
<li class="u-pull-start">
|
|
47
|
+
<a href="/users/sign_in/" class="btn btn--invert-hollow btn--small" id="login">Login</a>
|
|
48
|
+
</li>
|
|
49
|
+
<li class="u-pull-start u-margin-Ls">
|
|
50
|
+
<a href="/merchants/new" class="btn btn--invert-hollow btn--small" id="nav_sign_up">Sign up</a>
|
|
51
|
+
</li>
|
|
52
|
+
</ul>
|
|
53
|
+
</div>
|
|
54
|
+
|
|
55
|
+
<!-- Example video -->
|
|
56
|
+
<video src="https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4"></video>
|
|
57
|
+
|
|
58
|
+
<div class="site-container">
|
|
59
|
+
<div class="grid site-gutter">
|
|
60
|
+
<div class="grid__cell u-size1of2 u-text-center u-text-heading hero-worry__heading">
|
|
61
|
+
<h1 class="u-color-invert hero-heading u-text-light">Stop worrying about online payments</h1>
|
|
62
|
+
<i class="icon-gc-logo-invert u-margin-Tm"></i>
|
|
63
|
+
</div>
|
|
64
|
+
<div class="grid__cell u-size1of2">
|
|
65
|
+
<div class="hero-worry__modal"></div>
|
|
66
|
+
</div>
|
|
67
|
+
</div>
|
|
68
|
+
</div>
|
|
69
|
+
|
|
70
|
+
</div>
|
|
71
|
+
|
|
72
|
+
<div class="u-background-white">
|
|
73
|
+
<div class="site-container u-text-center u-padding-Vl">
|
|
74
|
+
|
|
75
|
+
<div class="u-padding-Vl"><div class="u-padding-Vs">
|
|
76
|
+
<h2 class="u-text-heading u-color-meta">GoCardless makes it quick and easy to take Direct Debit</h2>
|
|
77
|
+
<div class="u-size5of12 u-center u-padding-Vm">
|
|
78
|
+
<div class="grid">
|
|
79
|
+
<div class="grid__cell u-size1of2">
|
|
80
|
+
<a href="/watch-a-demo" class="btn btn--block btn--info btn--large">Watch a demo</a>
|
|
81
|
+
</div>
|
|
82
|
+
<div class="grid__cell u-size1of2">
|
|
83
|
+
<a href="/features" class="btn btn--block btn--hollow btn--large">Learn more</a>
|
|
84
|
+
</div>
|
|
85
|
+
</div>
|
|
86
|
+
</div>
|
|
87
|
+
</div></div>
|
|
88
|
+
|
|
89
|
+
<hr class="horizontal-ruler-top-double">
|
|
90
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
|
91
|
+
<i class="benefits-led-container__people-icon u-margin-Bs"></i>
|
|
92
|
+
<h1 class="u-text-heading u-text-hero u-color-primary">SIMPLE TO USE</h1>
|
|
93
|
+
<p>Sign up for free, set up in minutes. Manage everything online.</p>
|
|
94
|
+
</div></div>
|
|
95
|
+
|
|
96
|
+
<hr class="horizontal-ruler-top-double">
|
|
97
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
|
98
|
+
<i class="benefits-led-container__money-icon u-margin-Bs"></i>
|
|
99
|
+
<h1 class="u-text-heading u-text-hero u-color-accent">AMAZING VALUE</h1>
|
|
100
|
+
<p>1% up to £2 with no other fees. Incredible, tailored rates for high volume.</p>
|
|
101
|
+
</div></div>
|
|
102
|
+
|
|
103
|
+
<hr class="horizontal-ruler-top-double">
|
|
104
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
|
105
|
+
<i class="benefits-led-container__service-icon u-margin-Bs"></i>
|
|
106
|
+
<h1 class="u-text-heading u-text-hero u-color-secondary">LEGENDARY SERVICE</h1>
|
|
107
|
+
<p>Our personal support team are the best in the industry.</p>
|
|
108
|
+
</div></div>
|
|
109
|
+
|
|
110
|
+
<hr class="horizontal-ruler-top-double">
|
|
111
|
+
<div class="u-padding-Vl"><div class="u-padding-Vl">
|
|
112
|
+
<p class="u-margin-Bl">We've been featured in</p>
|
|
113
|
+
<div class="benefits-led-container__publications u-center"></div>
|
|
114
|
+
</div></div>
|
|
115
|
+
|
|
116
|
+
<hr class="horizontal-ruler-top-double u-margin-Vl">
|
|
117
|
+
<div class="u-padding-Vl"><div class="u-padding-Vs">
|
|
118
|
+
<p class="u-text-h2 u-text-heading u-color-meta">Get started today. <a href="/merchants/new" class="u-text-underline u-link-secondary">Sign up for free</a></p>
|
|
119
|
+
</div></div>
|
|
120
|
+
</div>
|
|
121
|
+
</div>
|
|
122
|
+
</div>
|
|
123
|
+
|
|
124
|
+
<div class="u-margin-Ts">
|
|
125
|
+
<div class="site-container site-gutter">
|
|
126
|
+
<div>
|
|
127
|
+
<div class="u-cf">
|
|
128
|
+
<ul class="nav u-padding-Vs u-pull-start">
|
|
129
|
+
<li class="nav__item">
|
|
130
|
+
<a href="https://help.gocardless.com#some-id">Help</a>
|
|
131
|
+
</li>
|
|
132
|
+
<li class="nav__item">
|
|
133
|
+
<a href="/contact-sales">Contact sales</a>
|
|
134
|
+
</li>
|
|
135
|
+
<li class="nav__item">
|
|
136
|
+
<a href="/faq/merchants" ng-gc-href-active>FAQ</a>
|
|
137
|
+
</li>
|
|
138
|
+
<li class="nav__item">
|
|
139
|
+
<a href="/direct-debit">Direct Debit</a>
|
|
140
|
+
</li>
|
|
141
|
+
<li class="nav__item">
|
|
142
|
+
<a href="/direct-debit/sepa/">SEPA</a>
|
|
143
|
+
</li>
|
|
144
|
+
<li class="nav__item">
|
|
145
|
+
<a href="/security" ng-gc-href-active>Security</a>
|
|
146
|
+
</li>
|
|
147
|
+
<li class="nav__item">
|
|
148
|
+
<a href="https://developer.gocardless.com">API</a>
|
|
149
|
+
</li>
|
|
150
|
+
<li class="nav__item">
|
|
151
|
+
<a href="/legal" ng-gc-href-active>Legal</a>
|
|
152
|
+
</li>
|
|
153
|
+
<li class="nav__item">
|
|
154
|
+
<a href="/about" ng-gc-href-active>About</a>
|
|
155
|
+
</li>
|
|
156
|
+
<li class="nav__item">
|
|
157
|
+
<a href="/jobs" ng-gc-href-active>Jobs</a>
|
|
158
|
+
</li>
|
|
159
|
+
<li class="nav__item">
|
|
160
|
+
<a href="/press" ng-gc-href-active>Press</a>
|
|
161
|
+
</li>
|
|
162
|
+
<li class="nav__item">
|
|
163
|
+
<a href="https://gocardless.com/blog">Blog</a>
|
|
164
|
+
</li>
|
|
165
|
+
</ul>
|
|
166
|
+
<div class="u-pull-end">
|
|
167
|
+
<a href="https://twitter.com/gocardless" class="u-margin-Ts twitter-follow-button" data-show-count="false" data-dnt="true">Follow @gocardless</a>
|
|
168
|
+
<script src="//platform.twitter.com/widgets.js" async id="twitter-wjs"></script>
|
|
169
|
+
</div>
|
|
170
|
+
</div>
|
|
171
|
+
<div class="grid u-margin-Ts">
|
|
172
|
+
<div class="u-text-h5 u-size3of12 u-padding-Vl grid__cell">
|
|
173
|
+
<b>GoCardless Ltd</b><br>
|
|
174
|
+
338-346 Goswell Road<br>London, EC1V 7LQ<br>
|
|
175
|
+
020 7183 8674<br>
|
|
176
|
+
<a href="mailto:help@gocardless.com">help@gocardless.com</a><br>
|
|
177
|
+
</div>
|
|
178
|
+
|
|
179
|
+
<div class="u-size3of12 grid__cell u-text-h5 u-padding-Vl ">
|
|
180
|
+
<p>
|
|
181
|
+
GoCardless is regulated by the <strong>Financial Conduct
|
|
182
|
+
Authority</strong> as an Authorised Payment Institution.
|
|
183
|
+
</p>
|
|
184
|
+
</div>
|
|
185
|
+
|
|
186
|
+
<div class="grid__cell u-size6of12 u-margin-Tm u-text-end">
|
|
187
|
+
<img alt="Footer logos" class="footer__logos" src="/images/footer/footer-logos@2x.png">
|
|
188
|
+
</div>
|
|
189
|
+
</div>
|
|
190
|
+
</div>
|
|
191
|
+
|
|
192
|
+
</div>
|
|
193
|
+
</div>
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
<script src="/js/vendor.js"></script>
|
|
198
|
+
<script src="/js/main.js"></script>
|
|
199
|
+
|
|
200
|
+
<!-- Google Tag Manager -->
|
|
201
|
+
<script>
|
|
202
|
+
dataLayer = [];
|
|
203
|
+
</script>
|
|
204
|
+
<script src="//www.googletagmanager.com/gtm.js?id=GTM-PRFKNC" async></script>
|
|
205
|
+
<script>
|
|
206
|
+
(function(w,l){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});})(window, 'dataLayer');
|
|
207
|
+
</script>
|
|
208
|
+
</body>
|
|
209
|
+
</html>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<head>
|
|
3
|
+
<meta charset="utf-8">
|
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
5
|
+
<title>Fake root</title>
|
|
6
|
+
<link rel="stylesheet" href="/styles.css">
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<a href="http://example.com/section-1"></a>
|
|
10
|
+
<a href="http://example.com/section-2"></a>
|
|
11
|
+
</body>
|
|
12
|
+
</html>
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<head>
|
|
3
|
+
<meta charset="utf-8">
|
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
5
|
+
<title>Section 1</title>
|
|
6
|
+
<link rel="stylesheet" href="/styles.css">
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<a href="http://example.com/section-1-1"></a>
|
|
10
|
+
</body>
|
|
11
|
+
</html>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<head>
|
|
3
|
+
<meta charset="utf-8">
|
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
5
|
+
<title>Section 2</title>
|
|
6
|
+
<link rel="stylesheet" href="/styles.css">
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<!-- Also points to section 1.1 -->
|
|
10
|
+
<a href="http://example.com/section-1-1"></a>
|
|
11
|
+
</body>
|
|
12
|
+
</html>
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Crawlette::Crawler do
|
|
4
|
+
subject(:crawler) { Crawlette::Crawler.new('http://example.com') }
|
|
5
|
+
let(:root) { File.read('spec/files/root.html') }
|
|
6
|
+
let(:s1) { File.read('spec/files/section-1.html') }
|
|
7
|
+
let(:s2) { File.read('spec/files/section-2.html') }
|
|
8
|
+
let(:s1_1) { File.read('spec/files/section-1-1.html') }
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
describe '#crawl' do
|
|
12
|
+
before do
|
|
13
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com')){ root }
|
|
14
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1')){ s1 }
|
|
15
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-2')){ s2 }
|
|
16
|
+
expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1-1')).twice{ s1_1 }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it "returns a hash with the crawled urls as keys and hashes with the links and assets of each one" do
|
|
20
|
+
expect(crawler.crawl).to eq(
|
|
21
|
+
"http://example.com" => {
|
|
22
|
+
"links"=>["http://example.com/section-1", "http://example.com/section-2"],
|
|
23
|
+
"assets"=>["http://example.com/styles.css"]
|
|
24
|
+
},
|
|
25
|
+
"http://example.com/section-2"=>{
|
|
26
|
+
"links"=>["http://example.com/section-1-1"],
|
|
27
|
+
"assets"=>["http://example.com/styles.css"]
|
|
28
|
+
},
|
|
29
|
+
"http://example.com/section-1"=>{
|
|
30
|
+
"links"=>["http://example.com/section-1-1"],
|
|
31
|
+
"assets"=>["http://example.com/styles.css"]
|
|
32
|
+
},
|
|
33
|
+
"http://example.com/section-1-1"=>{
|
|
34
|
+
"links"=>[],
|
|
35
|
+
"assets"=>["http://example.com/styles.css"]
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Crawlette::Page do
|
|
4
|
+
subject(:page) { Crawlette::Page.new(html, URI.parse('https://gocardless.com')) }
|
|
5
|
+
let(:html) { File.read('spec/files/page.html') }
|
|
6
|
+
|
|
7
|
+
describe '#links' do
|
|
8
|
+
it "returns a unique list of normalized non-external urls" do
|
|
9
|
+
expected_links = [
|
|
10
|
+
"https://gocardless.com/users/sign_in",
|
|
11
|
+
"https://gocardless.com/merchants/new",
|
|
12
|
+
"https://gocardless.com/watch-a-demo",
|
|
13
|
+
"https://gocardless.com/features",
|
|
14
|
+
"https://help.gocardless.com",
|
|
15
|
+
"https://gocardless.com/contact-sales",
|
|
16
|
+
"https://gocardless.com/faq/merchants",
|
|
17
|
+
"https://gocardless.com/direct-debit",
|
|
18
|
+
"https://gocardless.com/direct-debit/sepa",
|
|
19
|
+
"https://gocardless.com/security",
|
|
20
|
+
"https://developer.gocardless.com",
|
|
21
|
+
"https://gocardless.com/legal",
|
|
22
|
+
"https://gocardless.com/about",
|
|
23
|
+
"https://gocardless.com/jobs",
|
|
24
|
+
"https://gocardless.com/press",
|
|
25
|
+
"https://gocardless.com/blog",
|
|
26
|
+
]
|
|
27
|
+
expect(page.links).to match_array(expected_links)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
describe '#assets' do
|
|
32
|
+
it 'returns a unique list with the normalized urls of the static assets on this page' do
|
|
33
|
+
expected_assets = [
|
|
34
|
+
"https://cdn.optimizely.com/js/125150657.js",
|
|
35
|
+
"https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4",
|
|
36
|
+
"https://platform.twitter.com/widgets.js",
|
|
37
|
+
"https://gocardless.com/images/footer/footer-logos@2x.png",
|
|
38
|
+
"https://gocardless.com/js/vendor.js",
|
|
39
|
+
"https://gocardless.com/js/main.js",
|
|
40
|
+
"https://www.googletagmanager.com/gtm.js?id=GTM-PRFKNC",
|
|
41
|
+
"https://gocardless.com/css/main.css",
|
|
42
|
+
"https://gocardless.com/css/fonts.css",
|
|
43
|
+
"https://gocardless.com/images/logos/gocardless-square.png"
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
expect(page.assets).to match_array(expected_assets)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: crawlette
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Miguel Camba
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2014-09-14 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: nokogiri
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.6'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.6'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: awesome_print
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '1.2'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '1.2'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: bundler
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '1.6'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '1.6'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: rake
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '10.0'
|
|
62
|
+
type: :development
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '10.0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: rspec
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '3.1'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - "~>"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '3.1'
|
|
83
|
+
description: Crawls a page, with no limits and without visiting external domains
|
|
84
|
+
email:
|
|
85
|
+
- miguel.camba@gmail.com
|
|
86
|
+
executables:
|
|
87
|
+
- crawlette
|
|
88
|
+
extensions: []
|
|
89
|
+
extra_rdoc_files: []
|
|
90
|
+
files:
|
|
91
|
+
- ".gitignore"
|
|
92
|
+
- ".rspec"
|
|
93
|
+
- Gemfile
|
|
94
|
+
- LICENSE.txt
|
|
95
|
+
- README.md
|
|
96
|
+
- Rakefile
|
|
97
|
+
- bin/crawlette
|
|
98
|
+
- crawlette.gemspec
|
|
99
|
+
- lib/crawlette.rb
|
|
100
|
+
- lib/crawlette/crawler.rb
|
|
101
|
+
- lib/crawlette/page.rb
|
|
102
|
+
- lib/crawlette/version.rb
|
|
103
|
+
- spec/files/page.html
|
|
104
|
+
- spec/files/root.html
|
|
105
|
+
- spec/files/section-1-1.html
|
|
106
|
+
- spec/files/section-1.html
|
|
107
|
+
- spec/files/section-2.html
|
|
108
|
+
- spec/lib/crawler_spec.rb
|
|
109
|
+
- spec/lib/page_spec.rb
|
|
110
|
+
- spec/spec_helper.rb
|
|
111
|
+
homepage: ''
|
|
112
|
+
licenses:
|
|
113
|
+
- MIT
|
|
114
|
+
metadata: {}
|
|
115
|
+
post_install_message:
|
|
116
|
+
rdoc_options: []
|
|
117
|
+
require_paths:
|
|
118
|
+
- lib
|
|
119
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - ">="
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '0'
|
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
|
+
requirements:
|
|
126
|
+
- - ">="
|
|
127
|
+
- !ruby/object:Gem::Version
|
|
128
|
+
version: '0'
|
|
129
|
+
requirements: []
|
|
130
|
+
rubyforge_project:
|
|
131
|
+
rubygems_version: 2.4.1
|
|
132
|
+
signing_key:
|
|
133
|
+
specification_version: 4
|
|
134
|
+
summary: Very simple web crawler
|
|
135
|
+
test_files:
|
|
136
|
+
- spec/files/page.html
|
|
137
|
+
- spec/files/root.html
|
|
138
|
+
- spec/files/section-1-1.html
|
|
139
|
+
- spec/files/section-1.html
|
|
140
|
+
- spec/files/section-2.html
|
|
141
|
+
- spec/lib/crawler_spec.rb
|
|
142
|
+
- spec/lib/page_spec.rb
|
|
143
|
+
- spec/spec_helper.rb
|