crawlette 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 88877d0289d6126cd7b340ce741db5da6c40ac3e
4
+ data.tar.gz: 08e58934ae52afcce2f5c7ab371f9ac55cfd5bfa
5
+ SHA512:
6
+ metadata.gz: 06571be5090c0e02d374fb01f89c07f377761c498ff72ff28b437d6a97da7440baa3cd891230281554f21391839848e7761fa236f9af0154b96813325a117b06
7
+ data.tar.gz: 3f03a72370cdce93f983a633f57229132ee55fc8ae254fbfa3f89ef6310b10b2b24cfda848088042dc5f8554b67163a873b3d1a209407da624690552c9fe5da1
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawlette.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Miguel Camba
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # Crawlette
2
+
3
+ Very simple command line utility to crawl an URL and show the links and assets of each page.
4
+
5
+ ## Installation
6
+
7
+ $ gem install crawlette
8
+
9
+ ## Usage
10
+
11
+ $ crawlette http://miguelcamba.com
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/crawlette ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'awesome_print'
7
+ require 'crawlette'
8
+ ap Crawlette::Crawler.new(ARGV[0]).crawl
data/crawlette.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crawlette/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "crawlette"
8
+ spec.version = Crawlette::VERSION
9
+ spec.authors = ["Miguel Camba"]
10
+ spec.email = ["miguel.camba@gmail.com"]
11
+ spec.summary = %q{Very simple web crawler}
12
+ spec.description = %q{Crawls a page, with no limits and without visiting external domains}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency "nokogiri", '~> 1.6'
22
+ spec.add_runtime_dependency "awesome_print", '~> 1.2'
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rspec", "~> 3.1"
26
+ end
data/lib/crawlette.rb ADDED
@@ -0,0 +1,2 @@
1
+ require "crawlette/version"
2
+ require "crawlette/crawler"
@@ -0,0 +1,77 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+ require 'crawlette/page'
4
+
5
+ module Crawlette
6
+ class Crawler
7
+ MAX_THREADS = 8
8
+ BadUrlError = Class.new(ArgumentError)
9
+
10
+ def initialize(url, sitemap = {})
11
+ @uri = URI.parse(url)
12
+ @pending_uris = [@uri]
13
+ @sitemap = sitemap
14
+ unless @uri.host && @uri.scheme
15
+ fail BadUrlError, "Invalid url: You must provide a full qualified url"
16
+ end
17
+ end
18
+
19
+
20
+ # Crawl a web page and generate a sitemap that must also contain:
21
+ #
22
+ # * Links betwenn pages.
23
+ # * On which static assets each page depend on.
24
+ #
25
+ # Example:
26
+ #
27
+ # Crawlette::Crawler.new('https://gocardless.com').crawl
28
+ # # => {
29
+ # 'http://example.com/' => {
30
+ # 'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'],
31
+ # 'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'],
32
+ # },
33
+ # 'http://example.com/watch-a-demo' => {
34
+ # 'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'],
35
+ # 'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'],
36
+ # },
37
+ # 'http://example.com/features' => {
38
+ # 'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'],
39
+ # 'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'],
40
+ # },
41
+ # 'http://example.com/features/api' => {
42
+ # ...
43
+ # },
44
+ # 'http://example.com/features/pricing' => {
45
+ # ...
46
+ # },
47
+ # }
48
+
49
+ def crawl
50
+ while @pending_uris.size > 0
51
+ threads = []
52
+ @pending_uris.pop(MAX_THREADS).each do |uri|
53
+ threads << Thread.new do
54
+ process_uri(uri)
55
+ end
56
+ end
57
+ threads.each(&:join)
58
+ end
59
+ @sitemap
60
+ end
61
+
62
+
63
+ private
64
+
65
+ def process_uri(uri)
66
+ @sitemap[uri.to_s] ||= begin
67
+ puts "... Fetching #{uri.to_s}"
68
+ page = Page.new(Net::HTTP.get(uri), uri)
69
+ more_uris = page.links.map { |url| URI.parse(url) }
70
+ @pending_uris.push(*more_uris)
71
+ { 'links' => page.links, 'assets' => page.assets }
72
+ end
73
+ rescue => e
74
+ puts "ERROR! Cannot fetch #{@uri}: #{e.message}"
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,43 @@
1
+ require 'nokogiri'
2
+
3
+ module Crawlette
4
+ class Page
5
+ MAILTO_REGEX = /^mailto:/
6
+ attr_reader :uri
7
+
8
+ def initialize(html, uri)
9
+ @html = html
10
+ @uri = uri
11
+ end
12
+
13
+ def links
14
+ @links ||= sanitize_urls(document.css('a[href]').map { |a| a["href"] })
15
+ end
16
+
17
+ def assets
18
+ @assets ||= begin
19
+ urls = document.css('[src]').map { |a| a["src"] }
20
+ urls += document.css('link[rel="stylesheet"][href]').map { |a| a["href"] }
21
+ urls += document.css('meta[name^="og:image"]').map { |a| a["content"] }
22
+
23
+ sanitize_urls(urls, external_links: true)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def document
30
+ @document ||= Nokogiri::HTML.parse(@html)
31
+ end
32
+
33
+ def sanitize_urls(urls, external_links: false)
34
+ urls.reject { |url| url =~ MAILTO_REGEX }
35
+ .map { |url| URI.parse(URI.escape(url.sub(/#.*$/, ''))) }
36
+ .map do |uri|
37
+ uri.host ||= @uri.host
38
+ uri.scheme ||= @uri.scheme
39
+ uri.to_s.sub(/\/$/, '') if external_links || uri.host =~ /#{@uri.host}$/
40
+ end.compact.uniq
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ module Crawlette
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,209 @@
1
+ <!doctype html>
2
+ <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
3
+ <!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]-->
4
+ <head>
5
+ <meta charset="utf-8">
6
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
7
+ <title>The UK’s #1 for online Direct Debit - GoCardless</title>
8
+ <meta name="description" content="GoCardless is the UK's #1 for Direct Debit, serving more companies than any other provider. Perfect for recurring billing and B2B invoicing.">
9
+ <meta name="viewport" content="width=device-width">
10
+ <meta name="og:image" content="https://gocardless.com/images/logos/gocardless-square.png">
11
+ <meta name="og:image:secure_url" content="https://gocardless.com/images/logos/gocardless-square.png">
12
+ <meta name="google-site-verification" content="Y80kah87ghJhwiDqw-5ap234p9wCcGt6kMRxvnamtHU">
13
+ <link href="https://plus.google.com/+Gocardless" rel="publisher">
14
+ <!--[if lt IE 9]>
15
+ <script>
16
+ (function() {
17
+ var elems = 'article aside details figcaption figure footer header hgroup main nav section summary'.split(' ');
18
+ var length = elems.length;
19
+ var index = 0;
20
+ for(;index < length; index++){
21
+ document.createElement(elems[index]);
22
+ }
23
+ })();
24
+ </script>
25
+ <![endif]-->
26
+ <link rel="stylesheet" href="/css/main.css">
27
+ <link rel="stylesheet" href="/css/fonts.css">
28
+ </head>
29
+ <body>
30
+
31
+ <script src="//cdn.optimizely.com/js/125150657.js"></script>
32
+
33
+
34
+ <!--[if lt IE 9]>
35
+ <p class="browsehappy">You are using an <strong>outdated</strong> browser.
36
+ Please <a href="http://browsehappy.com/">upgrade your browser</a> to improve your experience.</p>
37
+ <![endif]-->
38
+
39
+
40
+
41
+ <div class="worry-container">
42
+
43
+ <div class="hero-worry hero-worry--girl u-center u-text-center">
44
+ <div class="site-container site-gutter u-cf align-btn-small u-padding-Txs u-padding-Bm">
45
+ <ul class="u-cf u-pull-end">
46
+ <li class="u-pull-start">
47
+ <a href="/users/sign_in/" class="btn btn--invert-hollow btn--small" id="login">Login</a>
48
+ </li>
49
+ <li class="u-pull-start u-margin-Ls">
50
+ <a href="/merchants/new" class="btn btn--invert-hollow btn--small" id="nav_sign_up">Sign up</a>
51
+ </li>
52
+ </ul>
53
+ </div>
54
+
55
+ <!-- Example video -->
56
+ <video src="https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4"></video>
57
+
58
+ <div class="site-container">
59
+ <div class="grid site-gutter">
60
+ <div class="grid__cell u-size1of2 u-text-center u-text-heading hero-worry__heading">
61
+ <h1 class="u-color-invert hero-heading u-text-light">Stop worrying about online payments</h1>
62
+ <i class="icon-gc-logo-invert u-margin-Tm"></i>
63
+ </div>
64
+ <div class="grid__cell u-size1of2">
65
+ <div class="hero-worry__modal"></div>
66
+ </div>
67
+ </div>
68
+ </div>
69
+
70
+ </div>
71
+
72
+ <div class="u-background-white">
73
+ <div class="site-container u-text-center u-padding-Vl">
74
+
75
+ <div class="u-padding-Vl"><div class="u-padding-Vs">
76
+ <h2 class="u-text-heading u-color-meta">GoCardless makes it quick and easy to take Direct Debit</h2>
77
+ <div class="u-size5of12 u-center u-padding-Vm">
78
+ <div class="grid">
79
+ <div class="grid__cell u-size1of2">
80
+ <a href="/watch-a-demo" class="btn btn--block btn--info btn--large">Watch a demo</a>
81
+ </div>
82
+ <div class="grid__cell u-size1of2">
83
+ <a href="/features" class="btn btn--block btn--hollow btn--large">Learn more</a>
84
+ </div>
85
+ </div>
86
+ </div>
87
+ </div></div>
88
+
89
+ <hr class="horizontal-ruler-top-double">
90
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
91
+ <i class="benefits-led-container__people-icon u-margin-Bs"></i>
92
+ <h1 class="u-text-heading u-text-hero u-color-primary">SIMPLE TO USE</h1>
93
+ <p>Sign up for free, set up in minutes. Manage everything online.</p>
94
+ </div></div>
95
+
96
+ <hr class="horizontal-ruler-top-double">
97
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
98
+ <i class="benefits-led-container__money-icon u-margin-Bs"></i>
99
+ <h1 class="u-text-heading u-text-hero u-color-accent">AMAZING VALUE</h1>
100
+ <p>1% up to £2 with no other fees. Incredible, tailored rates for high volume.</p>
101
+ </div></div>
102
+
103
+ <hr class="horizontal-ruler-top-double">
104
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
105
+ <i class="benefits-led-container__service-icon u-margin-Bs"></i>
106
+ <h1 class="u-text-heading u-text-hero u-color-secondary">LEGENDARY SERVICE</h1>
107
+ <p>Our personal support team are the best in the industry.</p>
108
+ </div></div>
109
+
110
+ <hr class="horizontal-ruler-top-double">
111
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
112
+ <p class="u-margin-Bl">We've been featured in</p>
113
+ <div class="benefits-led-container__publications u-center"></div>
114
+ </div></div>
115
+
116
+ <hr class="horizontal-ruler-top-double u-margin-Vl">
117
+ <div class="u-padding-Vl"><div class="u-padding-Vs">
118
+ <p class="u-text-h2 u-text-heading u-color-meta">Get started today. <a href="/merchants/new" class="u-text-underline u-link-secondary">Sign up for free</a></p>
119
+ </div></div>
120
+ </div>
121
+ </div>
122
+ </div>
123
+
124
+ <div class="u-margin-Ts">
125
+ <div class="site-container site-gutter">
126
+ <div>
127
+ <div class="u-cf">
128
+ <ul class="nav u-padding-Vs u-pull-start">
129
+ <li class="nav__item">
130
+ <a href="https://help.gocardless.com#some-id">Help</a>
131
+ </li>
132
+ <li class="nav__item">
133
+ <a href="/contact-sales">Contact sales</a>
134
+ </li>
135
+ <li class="nav__item">
136
+ <a href="/faq/merchants" ng-gc-href-active>FAQ</a>
137
+ </li>
138
+ <li class="nav__item">
139
+ <a href="/direct-debit">Direct Debit</a>
140
+ </li>
141
+ <li class="nav__item">
142
+ <a href="/direct-debit/sepa/">SEPA</a>
143
+ </li>
144
+ <li class="nav__item">
145
+ <a href="/security" ng-gc-href-active>Security</a>
146
+ </li>
147
+ <li class="nav__item">
148
+ <a href="https://developer.gocardless.com">API</a>
149
+ </li>
150
+ <li class="nav__item">
151
+ <a href="/legal" ng-gc-href-active>Legal</a>
152
+ </li>
153
+ <li class="nav__item">
154
+ <a href="/about" ng-gc-href-active>About</a>
155
+ </li>
156
+ <li class="nav__item">
157
+ <a href="/jobs" ng-gc-href-active>Jobs</a>
158
+ </li>
159
+ <li class="nav__item">
160
+ <a href="/press" ng-gc-href-active>Press</a>
161
+ </li>
162
+ <li class="nav__item">
163
+ <a href="https://gocardless.com/blog">Blog</a>
164
+ </li>
165
+ </ul>
166
+ <div class="u-pull-end">
167
+ <a href="https://twitter.com/gocardless" class="u-margin-Ts twitter-follow-button" data-show-count="false" data-dnt="true">Follow @gocardless</a>
168
+ <script src="//platform.twitter.com/widgets.js" async id="twitter-wjs"></script>
169
+ </div>
170
+ </div>
171
+ <div class="grid u-margin-Ts">
172
+ <div class="u-text-h5 u-size3of12 u-padding-Vl grid__cell">
173
+ <b>GoCardless Ltd</b><br>
174
+ 338-346 Goswell Road<br>London, EC1V 7LQ<br>
175
+ 020 7183 8674<br>
176
+ <a href="mailto:help@gocardless.com">help@gocardless.com</a><br>
177
+ </div>
178
+
179
+ <div class="u-size3of12 grid__cell u-text-h5 u-padding-Vl ">
180
+ <p>
181
+ GoCardless is regulated by the <strong>Financial Conduct
182
+ Authority</strong> as an Authorised Payment Institution.
183
+ </p>
184
+ </div>
185
+
186
+ <div class="grid__cell u-size6of12 u-margin-Tm u-text-end">
187
+ <img alt="Footer logos" class="footer__logos" src="/images/footer/footer-logos@2x.png">
188
+ </div>
189
+ </div>
190
+ </div>
191
+
192
+ </div>
193
+ </div>
194
+
195
+
196
+
197
+ <script src="/js/vendor.js"></script>
198
+ <script src="/js/main.js"></script>
199
+
200
+ <!-- Google Tag Manager -->
201
+ <script>
202
+ dataLayer = [];
203
+ </script>
204
+ <script src="//www.googletagmanager.com/gtm.js?id=GTM-PRFKNC" async></script>
205
+ <script>
206
+ (function(w,l){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});})(window, 'dataLayer');
207
+ </script>
208
+ </body>
209
+ </html>
@@ -0,0 +1,12 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Fake root</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ <a href="http://example.com/section-1"></a>
10
+ <a href="http://example.com/section-2"></a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,11 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Section 1-1</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ no more links
10
+ </body>
11
+ </html>
@@ -0,0 +1,11 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Section 1</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ <a href="http://example.com/section-1-1"></a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Section 2</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ <!-- Also points to section 1.1 -->
10
+ <a href="http://example.com/section-1-1"></a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Crawlette::Crawler do
4
+ subject(:crawler) { Crawlette::Crawler.new('http://example.com') }
5
+ let(:root) { File.read('spec/files/root.html') }
6
+ let(:s1) { File.read('spec/files/section-1.html') }
7
+ let(:s2) { File.read('spec/files/section-2.html') }
8
+ let(:s1_1) { File.read('spec/files/section-1-1.html') }
9
+
10
+
11
+ describe '#crawl' do
12
+ before do
13
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com')){ root }
14
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1')){ s1 }
15
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-2')){ s2 }
16
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1-1')).twice{ s1_1 }
17
+ end
18
+
19
+ it "returns a hash with the crawled urls as keys and hashes with the links and assets of each one" do
20
+ expect(crawler.crawl).to eq(
21
+ "http://example.com" => {
22
+ "links"=>["http://example.com/section-1", "http://example.com/section-2"],
23
+ "assets"=>["http://example.com/styles.css"]
24
+ },
25
+ "http://example.com/section-2"=>{
26
+ "links"=>["http://example.com/section-1-1"],
27
+ "assets"=>["http://example.com/styles.css"]
28
+ },
29
+ "http://example.com/section-1"=>{
30
+ "links"=>["http://example.com/section-1-1"],
31
+ "assets"=>["http://example.com/styles.css"]
32
+ },
33
+ "http://example.com/section-1-1"=>{
34
+ "links"=>[],
35
+ "assets"=>["http://example.com/styles.css"]
36
+ }
37
+ )
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe Crawlette::Page do
4
+ subject(:page) { Crawlette::Page.new(html, URI.parse('https://gocardless.com')) }
5
+ let(:html) { File.read('spec/files/page.html') }
6
+
7
+ describe '#links' do
8
+ it "returns a unique list of normalized non-external urls" do
9
+ expected_links = [
10
+ "https://gocardless.com/users/sign_in",
11
+ "https://gocardless.com/merchants/new",
12
+ "https://gocardless.com/watch-a-demo",
13
+ "https://gocardless.com/features",
14
+ "https://help.gocardless.com",
15
+ "https://gocardless.com/contact-sales",
16
+ "https://gocardless.com/faq/merchants",
17
+ "https://gocardless.com/direct-debit",
18
+ "https://gocardless.com/direct-debit/sepa",
19
+ "https://gocardless.com/security",
20
+ "https://developer.gocardless.com",
21
+ "https://gocardless.com/legal",
22
+ "https://gocardless.com/about",
23
+ "https://gocardless.com/jobs",
24
+ "https://gocardless.com/press",
25
+ "https://gocardless.com/blog",
26
+ ]
27
+ expect(page.links).to match_array(expected_links)
28
+ end
29
+ end
30
+
31
+ describe '#assets' do
32
+ it 'returns a unique list with the normalized urls of the static assets on this page' do
33
+ expected_assets = [
34
+ "https://cdn.optimizely.com/js/125150657.js",
35
+ "https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4",
36
+ "https://platform.twitter.com/widgets.js",
37
+ "https://gocardless.com/images/footer/footer-logos@2x.png",
38
+ "https://gocardless.com/js/vendor.js",
39
+ "https://gocardless.com/js/main.js",
40
+ "https://www.googletagmanager.com/gtm.js?id=GTM-PRFKNC",
41
+ "https://gocardless.com/css/main.css",
42
+ "https://gocardless.com/css/fonts.css",
43
+ "https://gocardless.com/images/logos/gocardless-square.png"
44
+ ]
45
+
46
+ expect(page.assets).to match_array(expected_assets)
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
2
+
3
+ require 'rspec'
4
+ require 'crawlette'
5
+
6
+ RSpec.configure do |config|
7
+ config.filter_run(focus: true)
8
+ config.run_all_when_everything_filtered = true
9
+ config.order = 'random'
10
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawlette
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Miguel Camba
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: awesome_print
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ description: Crawls a page, with no limits and without visiting external domains
84
+ email:
85
+ - miguel.camba@gmail.com
86
+ executables:
87
+ - crawlette
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".gitignore"
92
+ - ".rspec"
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - bin/crawlette
98
+ - crawlette.gemspec
99
+ - lib/crawlette.rb
100
+ - lib/crawlette/crawler.rb
101
+ - lib/crawlette/page.rb
102
+ - lib/crawlette/version.rb
103
+ - spec/files/page.html
104
+ - spec/files/root.html
105
+ - spec/files/section-1-1.html
106
+ - spec/files/section-1.html
107
+ - spec/files/section-2.html
108
+ - spec/lib/crawler_spec.rb
109
+ - spec/lib/page_spec.rb
110
+ - spec/spec_helper.rb
111
+ homepage: ''
112
+ licenses:
113
+ - MIT
114
+ metadata: {}
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 2.4.1
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: Very simple web crawler
135
+ test_files:
136
+ - spec/files/page.html
137
+ - spec/files/root.html
138
+ - spec/files/section-1-1.html
139
+ - spec/files/section-1.html
140
+ - spec/files/section-2.html
141
+ - spec/lib/crawler_spec.rb
142
+ - spec/lib/page_spec.rb
143
+ - spec/spec_helper.rb