crawlette 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 88877d0289d6126cd7b340ce741db5da6c40ac3e
4
+ data.tar.gz: 08e58934ae52afcce2f5c7ab371f9ac55cfd5bfa
5
+ SHA512:
6
+ metadata.gz: 06571be5090c0e02d374fb01f89c07f377761c498ff72ff28b437d6a97da7440baa3cd891230281554f21391839848e7761fa236f9af0154b96813325a117b06
7
+ data.tar.gz: 3f03a72370cdce93f983a633f57229132ee55fc8ae254fbfa3f89ef6310b10b2b24cfda848088042dc5f8554b67163a873b3d1a209407da624690552c9fe5da1
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawlette.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Miguel Camba
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # Crawlette
2
+
3
+ Very simple command line utility to crawl an URL and show the links and assets of each page.
4
+
5
+ ## Installation
6
+
7
+ $ gem install crawlette
8
+
9
+ ## Usage
10
+
11
+ $ crawlette http://miguelcamba.com
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/crawlette ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'awesome_print'
7
+ require 'crawlette'
8
+ ap Crawlette::Crawler.new(ARGV[0]).crawl
data/crawlette.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crawlette/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "crawlette"
8
+ spec.version = Crawlette::VERSION
9
+ spec.authors = ["Miguel Camba"]
10
+ spec.email = ["miguel.camba@gmail.com"]
11
+ spec.summary = %q{Very simple web crawler}
12
+ spec.description = %q{Crawls a page, with no limits and without visiting external domains}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency "nokogiri", '~> 1.6'
22
+ spec.add_runtime_dependency "awesome_print", '~> 1.2'
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rspec", "~> 3.1"
26
+ end
data/lib/crawlette.rb ADDED
@@ -0,0 +1,2 @@
1
+ require "crawlette/version"
2
+ require "crawlette/crawler"
@@ -0,0 +1,77 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+ require 'crawlette/page'
4
+
5
+ module Crawlette
6
+ class Crawler
7
+ MAX_THREADS = 8
8
+ BadUrlError = Class.new(ArgumentError)
9
+
10
+ def initialize(url, sitemap = {})
11
+ @uri = URI.parse(url)
12
+ @pending_uris = [@uri]
13
+ @sitemap = sitemap
14
+ unless @uri.host && @uri.scheme
15
+ fail BadUrlError, "Invalid url: You must provide a full qualified url"
16
+ end
17
+ end
18
+
19
+
20
+ # Crawl a web page and generate a sitemap that must also contain:
21
+ #
22
+ # * Links betwenn pages.
23
+ # * On which static assets each page depend on.
24
+ #
25
+ # Example:
26
+ #
27
+ # Crawlette::Crawler.new('https://gocardless.com').crawl
28
+ # # => {
29
+ # 'http://example.com/' => {
30
+ # 'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'],
31
+ # 'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'],
32
+ # },
33
+ # 'http://example.com/watch-a-demo' => {
34
+ # 'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'],
35
+ # 'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'],
36
+ # },
37
+ # 'http://example.com/features' => {
38
+ # 'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'],
39
+ # 'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'],
40
+ # },
41
+ # 'http://example.com/features/api' => {
42
+ # ...
43
+ # },
44
+ # 'http://example.com/features/pricing' => {
45
+ # ...
46
+ # },
47
+ # }
48
+
49
+ def crawl
50
+ while @pending_uris.size > 0
51
+ threads = []
52
+ @pending_uris.pop(MAX_THREADS).each do |uri|
53
+ threads << Thread.new do
54
+ process_uri(uri)
55
+ end
56
+ end
57
+ threads.each(&:join)
58
+ end
59
+ @sitemap
60
+ end
61
+
62
+
63
+ private
64
+
65
+ def process_uri(uri)
66
+ @sitemap[uri.to_s] ||= begin
67
+ puts "... Fetching #{uri.to_s}"
68
+ page = Page.new(Net::HTTP.get(uri), uri)
69
+ more_uris = page.links.map { |url| URI.parse(url) }
70
+ @pending_uris.push(*more_uris)
71
+ { 'links' => page.links, 'assets' => page.assets }
72
+ end
73
+ rescue => e
74
+ puts "ERROR! Cannot fetch #{@uri}: #{e.message}"
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,43 @@
1
+ require 'nokogiri'
2
+
3
+ module Crawlette
4
+ class Page
5
+ MAILTO_REGEX = /^mailto:/
6
+ attr_reader :uri
7
+
8
+ def initialize(html, uri)
9
+ @html = html
10
+ @uri = uri
11
+ end
12
+
13
+ def links
14
+ @links ||= sanitize_urls(document.css('a[href]').map { |a| a["href"] })
15
+ end
16
+
17
+ def assets
18
+ @assets ||= begin
19
+ urls = document.css('[src]').map { |a| a["src"] }
20
+ urls += document.css('link[rel="stylesheet"][href]').map { |a| a["href"] }
21
+ urls += document.css('meta[name^="og:image"]').map { |a| a["content"] }
22
+
23
+ sanitize_urls(urls, external_links: true)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def document
30
+ @document ||= Nokogiri::HTML.parse(@html)
31
+ end
32
+
33
+ def sanitize_urls(urls, external_links: false)
34
+ urls.reject { |url| url =~ MAILTO_REGEX }
35
+ .map { |url| URI.parse(URI.escape(url.sub(/#.*$/, ''))) }
36
+ .map do |uri|
37
+ uri.host ||= @uri.host
38
+ uri.scheme ||= @uri.scheme
39
+ uri.to_s.sub(/\/$/, '') if external_links || uri.host =~ /#{@uri.host}$/
40
+ end.compact.uniq
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ module Crawlette
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,209 @@
1
+ <!doctype html>
2
+ <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
3
+ <!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]-->
4
+ <head>
5
+ <meta charset="utf-8">
6
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
7
+ <title>The UK’s #1 for online Direct Debit - GoCardless</title>
8
+ <meta name="description" content="GoCardless is the UK's #1 for Direct Debit, serving more companies than any other provider. Perfect for recurring billing and B2B invoicing.">
9
+ <meta name="viewport" content="width=device-width">
10
+ <meta name="og:image" content="https://gocardless.com/images/logos/gocardless-square.png">
11
+ <meta name="og:image:secure_url" content="https://gocardless.com/images/logos/gocardless-square.png">
12
+ <meta name="google-site-verification" content="Y80kah87ghJhwiDqw-5ap234p9wCcGt6kMRxvnamtHU">
13
+ <link href="https://plus.google.com/+Gocardless" rel="publisher">
14
+ <!--[if lt IE 9]>
15
+ <script>
16
+ (function() {
17
+ var elems = 'article aside details figcaption figure footer header hgroup main nav section summary'.split(' ');
18
+ var length = elems.length;
19
+ var index = 0;
20
+ for(;index < length; index++){
21
+ document.createElement(elems[index]);
22
+ }
23
+ })();
24
+ </script>
25
+ <![endif]-->
26
+ <link rel="stylesheet" href="/css/main.css">
27
+ <link rel="stylesheet" href="/css/fonts.css">
28
+ </head>
29
+ <body>
30
+
31
+ <script src="//cdn.optimizely.com/js/125150657.js"></script>
32
+
33
+
34
+ <!--[if lt IE 9]>
35
+ <p class="browsehappy">You are using an <strong>outdated</strong> browser.
36
+ Please <a href="http://browsehappy.com/">upgrade your browser</a> to improve your experience.</p>
37
+ <![endif]-->
38
+
39
+
40
+
41
+ <div class="worry-container">
42
+
43
+ <div class="hero-worry hero-worry--girl u-center u-text-center">
44
+ <div class="site-container site-gutter u-cf align-btn-small u-padding-Txs u-padding-Bm">
45
+ <ul class="u-cf u-pull-end">
46
+ <li class="u-pull-start">
47
+ <a href="/users/sign_in/" class="btn btn--invert-hollow btn--small" id="login">Login</a>
48
+ </li>
49
+ <li class="u-pull-start u-margin-Ls">
50
+ <a href="/merchants/new" class="btn btn--invert-hollow btn--small" id="nav_sign_up">Sign up</a>
51
+ </li>
52
+ </ul>
53
+ </div>
54
+
55
+ <!-- Example video -->
56
+ <video src="https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4"></video>
57
+
58
+ <div class="site-container">
59
+ <div class="grid site-gutter">
60
+ <div class="grid__cell u-size1of2 u-text-center u-text-heading hero-worry__heading">
61
+ <h1 class="u-color-invert hero-heading u-text-light">Stop worrying about online payments</h1>
62
+ <i class="icon-gc-logo-invert u-margin-Tm"></i>
63
+ </div>
64
+ <div class="grid__cell u-size1of2">
65
+ <div class="hero-worry__modal"></div>
66
+ </div>
67
+ </div>
68
+ </div>
69
+
70
+ </div>
71
+
72
+ <div class="u-background-white">
73
+ <div class="site-container u-text-center u-padding-Vl">
74
+
75
+ <div class="u-padding-Vl"><div class="u-padding-Vs">
76
+ <h2 class="u-text-heading u-color-meta">GoCardless makes it quick and easy to take Direct Debit</h2>
77
+ <div class="u-size5of12 u-center u-padding-Vm">
78
+ <div class="grid">
79
+ <div class="grid__cell u-size1of2">
80
+ <a href="/watch-a-demo" class="btn btn--block btn--info btn--large">Watch a demo</a>
81
+ </div>
82
+ <div class="grid__cell u-size1of2">
83
+ <a href="/features" class="btn btn--block btn--hollow btn--large">Learn more</a>
84
+ </div>
85
+ </div>
86
+ </div>
87
+ </div></div>
88
+
89
+ <hr class="horizontal-ruler-top-double">
90
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
91
+ <i class="benefits-led-container__people-icon u-margin-Bs"></i>
92
+ <h1 class="u-text-heading u-text-hero u-color-primary">SIMPLE TO USE</h1>
93
+ <p>Sign up for free, set up in minutes. Manage everything online.</p>
94
+ </div></div>
95
+
96
+ <hr class="horizontal-ruler-top-double">
97
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
98
+ <i class="benefits-led-container__money-icon u-margin-Bs"></i>
99
+ <h1 class="u-text-heading u-text-hero u-color-accent">AMAZING VALUE</h1>
100
+ <p>1% up to £2 with no other fees. Incredible, tailored rates for high volume.</p>
101
+ </div></div>
102
+
103
+ <hr class="horizontal-ruler-top-double">
104
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
105
+ <i class="benefits-led-container__service-icon u-margin-Bs"></i>
106
+ <h1 class="u-text-heading u-text-hero u-color-secondary">LEGENDARY SERVICE</h1>
107
+ <p>Our personal support team are the best in the industry.</p>
108
+ </div></div>
109
+
110
+ <hr class="horizontal-ruler-top-double">
111
+ <div class="u-padding-Vl"><div class="u-padding-Vl">
112
+ <p class="u-margin-Bl">We've been featured in</p>
113
+ <div class="benefits-led-container__publications u-center"></div>
114
+ </div></div>
115
+
116
+ <hr class="horizontal-ruler-top-double u-margin-Vl">
117
+ <div class="u-padding-Vl"><div class="u-padding-Vs">
118
+ <p class="u-text-h2 u-text-heading u-color-meta">Get started today. <a href="/merchants/new" class="u-text-underline u-link-secondary">Sign up for free</a></p>
119
+ </div></div>
120
+ </div>
121
+ </div>
122
+ </div>
123
+
124
+ <div class="u-margin-Ts">
125
+ <div class="site-container site-gutter">
126
+ <div>
127
+ <div class="u-cf">
128
+ <ul class="nav u-padding-Vs u-pull-start">
129
+ <li class="nav__item">
130
+ <a href="https://help.gocardless.com#some-id">Help</a>
131
+ </li>
132
+ <li class="nav__item">
133
+ <a href="/contact-sales">Contact sales</a>
134
+ </li>
135
+ <li class="nav__item">
136
+ <a href="/faq/merchants" ng-gc-href-active>FAQ</a>
137
+ </li>
138
+ <li class="nav__item">
139
+ <a href="/direct-debit">Direct Debit</a>
140
+ </li>
141
+ <li class="nav__item">
142
+ <a href="/direct-debit/sepa/">SEPA</a>
143
+ </li>
144
+ <li class="nav__item">
145
+ <a href="/security" ng-gc-href-active>Security</a>
146
+ </li>
147
+ <li class="nav__item">
148
+ <a href="https://developer.gocardless.com">API</a>
149
+ </li>
150
+ <li class="nav__item">
151
+ <a href="/legal" ng-gc-href-active>Legal</a>
152
+ </li>
153
+ <li class="nav__item">
154
+ <a href="/about" ng-gc-href-active>About</a>
155
+ </li>
156
+ <li class="nav__item">
157
+ <a href="/jobs" ng-gc-href-active>Jobs</a>
158
+ </li>
159
+ <li class="nav__item">
160
+ <a href="/press" ng-gc-href-active>Press</a>
161
+ </li>
162
+ <li class="nav__item">
163
+ <a href="https://gocardless.com/blog">Blog</a>
164
+ </li>
165
+ </ul>
166
+ <div class="u-pull-end">
167
+ <a href="https://twitter.com/gocardless" class="u-margin-Ts twitter-follow-button" data-show-count="false" data-dnt="true">Follow @gocardless</a>
168
+ <script src="//platform.twitter.com/widgets.js" async id="twitter-wjs"></script>
169
+ </div>
170
+ </div>
171
+ <div class="grid u-margin-Ts">
172
+ <div class="u-text-h5 u-size3of12 u-padding-Vl grid__cell">
173
+ <b>GoCardless Ltd</b><br>
174
+ 338-346 Goswell Road<br>London, EC1V 7LQ<br>
175
+ 020 7183 8674<br>
176
+ <a href="mailto:help@gocardless.com">help@gocardless.com</a><br>
177
+ </div>
178
+
179
+ <div class="u-size3of12 grid__cell u-text-h5 u-padding-Vl ">
180
+ <p>
181
+ GoCardless is regulated by the <strong>Financial Conduct
182
+ Authority</strong> as an Authorised Payment Institution.
183
+ </p>
184
+ </div>
185
+
186
+ <div class="grid__cell u-size6of12 u-margin-Tm u-text-end">
187
+ <img alt="Footer logos" class="footer__logos" src="/images/footer/footer-logos@2x.png">
188
+ </div>
189
+ </div>
190
+ </div>
191
+
192
+ </div>
193
+ </div>
194
+
195
+
196
+
197
+ <script src="/js/vendor.js"></script>
198
+ <script src="/js/main.js"></script>
199
+
200
+ <!-- Google Tag Manager -->
201
+ <script>
202
+ dataLayer = [];
203
+ </script>
204
+ <script src="//www.googletagmanager.com/gtm.js?id=GTM-PRFKNC" async></script>
205
+ <script>
206
+ (function(w,l){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});})(window, 'dataLayer');
207
+ </script>
208
+ </body>
209
+ </html>
@@ -0,0 +1,12 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Fake root</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ <a href="http://example.com/section-1"></a>
10
+ <a href="http://example.com/section-2"></a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,11 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Section 1-1</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ no more links
10
+ </body>
11
+ </html>
@@ -0,0 +1,11 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Section 1</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ <a href="http://example.com/section-1-1"></a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <!doctype html>
2
+ <head>
3
+ <meta charset="utf-8">
4
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
5
+ <title>Section 2</title>
6
+ <link rel="stylesheet" href="/styles.css">
7
+ </head>
8
+ <body>
9
+ <!-- Also points to section 1.1 -->
10
+ <a href="http://example.com/section-1-1"></a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Crawlette::Crawler do
4
+ subject(:crawler) { Crawlette::Crawler.new('http://example.com') }
5
+ let(:root) { File.read('spec/files/root.html') }
6
+ let(:s1) { File.read('spec/files/section-1.html') }
7
+ let(:s2) { File.read('spec/files/section-2.html') }
8
+ let(:s1_1) { File.read('spec/files/section-1-1.html') }
9
+
10
+
11
+ describe '#crawl' do
12
+ before do
13
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com')){ root }
14
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1')){ s1 }
15
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-2')){ s2 }
16
+ expect(Net::HTTP).to receive(:get).with(URI.parse('http://example.com/section-1-1')).twice{ s1_1 }
17
+ end
18
+
19
+ it "returns a hash with the crawled urls as keys and hashes with the links and assets of each one" do
20
+ expect(crawler.crawl).to eq(
21
+ "http://example.com" => {
22
+ "links"=>["http://example.com/section-1", "http://example.com/section-2"],
23
+ "assets"=>["http://example.com/styles.css"]
24
+ },
25
+ "http://example.com/section-2"=>{
26
+ "links"=>["http://example.com/section-1-1"],
27
+ "assets"=>["http://example.com/styles.css"]
28
+ },
29
+ "http://example.com/section-1"=>{
30
+ "links"=>["http://example.com/section-1-1"],
31
+ "assets"=>["http://example.com/styles.css"]
32
+ },
33
+ "http://example.com/section-1-1"=>{
34
+ "links"=>[],
35
+ "assets"=>["http://example.com/styles.css"]
36
+ }
37
+ )
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe Crawlette::Page do
4
+ subject(:page) { Crawlette::Page.new(html, URI.parse('https://gocardless.com')) }
5
+ let(:html) { File.read('spec/files/page.html') }
6
+
7
+ describe '#links' do
8
+ it "returns a unique list of normalized non-external urls" do
9
+ expected_links = [
10
+ "https://gocardless.com/users/sign_in",
11
+ "https://gocardless.com/merchants/new",
12
+ "https://gocardless.com/watch-a-demo",
13
+ "https://gocardless.com/features",
14
+ "https://help.gocardless.com",
15
+ "https://gocardless.com/contact-sales",
16
+ "https://gocardless.com/faq/merchants",
17
+ "https://gocardless.com/direct-debit",
18
+ "https://gocardless.com/direct-debit/sepa",
19
+ "https://gocardless.com/security",
20
+ "https://developer.gocardless.com",
21
+ "https://gocardless.com/legal",
22
+ "https://gocardless.com/about",
23
+ "https://gocardless.com/jobs",
24
+ "https://gocardless.com/press",
25
+ "https://gocardless.com/blog",
26
+ ]
27
+ expect(page.links).to match_array(expected_links)
28
+ end
29
+ end
30
+
31
+ describe '#assets' do
32
+ it 'returns a unique list with the normalized urls of the static assets on this page' do
33
+ expected_assets = [
34
+ "https://cdn.optimizely.com/js/125150657.js",
35
+ "https://pdlvimeocdn-a.akamaihd.net/45126/030/267925344.mp4",
36
+ "https://platform.twitter.com/widgets.js",
37
+ "https://gocardless.com/images/footer/footer-logos@2x.png",
38
+ "https://gocardless.com/js/vendor.js",
39
+ "https://gocardless.com/js/main.js",
40
+ "https://www.googletagmanager.com/gtm.js?id=GTM-PRFKNC",
41
+ "https://gocardless.com/css/main.css",
42
+ "https://gocardless.com/css/fonts.css",
43
+ "https://gocardless.com/images/logos/gocardless-square.png"
44
+ ]
45
+
46
+ expect(page.assets).to match_array(expected_assets)
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
2
+
3
+ require 'rspec'
4
+ require 'crawlette'
5
+
6
+ RSpec.configure do |config|
7
+ config.filter_run(focus: true)
8
+ config.run_all_when_everything_filtered = true
9
+ config.order = 'random'
10
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawlette
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Miguel Camba
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: awesome_print
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ description: Crawls a page, with no limits and without visiting external domains
84
+ email:
85
+ - miguel.camba@gmail.com
86
+ executables:
87
+ - crawlette
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".gitignore"
92
+ - ".rspec"
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - bin/crawlette
98
+ - crawlette.gemspec
99
+ - lib/crawlette.rb
100
+ - lib/crawlette/crawler.rb
101
+ - lib/crawlette/page.rb
102
+ - lib/crawlette/version.rb
103
+ - spec/files/page.html
104
+ - spec/files/root.html
105
+ - spec/files/section-1-1.html
106
+ - spec/files/section-1.html
107
+ - spec/files/section-2.html
108
+ - spec/lib/crawler_spec.rb
109
+ - spec/lib/page_spec.rb
110
+ - spec/spec_helper.rb
111
+ homepage: ''
112
+ licenses:
113
+ - MIT
114
+ metadata: {}
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 2.4.1
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: Very simple web crawler
135
+ test_files:
136
+ - spec/files/page.html
137
+ - spec/files/root.html
138
+ - spec/files/section-1-1.html
139
+ - spec/files/section-1.html
140
+ - spec/files/section-2.html
141
+ - spec/lib/crawler_spec.rb
142
+ - spec/lib/page_spec.rb
143
+ - spec/spec_helper.rb