crabbs 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabbs.rb +3 -3
- data/lib/crabbs/cli.rb +3 -2
- data/lib/crabbs/crawler.rb +13 -12
- data/lib/crabbs/link.rb +40 -0
- data/lib/crabbs/page.rb +9 -32
- data/lib/crabbs/version.rb +1 -1
- data/spec/crabbs/cli_spec.rb +20 -2
- data/spec/crabbs/crabbs_spec.rb +1 -1
- data/spec/crabbs/crawler_spec.rb +42 -1
- data/spec/crabbs/link_spec.rb +151 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c21a29135af8fd53a59e659fd1ccbff6f1642f6d
|
4
|
+
data.tar.gz: 9c144e1bc4381a328f01b22b36d7936e348713bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6573282576cb66e87d654a4d24f8361c6b53afe6361fb7656e26fa3df941577519f952a65a6f3ff7c6583229a1382b75e026c77bec14355ca51280b9592d4dcc
|
7
|
+
data.tar.gz: 3a8e8eb68e217614c406ce547e6a204463636d2457615f8432235c2f86fbba0c32bc2b5765f7a97dd71bcda71f9ab6d5d2101eb758ac6c965f9233ce5a58deeb
|
data/lib/crabbs.rb
CHANGED
@@ -8,9 +8,9 @@ module Crabbs
|
|
8
8
|
class << self
|
9
9
|
attr_reader :crawler
|
10
10
|
|
11
|
-
def start(
|
12
|
-
@crawler = Crabbs::Crawler.new
|
13
|
-
@crawler.crawl url
|
11
|
+
def start(options)
|
12
|
+
@crawler = Crabbs::Crawler.new options
|
13
|
+
@crawler.crawl options[:url]
|
14
14
|
@crawler.site_map
|
15
15
|
end
|
16
16
|
end
|
data/lib/crabbs/cli.rb
CHANGED
@@ -7,8 +7,8 @@ module Crabbs
|
|
7
7
|
def start
|
8
8
|
begin
|
9
9
|
opts = parse_options
|
10
|
-
result = Crabbs.start(opts[:url])
|
11
|
-
STDOUT.puts result.to_json
|
10
|
+
result = Crabbs.start({ url: opts[:url], verbose: opts[:verbose] })
|
11
|
+
STDOUT.puts "\nResult:\n#{result.to_json}"
|
12
12
|
rescue Slop::MissingOptionError => e
|
13
13
|
STDOUT.puts e.message
|
14
14
|
end
|
@@ -21,6 +21,7 @@ module Crabbs
|
|
21
21
|
banner 'Usage: crabbs [options]'
|
22
22
|
|
23
23
|
on 'u', 'url=', 'URL to start crawling', required: true
|
24
|
+
on 'v', 'verbose', 'Shows URLs being crawled', default: false
|
24
25
|
end
|
25
26
|
end
|
26
27
|
end
|
data/lib/crabbs/crawler.rb
CHANGED
@@ -4,18 +4,19 @@ module Crabbs
|
|
4
4
|
class Crawler
|
5
5
|
attr_reader :visited, :site_map
|
6
6
|
|
7
|
-
def initialize()
|
7
|
+
def initialize(options={})
|
8
8
|
@visited = []
|
9
9
|
@site_map = {}
|
10
|
+
@options = options
|
10
11
|
end
|
11
12
|
|
12
13
|
def crawl(uri_string)
|
13
|
-
|
14
|
+
recursively_crawl uri_string, @site_map
|
14
15
|
end
|
15
16
|
|
16
17
|
private
|
17
18
|
|
18
|
-
def
|
19
|
+
def recursively_crawl(uri_string, hash)
|
19
20
|
hash[uri_string] = Hash.new
|
20
21
|
|
21
22
|
return if (@visited.include? uri_string)
|
@@ -24,19 +25,19 @@ module Crabbs
|
|
24
25
|
@visited << uri_string
|
25
26
|
|
26
27
|
links.each do |link|
|
27
|
-
|
28
|
+
recursively_crawl(link, hash[uri_string])
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
31
32
|
def extract_links(uri_string)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
33
|
+
uri = URI.parse(uri_string)
|
34
|
+
STDOUT.puts "Visiting: #{uri_string}" if @options[:verbose]
|
35
|
+
STDOUT.putc '.' unless @options[:verbose]
|
36
|
+
page = Crabbs::Page.new(Net::HTTP.get(uri), uri.to_s)
|
37
|
+
|
38
|
+
page.links
|
39
|
+
rescue URI::InvalidURIError
|
40
|
+
[]
|
40
41
|
end
|
41
42
|
end
|
42
43
|
end
|
data/lib/crabbs/link.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Crabbs
|
2
|
+
class Link
|
3
|
+
|
4
|
+
def initialize(href)
|
5
|
+
@href = href
|
6
|
+
@uri = URI.parse(href)
|
7
|
+
end
|
8
|
+
|
9
|
+
def same_host_as?(url)
|
10
|
+
@uri.host == URI.parse(url).host or @uri.host.nil?
|
11
|
+
end
|
12
|
+
|
13
|
+
def has_valid_fragment?
|
14
|
+
@uri.fragment.nil? or not @uri.fragment.empty?
|
15
|
+
end
|
16
|
+
|
17
|
+
def has_html_extension?
|
18
|
+
link = @href
|
19
|
+
if not @uri.host.nil?
|
20
|
+
link = @href.sub(@uri.host, '')
|
21
|
+
end
|
22
|
+
|
23
|
+
extension = File.extname(link)
|
24
|
+
extension.empty? or extension == '.html'
|
25
|
+
end
|
26
|
+
|
27
|
+
def join(url)
|
28
|
+
new_uri = @uri
|
29
|
+
|
30
|
+
if @uri.host.nil?
|
31
|
+
new_uri = URI.parse url
|
32
|
+
new_uri = URI.join(new_uri.to_s, @uri.path) unless @uri.path.nil?
|
33
|
+
new_uri = URI.join(new_uri.to_s, "?#{@uri.query}") unless @uri.query.nil?
|
34
|
+
new_uri = URI.join(new_uri.to_s, "##{@uri.fragment}") unless @uri.fragment.nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
new_uri.to_s
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/crabbs/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'crabbs/link'
|
2
3
|
|
3
4
|
module Crabbs
|
4
5
|
class Page
|
@@ -8,38 +9,14 @@ module Crabbs
|
|
8
9
|
end
|
9
10
|
|
10
11
|
def links
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
.select { |
|
15
|
-
.select
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
links = links.select do |link|
|
20
|
-
host = URI.parse(link).host
|
21
|
-
extension = File.extname(link.sub(host, ''))
|
22
|
-
extension.empty? or extension == '.html'
|
23
|
-
end
|
24
|
-
|
25
|
-
links.uniq
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def create_full_uri_links(uri_list)
|
31
|
-
uri_list.map do |uri|
|
32
|
-
new_uri = uri
|
33
|
-
|
34
|
-
if uri.host.nil?
|
35
|
-
new_uri = URI.parse @url
|
36
|
-
new_uri = URI.join(new_uri.to_s, uri.path) unless uri.path.nil?
|
37
|
-
new_uri = URI.join(new_uri.to_s, "?#{uri.query}") unless uri.query.nil?
|
38
|
-
new_uri = URI.join(new_uri.to_s, "##{uri.fragment}") unless uri.fragment.nil?
|
39
|
-
end
|
40
|
-
|
41
|
-
new_uri.to_s
|
42
|
-
end
|
12
|
+
links = @html.css('a[href]').map { |a| Crabbs::Link.new a['href'] }
|
13
|
+
|
14
|
+
links
|
15
|
+
.select { |link| link.same_host_as? @url }
|
16
|
+
.select(&:has_valid_fragment?)
|
17
|
+
.select(&:has_html_extension?)
|
18
|
+
.map { |link| link.join @url }
|
19
|
+
.uniq
|
43
20
|
end
|
44
21
|
end
|
45
22
|
end
|
data/lib/crabbs/version.rb
CHANGED
data/spec/crabbs/cli_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'crabbs/cli'
|
2
|
+
require 'webmock/rspec'
|
2
3
|
|
3
4
|
describe Crabbs::CLI do
|
4
5
|
subject { Crabbs::CLI.new }
|
@@ -25,8 +26,25 @@ describe Crabbs::CLI do
|
|
25
26
|
|
26
27
|
subject.start
|
27
28
|
|
28
|
-
expect(Crabbs).to have_received(:start).with('https://example.com')
|
29
|
-
expect(STDOUT).to have_received(:puts).with('result'.to_json)
|
29
|
+
expect(Crabbs).to have_received(:start).with({ url: 'https://example.com', verbose: false })
|
30
|
+
expect(STDOUT).to have_received(:puts).with("\nResult:\n#{'result'.to_json}")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context 'integration with crabbs' do
|
35
|
+
before do
|
36
|
+
stub_request(:get, "http://example.com/").to_return(:body => "")
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'does not break contract' do
|
40
|
+
ARGV.replace ['--url=http://example.com']
|
41
|
+
allow(STDOUT).to receive(:puts)
|
42
|
+
|
43
|
+
subject.start
|
44
|
+
|
45
|
+
output = "\nResult:\n#{{ 'http://example.com' => {} }.to_json}"
|
46
|
+
|
47
|
+
expect(STDOUT).to have_received(:puts).with(output)
|
30
48
|
end
|
31
49
|
end
|
32
50
|
end
|
data/spec/crabbs/crabbs_spec.rb
CHANGED
data/spec/crabbs/crawler_spec.rb
CHANGED
@@ -4,7 +4,48 @@ require 'crabbs/crawler'
|
|
4
4
|
|
5
5
|
describe Crabbs::Crawler do
|
6
6
|
describe '#crawl' do
|
7
|
-
|
7
|
+
let(:options) { Hash.new }
|
8
|
+
subject { Crabbs::Crawler.new options }
|
9
|
+
|
10
|
+
context 'when verbose' do
|
11
|
+
let(:options) { { verbose: true } }
|
12
|
+
|
13
|
+
before do
|
14
|
+
@uri_string = 'http://example.com/'
|
15
|
+
stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
|
16
|
+
stub_request(:get, "http://example.com/path").to_return(body: "")
|
17
|
+
stub_request(:get, "http://example.com/local").to_return(body: "")
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'logs the url been visited' do
|
21
|
+
allow(STDOUT).to receive(:puts)
|
22
|
+
|
23
|
+
subject.crawl @uri_string
|
24
|
+
|
25
|
+
expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/')
|
26
|
+
expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/path')
|
27
|
+
expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/local')
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context 'when not verbose' do
|
32
|
+
let(:options) { { verbose: false } }
|
33
|
+
|
34
|
+
before do
|
35
|
+
@uri_string = 'http://example.com/'
|
36
|
+
stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
|
37
|
+
stub_request(:get, "http://example.com/path").to_return(body: "")
|
38
|
+
stub_request(:get, "http://example.com/local").to_return(body: "")
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'logs the url been visited' do
|
42
|
+
allow(STDOUT).to receive(:putc)
|
43
|
+
|
44
|
+
subject.crawl @uri_string
|
45
|
+
|
46
|
+
expect(STDOUT).to have_received(:putc).with('.').exactly(3).times
|
47
|
+
end
|
48
|
+
end
|
8
49
|
|
9
50
|
context 'an invalid URI' do
|
10
51
|
it 'stores single entry site map' do
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'crabbs/link'
|
2
|
+
|
3
|
+
describe Crabbs::Link do
|
4
|
+
subject { Crabbs::Link.new href }
|
5
|
+
|
6
|
+
describe '#same_host_as?' do
|
7
|
+
context 'when it is the same host' do
|
8
|
+
let(:host) { 'http://example.com/path' }
|
9
|
+
let(:href) { 'http://example.com/path' }
|
10
|
+
|
11
|
+
it 'is the same host' do
|
12
|
+
subject.same_host_as?(host).should be_true
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context 'when href have no host' do
|
17
|
+
let(:host) { 'http://example.com/path' }
|
18
|
+
let(:href) { '/path' }
|
19
|
+
|
20
|
+
it 'assumes its the same host' do
|
21
|
+
subject.same_host_as?(host).should be_true
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context 'when host is a subdomain' do
|
26
|
+
let(:host) { 'http://example.com/path' }
|
27
|
+
let(:href) { 'http://subdomain.example.com/path' }
|
28
|
+
|
29
|
+
it 'is not the same host' do
|
30
|
+
subject.same_host_as?(host).should be_false
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context 'when host is completely different' do
|
35
|
+
let(:host) { 'http://example.com/path' }
|
36
|
+
let(:href) { 'http://facebook.com/path' }
|
37
|
+
|
38
|
+
it 'is not the same host' do
|
39
|
+
subject.same_host_as?(host).should be_false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#has_valid_fragment?' do
|
45
|
+
context 'when the fragment is empty' do
|
46
|
+
let(:href) { 'http://example.com/path#' }
|
47
|
+
|
48
|
+
it 'is not valid' do
|
49
|
+
subject.has_valid_fragment?.should be_false
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when there is no fragment' do
|
54
|
+
let(:href) { 'http://example.com/path' }
|
55
|
+
|
56
|
+
it 'is valid' do
|
57
|
+
subject.has_valid_fragment?.should be_true
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context 'when there is a fragment' do
|
62
|
+
let(:href) { 'http://example.com/path#fragment' }
|
63
|
+
|
64
|
+
it 'is valid' do
|
65
|
+
subject.has_valid_fragment?.should be_true
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#has_html_extension?' do
|
71
|
+
context 'when the extension is not html' do
|
72
|
+
let(:href) { 'http://example.com/path.zip' }
|
73
|
+
|
74
|
+
it 'has no html extension' do
|
75
|
+
subject.has_html_extension?.should be_false
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context 'when the extension html' do
|
80
|
+
let(:href) { 'http://example.com/path.html' }
|
81
|
+
|
82
|
+
it 'has html extension' do
|
83
|
+
subject.has_html_extension?.should be_true
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
context 'when there is no extension' do
|
88
|
+
let(:href) { 'http://example.com/path' }
|
89
|
+
|
90
|
+
it 'assumes to be a html link' do
|
91
|
+
subject.has_html_extension?.should be_true
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#join' do
|
97
|
+
context 'when the href is a root path' do
|
98
|
+
let(:href) { '/path' }
|
99
|
+
let(:url) { 'http://example.com/' }
|
100
|
+
|
101
|
+
it 'joins with the url' do
|
102
|
+
subject.join(url).should == 'http://example.com/path'
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
context 'when the href is a direct path' do
|
107
|
+
let(:href) { 'path' }
|
108
|
+
let(:url) { 'http://example.com/test/' }
|
109
|
+
|
110
|
+
it 'appends to the url previous path' do
|
111
|
+
subject.join(url).should == 'http://example.com/test/path'
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
context 'when the href is a fragment (hash)' do
|
116
|
+
let(:href) { '#fragment' }
|
117
|
+
let(:url) { 'http://example.com/test/' }
|
118
|
+
|
119
|
+
it 'appends to the url previous path' do
|
120
|
+
subject.join(url).should == 'http://example.com/test/#fragment'
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
context 'when the href is a query parameter' do
|
125
|
+
let(:href) { '?parameter' }
|
126
|
+
let(:url) { 'http://example.com/test/' }
|
127
|
+
|
128
|
+
it 'appends to the url previous path' do
|
129
|
+
subject.join(url).should == 'http://example.com/test/?parameter'
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context 'when the href is both (fragment + query)' do
|
134
|
+
let(:href) { '#fragment?parameter' }
|
135
|
+
let(:url) { 'http://example.com/test/' }
|
136
|
+
|
137
|
+
it 'appends to the url previous path' do
|
138
|
+
subject.join(url).should == 'http://example.com/test/#fragment?parameter'
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context 'when the href is a full uri' do
|
143
|
+
let(:href) { 'http://example.com/test/#fragment?parameter' }
|
144
|
+
let(:url) { 'http://example.com/test/' }
|
145
|
+
|
146
|
+
it 'leaves it untouched' do
|
147
|
+
subject.join(url).should == 'http://example.com/test/#fragment?parameter'
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabbs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bruno Trecenti
|
@@ -157,11 +157,13 @@ files:
|
|
157
157
|
- lib/crabbs.rb
|
158
158
|
- lib/crabbs/cli.rb
|
159
159
|
- lib/crabbs/crawler.rb
|
160
|
+
- lib/crabbs/link.rb
|
160
161
|
- lib/crabbs/page.rb
|
161
162
|
- lib/crabbs/version.rb
|
162
163
|
- spec/crabbs/cli_spec.rb
|
163
164
|
- spec/crabbs/crabbs_spec.rb
|
164
165
|
- spec/crabbs/crawler_spec.rb
|
166
|
+
- spec/crabbs/link_spec.rb
|
165
167
|
- spec/crabbs/page_spec.rb
|
166
168
|
homepage: http://github.com/Trecenti/crabbs
|
167
169
|
licenses:
|
@@ -191,4 +193,5 @@ test_files:
|
|
191
193
|
- spec/crabbs/cli_spec.rb
|
192
194
|
- spec/crabbs/crabbs_spec.rb
|
193
195
|
- spec/crabbs/crawler_spec.rb
|
196
|
+
- spec/crabbs/link_spec.rb
|
194
197
|
- spec/crabbs/page_spec.rb
|