crabbs 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9fc5a904a6d962205d1a075c95ef79a98ccba4e7
4
- data.tar.gz: e30f07bf9305c2e4a6aad06de1c335d2f2976d0d
3
+ metadata.gz: c21a29135af8fd53a59e659fd1ccbff6f1642f6d
4
+ data.tar.gz: 9c144e1bc4381a328f01b22b36d7936e348713bf
5
5
  SHA512:
6
- metadata.gz: 82c05e3734be530a5f8e9059fdd69959598f76ad66cbd3ce2cb9af7120e3d24cc9839eb79a7168417273404e7dd20a74a1cad01d05563f5ce4743f59e6efaaba
7
- data.tar.gz: db18939b31a5cb899cdc8ca5872ba355e21ef3f597106c9996debc44697fa08ab8c7d1834bbbdff0c3373939c034334e5897a514ab0a45a2b9c91c8410d584fa
6
+ metadata.gz: 6573282576cb66e87d654a4d24f8361c6b53afe6361fb7656e26fa3df941577519f952a65a6f3ff7c6583229a1382b75e026c77bec14355ca51280b9592d4dcc
7
+ data.tar.gz: 3a8e8eb68e217614c406ce547e6a204463636d2457615f8432235c2f86fbba0c32bc2b5765f7a97dd71bcda71f9ab6d5d2101eb758ac6c965f9233ce5a58deeb
data/lib/crabbs.rb CHANGED
@@ -8,9 +8,9 @@ module Crabbs
8
8
  class << self
9
9
  attr_reader :crawler
10
10
 
11
- def start(url)
12
- @crawler = Crabbs::Crawler.new
13
- @crawler.crawl url
11
+ def start(options)
12
+ @crawler = Crabbs::Crawler.new options
13
+ @crawler.crawl options[:url]
14
14
  @crawler.site_map
15
15
  end
16
16
  end
data/lib/crabbs/cli.rb CHANGED
@@ -7,8 +7,8 @@ module Crabbs
7
7
  def start
8
8
  begin
9
9
  opts = parse_options
10
- result = Crabbs.start(opts[:url])
11
- STDOUT.puts result.to_json
10
+ result = Crabbs.start({ url: opts[:url], verbose: opts[:verbose] })
11
+ STDOUT.puts "\nResult:\n#{result.to_json}"
12
12
  rescue Slop::MissingOptionError => e
13
13
  STDOUT.puts e.message
14
14
  end
@@ -21,6 +21,7 @@ module Crabbs
21
21
  banner 'Usage: crabbs [options]'
22
22
 
23
23
  on 'u', 'url=', 'URL to start crawling', required: true
24
+ on 'v', 'verbose', 'Shows URLs being crawled', default: false
24
25
  end
25
26
  end
26
27
  end
@@ -4,18 +4,19 @@ module Crabbs
4
4
  class Crawler
5
5
  attr_reader :visited, :site_map
6
6
 
7
- def initialize()
7
+ def initialize(options={})
8
8
  @visited = []
9
9
  @site_map = {}
10
+ @options = options
10
11
  end
11
12
 
12
13
  def crawl(uri_string)
13
- recurse uri_string, @site_map
14
+ recursively_crawl uri_string, @site_map
14
15
  end
15
16
 
16
17
  private
17
18
 
18
- def recurse(uri_string, hash)
19
+ def recursively_crawl(uri_string, hash)
19
20
  hash[uri_string] = Hash.new
20
21
 
21
22
  return if (@visited.include? uri_string)
@@ -24,19 +25,19 @@ module Crabbs
24
25
  @visited << uri_string
25
26
 
26
27
  links.each do |link|
27
- recurse(link, hash[uri_string])
28
+ recursively_crawl(link, hash[uri_string])
28
29
  end
29
30
  end
30
31
 
31
32
  def extract_links(uri_string)
32
- begin
33
- uri = URI.parse(uri_string)
34
- page = Crabbs::Page.new(Net::HTTP.get(uri), uri.to_s)
35
-
36
- page.links
37
- rescue URI::InvalidURIError
38
- []
39
- end
33
+ uri = URI.parse(uri_string)
34
+ STDOUT.puts "Visiting: #{uri_string}" if @options[:verbose]
35
+ STDOUT.putc '.' unless @options[:verbose]
36
+ page = Crabbs::Page.new(Net::HTTP.get(uri), uri.to_s)
37
+
38
+ page.links
39
+ rescue URI::InvalidURIError
40
+ []
40
41
  end
41
42
  end
42
43
  end
@@ -0,0 +1,40 @@
1
+ module Crabbs
2
+ class Link
3
+
4
+ def initialize(href)
5
+ @href = href
6
+ @uri = URI.parse(href)
7
+ end
8
+
9
+ def same_host_as?(url)
10
+ @uri.host == URI.parse(url).host or @uri.host.nil?
11
+ end
12
+
13
+ def has_valid_fragment?
14
+ @uri.fragment.nil? or not @uri.fragment.empty?
15
+ end
16
+
17
+ def has_html_extension?
18
+ link = @href
19
+ if not @uri.host.nil?
20
+ link = @href.sub(@uri.host, '')
21
+ end
22
+
23
+ extension = File.extname(link)
24
+ extension.empty? or extension == '.html'
25
+ end
26
+
27
+ def join(url)
28
+ new_uri = @uri
29
+
30
+ if @uri.host.nil?
31
+ new_uri = URI.parse url
32
+ new_uri = URI.join(new_uri.to_s, @uri.path) unless @uri.path.nil?
33
+ new_uri = URI.join(new_uri.to_s, "?#{@uri.query}") unless @uri.query.nil?
34
+ new_uri = URI.join(new_uri.to_s, "##{@uri.fragment}") unless @uri.fragment.nil?
35
+ end
36
+
37
+ new_uri.to_s
38
+ end
39
+ end
40
+ end
data/lib/crabbs/page.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'nokogiri'
2
+ require 'crabbs/link'
2
3
 
3
4
  module Crabbs
4
5
  class Page
@@ -8,38 +9,14 @@ module Crabbs
8
9
  end
9
10
 
10
11
  def links
11
- uri_list = @html.css('a[href]').map { |a| URI.parse(a['href']) }
12
-
13
- valid_uris = uri_list
14
- .select { |uri| uri.host == URI.parse(@url).host or uri.host.nil? }
15
- .select { |uri| uri.fragment.nil? or not uri.fragment.empty? }
16
-
17
- links = create_full_uri_links(valid_uris)
18
-
19
- links = links.select do |link|
20
- host = URI.parse(link).host
21
- extension = File.extname(link.sub(host, ''))
22
- extension.empty? or extension == '.html'
23
- end
24
-
25
- links.uniq
26
- end
27
-
28
- private
29
-
30
- def create_full_uri_links(uri_list)
31
- uri_list.map do |uri|
32
- new_uri = uri
33
-
34
- if uri.host.nil?
35
- new_uri = URI.parse @url
36
- new_uri = URI.join(new_uri.to_s, uri.path) unless uri.path.nil?
37
- new_uri = URI.join(new_uri.to_s, "?#{uri.query}") unless uri.query.nil?
38
- new_uri = URI.join(new_uri.to_s, "##{uri.fragment}") unless uri.fragment.nil?
39
- end
40
-
41
- new_uri.to_s
42
- end
12
+ links = @html.css('a[href]').map { |a| Crabbs::Link.new a['href'] }
13
+
14
+ links
15
+ .select { |link| link.same_host_as? @url }
16
+ .select(&:has_valid_fragment?)
17
+ .select(&:has_html_extension?)
18
+ .map { |link| link.join @url }
19
+ .uniq
43
20
  end
44
21
  end
45
22
  end
@@ -1,3 +1,3 @@
1
1
  module Crabbs
2
- VERSION = "0.0.2"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,4 +1,5 @@
1
1
  require 'crabbs/cli'
2
+ require 'webmock/rspec'
2
3
 
3
4
  describe Crabbs::CLI do
4
5
  subject { Crabbs::CLI.new }
@@ -25,8 +26,25 @@ describe Crabbs::CLI do
25
26
 
26
27
  subject.start
27
28
 
28
- expect(Crabbs).to have_received(:start).with('https://example.com')
29
- expect(STDOUT).to have_received(:puts).with('result'.to_json)
29
+ expect(Crabbs).to have_received(:start).with({ url: 'https://example.com', verbose: false })
30
+ expect(STDOUT).to have_received(:puts).with("\nResult:\n#{'result'.to_json}")
31
+ end
32
+ end
33
+
34
+ context 'integration with crabbs' do
35
+ before do
36
+ stub_request(:get, "http://example.com/").to_return(:body => "")
37
+ end
38
+
39
+ it 'does not break contract' do
40
+ ARGV.replace ['--url=http://example.com']
41
+ allow(STDOUT).to receive(:puts)
42
+
43
+ subject.start
44
+
45
+ output = "\nResult:\n#{{ 'http://example.com' => {} }.to_json}"
46
+
47
+ expect(STDOUT).to have_received(:puts).with(output)
30
48
  end
31
49
  end
32
50
  end
@@ -11,7 +11,7 @@ describe Crabbs do
11
11
  end
12
12
 
13
13
  it 'starts crawling' do
14
- subject.start('http://example.com').should == { 'http://example.com' => {} }
14
+ subject.start({ url: 'http://example.com' }).should == { 'http://example.com' => {} }
15
15
  end
16
16
  end
17
17
  end
@@ -4,7 +4,48 @@ require 'crabbs/crawler'
4
4
 
5
5
  describe Crabbs::Crawler do
6
6
  describe '#crawl' do
7
- subject { Crabbs::Crawler.new }
7
+ let(:options) { Hash.new }
8
+ subject { Crabbs::Crawler.new options }
9
+
10
+ context 'when verbose' do
11
+ let(:options) { { verbose: true } }
12
+
13
+ before do
14
+ @uri_string = 'http://example.com/'
15
+ stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
16
+ stub_request(:get, "http://example.com/path").to_return(body: "")
17
+ stub_request(:get, "http://example.com/local").to_return(body: "")
18
+ end
19
+
20
+ it 'logs the url been visited' do
21
+ allow(STDOUT).to receive(:puts)
22
+
23
+ subject.crawl @uri_string
24
+
25
+ expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/')
26
+ expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/path')
27
+ expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/local')
28
+ end
29
+ end
30
+
31
+ context 'when not verbose' do
32
+ let(:options) { { verbose: false } }
33
+
34
+ before do
35
+ @uri_string = 'http://example.com/'
36
+ stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
37
+ stub_request(:get, "http://example.com/path").to_return(body: "")
38
+ stub_request(:get, "http://example.com/local").to_return(body: "")
39
+ end
40
+
41
+ it 'logs the url been visited' do
42
+ allow(STDOUT).to receive(:putc)
43
+
44
+ subject.crawl @uri_string
45
+
46
+ expect(STDOUT).to have_received(:putc).with('.').exactly(3).times
47
+ end
48
+ end
8
49
 
9
50
  context 'an invalid URI' do
10
51
  it 'stores single entry site map' do
@@ -0,0 +1,151 @@
1
+ require 'crabbs/link'
2
+
3
+ describe Crabbs::Link do
4
+ subject { Crabbs::Link.new href }
5
+
6
+ describe '#same_host_as?' do
7
+ context 'when it is the same host' do
8
+ let(:host) { 'http://example.com/path' }
9
+ let(:href) { 'http://example.com/path' }
10
+
11
+ it 'is the same host' do
12
+ subject.same_host_as?(host).should be_true
13
+ end
14
+ end
15
+
16
+ context 'when href have no host' do
17
+ let(:host) { 'http://example.com/path' }
18
+ let(:href) { '/path' }
19
+
20
+ it 'assumes its the same host' do
21
+ subject.same_host_as?(host).should be_true
22
+ end
23
+ end
24
+
25
+ context 'when host is a subdomain' do
26
+ let(:host) { 'http://example.com/path' }
27
+ let(:href) { 'http://subdomain.example.com/path' }
28
+
29
+ it 'is not the same host' do
30
+ subject.same_host_as?(host).should be_false
31
+ end
32
+ end
33
+
34
+ context 'when host is completely different' do
35
+ let(:host) { 'http://example.com/path' }
36
+ let(:href) { 'http://facebook.com/path' }
37
+
38
+ it 'is not the same host' do
39
+ subject.same_host_as?(host).should be_false
40
+ end
41
+ end
42
+ end
43
+
44
+ describe '#has_valid_fragment?' do
45
+ context 'when the fragment is empty' do
46
+ let(:href) { 'http://example.com/path#' }
47
+
48
+ it 'is not valid' do
49
+ subject.has_valid_fragment?.should be_false
50
+ end
51
+ end
52
+
53
+ context 'when there is no fragment' do
54
+ let(:href) { 'http://example.com/path' }
55
+
56
+ it 'is valid' do
57
+ subject.has_valid_fragment?.should be_true
58
+ end
59
+ end
60
+
61
+ context 'when there is a fragment' do
62
+ let(:href) { 'http://example.com/path#fragment' }
63
+
64
+ it 'is valid' do
65
+ subject.has_valid_fragment?.should be_true
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#has_html_extension?' do
71
+ context 'when the extension is not html' do
72
+ let(:href) { 'http://example.com/path.zip' }
73
+
74
+ it 'has no html extension' do
75
+ subject.has_html_extension?.should be_false
76
+ end
77
+ end
78
+
79
+ context 'when the extension html' do
80
+ let(:href) { 'http://example.com/path.html' }
81
+
82
+ it 'has html extension' do
83
+ subject.has_html_extension?.should be_true
84
+ end
85
+ end
86
+
87
+ context 'when there is no extension' do
88
+ let(:href) { 'http://example.com/path' }
89
+
90
+ it 'assumes to be a html link' do
91
+ subject.has_html_extension?.should be_true
92
+ end
93
+ end
94
+ end
95
+
96
+ describe '#join' do
97
+ context 'when the href is a root path' do
98
+ let(:href) { '/path' }
99
+ let(:url) { 'http://example.com/' }
100
+
101
+ it 'joins with the url' do
102
+ subject.join(url).should == 'http://example.com/path'
103
+ end
104
+ end
105
+
106
+ context 'when the href is a direct path' do
107
+ let(:href) { 'path' }
108
+ let(:url) { 'http://example.com/test/' }
109
+
110
+ it 'appends to the url previous path' do
111
+ subject.join(url).should == 'http://example.com/test/path'
112
+ end
113
+ end
114
+
115
+ context 'when the href is a fragment (hash)' do
116
+ let(:href) { '#fragment' }
117
+ let(:url) { 'http://example.com/test/' }
118
+
119
+ it 'appends to the url previous path' do
120
+ subject.join(url).should == 'http://example.com/test/#fragment'
121
+ end
122
+ end
123
+
124
+ context 'when the href is a query parameter' do
125
+ let(:href) { '?parameter' }
126
+ let(:url) { 'http://example.com/test/' }
127
+
128
+ it 'appends to the url previous path' do
129
+ subject.join(url).should == 'http://example.com/test/?parameter'
130
+ end
131
+ end
132
+
133
+ context 'when the href is both (fragment + query)' do
134
+ let(:href) { '#fragment?parameter' }
135
+ let(:url) { 'http://example.com/test/' }
136
+
137
+ it 'appends to the url previous path' do
138
+ subject.join(url).should == 'http://example.com/test/#fragment?parameter'
139
+ end
140
+ end
141
+
142
+ context 'when the href is a full uri' do
143
+ let(:href) { 'http://example.com/test/#fragment?parameter' }
144
+ let(:url) { 'http://example.com/test/' }
145
+
146
+ it 'leaves it untouched' do
147
+ subject.join(url).should == 'http://example.com/test/#fragment?parameter'
148
+ end
149
+ end
150
+ end
151
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crabbs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bruno Trecenti
@@ -157,11 +157,13 @@ files:
157
157
  - lib/crabbs.rb
158
158
  - lib/crabbs/cli.rb
159
159
  - lib/crabbs/crawler.rb
160
+ - lib/crabbs/link.rb
160
161
  - lib/crabbs/page.rb
161
162
  - lib/crabbs/version.rb
162
163
  - spec/crabbs/cli_spec.rb
163
164
  - spec/crabbs/crabbs_spec.rb
164
165
  - spec/crabbs/crawler_spec.rb
166
+ - spec/crabbs/link_spec.rb
165
167
  - spec/crabbs/page_spec.rb
166
168
  homepage: http://github.com/Trecenti/crabbs
167
169
  licenses:
@@ -191,4 +193,5 @@ test_files:
191
193
  - spec/crabbs/cli_spec.rb
192
194
  - spec/crabbs/crabbs_spec.rb
193
195
  - spec/crabbs/crawler_spec.rb
196
+ - spec/crabbs/link_spec.rb
194
197
  - spec/crabbs/page_spec.rb