webspinne 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.csv
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in webspinne.gemspec
4
+ gemspec
@@ -0,0 +1,24 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+
9
+ # Rails example
10
+ watch(%r{^app/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
11
+ watch(%r{^app/(.*)(\.erb|\.haml)$}) { |m| "spec/#{m[1]}#{m[2]}_spec.rb" }
12
+ watch(%r{^app/controllers/(.+)_(controller)\.rb$}) { |m| ["spec/routing/#{m[1]}_routing_spec.rb", "spec/#{m[2]}s/#{m[1]}_#{m[2]}_spec.rb", "spec/acceptance/#{m[1]}_spec.rb"] }
13
+ watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
14
+ watch('config/routes.rb') { "spec/routing" }
15
+ watch('app/controllers/application_controller.rb') { "spec/controllers" }
16
+
17
+ # Capybara features specs
18
+ watch(%r{^app/views/(.+)/.*\.(erb|haml)$}) { |m| "spec/features/#{m[1]}_spec.rb" }
19
+
20
+ # Turnip features and steps
21
+ watch(%r{^spec/acceptance/(.+)\.feature$})
22
+ watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'spec/acceptance' }
23
+ end
24
+
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jens Bissinger
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Webspinne
2
+
3
+ Webspinne analyzes websites by counting their pages.
4
+
5
+ Named after [http://de.wikipedia.org/wiki/Webspinnen](http://de.wikipedia.org/wiki/Webspinnen).
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'webspinne'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install webspinne
20
+
21
+ ## Usage
22
+
23
+ From the commandline:
24
+
25
+ $ webspinne http://www.wikipedia.org 1000
26
+
27
+ This will make webspinne go to Wikipedia website and make up to 1,000 web requests to follow the links and sublinks found on the pages. The links found will automatically be reported in a file called `www.wikipedia.org.csv`.
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
36
+
37
+ ## License
38
+
39
+ Copyright (c) 2012 Jens Bissinger. See [LICENSE.txt](LICENSE.txt)
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path('../../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'webspinne'
6
+
7
+ # parameters
8
+ uri = ARGV[0] || 'http://www.google.de'
9
+ max_follows = (ARGV[1] || 10).to_i
10
+
11
+ # visit
12
+ visit = Webspinne.visit uri, max_follows
13
+
14
+ puts "=== FINISHED ==="
15
+
16
+ # stats
17
+ visit.index.tap do |i|
18
+ puts "links (total) = #{i.size}"
19
+ puts "visited / unvisited = #{i.visited_size} / #{i.unvisited_size}"
20
+ puts "onsite / offsite = #{i.onsite_size} / #{i.offsite_size}"
21
+ end
22
+
23
+ # csv report
24
+ report = Webspinne.csv_report(visit.index)
25
+ report.to_file(visit.site.hostname + '.csv')
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'sinatra'
4
+
5
+ def html
6
+ %Q{<!DOCTYPE html>
7
+ <html>
8
+ <head><title>dummy</title></head>
9
+ <body>#{yield}</body>
10
+ </html>}
11
+ end
12
+
13
+ def link_to uri
14
+ "<a href='#{uri}'>#{uri}</a><br/>"
15
+ end
16
+
17
+ get '/' do
18
+ html do
19
+ %Q{#{link_to '/foo'}}
20
+ end
21
+ end
22
+
23
+ get '/foo' do
24
+ html do
25
+ [ link_to('/foo'),
26
+ link_to('/bar'),
27
+ link_to('/baz'),
28
+ link_to('http://foo.bar.baz') ].join
29
+ end
30
+ end
31
+
32
+ get '/bar' do
33
+ html do
34
+ (
35
+ [ link_to('/foo'),
36
+ link_to('http://subdomain.localhost:4567/') ] +
37
+ 15.times.map{|i| link_to "/#{i}"}
38
+ ).join
39
+ end
40
+ end
41
+
42
+ 10.times do |i|
43
+ get "/#{i}" do
44
+ html do
45
+ [ link_to('/foo'),
46
+ link_to('/bar') ].join
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,20 @@
1
+ require "webspinne/version"
2
+ require "mechanize"
3
+
4
+ module Webspinne
5
+ class DoubleVisitError < StandardError; end
6
+
7
+ autoload :Visit, "webspinne/visit"
8
+ autoload :Index, "webspinne/index"
9
+ autoload :Site, "webspinne/site"
10
+ autoload :Link, "webspinne/link"
11
+ autoload :CSVReport, "webspinne/csv_report"
12
+
13
+ def self.visit uri, max_follows
14
+ Visit.new(uri).run max_follows
15
+ end
16
+
17
+ def self.csv_report index
18
+ CSVReport.new(index)
19
+ end
20
+ end
@@ -0,0 +1,14 @@
1
+ require 'csv'
2
+
3
+ module Webspinne
4
+ class CSVReport < Struct.new(:index)
5
+ def to_file path
6
+ CSV.open(path, "wb") do |csv|
7
+ csv << ['uri', 'onsite', 'visited']
8
+ index.each do |link|
9
+ csv << [link.uri, link.onsite?, link.visited?]
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,44 @@
1
+ module Webspinne
2
+ class Index
3
+ include Enumerable
4
+
5
+ def initialize
6
+ @list = {}
7
+ end
8
+
9
+ def each &block
10
+ @list.each do |uri, link|
11
+ block.call link
12
+ end
13
+ end
14
+
15
+ # defensive set
16
+ def << link
17
+ @list[link.uri] ||= link
18
+ end
19
+
20
+ def size
21
+ @list.size
22
+ end
23
+
24
+ def next_unvisited_onsite_link
25
+ find { |link| !link.visited? && link.onsite? }
26
+ end
27
+
28
+ def visited_size
29
+ find_all { |link| link.visited? }.size
30
+ end
31
+
32
+ def unvisited_size
33
+ size - visited_size
34
+ end
35
+
36
+ def onsite_size
37
+ find_all { |link| link.onsite? }.size
38
+ end
39
+
40
+ def offsite_size
41
+ size - onsite_size
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,31 @@
1
+ module Webspinne
2
+ class Link
3
+ attr_reader :uri, :onsite
4
+
5
+ def initialize uri, onsite=false
6
+ @uri = uri
7
+ @onsite = onsite
8
+ @visited = false
9
+ end
10
+
11
+ def visited!
12
+ if visited?
13
+ raise DoubleVisitError, uri
14
+ else
15
+ @visited = true
16
+ end
17
+ end
18
+
19
+ def visited?
20
+ @visited == true
21
+ end
22
+
23
+ def onsite?
24
+ onsite == true
25
+ end
26
+
27
+ def offsite?
28
+ !onsite?
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,15 @@
1
+ module Webspinne
2
+ class Site < Struct.new(:uri)
3
+ def onsite? test_uri
4
+ hostname == hostname(test_uri) || hostname(test_uri) == nil
5
+ end
6
+
7
+ def offsite? test_uri
8
+ !onsite?(test_uri)
9
+ end
10
+
11
+ def hostname test_uri=uri
12
+ URI(test_uri).hostname
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Webspinne
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,58 @@
1
+ module Webspinne
2
+ class Visit
3
+ attr_reader :site, :agent, :index
4
+
5
+ def initialize(uri)
6
+ @site = Site.new(uri)
7
+ end
8
+
9
+ def run max_follows
10
+ @agent = Mechanize.new
11
+ @index = Index.new
12
+
13
+ puts "visiting #{site.uri} - following max. #{max_follows} links"
14
+
15
+ plan_visit site.uri
16
+ max_follows.times do
17
+ if link = index.next_unvisited_onsite_link
18
+ if exec_visit(link)
19
+ print '.'
20
+ else
21
+ print 'f'
22
+ end
23
+ else
24
+ # no more unvisited onsite links
25
+ break
26
+ end
27
+ end
28
+
29
+ puts
30
+
31
+ self
32
+ end
33
+
34
+ def exec_visit link
35
+ link.visited!
36
+ page = agent.get(link.uri)
37
+
38
+ # e.g. images have no links
39
+ if page.respond_to? :links
40
+ page.links.each do |link|
41
+ if link.uri.to_s != ''
42
+ plan_visit link.uri.to_s
43
+ end
44
+ end
45
+ true
46
+ else
47
+ false
48
+ end
49
+ rescue => e
50
+ puts e.class
51
+ false
52
+ end
53
+
54
+ def plan_visit uri
55
+ index << Link.new(uri, site.onsite?(uri))
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,297 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://localhost:4567/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept:
11
+ - ! '*/*'
12
+ User-Agent:
13
+ - Mechanize/2.5.1 Ruby/1.9.3p327 (http://github.com/tenderlove/mechanize/)
14
+ Accept-Encoding:
15
+ - gzip,deflate,identity
16
+ Accept-Charset:
17
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
18
+ Accept-Language:
19
+ - en-us,en;q=0.5
20
+ Host:
21
+ - localhost:4567
22
+ Connection:
23
+ - keep-alive
24
+ Keep-Alive:
25
+ - 300
26
+ response:
27
+ status:
28
+ code: 200
29
+ message: ! 'OK '
30
+ headers:
31
+ X-Frame-Options:
32
+ - sameorigin
33
+ X-Xss-Protection:
34
+ - 1; mode=block
35
+ Content-Type:
36
+ - text/html;charset=utf-8
37
+ Content-Length:
38
+ - '126'
39
+ Server:
40
+ - WEBrick/1.3.1 (Ruby/1.9.3/2012-11-10)
41
+ Date:
42
+ - Thu, 06 Dec 2012 18:10:20 GMT
43
+ Connection:
44
+ - Keep-Alive
45
+ body:
46
+ encoding: US-ASCII
47
+ string: ! "<!DOCTYPE html>\n <html>\n <head><title>dummy</title></head>\n
48
+ \ <body><a href='/foo'>/foo</a><br/></body>\n </html>"
49
+ http_version:
50
+ recorded_at: Thu, 06 Dec 2012 18:10:20 GMT
51
+ - request:
52
+ method: get
53
+ uri: http://localhost:4567/foo
54
+ body:
55
+ encoding: US-ASCII
56
+ string: ''
57
+ headers:
58
+ Accept:
59
+ - ! '*/*'
60
+ User-Agent:
61
+ - Mechanize/2.5.1 Ruby/1.9.3p327 (http://github.com/tenderlove/mechanize/)
62
+ Accept-Encoding:
63
+ - gzip,deflate,identity
64
+ Accept-Charset:
65
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
66
+ Accept-Language:
67
+ - en-us,en;q=0.5
68
+ Host:
69
+ - localhost:4567
70
+ Referer:
71
+ - !ruby/object:URI::HTTP
72
+ scheme: http
73
+ user:
74
+ password:
75
+ host: localhost
76
+ port: 4567
77
+ path: /
78
+ query:
79
+ opaque:
80
+ registry:
81
+ fragment:
82
+ parser:
83
+ Connection:
84
+ - keep-alive
85
+ Keep-Alive:
86
+ - 300
87
+ response:
88
+ status:
89
+ code: 200
90
+ message: ! 'OK '
91
+ headers:
92
+ X-Frame-Options:
93
+ - sameorigin
94
+ X-Xss-Protection:
95
+ - 1; mode=block
96
+ Content-Type:
97
+ - text/html;charset=utf-8
98
+ Content-Length:
99
+ - '238'
100
+ Server:
101
+ - WEBrick/1.3.1 (Ruby/1.9.3/2012-11-10)
102
+ Date:
103
+ - Thu, 06 Dec 2012 18:10:20 GMT
104
+ Connection:
105
+ - Keep-Alive
106
+ body:
107
+ encoding: US-ASCII
108
+ string: ! "<!DOCTYPE html>\n <html>\n <head><title>dummy</title></head>\n
109
+ \ <body><a href='/foo'>/foo</a><br/><a href='/bar'>/bar</a><br/><a href='/baz'>/baz</a><br/><a
110
+ href='http://foo.bar.baz'>http://foo.bar.baz</a><br/></body>\n </html>"
111
+ http_version:
112
+ recorded_at: Thu, 06 Dec 2012 18:10:20 GMT
113
+ - request:
114
+ method: get
115
+ uri: http://localhost:4567/bar
116
+ body:
117
+ encoding: US-ASCII
118
+ string: ''
119
+ headers:
120
+ Accept:
121
+ - ! '*/*'
122
+ User-Agent:
123
+ - Mechanize/2.5.1 Ruby/1.9.3p327 (http://github.com/tenderlove/mechanize/)
124
+ Accept-Encoding:
125
+ - gzip,deflate,identity
126
+ Accept-Charset:
127
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
128
+ Accept-Language:
129
+ - en-us,en;q=0.5
130
+ Host:
131
+ - localhost:4567
132
+ Referer:
133
+ - !ruby/object:URI::HTTP
134
+ scheme: http
135
+ user:
136
+ password:
137
+ host: localhost
138
+ port: 4567
139
+ path: /foo
140
+ query:
141
+ opaque:
142
+ registry:
143
+ fragment:
144
+ parser:
145
+ Connection:
146
+ - keep-alive
147
+ Keep-Alive:
148
+ - 300
149
+ response:
150
+ status:
151
+ code: 200
152
+ message: ! 'OK '
153
+ headers:
154
+ X-Frame-Options:
155
+ - sameorigin
156
+ X-Xss-Protection:
157
+ - 1; mode=block
158
+ Content-Type:
159
+ - text/html;charset=utf-8
160
+ Content-Length:
161
+ - '580'
162
+ Server:
163
+ - WEBrick/1.3.1 (Ruby/1.9.3/2012-11-10)
164
+ Date:
165
+ - Thu, 06 Dec 2012 18:10:20 GMT
166
+ Connection:
167
+ - Keep-Alive
168
+ body:
169
+ encoding: US-ASCII
170
+ string: ! "<!DOCTYPE html>\n <html>\n <head><title>dummy</title></head>\n
171
+ \ <body><a href='/foo'>/foo</a><br/><a href='http://subdomain.localhost:4567/'>http://subdomain.localhost:4567/</a><br/><a
172
+ href='/0'>/0</a><br/><a href='/1'>/1</a><br/><a href='/2'>/2</a><br/><a href='/3'>/3</a><br/><a
173
+ href='/4'>/4</a><br/><a href='/5'>/5</a><br/><a href='/6'>/6</a><br/><a href='/7'>/7</a><br/><a
174
+ href='/8'>/8</a><br/><a href='/9'>/9</a><br/><a href='/10'>/10</a><br/><a
175
+ href='/11'>/11</a><br/><a href='/12'>/12</a><br/><a href='/13'>/13</a><br/><a
176
+ href='/14'>/14</a><br/></body>\n </html>"
177
+ http_version:
178
+ recorded_at: Thu, 06 Dec 2012 18:10:20 GMT
179
+ - request:
180
+ method: get
181
+ uri: http://localhost:4567/baz
182
+ body:
183
+ encoding: US-ASCII
184
+ string: ''
185
+ headers:
186
+ Accept:
187
+ - ! '*/*'
188
+ User-Agent:
189
+ - Mechanize/2.5.1 Ruby/1.9.3p327 (http://github.com/tenderlove/mechanize/)
190
+ Accept-Encoding:
191
+ - gzip,deflate,identity
192
+ Accept-Charset:
193
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
194
+ Accept-Language:
195
+ - en-us,en;q=0.5
196
+ Host:
197
+ - localhost:4567
198
+ Referer:
199
+ - &70162820747780 !ruby/object:URI::HTTP
200
+ scheme: http
201
+ user:
202
+ password:
203
+ host: localhost
204
+ port: 4567
205
+ path: /bar
206
+ query:
207
+ opaque:
208
+ registry:
209
+ fragment:
210
+ parser:
211
+ Connection:
212
+ - keep-alive
213
+ Keep-Alive:
214
+ - 300
215
+ response:
216
+ status:
217
+ code: 404
218
+ message: ! 'Not Found '
219
+ headers:
220
+ X-Frame-Options:
221
+ - sameorigin
222
+ X-Xss-Protection:
223
+ - 1; mode=block
224
+ Content-Type:
225
+ - text/html;charset=utf-8
226
+ X-Cascade:
227
+ - pass
228
+ Content-Length:
229
+ - '439'
230
+ Server:
231
+ - WEBrick/1.3.1 (Ruby/1.9.3/2012-11-10)
232
+ Date:
233
+ - Thu, 06 Dec 2012 18:10:20 GMT
234
+ Connection:
235
+ - Keep-Alive
236
+ body:
237
+ encoding: US-ASCII
238
+ string: ! "<!DOCTYPE html>\n<html>\n<head>\n <style type=\"text/css\">\n body
239
+ { text-align:center;font-family:helvetica,arial;font-size:22px;\n color:#888;margin:20px}\n
240
+ \ #c {margin:0 auto;width:500px;text-align:left}\n </style>\n</head>\n<body>\n
241
+ \ <h2>Sinatra doesn&rsquo;t know this ditty.</h2>\n <img src='http://localhost:4567/__sinatra__/404.png'>\n
242
+ \ <div id=\"c\">\n Try this:\n <pre>get '/baz' do\n \"Hello World\"\nend</pre>\n
243
+ \ </div>\n</body>\n</html>\n"
244
+ http_version:
245
+ recorded_at: Thu, 06 Dec 2012 18:10:20 GMT
246
+ - request:
247
+ method: get
248
+ uri: http://localhost:4567/0
249
+ body:
250
+ encoding: US-ASCII
251
+ string: ''
252
+ headers:
253
+ Accept:
254
+ - ! '*/*'
255
+ User-Agent:
256
+ - Mechanize/2.5.1 Ruby/1.9.3p327 (http://github.com/tenderlove/mechanize/)
257
+ Accept-Encoding:
258
+ - gzip,deflate,identity
259
+ Accept-Charset:
260
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
261
+ Accept-Language:
262
+ - en-us,en;q=0.5
263
+ Host:
264
+ - localhost:4567
265
+ Referer:
266
+ - *70162820747780
267
+ Connection:
268
+ - keep-alive
269
+ Keep-Alive:
270
+ - 300
271
+ response:
272
+ status:
273
+ code: 200
274
+ message: ! 'OK '
275
+ headers:
276
+ X-Frame-Options:
277
+ - sameorigin
278
+ X-Xss-Protection:
279
+ - 1; mode=block
280
+ Content-Type:
281
+ - text/html;charset=utf-8
282
+ Content-Length:
283
+ - '154'
284
+ Server:
285
+ - WEBrick/1.3.1 (Ruby/1.9.3/2012-11-10)
286
+ Date:
287
+ - Thu, 06 Dec 2012 18:10:20 GMT
288
+ Connection:
289
+ - Keep-Alive
290
+ body:
291
+ encoding: US-ASCII
292
+ string: ! "<!DOCTYPE html>\n <html>\n <head><title>dummy</title></head>\n
293
+ \ <body><a href='/foo'>/foo</a><br/><a href='/bar'>/bar</a><br/></body>\n
294
+ \ </html>"
295
+ http_version:
296
+ recorded_at: Thu, 06 Dec 2012 18:10:20 GMT
297
+ recorded_with: VCR 2.3.0
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ describe Webspinne::Index do
4
+ subject { described_class.new }
5
+
6
+ its(:size) { should == 0}
7
+ its(:visited_size) { should == 0}
8
+
9
+ context 'with onsite-link' do
10
+ before { subject << Webspinne::Link.new('foo', true) }
11
+
12
+ its(:size) { should == 1 }
13
+ its(:next_unvisited_onsite_link) { should be_instance_of(Webspinne::Link) }
14
+
15
+ context 'and the same link again' do
16
+ before { subject << Webspinne::Link.new('foo', true) }
17
+ its(:size) { should == 1 }
18
+ end
19
+
20
+ context 'and another link' do
21
+ before { subject << Webspinne::Link.new('bar', true) }
22
+ its(:size) { should == 2 }
23
+ end
24
+
25
+ context 'visited' do
26
+ before { subject.next_unvisited_onsite_link.visited! }
27
+ its(:visited_size) { should == 1 }
28
+ end
29
+ end
30
+
31
+ context 'with offsite-link' do
32
+ before { subject << Webspinne::Link.new('foo', false) }
33
+
34
+ its(:size) { should == 1 }
35
+ its(:next_unvisited_onsite_link) { should be_nil }
36
+ end
37
+
38
+ end
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+
3
+ describe Webspinne::Site do
4
+ subject { described_class.new 'http://foobar.com' }
5
+
6
+ it 'returns hostname' do
7
+ subject.hostname.should == 'foobar.com'
8
+ subject.hostname('http://www.foobar.com').should == 'www.foobar.com'
9
+ subject.hostname('http://baccio.com').should == 'baccio.com'
10
+ subject.hostname('lalal').should == nil
11
+ end
12
+
13
+ shared_examples_for 'onsite' do |uri|
14
+ specify { subject.onsite?(uri).should be_true }
15
+ specify { subject.offsite?(uri).should be_false }
16
+ end
17
+
18
+ shared_examples_for 'offsite' do |uri|
19
+ specify { subject.onsite?(uri).should be_false }
20
+ specify { subject.offsite?(uri).should be_true }
21
+ end
22
+
23
+ # onsite
24
+ %w[ http://foobar.com
25
+ http://foobar.com/
26
+ http://foobar.com/x ].each do |uri|
27
+ context uri do
28
+ it_behaves_like 'onsite', uri
29
+ end
30
+ end
31
+
32
+ # offsite
33
+ %w[ http://baccio.com
34
+ http://subdomain.foobar.com ].each do |uri|
35
+ context uri do
36
+ it_behaves_like 'offsite', uri
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+
3
+ describe Webspinne do
4
+ it 'visits a site' do
5
+ VCR.use_cassette('localhost') do
6
+ visit = described_class.visit 'http://localhost:4567', 5
7
+ visit.index.size.should == 21 # links
8
+ visit.index.visited_size.should == 5 # visitied_links
9
+ visit.index.unvisited_size.should == 16 # unvisited links
10
+ visit.index.onsite_size.should == 19 # onsite links
11
+ visit.index.offsite_size.should == 2 # offsite links
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,28 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
18
+
19
+ require 'vcr'
20
+ require 'webmock'
21
+ VCR.configure do |c|
22
+ c.cassette_library_dir = File.expand_path('../../spec/fixtures/vcr', __FILE__)
23
+ c.hook_into :webmock # or :fakeweb
24
+ end
25
+
26
+ lib = File.expand_path('../lib', __FILE__)
27
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
28
+ require 'webspinne'
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'webspinne/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "webspinne"
8
+ gem.version = Webspinne::VERSION
9
+ gem.authors = ["Jens Bissinger"]
10
+ gem.email = ["mail@jens-bissinger.de"]
11
+ gem.description = %q{Webspinne analyzes websites by counting their pages.}
12
+ gem.summary = %q{Uses the mechanize gem to open a website and collects all links and sublinks.}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency "mechanize"
21
+ gem.add_development_dependency "rspec"
22
+ gem.add_development_dependency "guard-rspec"
23
+ gem.add_development_dependency "vcr"
24
+ gem.add_development_dependency "webmock"
25
+ gem.add_development_dependency "sinatra"
26
+ end
metadata ADDED
@@ -0,0 +1,171 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webspinne
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jens Bissinger
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: guard-rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: vcr
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: webmock
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: sinatra
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Webspinne analyzes websites by counting their pages.
111
+ email:
112
+ - mail@jens-bissinger.de
113
+ executables:
114
+ - webspinne
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - .rspec
120
+ - Gemfile
121
+ - Guardfile
122
+ - LICENSE.txt
123
+ - README.md
124
+ - Rakefile
125
+ - bin/webspinne
126
+ - examples/dummy_site.rb
127
+ - fonic.de.csv
128
+ - lib/webspinne.rb
129
+ - lib/webspinne/csv_report.rb
130
+ - lib/webspinne/index.rb
131
+ - lib/webspinne/link.rb
132
+ - lib/webspinne/site.rb
133
+ - lib/webspinne/version.rb
134
+ - lib/webspinne/visit.rb
135
+ - localhost.csv
136
+ - spec/fixtures/vcr/localhost.yml
137
+ - spec/lib/webspinne/index_spec.rb
138
+ - spec/lib/webspinne/site_spec.rb
139
+ - spec/lib/webspinne_spec.rb
140
+ - spec/spec_helper.rb
141
+ - webspinne.gemspec
142
+ homepage: ''
143
+ licenses: []
144
+ post_install_message:
145
+ rdoc_options: []
146
+ require_paths:
147
+ - lib
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ none: false
150
+ requirements:
151
+ - - ! '>='
152
+ - !ruby/object:Gem::Version
153
+ version: '0'
154
+ required_rubygems_version: !ruby/object:Gem::Requirement
155
+ none: false
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ requirements: []
161
+ rubyforge_project:
162
+ rubygems_version: 1.8.23
163
+ signing_key:
164
+ specification_version: 3
165
+ summary: Uses the mechanize gem to open a website and collects all links and sublinks.
166
+ test_files:
167
+ - spec/fixtures/vcr/localhost.yml
168
+ - spec/lib/webspinne/index_spec.rb
169
+ - spec/lib/webspinne/site_spec.rb
170
+ - spec/lib/webspinne_spec.rb
171
+ - spec/spec_helper.rb