clownfish 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +80 -0
- data/Rakefile +6 -0
- data/clownfish.gemspec +22 -0
- data/ideas.txt +8 -0
- data/lib/clownfish.rb +10 -0
- data/lib/clownfish/adapter.rb +62 -0
- data/lib/clownfish/anemone_ext.rb +16 -0
- data/lib/clownfish/fish/count.rb +11 -0
- data/lib/clownfish/fish/links_by_page.rb +60 -0
- data/lib/clownfish/fish/response_times.rb +20 -0
- data/lib/clownfish/helpers/status_group.rb +66 -0
- data/lib/clownfish/helpers/url_statuses.rb +39 -0
- data/lib/clownfish/version.rb +3 -0
- data/spec/adapter_spec.rb +144 -0
- data/spec/links_by_page_spec.rb +80 -0
- data/spec/spec_helper.rb +70 -0
- data/spec/status_group_spec.rb +58 -0
- data/spec/url_statuses_spec.rb +91 -0
- metadata +99 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MGYzYmQ4YjNlYmEzNDAxODEwM2M5ODMyZjE2NzQ2N2MwOTczODIwZQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NDBiYjYzMzcwZTQ2MzExYWRkY2ZhMzA2NzFkMWNiODU1ODQwY2MxOQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NTk3N2U5OTEwNmI0MGY2ZDVhYWY4OGZiNDRmMDk5OGNhMjg3NTFiMjMxOTlh
|
10
|
+
YjYxZmZlNjQxMGViZWJhZGQzZjk3NjBlZTM4NGRjNDk2YzQxZmJlZTAzMTU2
|
11
|
+
YTBmNTYxNGY1MDJjNzc3OGI3NmJmOGYwN2ZhZTc2MDQwOTQwM2M=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZmFkMTJhODdmMmY5ODllZGY2MGMyOTUxNTRhNWE0MzQ2OGZhZTcxZmQyMTA0
|
14
|
+
YjY5ZjlmNWZiNGJmYTZhYWJjMWRkOWI2MDMwZjg5MzVkODA3NWFjMmQ3ZTQ4
|
15
|
+
NjEzOTUxYjIzMWQwZmFhMTQxYjA5ZmNhNmYxMjY5YzI4OGUyZjg=
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Paul Salaets
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# Clownfish
|
2
|
+
|
3
|
+
Helper for [Anemone](http://anemone.rubyforge.org/). Makes common crawls easier to repeat.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'clownfish'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install clownfish
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
require 'clownfish'
|
23
|
+
|
24
|
+
clownfish = MyClownfish.new
|
25
|
+
|
26
|
+
Anemone.crawl_with_clownfish(start_url, clownfish)
|
27
|
+
|
28
|
+
# query clownfish for data from crawl
|
29
|
+
```
|
30
|
+
|
31
|
+
## Clownfish Spec
|
32
|
+
|
33
|
+
A clownfish is an object that has one or more of the following instance methods:
|
34
|
+
|
35
|
+
Reference: [Anemone RDocs](http://anemone.rubyforge.org/doc/index.html)
|
36
|
+
|
37
|
+
### anemone_options
|
38
|
+
|
39
|
+
Returns a `Hash` of `Symbol` to values. See [Anemone::Core::DEFAULT_OPTS](http://git.io/wFmCfA) for available options.
|
40
|
+
This is forwarded as the second argument to `Anemone.crawl`. Invoked once before crawl.
|
41
|
+
|
42
|
+
### skip_links_like
|
43
|
+
|
44
|
+
Returns a single `Regexp` or `Array` of `Regexp`. Urls matching any of these will not be crawled. Invoked once before crawl.
|
45
|
+
|
46
|
+
### on_every_page
|
47
|
+
|
48
|
+
Takes one argument, an `Anemone::Page`. Invoked once per page during crawl.
|
49
|
+
|
50
|
+
### focus_crawl
|
51
|
+
|
52
|
+
Takes one argument, an `Anemone::Page`. Returns the links on that page that should be crawled. Invoked once per page during crawl.
|
53
|
+
|
54
|
+
### after_crawl
|
55
|
+
|
56
|
+
Takes one argument, an `Anemone::PageStore`. Invoked once after crawl is done.
|
57
|
+
|
58
|
+
## Whats Included
|
59
|
+
|
60
|
+
See [wiki](https://github.com/psalaets/clownfish/wiki) for examples.
|
61
|
+
|
62
|
+
### Clownfish::LinksByPage
|
63
|
+
|
64
|
+
Lists every page that has links, the links and the status code when following those links.
|
65
|
+
|
66
|
+
### Clownfisn::ResponseTimes
|
67
|
+
|
68
|
+
Record every url and it's response time.
|
69
|
+
|
70
|
+
### Clownfisn::Count
|
71
|
+
|
72
|
+
Count pages.
|
73
|
+
|
74
|
+
## Contributing
|
75
|
+
|
76
|
+
1. Fork it
|
77
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
78
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
79
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
80
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/clownfish.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('lib', File.dirname(__FILE__))
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'clownfish/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "clownfish"
|
8
|
+
gem.version = Clownfish::VERSION
|
9
|
+
gem.authors = ["Paul Salaets"]
|
10
|
+
gem.email = ["psalaets@gmail.com"]
|
11
|
+
gem.summary = "Anemone helper"
|
12
|
+
gem.description = "Anemone helper making common crawls easier to repeat."
|
13
|
+
gem.homepage = "https://github.com/psalaets/clownfish"
|
14
|
+
gem.license = "MIT"
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($/)
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_dependency('anemone', '~> 0.7.2')
|
21
|
+
gem.add_development_dependency('rspec', '~> 2.12')
|
22
|
+
end
|
data/ideas.txt
ADDED
data/lib/clownfish.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require "clownfish/version"
|
2
|
+
require "clownfish/adapter"
|
3
|
+
require "clownfish/anemone_ext"
|
4
|
+
|
5
|
+
require "clownfish/helpers/status_group"
|
6
|
+
require "clownfish/helpers/url_statuses"
|
7
|
+
|
8
|
+
require "clownfish/fish/links_by_page"
|
9
|
+
require "clownfish/fish/response_times"
|
10
|
+
require "clownfish/fish/count"
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Adapter between Anemone and clownfish objects.
|
3
|
+
class Adapter
|
4
|
+
# Internal: Create an Adapter that wraps a clownfish.
|
5
|
+
#
|
6
|
+
# clownfish - Object that conforms to clownfish spec. See README.md.
|
7
|
+
def initialize(clownfish)
|
8
|
+
raise ArgumentError, "clownfish cannot be nil" if clownfish.nil?
|
9
|
+
@delegate = clownfish
|
10
|
+
end
|
11
|
+
|
12
|
+
# Internal: Forwards Anemone options from clownfish.
|
13
|
+
#
|
14
|
+
# Returns Hash of Anemone options, never nil.
|
15
|
+
def anemone_options
|
16
|
+
(@delegate.respond_to?(:anemone_options) && @delegate.anemone_options) || {}
|
17
|
+
end
|
18
|
+
|
19
|
+
# Internal: Connects clownfish to Anemone.
|
20
|
+
#
|
21
|
+
# anemone - Instance of Anemone::Core.
|
22
|
+
#
|
23
|
+
# Returns nothing.
|
24
|
+
def hook_into_anemone(anemone)
|
25
|
+
wire_up_after_crawl(anemone)
|
26
|
+
wire_up_on_every_page(anemone)
|
27
|
+
wire_up_focus_crawl(anemone)
|
28
|
+
relay_skip_links_like(anemone)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# Connects delegate's after_crawl to Anemone.
|
34
|
+
def wire_up_after_crawl(anemone)
|
35
|
+
anemone.after_crawl do |page_store|
|
36
|
+
@delegate.after_crawl(page_store)
|
37
|
+
end if @delegate.respond_to?(:after_crawl)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Connects delegate's on_every_page to Anemone.
|
41
|
+
def wire_up_on_every_page(anemone)
|
42
|
+
anemone.on_every_page do |page|
|
43
|
+
@delegate.on_every_page(page)
|
44
|
+
end if @delegate.respond_to?(:on_every_page)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Connects delegate's focus_crawl to Anemone.
|
48
|
+
def wire_up_focus_crawl(anemone)
|
49
|
+
anemone.focus_crawl do |page|
|
50
|
+
@delegate.focus_crawl(page) || []
|
51
|
+
end if @delegate.respond_to?(:focus_crawl)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Passes delegate's skip_links_like to Anemone.
|
55
|
+
def relay_skip_links_like(anemone)
|
56
|
+
if @delegate.respond_to?(:skip_links_like)
|
57
|
+
regexes = @delegate.skip_links_like
|
58
|
+
anemone.skip_links_like([regexes].flatten)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "anemone"
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
# Public: Starts an Anemone crawl with a clownfish.
|
5
|
+
#
|
6
|
+
# urls - String or Array of Strings telling where to start crawl from.
|
7
|
+
# clownfish - Object that conforms to clownfish spec. See README.md.
|
8
|
+
#
|
9
|
+
# Returns nothing.
|
10
|
+
def self.crawl_with_clownfish(urls, clownfish)
|
11
|
+
adapter = Clownfish::Adapter.new(clownfish)
|
12
|
+
self.crawl(urls, adapter.anemone_options) do |anemone|
|
13
|
+
adapter.hook_into_anemone(anemone)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Clownfish that counts number of pages on a site. Taken from Anemone.
|
3
|
+
class Count
|
4
|
+
# Number of pages found. Only meaningful after a crawl.
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def after_crawl(page_store)
|
8
|
+
@count = page_store.uniq!.size
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Clownfish that records every link on a page and the repsonse status codes
|
3
|
+
# when the links are followed.
|
4
|
+
class LinksByPage
|
5
|
+
# Hash of url String to UrlStatuses. The values are all links found on page
|
6
|
+
# at the key.
|
7
|
+
attr_reader :links_by_page
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@links_by_page = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def anemone_options
|
14
|
+
# Not looking at page bodies so don't keep them around
|
15
|
+
{:discard_page_bodies => true}
|
16
|
+
end
|
17
|
+
|
18
|
+
def on_every_page(page)
|
19
|
+
# First url in crawl has no page
|
20
|
+
referer = page.referer ? page.referer.to_s : '[starting point]'
|
21
|
+
|
22
|
+
@links_by_page[referer] = UrlStatuses.new unless @links_by_page.include? referer
|
23
|
+
|
24
|
+
links = @links_by_page[referer]
|
25
|
+
links.add_url(page.url.to_s, page.code)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Print links by page to stdout.
|
29
|
+
#
|
30
|
+
# options - Hash specifying what and how to report.
|
31
|
+
# :to - IO to print report to. Defaults to STDOUT.
|
32
|
+
# :status - One or Array of status specifiers. Defaults to :all.
|
33
|
+
# Only links with these statues will be reported. See
|
34
|
+
# Clownfish::StatusGroup for accepted status specifiers.
|
35
|
+
def report(options = {})
|
36
|
+
options = report_options(options)
|
37
|
+
out = options[:to]
|
38
|
+
specifiers = options[:status]
|
39
|
+
|
40
|
+
@links_by_page.each do |page, link_statuses|
|
41
|
+
link_status_pairs = link_statuses.query(specifiers)
|
42
|
+
|
43
|
+
unless link_status_pairs.empty?
|
44
|
+
out.puts "#{page}"
|
45
|
+
link_status_pairs.each do |link, status|
|
46
|
+
out.puts "#{status} #{link}"
|
47
|
+
end
|
48
|
+
out.puts
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def report_options(options)
|
56
|
+
defaults = {:to => STDOUT, :status => :all}
|
57
|
+
defaults.merge(options)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Clownfish that records the response time of every url.
|
3
|
+
class ResponseTimes
|
4
|
+
# Hash where key is url String and value is number (milliseconds).
|
5
|
+
attr_reader :times_by_url
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@times_by_url = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def anemone_options
|
12
|
+
# Not looking at page bodies so don't keep them around
|
13
|
+
{:discard_page_bodies => true}
|
14
|
+
end
|
15
|
+
|
16
|
+
def on_every_page(page)
|
17
|
+
@times_by_url[page.url.to_s] = page.response_time
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# One or more response status codes. StatusGroups are filled with status
|
3
|
+
# specifiers to determine what is in the group.
|
4
|
+
#
|
5
|
+
# Status specifiers can be Integer status codes like 200, Integer Ranges like
|
6
|
+
# 400..404 or any of the following Symbols:
|
7
|
+
# :all - any status code
|
8
|
+
# :success - 2xx
|
9
|
+
# :redirect - 3xx
|
10
|
+
# :non_error - 2xx through 3xx
|
11
|
+
# :client_error - 4xx
|
12
|
+
# :server_error - 5xx
|
13
|
+
# :error - 4xx through 5xx
|
14
|
+
class StatusGroup
|
15
|
+
ALIASES = {
|
16
|
+
:all => 200..599,
|
17
|
+
:success => 200..299,
|
18
|
+
:redirect => 300..399,
|
19
|
+
:non_error => 200..399,
|
20
|
+
:client_error => 400..499,
|
21
|
+
:server_error => 500..599,
|
22
|
+
:error => 400..599
|
23
|
+
}
|
24
|
+
|
25
|
+
# Public: Create a new group.
|
26
|
+
#
|
27
|
+
# statuses - One or more status specifiers or an Array of status specifiers.
|
28
|
+
def initialize(*specifiers)
|
29
|
+
@members = []
|
30
|
+
|
31
|
+
specifiers.flatten.each do |status|
|
32
|
+
self << status
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: Add a status specifier to this group.
|
37
|
+
#
|
38
|
+
# specifier - A status specifier
|
39
|
+
#
|
40
|
+
# Returns self for chaining purposes.
|
41
|
+
def <<(specifier)
|
42
|
+
@members << (resolve_alias(specifier) || specifier)
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
# Public: Tells if this group includes a given status code.
|
47
|
+
#
|
48
|
+
# status - Integer status code
|
49
|
+
#
|
50
|
+
# Returns true if status is included, false otherwise.
|
51
|
+
def include?(status)
|
52
|
+
@members.any? {|m| m === status}
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# Resolves a group alias to its Range.
|
58
|
+
#
|
59
|
+
# group_alias - Symbol representing a set of status codes.
|
60
|
+
#
|
61
|
+
# Returns Range specified by group_alias or nil if there is none.
|
62
|
+
def resolve_alias(group_alias)
|
63
|
+
ALIASES[group_alias]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Helper class for pairing urls with status codes.
|
3
|
+
class UrlStatuses
|
4
|
+
attr_reader :status_codes_by_url
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@status_codes_by_url = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def add_url(url, status_code)
|
11
|
+
@status_codes_by_url[url] = status_code
|
12
|
+
end
|
13
|
+
|
14
|
+
def each(&block)
|
15
|
+
@status_codes_by_url.each(&block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def size
|
19
|
+
@status_codes_by_url.size
|
20
|
+
end
|
21
|
+
|
22
|
+
def empty?
|
23
|
+
size == 0
|
24
|
+
end
|
25
|
+
|
26
|
+
# Public: Gets url/status code pairs that match one of the specified status
|
27
|
+
# codes.
|
28
|
+
#
|
29
|
+
# status_group_specifiers - One, many or an Array of status group specifiers
|
30
|
+
# as accepted by StatusGroup.new.
|
31
|
+
#
|
32
|
+
# Returns url/status pairs that match status specifiers.
|
33
|
+
def query(*status_group_specifiers)
|
34
|
+
group = StatusGroup.new(status_group_specifiers)
|
35
|
+
|
36
|
+
@status_codes_by_url.find_all { |url, code| group.include? code }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Clownfish
|
4
|
+
describe Adapter do
|
5
|
+
context ".new" do
|
6
|
+
it "doesn't accept nil delegate" do
|
7
|
+
expect { Adapter.new(nil) }.to raise_error(ArgumentError)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
context "#anemone_options" do
|
12
|
+
it "forwards anemone_options from delegate" do
|
13
|
+
delegate = double('delegate')
|
14
|
+
delegate.stub(:anemone_options) {{:name => 'bob'}}
|
15
|
+
|
16
|
+
adapter = Adapter.new(delegate)
|
17
|
+
|
18
|
+
adapter.anemone_options.should eq({:name => 'bob'})
|
19
|
+
end
|
20
|
+
|
21
|
+
it "returns empty Hash if delegate has no options" do
|
22
|
+
delegate = double('delegate')
|
23
|
+
delegate.stub(:anemone_options) {nil}
|
24
|
+
|
25
|
+
adapter = Adapter.new(delegate)
|
26
|
+
|
27
|
+
adapter.anemone_options.should eq({})
|
28
|
+
end
|
29
|
+
|
30
|
+
it "returns empty Hash if delegate doesn't support anemone_options" do
|
31
|
+
# Has no anemone_options method
|
32
|
+
delegate = Object.new
|
33
|
+
|
34
|
+
adapter = Adapter.new(delegate)
|
35
|
+
|
36
|
+
adapter.anemone_options.should eq({})
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
context "hooking into Anemone" do
|
41
|
+
before :each do
|
42
|
+
@page_store = Object.new
|
43
|
+
@page1, @page2 = Object.new, Object.new
|
44
|
+
|
45
|
+
@anemone = FakeAnemone.new(@page_store, @page1, @page2)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "wires up after_crawl when delegate supports it" do
|
49
|
+
delegate = double('delegate')
|
50
|
+
delegate.should_receive(:after_crawl).with(@page_store).once
|
51
|
+
|
52
|
+
adapter = Adapter.new(delegate)
|
53
|
+
|
54
|
+
adapter.hook_into_anemone(@anemone)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "ignores after_crawl when not supported" do
|
58
|
+
delegate = Object.new
|
59
|
+
|
60
|
+
adapter = Adapter.new(delegate)
|
61
|
+
|
62
|
+
adapter.hook_into_anemone(@anemone)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "wires up on_every_page when delegate supports it" do
|
66
|
+
delegate = double('delegate')
|
67
|
+
delegate.should_receive(:on_every_page).with(@page1).once
|
68
|
+
delegate.should_receive(:on_every_page).with(@page2).once
|
69
|
+
|
70
|
+
adapter = Adapter.new(delegate)
|
71
|
+
|
72
|
+
adapter.hook_into_anemone(@anemone)
|
73
|
+
end
|
74
|
+
|
75
|
+
it "ignores on_every_page when not supported" do
|
76
|
+
delegate = Object.new
|
77
|
+
|
78
|
+
adapter = Adapter.new(delegate)
|
79
|
+
|
80
|
+
adapter.hook_into_anemone(@anemone)
|
81
|
+
end
|
82
|
+
|
83
|
+
it "wires up focus_crawl when delegate supports it" do
|
84
|
+
delegate = double('delegate')
|
85
|
+
delegate.should_receive(:focus_crawl).with(@page1) {['url1']}.once
|
86
|
+
|
87
|
+
adapter = Adapter.new(delegate)
|
88
|
+
|
89
|
+
adapter.hook_into_anemone(@anemone)
|
90
|
+
|
91
|
+
@anemone.last_focus_crawl_links.should eq(['url1'])
|
92
|
+
end
|
93
|
+
|
94
|
+
it "focuses on no links when delegate doesn't focus on any" do
|
95
|
+
delegate = double('delegate')
|
96
|
+
delegate.should_receive(:focus_crawl).with(@page1) {nil}
|
97
|
+
|
98
|
+
adapter = Adapter.new(delegate)
|
99
|
+
|
100
|
+
adapter.hook_into_anemone(@anemone)
|
101
|
+
|
102
|
+
@anemone.last_focus_crawl_links.should eq([])
|
103
|
+
end
|
104
|
+
|
105
|
+
it "ignores focus_crawl when delegate doesn't support it" do
|
106
|
+
delegate = Object.new
|
107
|
+
|
108
|
+
adapter = Adapter.new(delegate)
|
109
|
+
|
110
|
+
adapter.hook_into_anemone(@anemone)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "relays skip_links_like regex when delegate returns one" do
|
114
|
+
delegate = double('delegate')
|
115
|
+
delegate.stub(:skip_links_like) {/a/}
|
116
|
+
|
117
|
+
adapter = Adapter.new(delegate)
|
118
|
+
|
119
|
+
adapter.hook_into_anemone(@anemone)
|
120
|
+
|
121
|
+
@anemone.last_skip_links_like_regexes.should eq([/a/])
|
122
|
+
end
|
123
|
+
|
124
|
+
it "relays skip_links_like regexes when delegate returns many" do
|
125
|
+
delegate = double('delegate')
|
126
|
+
delegate.stub(:skip_links_like) {[/a/, /b/]}
|
127
|
+
|
128
|
+
adapter = Adapter.new(delegate)
|
129
|
+
|
130
|
+
adapter.hook_into_anemone(@anemone)
|
131
|
+
|
132
|
+
@anemone.last_skip_links_like_regexes.should eq([/a/, /b/])
|
133
|
+
end
|
134
|
+
|
135
|
+
it "ignores skip_links_like when not supported" do
|
136
|
+
delegate = Object.new
|
137
|
+
|
138
|
+
adapter = Adapter.new(delegate)
|
139
|
+
|
140
|
+
adapter.hook_into_anemone(@anemone)
|
141
|
+
end
|
142
|
+
end # end of hooking into Anemone
|
143
|
+
end # end of describe Adapter
|
144
|
+
end # end of Clownfish module
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module Clownfish
|
5
|
+
describe LinksByPage do
|
6
|
+
describe "#report" do
|
7
|
+
before :each do
|
8
|
+
@home = FakePage.new('home.com', 200)
|
9
|
+
@links = FakePage.new('links.com', 200, 'home.com')
|
10
|
+
@client = FakePage.new('client.com', 404, 'links.com')
|
11
|
+
@server = FakePage.new('server.com', 500, 'links.com')
|
12
|
+
end
|
13
|
+
|
14
|
+
it "reports all statuses by default" do
|
15
|
+
fish = LinksByPage.new
|
16
|
+
|
17
|
+
fish.on_every_page(@client)
|
18
|
+
fish.on_every_page(@server)
|
19
|
+
|
20
|
+
out = StringIO.new
|
21
|
+
fish.report(:to => out)
|
22
|
+
|
23
|
+
out.string.should =~ %r{404 http://client.com\n500 http://server.com}
|
24
|
+
end
|
25
|
+
|
26
|
+
it "reports specified status when specifier given" do
|
27
|
+
fish = LinksByPage.new
|
28
|
+
|
29
|
+
fish.on_every_page(@client)
|
30
|
+
fish.on_every_page(@server)
|
31
|
+
|
32
|
+
out = StringIO.new
|
33
|
+
fish.report(:to => out, :status => :server_error)
|
34
|
+
|
35
|
+
out.string.should =~ %r{500 http://server.com}
|
36
|
+
out.string.should_not =~ /404/
|
37
|
+
end
|
38
|
+
|
39
|
+
it "reports specified statuses when many specified" do
|
40
|
+
fish = LinksByPage.new
|
41
|
+
|
42
|
+
fish.on_every_page(@links)
|
43
|
+
fish.on_every_page(@client)
|
44
|
+
fish.on_every_page(@server)
|
45
|
+
|
46
|
+
out = StringIO.new
|
47
|
+
fish.report(:to => out, :status => [500, 200..204])
|
48
|
+
|
49
|
+
out.string.should =~ %r{200 http://links.com}
|
50
|
+
out.string.should =~ %r{500 http://server.com}
|
51
|
+
out.string.should_not =~ /404/
|
52
|
+
end
|
53
|
+
|
54
|
+
it "omits page if none of its links will be shown" do
|
55
|
+
fish = LinksByPage.new
|
56
|
+
|
57
|
+
fish.on_every_page(@links)
|
58
|
+
fish.on_every_page(@client)
|
59
|
+
fish.on_every_page(@server)
|
60
|
+
|
61
|
+
out = StringIO.new
|
62
|
+
fish.report(:to => out, :status => [304])
|
63
|
+
|
64
|
+
out.string.should_not =~ %r{http://home.com}
|
65
|
+
out.string.should_not =~ %r{http://links.com}
|
66
|
+
end
|
67
|
+
|
68
|
+
it "shows referer of links with no referer as [starting point]" do
|
69
|
+
fish = LinksByPage.new
|
70
|
+
|
71
|
+
fish.on_every_page(@home)
|
72
|
+
|
73
|
+
out = StringIO.new
|
74
|
+
fish.report(:to => out)
|
75
|
+
|
76
|
+
out.string.should =~ /\[starting point\]/
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'clownfish'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
6
|
+
config.run_all_when_everything_filtered = true
|
7
|
+
config.filter_run :focus
|
8
|
+
|
9
|
+
# Run specs in random order to surface order dependencies. If you find an
|
10
|
+
# order dependency and want to debug it, you can fix the order by providing
|
11
|
+
# the seed, which is printed after each run.
|
12
|
+
# --seed 1234
|
13
|
+
config.order = 'random'
|
14
|
+
end
|
15
|
+
|
16
|
+
# Matcher for unordered equality Array
|
17
|
+
RSpec::Matchers.define :have_same_elements_as do |expected|
|
18
|
+
match do |actual|
|
19
|
+
expected.sort == actual.sort
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module Clownfish
|
24
|
+
# Fake Anemone::Core to help with tests.
|
25
|
+
class FakeAnemone
|
26
|
+
attr_reader :last_focus_crawl_links
|
27
|
+
attr_reader :last_skip_links_like_regexes
|
28
|
+
|
29
|
+
def initialize(page_store, page1, page2)
|
30
|
+
@page_store = page_store
|
31
|
+
@page1 = page1
|
32
|
+
@page2 = page2
|
33
|
+
end
|
34
|
+
|
35
|
+
def after_crawl
|
36
|
+
yield(@page_store)
|
37
|
+
end
|
38
|
+
|
39
|
+
def on_every_page
|
40
|
+
yield(@page1)
|
41
|
+
yield(@page2)
|
42
|
+
end
|
43
|
+
|
44
|
+
def focus_crawl
|
45
|
+
@last_focus_crawl_links = yield(@page1)
|
46
|
+
end
|
47
|
+
|
48
|
+
def skip_links_like(regexes)
|
49
|
+
@last_skip_links_like_regexes = regexes
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Fake and minimal Anemone::Page to help with tests.
|
54
|
+
class FakePage
|
55
|
+
attr_reader :url, :referer, :code
|
56
|
+
|
57
|
+
def initialize(url, code = 200, referer = nil)
|
58
|
+
@url = urlify(url)
|
59
|
+
@referer = urlify(referer)
|
60
|
+
@code = code
|
61
|
+
end
|
62
|
+
|
63
|
+
def urlify(str)
|
64
|
+
return str if str.class == URI || str.nil?
|
65
|
+
|
66
|
+
str = "http://#{str}" unless str.start_with? 'http'
|
67
|
+
URI(str)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Clownfish
|
4
|
+
describe StatusGroup do
|
5
|
+
context "#<<" do
|
6
|
+
it "takes alias Symbols, Integers and Integer Ranges" do
|
7
|
+
group = StatusGroup.new
|
8
|
+
|
9
|
+
group << :client_error
|
10
|
+
group << 500
|
11
|
+
group << 200..204
|
12
|
+
|
13
|
+
group.include?(404).should be_true
|
14
|
+
group.include?(500).should be_true
|
15
|
+
group.include?(200).should be_true
|
16
|
+
group.include?(304).should be_false
|
17
|
+
end
|
18
|
+
|
19
|
+
it "can be chained" do
|
20
|
+
group = StatusGroup.new
|
21
|
+
|
22
|
+
group << :client_error << 304 << :server_error
|
23
|
+
|
24
|
+
group.include?(404).should be_true
|
25
|
+
group.include?(500).should be_true
|
26
|
+
group.include?(304).should be_true
|
27
|
+
group.include?(200).should be_false
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context ".new" do
|
32
|
+
it "can take a single status specifier" do
|
33
|
+
group = StatusGroup.new(:server_error)
|
34
|
+
|
35
|
+
group.include?(500).should be_true
|
36
|
+
group.include?(200).should be_false
|
37
|
+
end
|
38
|
+
|
39
|
+
it "can take multiple status specifiers" do
|
40
|
+
group = StatusGroup.new(200, :redirect, 400..406)
|
41
|
+
|
42
|
+
group.include?(200).should be_true
|
43
|
+
group.include?(301).should be_true
|
44
|
+
group.include?(401).should be_true
|
45
|
+
group.include?(204).should be_false
|
46
|
+
end
|
47
|
+
|
48
|
+
it "can take Array of status specifiers" do
|
49
|
+
group = StatusGroup.new([:success, 500, 300..304])
|
50
|
+
|
51
|
+
group.include?(500).should be_true
|
52
|
+
group.include?(200).should be_true
|
53
|
+
group.include?(302).should be_true
|
54
|
+
group.include?(404).should be_false
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Clownfish
|
4
|
+
describe UrlStatuses do
|
5
|
+
it "accumulates urls with status codes" do
|
6
|
+
statuses = UrlStatuses.new
|
7
|
+
|
8
|
+
statuses.add_url('http://ok.com', 200)
|
9
|
+
statuses.add_url('http://huh.com', 404)
|
10
|
+
|
11
|
+
statuses.status_codes_by_url.should eq({'http://ok.com' => 200, 'http://huh.com' => 404})
|
12
|
+
end
|
13
|
+
|
14
|
+
it "starts off empty" do
|
15
|
+
statuses = UrlStatuses.new
|
16
|
+
|
17
|
+
statuses.empty?.should be_true
|
18
|
+
end
|
19
|
+
|
20
|
+
it "knows how many urls it has" do
|
21
|
+
statuses = UrlStatuses.new
|
22
|
+
|
23
|
+
statuses.add_url('http://ok.com', 200)
|
24
|
+
statuses.add_url('http://huh.com', 404)
|
25
|
+
|
26
|
+
statuses.size.should eq(2)
|
27
|
+
end
|
28
|
+
|
29
|
+
context '#each' do
|
30
|
+
it "yields url/code pairs to 2-arg block" do
|
31
|
+
statuses = UrlStatuses.new
|
32
|
+
|
33
|
+
statuses.add_url('http://ok.com', 200)
|
34
|
+
statuses.add_url('http://huh.com', 404)
|
35
|
+
|
36
|
+
pairs = []
|
37
|
+
statuses.each { |k, v| pairs << [k, v] }
|
38
|
+
|
39
|
+
pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
|
40
|
+
end
|
41
|
+
|
42
|
+
it "yields url/code Array to 1-arg block" do
|
43
|
+
statuses = UrlStatuses.new
|
44
|
+
|
45
|
+
statuses.add_url('http://ok.com', 200)
|
46
|
+
statuses.add_url('http://huh.com', 404)
|
47
|
+
|
48
|
+
pairs = []
|
49
|
+
statuses.each { |p| pairs << p }
|
50
|
+
|
51
|
+
pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context '#query' do
|
56
|
+
it "returns url/status pairs that match a specifier" do
|
57
|
+
statuses = UrlStatuses.new
|
58
|
+
|
59
|
+
statuses.add_url('http://ok.com', 200)
|
60
|
+
statuses.add_url('http://huh.com', 404)
|
61
|
+
|
62
|
+
pairs = statuses.query(200)
|
63
|
+
|
64
|
+
pairs.should have_same_elements_as([['http://ok.com', 200]])
|
65
|
+
end
|
66
|
+
|
67
|
+
it "returns url/status pairs that match any specifier" do
|
68
|
+
statuses = UrlStatuses.new
|
69
|
+
|
70
|
+
statuses.add_url('http://ok.com', 200)
|
71
|
+
statuses.add_url('http://huh.com', 404)
|
72
|
+
statuses.add_url('http://ohno.com', 500)
|
73
|
+
|
74
|
+
pairs = statuses.query(200, :server_error)
|
75
|
+
|
76
|
+
pairs.should have_same_elements_as([['http://ok.com', 200], ['http://ohno.com', 500]])
|
77
|
+
end
|
78
|
+
|
79
|
+
it "returns empty Array if no pairs match a specifier" do
|
80
|
+
statuses = UrlStatuses.new
|
81
|
+
|
82
|
+
statuses.add_url('http://ok.com', 200)
|
83
|
+
statuses.add_url('http://huh.com', 404)
|
84
|
+
|
85
|
+
pairs = statuses.query(500, :redirect)
|
86
|
+
|
87
|
+
pairs.empty?.should be_true
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: clownfish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paul Salaets
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-02-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: anemone
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.7.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.7.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.12'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.12'
|
41
|
+
description: Anemone helper making common crawls easier to repeat.
|
42
|
+
email:
|
43
|
+
- psalaets@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- .rspec
|
50
|
+
- Gemfile
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- clownfish.gemspec
|
55
|
+
- ideas.txt
|
56
|
+
- lib/clownfish.rb
|
57
|
+
- lib/clownfish/adapter.rb
|
58
|
+
- lib/clownfish/anemone_ext.rb
|
59
|
+
- lib/clownfish/fish/count.rb
|
60
|
+
- lib/clownfish/fish/links_by_page.rb
|
61
|
+
- lib/clownfish/fish/response_times.rb
|
62
|
+
- lib/clownfish/helpers/status_group.rb
|
63
|
+
- lib/clownfish/helpers/url_statuses.rb
|
64
|
+
- lib/clownfish/version.rb
|
65
|
+
- spec/adapter_spec.rb
|
66
|
+
- spec/links_by_page_spec.rb
|
67
|
+
- spec/spec_helper.rb
|
68
|
+
- spec/status_group_spec.rb
|
69
|
+
- spec/url_statuses_spec.rb
|
70
|
+
homepage: https://github.com/psalaets/clownfish
|
71
|
+
licenses:
|
72
|
+
- MIT
|
73
|
+
metadata: {}
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 2.0.0
|
91
|
+
signing_key:
|
92
|
+
specification_version: 4
|
93
|
+
summary: Anemone helper
|
94
|
+
test_files:
|
95
|
+
- spec/adapter_spec.rb
|
96
|
+
- spec/links_by_page_spec.rb
|
97
|
+
- spec/spec_helper.rb
|
98
|
+
- spec/status_group_spec.rb
|
99
|
+
- spec/url_statuses_spec.rb
|