clownfish 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +80 -0
- data/Rakefile +6 -0
- data/clownfish.gemspec +22 -0
- data/ideas.txt +8 -0
- data/lib/clownfish.rb +10 -0
- data/lib/clownfish/adapter.rb +62 -0
- data/lib/clownfish/anemone_ext.rb +16 -0
- data/lib/clownfish/fish/count.rb +11 -0
- data/lib/clownfish/fish/links_by_page.rb +60 -0
- data/lib/clownfish/fish/response_times.rb +20 -0
- data/lib/clownfish/helpers/status_group.rb +66 -0
- data/lib/clownfish/helpers/url_statuses.rb +39 -0
- data/lib/clownfish/version.rb +3 -0
- data/spec/adapter_spec.rb +144 -0
- data/spec/links_by_page_spec.rb +80 -0
- data/spec/spec_helper.rb +70 -0
- data/spec/status_group_spec.rb +58 -0
- data/spec/url_statuses_spec.rb +91 -0
- metadata +99 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MGYzYmQ4YjNlYmEzNDAxODEwM2M5ODMyZjE2NzQ2N2MwOTczODIwZQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NDBiYjYzMzcwZTQ2MzExYWRkY2ZhMzA2NzFkMWNiODU1ODQwY2MxOQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NTk3N2U5OTEwNmI0MGY2ZDVhYWY4OGZiNDRmMDk5OGNhMjg3NTFiMjMxOTlh
|
10
|
+
YjYxZmZlNjQxMGViZWJhZGQzZjk3NjBlZTM4NGRjNDk2YzQxZmJlZTAzMTU2
|
11
|
+
YTBmNTYxNGY1MDJjNzc3OGI3NmJmOGYwN2ZhZTc2MDQwOTQwM2M=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZmFkMTJhODdmMmY5ODllZGY2MGMyOTUxNTRhNWE0MzQ2OGZhZTcxZmQyMTA0
|
14
|
+
YjY5ZjlmNWZiNGJmYTZhYWJjMWRkOWI2MDMwZjg5MzVkODA3NWFjMmQ3ZTQ4
|
15
|
+
NjEzOTUxYjIzMWQwZmFhMTQxYjA5ZmNhNmYxMjY5YzI4OGUyZjg=
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Paul Salaets
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# Clownfish
|
2
|
+
|
3
|
+
Helper for [Anemone](http://anemone.rubyforge.org/). Makes common crawls easier to repeat.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'clownfish'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install clownfish
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
require 'clownfish'
|
23
|
+
|
24
|
+
clownfish = MyClownfish.new
|
25
|
+
|
26
|
+
Anemone.crawl_with_clownfish(start_url, clownfish)
|
27
|
+
|
28
|
+
# query clownfish for data from crawl
|
29
|
+
```
|
30
|
+
|
31
|
+
## Clownfish Spec
|
32
|
+
|
33
|
+
A clownfish is an object that has one or more of the following instance methods:
|
34
|
+
|
35
|
+
Reference: [Anemone RDocs](http://anemone.rubyforge.org/doc/index.html)
|
36
|
+
|
37
|
+
### anemone_options
|
38
|
+
|
39
|
+
Returns a `Hash` of `Symbol` to values. See [Anemone::Core::DEFAULT_OPTS](http://git.io/wFmCfA) for available options.
|
40
|
+
This is forwarded as the second argument to `Anemone.crawl`. Invoked once before crawl.
|
41
|
+
|
42
|
+
### skip_links_like
|
43
|
+
|
44
|
+
Returns a single `Regexp` or `Array` of `Regexp`. Urls matching any of these will not be crawled. Invoked once before crawl.
|
45
|
+
|
46
|
+
### on_every_page
|
47
|
+
|
48
|
+
Takes one argument, an `Anemone::Page`. Invoked once per page during crawl.
|
49
|
+
|
50
|
+
### focus_crawl
|
51
|
+
|
52
|
+
Takes one argument, an `Anemone::Page`. Returns the links on that page that should be crawled. Invoked once per page during crawl.
|
53
|
+
|
54
|
+
### after_crawl
|
55
|
+
|
56
|
+
Takes one argument, an `Anemone::PageStore`. Invoked once after crawl is done.
|
57
|
+
|
58
|
+
## Whats Included
|
59
|
+
|
60
|
+
See [wiki](https://github.com/psalaets/clownfish/wiki) for examples.
|
61
|
+
|
62
|
+
### Clownfish::LinksByPage
|
63
|
+
|
64
|
+
Lists every page that has links, the links and the status code when following those links.
|
65
|
+
|
66
|
+
### Clownfisn::ResponseTimes
|
67
|
+
|
68
|
+
Record every url and it's response time.
|
69
|
+
|
70
|
+
### Clownfisn::Count
|
71
|
+
|
72
|
+
Count pages.
|
73
|
+
|
74
|
+
## Contributing
|
75
|
+
|
76
|
+
1. Fork it
|
77
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
78
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
79
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
80
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/clownfish.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('lib', File.dirname(__FILE__))
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'clownfish/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "clownfish"
|
8
|
+
gem.version = Clownfish::VERSION
|
9
|
+
gem.authors = ["Paul Salaets"]
|
10
|
+
gem.email = ["psalaets@gmail.com"]
|
11
|
+
gem.summary = "Anemone helper"
|
12
|
+
gem.description = "Anemone helper making common crawls easier to repeat."
|
13
|
+
gem.homepage = "https://github.com/psalaets/clownfish"
|
14
|
+
gem.license = "MIT"
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($/)
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_dependency('anemone', '~> 0.7.2')
|
21
|
+
gem.add_development_dependency('rspec', '~> 2.12')
|
22
|
+
end
|
data/ideas.txt
ADDED
data/lib/clownfish.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require "clownfish/version"
|
2
|
+
require "clownfish/adapter"
|
3
|
+
require "clownfish/anemone_ext"
|
4
|
+
|
5
|
+
require "clownfish/helpers/status_group"
|
6
|
+
require "clownfish/helpers/url_statuses"
|
7
|
+
|
8
|
+
require "clownfish/fish/links_by_page"
|
9
|
+
require "clownfish/fish/response_times"
|
10
|
+
require "clownfish/fish/count"
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Adapter between Anemone and clownfish objects.
|
3
|
+
class Adapter
|
4
|
+
# Internal: Create an Adapter that wraps a clownfish.
|
5
|
+
#
|
6
|
+
# clownfish - Object that conforms to clownfish spec. See README.md.
|
7
|
+
def initialize(clownfish)
|
8
|
+
raise ArgumentError, "clownfish cannot be nil" if clownfish.nil?
|
9
|
+
@delegate = clownfish
|
10
|
+
end
|
11
|
+
|
12
|
+
# Internal: Forwards Anemone options from clownfish.
|
13
|
+
#
|
14
|
+
# Returns Hash of Anemone options, never nil.
|
15
|
+
def anemone_options
|
16
|
+
(@delegate.respond_to?(:anemone_options) && @delegate.anemone_options) || {}
|
17
|
+
end
|
18
|
+
|
19
|
+
# Internal: Connects clownfish to Anemone.
|
20
|
+
#
|
21
|
+
# anemone - Instance of Anemone::Core.
|
22
|
+
#
|
23
|
+
# Returns nothing.
|
24
|
+
def hook_into_anemone(anemone)
|
25
|
+
wire_up_after_crawl(anemone)
|
26
|
+
wire_up_on_every_page(anemone)
|
27
|
+
wire_up_focus_crawl(anemone)
|
28
|
+
relay_skip_links_like(anemone)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# Connects delegate's after_crawl to Anemone.
|
34
|
+
def wire_up_after_crawl(anemone)
|
35
|
+
anemone.after_crawl do |page_store|
|
36
|
+
@delegate.after_crawl(page_store)
|
37
|
+
end if @delegate.respond_to?(:after_crawl)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Connects delegate's on_every_page to Anemone.
|
41
|
+
def wire_up_on_every_page(anemone)
|
42
|
+
anemone.on_every_page do |page|
|
43
|
+
@delegate.on_every_page(page)
|
44
|
+
end if @delegate.respond_to?(:on_every_page)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Connects delegate's focus_crawl to Anemone.
|
48
|
+
def wire_up_focus_crawl(anemone)
|
49
|
+
anemone.focus_crawl do |page|
|
50
|
+
@delegate.focus_crawl(page) || []
|
51
|
+
end if @delegate.respond_to?(:focus_crawl)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Passes delegate's skip_links_like to Anemone.
|
55
|
+
def relay_skip_links_like(anemone)
|
56
|
+
if @delegate.respond_to?(:skip_links_like)
|
57
|
+
regexes = @delegate.skip_links_like
|
58
|
+
anemone.skip_links_like([regexes].flatten)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "anemone"
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
# Public: Starts an Anemone crawl with a clownfish.
|
5
|
+
#
|
6
|
+
# urls - String or Array of Strings telling where to start crawl from.
|
7
|
+
# clownfish - Object that conforms to clownfish spec. See README.md.
|
8
|
+
#
|
9
|
+
# Returns nothing.
|
10
|
+
def self.crawl_with_clownfish(urls, clownfish)
|
11
|
+
adapter = Clownfish::Adapter.new(clownfish)
|
12
|
+
self.crawl(urls, adapter.anemone_options) do |anemone|
|
13
|
+
adapter.hook_into_anemone(anemone)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Clownfish that counts number of pages on a site. Taken from Anemone.
|
3
|
+
class Count
|
4
|
+
# Number of pages found. Only meaningful after a crawl.
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def after_crawl(page_store)
|
8
|
+
@count = page_store.uniq!.size
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Clownfish that records every link on a page and the repsonse status codes
|
3
|
+
# when the links are followed.
|
4
|
+
class LinksByPage
|
5
|
+
# Hash of url String to UrlStatuses. The values are all links found on page
|
6
|
+
# at the key.
|
7
|
+
attr_reader :links_by_page
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@links_by_page = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def anemone_options
|
14
|
+
# Not looking at page bodies so don't keep them around
|
15
|
+
{:discard_page_bodies => true}
|
16
|
+
end
|
17
|
+
|
18
|
+
def on_every_page(page)
|
19
|
+
# First url in crawl has no page
|
20
|
+
referer = page.referer ? page.referer.to_s : '[starting point]'
|
21
|
+
|
22
|
+
@links_by_page[referer] = UrlStatuses.new unless @links_by_page.include? referer
|
23
|
+
|
24
|
+
links = @links_by_page[referer]
|
25
|
+
links.add_url(page.url.to_s, page.code)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Print links by page to stdout.
|
29
|
+
#
|
30
|
+
# options - Hash specifying what and how to report.
|
31
|
+
# :to - IO to print report to. Defaults to STDOUT.
|
32
|
+
# :status - One or Array of status specifiers. Defaults to :all.
|
33
|
+
# Only links with these statues will be reported. See
|
34
|
+
# Clownfish::StatusGroup for accepted status specifiers.
|
35
|
+
def report(options = {})
|
36
|
+
options = report_options(options)
|
37
|
+
out = options[:to]
|
38
|
+
specifiers = options[:status]
|
39
|
+
|
40
|
+
@links_by_page.each do |page, link_statuses|
|
41
|
+
link_status_pairs = link_statuses.query(specifiers)
|
42
|
+
|
43
|
+
unless link_status_pairs.empty?
|
44
|
+
out.puts "#{page}"
|
45
|
+
link_status_pairs.each do |link, status|
|
46
|
+
out.puts "#{status} #{link}"
|
47
|
+
end
|
48
|
+
out.puts
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def report_options(options)
|
56
|
+
defaults = {:to => STDOUT, :status => :all}
|
57
|
+
defaults.merge(options)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Clownfish that records the response time of every url.
|
3
|
+
class ResponseTimes
|
4
|
+
# Hash where key is url String and value is number (milliseconds).
|
5
|
+
attr_reader :times_by_url
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@times_by_url = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def anemone_options
|
12
|
+
# Not looking at page bodies so don't keep them around
|
13
|
+
{:discard_page_bodies => true}
|
14
|
+
end
|
15
|
+
|
16
|
+
def on_every_page(page)
|
17
|
+
@times_by_url[page.url.to_s] = page.response_time
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# One or more response status codes. StatusGroups are filled with status
|
3
|
+
# specifiers to determine what is in the group.
|
4
|
+
#
|
5
|
+
# Status specifiers can be Integer status codes like 200, Integer Ranges like
|
6
|
+
# 400..404 or any of the following Symbols:
|
7
|
+
# :all - any status code
|
8
|
+
# :success - 2xx
|
9
|
+
# :redirect - 3xx
|
10
|
+
# :non_error - 2xx through 3xx
|
11
|
+
# :client_error - 4xx
|
12
|
+
# :server_error - 5xx
|
13
|
+
# :error - 4xx through 5xx
|
14
|
+
class StatusGroup
|
15
|
+
ALIASES = {
|
16
|
+
:all => 200..599,
|
17
|
+
:success => 200..299,
|
18
|
+
:redirect => 300..399,
|
19
|
+
:non_error => 200..399,
|
20
|
+
:client_error => 400..499,
|
21
|
+
:server_error => 500..599,
|
22
|
+
:error => 400..599
|
23
|
+
}
|
24
|
+
|
25
|
+
# Public: Create a new group.
|
26
|
+
#
|
27
|
+
# statuses - One or more status specifiers or an Array of status specifiers.
|
28
|
+
def initialize(*specifiers)
|
29
|
+
@members = []
|
30
|
+
|
31
|
+
specifiers.flatten.each do |status|
|
32
|
+
self << status
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: Add a status specifier to this group.
|
37
|
+
#
|
38
|
+
# specifier - A status specifier
|
39
|
+
#
|
40
|
+
# Returns self for chaining purposes.
|
41
|
+
def <<(specifier)
|
42
|
+
@members << (resolve_alias(specifier) || specifier)
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
# Public: Tells if this group includes a given status code.
|
47
|
+
#
|
48
|
+
# status - Integer status code
|
49
|
+
#
|
50
|
+
# Returns true if status is included, false otherwise.
|
51
|
+
def include?(status)
|
52
|
+
@members.any? {|m| m === status}
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# Resolves a group alias to its Range.
|
58
|
+
#
|
59
|
+
# group_alias - Symbol representing a set of status codes.
|
60
|
+
#
|
61
|
+
# Returns Range specified by group_alias or nil if there is none.
|
62
|
+
def resolve_alias(group_alias)
|
63
|
+
ALIASES[group_alias]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Clownfish
|
2
|
+
# Helper class for pairing urls with status codes.
|
3
|
+
class UrlStatuses
|
4
|
+
attr_reader :status_codes_by_url
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@status_codes_by_url = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def add_url(url, status_code)
|
11
|
+
@status_codes_by_url[url] = status_code
|
12
|
+
end
|
13
|
+
|
14
|
+
def each(&block)
|
15
|
+
@status_codes_by_url.each(&block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def size
|
19
|
+
@status_codes_by_url.size
|
20
|
+
end
|
21
|
+
|
22
|
+
def empty?
|
23
|
+
size == 0
|
24
|
+
end
|
25
|
+
|
26
|
+
# Public: Gets url/status code pairs that match one of the specified status
|
27
|
+
# codes.
|
28
|
+
#
|
29
|
+
# status_group_specifiers - One, many or an Array of status group specifiers
|
30
|
+
# as accepted by StatusGroup.new.
|
31
|
+
#
|
32
|
+
# Returns url/status pairs that match status specifiers.
|
33
|
+
def query(*status_group_specifiers)
|
34
|
+
group = StatusGroup.new(status_group_specifiers)
|
35
|
+
|
36
|
+
@status_codes_by_url.find_all { |url, code| group.include? code }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Clownfish
|
4
|
+
describe Adapter do
|
5
|
+
context ".new" do
|
6
|
+
it "doesn't accept nil delegate" do
|
7
|
+
expect { Adapter.new(nil) }.to raise_error(ArgumentError)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
context "#anemone_options" do
|
12
|
+
it "forwards anemone_options from delegate" do
|
13
|
+
delegate = double('delegate')
|
14
|
+
delegate.stub(:anemone_options) {{:name => 'bob'}}
|
15
|
+
|
16
|
+
adapter = Adapter.new(delegate)
|
17
|
+
|
18
|
+
adapter.anemone_options.should eq({:name => 'bob'})
|
19
|
+
end
|
20
|
+
|
21
|
+
it "returns empty Hash if delegate has no options" do
|
22
|
+
delegate = double('delegate')
|
23
|
+
delegate.stub(:anemone_options) {nil}
|
24
|
+
|
25
|
+
adapter = Adapter.new(delegate)
|
26
|
+
|
27
|
+
adapter.anemone_options.should eq({})
|
28
|
+
end
|
29
|
+
|
30
|
+
it "returns empty Hash if delegate doesn't support anemone_options" do
|
31
|
+
# Has no anemone_options method
|
32
|
+
delegate = Object.new
|
33
|
+
|
34
|
+
adapter = Adapter.new(delegate)
|
35
|
+
|
36
|
+
adapter.anemone_options.should eq({})
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
context "hooking into Anemone" do
|
41
|
+
before :each do
|
42
|
+
@page_store = Object.new
|
43
|
+
@page1, @page2 = Object.new, Object.new
|
44
|
+
|
45
|
+
@anemone = FakeAnemone.new(@page_store, @page1, @page2)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "wires up after_crawl when delegate supports it" do
|
49
|
+
delegate = double('delegate')
|
50
|
+
delegate.should_receive(:after_crawl).with(@page_store).once
|
51
|
+
|
52
|
+
adapter = Adapter.new(delegate)
|
53
|
+
|
54
|
+
adapter.hook_into_anemone(@anemone)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "ignores after_crawl when not supported" do
|
58
|
+
delegate = Object.new
|
59
|
+
|
60
|
+
adapter = Adapter.new(delegate)
|
61
|
+
|
62
|
+
adapter.hook_into_anemone(@anemone)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "wires up on_every_page when delegate supports it" do
|
66
|
+
delegate = double('delegate')
|
67
|
+
delegate.should_receive(:on_every_page).with(@page1).once
|
68
|
+
delegate.should_receive(:on_every_page).with(@page2).once
|
69
|
+
|
70
|
+
adapter = Adapter.new(delegate)
|
71
|
+
|
72
|
+
adapter.hook_into_anemone(@anemone)
|
73
|
+
end
|
74
|
+
|
75
|
+
it "ignores on_every_page when not supported" do
|
76
|
+
delegate = Object.new
|
77
|
+
|
78
|
+
adapter = Adapter.new(delegate)
|
79
|
+
|
80
|
+
adapter.hook_into_anemone(@anemone)
|
81
|
+
end
|
82
|
+
|
83
|
+
it "wires up focus_crawl when delegate supports it" do
|
84
|
+
delegate = double('delegate')
|
85
|
+
delegate.should_receive(:focus_crawl).with(@page1) {['url1']}.once
|
86
|
+
|
87
|
+
adapter = Adapter.new(delegate)
|
88
|
+
|
89
|
+
adapter.hook_into_anemone(@anemone)
|
90
|
+
|
91
|
+
@anemone.last_focus_crawl_links.should eq(['url1'])
|
92
|
+
end
|
93
|
+
|
94
|
+
it "focuses on no links when delegate doesn't focus on any" do
|
95
|
+
delegate = double('delegate')
|
96
|
+
delegate.should_receive(:focus_crawl).with(@page1) {nil}
|
97
|
+
|
98
|
+
adapter = Adapter.new(delegate)
|
99
|
+
|
100
|
+
adapter.hook_into_anemone(@anemone)
|
101
|
+
|
102
|
+
@anemone.last_focus_crawl_links.should eq([])
|
103
|
+
end
|
104
|
+
|
105
|
+
it "ignores focus_crawl when delegate doesn't support it" do
|
106
|
+
delegate = Object.new
|
107
|
+
|
108
|
+
adapter = Adapter.new(delegate)
|
109
|
+
|
110
|
+
adapter.hook_into_anemone(@anemone)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "relays skip_links_like regex when delegate returns one" do
|
114
|
+
delegate = double('delegate')
|
115
|
+
delegate.stub(:skip_links_like) {/a/}
|
116
|
+
|
117
|
+
adapter = Adapter.new(delegate)
|
118
|
+
|
119
|
+
adapter.hook_into_anemone(@anemone)
|
120
|
+
|
121
|
+
@anemone.last_skip_links_like_regexes.should eq([/a/])
|
122
|
+
end
|
123
|
+
|
124
|
+
it "relays skip_links_like regexes when delegate returns many" do
|
125
|
+
delegate = double('delegate')
|
126
|
+
delegate.stub(:skip_links_like) {[/a/, /b/]}
|
127
|
+
|
128
|
+
adapter = Adapter.new(delegate)
|
129
|
+
|
130
|
+
adapter.hook_into_anemone(@anemone)
|
131
|
+
|
132
|
+
@anemone.last_skip_links_like_regexes.should eq([/a/, /b/])
|
133
|
+
end
|
134
|
+
|
135
|
+
it "ignores skip_links_like when not supported" do
|
136
|
+
delegate = Object.new
|
137
|
+
|
138
|
+
adapter = Adapter.new(delegate)
|
139
|
+
|
140
|
+
adapter.hook_into_anemone(@anemone)
|
141
|
+
end
|
142
|
+
end # end of hooking into Anemone
|
143
|
+
end # end of describe Adapter
|
144
|
+
end # end of Clownfish module
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module Clownfish
|
5
|
+
describe LinksByPage do
|
6
|
+
describe "#report" do
|
7
|
+
before :each do
|
8
|
+
@home = FakePage.new('home.com', 200)
|
9
|
+
@links = FakePage.new('links.com', 200, 'home.com')
|
10
|
+
@client = FakePage.new('client.com', 404, 'links.com')
|
11
|
+
@server = FakePage.new('server.com', 500, 'links.com')
|
12
|
+
end
|
13
|
+
|
14
|
+
it "reports all statuses by default" do
|
15
|
+
fish = LinksByPage.new
|
16
|
+
|
17
|
+
fish.on_every_page(@client)
|
18
|
+
fish.on_every_page(@server)
|
19
|
+
|
20
|
+
out = StringIO.new
|
21
|
+
fish.report(:to => out)
|
22
|
+
|
23
|
+
out.string.should =~ %r{404 http://client.com\n500 http://server.com}
|
24
|
+
end
|
25
|
+
|
26
|
+
it "reports specified status when specifier given" do
|
27
|
+
fish = LinksByPage.new
|
28
|
+
|
29
|
+
fish.on_every_page(@client)
|
30
|
+
fish.on_every_page(@server)
|
31
|
+
|
32
|
+
out = StringIO.new
|
33
|
+
fish.report(:to => out, :status => :server_error)
|
34
|
+
|
35
|
+
out.string.should =~ %r{500 http://server.com}
|
36
|
+
out.string.should_not =~ /404/
|
37
|
+
end
|
38
|
+
|
39
|
+
it "reports specified statuses when many specified" do
|
40
|
+
fish = LinksByPage.new
|
41
|
+
|
42
|
+
fish.on_every_page(@links)
|
43
|
+
fish.on_every_page(@client)
|
44
|
+
fish.on_every_page(@server)
|
45
|
+
|
46
|
+
out = StringIO.new
|
47
|
+
fish.report(:to => out, :status => [500, 200..204])
|
48
|
+
|
49
|
+
out.string.should =~ %r{200 http://links.com}
|
50
|
+
out.string.should =~ %r{500 http://server.com}
|
51
|
+
out.string.should_not =~ /404/
|
52
|
+
end
|
53
|
+
|
54
|
+
it "omits page if none of its links will be shown" do
|
55
|
+
fish = LinksByPage.new
|
56
|
+
|
57
|
+
fish.on_every_page(@links)
|
58
|
+
fish.on_every_page(@client)
|
59
|
+
fish.on_every_page(@server)
|
60
|
+
|
61
|
+
out = StringIO.new
|
62
|
+
fish.report(:to => out, :status => [304])
|
63
|
+
|
64
|
+
out.string.should_not =~ %r{http://home.com}
|
65
|
+
out.string.should_not =~ %r{http://links.com}
|
66
|
+
end
|
67
|
+
|
68
|
+
it "shows referer of links with no referer as [starting point]" do
|
69
|
+
fish = LinksByPage.new
|
70
|
+
|
71
|
+
fish.on_every_page(@home)
|
72
|
+
|
73
|
+
out = StringIO.new
|
74
|
+
fish.report(:to => out)
|
75
|
+
|
76
|
+
out.string.should =~ /\[starting point\]/
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'clownfish'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
6
|
+
config.run_all_when_everything_filtered = true
|
7
|
+
config.filter_run :focus
|
8
|
+
|
9
|
+
# Run specs in random order to surface order dependencies. If you find an
|
10
|
+
# order dependency and want to debug it, you can fix the order by providing
|
11
|
+
# the seed, which is printed after each run.
|
12
|
+
# --seed 1234
|
13
|
+
config.order = 'random'
|
14
|
+
end
|
15
|
+
|
16
|
+
# Matcher for unordered equality Array
|
17
|
+
RSpec::Matchers.define :have_same_elements_as do |expected|
|
18
|
+
match do |actual|
|
19
|
+
expected.sort == actual.sort
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module Clownfish
|
24
|
+
# Fake Anemone::Core to help with tests.
|
25
|
+
class FakeAnemone
|
26
|
+
attr_reader :last_focus_crawl_links
|
27
|
+
attr_reader :last_skip_links_like_regexes
|
28
|
+
|
29
|
+
def initialize(page_store, page1, page2)
|
30
|
+
@page_store = page_store
|
31
|
+
@page1 = page1
|
32
|
+
@page2 = page2
|
33
|
+
end
|
34
|
+
|
35
|
+
def after_crawl
|
36
|
+
yield(@page_store)
|
37
|
+
end
|
38
|
+
|
39
|
+
def on_every_page
|
40
|
+
yield(@page1)
|
41
|
+
yield(@page2)
|
42
|
+
end
|
43
|
+
|
44
|
+
def focus_crawl
|
45
|
+
@last_focus_crawl_links = yield(@page1)
|
46
|
+
end
|
47
|
+
|
48
|
+
def skip_links_like(regexes)
|
49
|
+
@last_skip_links_like_regexes = regexes
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Fake and minimal Anemone::Page to help with tests.
|
54
|
+
class FakePage
|
55
|
+
attr_reader :url, :referer, :code
|
56
|
+
|
57
|
+
def initialize(url, code = 200, referer = nil)
|
58
|
+
@url = urlify(url)
|
59
|
+
@referer = urlify(referer)
|
60
|
+
@code = code
|
61
|
+
end
|
62
|
+
|
63
|
+
def urlify(str)
|
64
|
+
return str if str.class == URI || str.nil?
|
65
|
+
|
66
|
+
str = "http://#{str}" unless str.start_with? 'http'
|
67
|
+
URI(str)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Clownfish
|
4
|
+
describe StatusGroup do
|
5
|
+
context "#<<" do
|
6
|
+
it "takes alias Symbols, Integers and Integer Ranges" do
|
7
|
+
group = StatusGroup.new
|
8
|
+
|
9
|
+
group << :client_error
|
10
|
+
group << 500
|
11
|
+
group << 200..204
|
12
|
+
|
13
|
+
group.include?(404).should be_true
|
14
|
+
group.include?(500).should be_true
|
15
|
+
group.include?(200).should be_true
|
16
|
+
group.include?(304).should be_false
|
17
|
+
end
|
18
|
+
|
19
|
+
it "can be chained" do
|
20
|
+
group = StatusGroup.new
|
21
|
+
|
22
|
+
group << :client_error << 304 << :server_error
|
23
|
+
|
24
|
+
group.include?(404).should be_true
|
25
|
+
group.include?(500).should be_true
|
26
|
+
group.include?(304).should be_true
|
27
|
+
group.include?(200).should be_false
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context ".new" do
|
32
|
+
it "can take a single status specifier" do
|
33
|
+
group = StatusGroup.new(:server_error)
|
34
|
+
|
35
|
+
group.include?(500).should be_true
|
36
|
+
group.include?(200).should be_false
|
37
|
+
end
|
38
|
+
|
39
|
+
it "can take multiple status specifiers" do
|
40
|
+
group = StatusGroup.new(200, :redirect, 400..406)
|
41
|
+
|
42
|
+
group.include?(200).should be_true
|
43
|
+
group.include?(301).should be_true
|
44
|
+
group.include?(401).should be_true
|
45
|
+
group.include?(204).should be_false
|
46
|
+
end
|
47
|
+
|
48
|
+
it "can take Array of status specifiers" do
|
49
|
+
group = StatusGroup.new([:success, 500, 300..304])
|
50
|
+
|
51
|
+
group.include?(500).should be_true
|
52
|
+
group.include?(200).should be_true
|
53
|
+
group.include?(302).should be_true
|
54
|
+
group.include?(404).should be_false
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Clownfish
|
4
|
+
describe UrlStatuses do
|
5
|
+
it "accumulates urls with status codes" do
|
6
|
+
statuses = UrlStatuses.new
|
7
|
+
|
8
|
+
statuses.add_url('http://ok.com', 200)
|
9
|
+
statuses.add_url('http://huh.com', 404)
|
10
|
+
|
11
|
+
statuses.status_codes_by_url.should eq({'http://ok.com' => 200, 'http://huh.com' => 404})
|
12
|
+
end
|
13
|
+
|
14
|
+
it "starts off empty" do
|
15
|
+
statuses = UrlStatuses.new
|
16
|
+
|
17
|
+
statuses.empty?.should be_true
|
18
|
+
end
|
19
|
+
|
20
|
+
it "knows how many urls it has" do
|
21
|
+
statuses = UrlStatuses.new
|
22
|
+
|
23
|
+
statuses.add_url('http://ok.com', 200)
|
24
|
+
statuses.add_url('http://huh.com', 404)
|
25
|
+
|
26
|
+
statuses.size.should eq(2)
|
27
|
+
end
|
28
|
+
|
29
|
+
context '#each' do
|
30
|
+
it "yields url/code pairs to 2-arg block" do
|
31
|
+
statuses = UrlStatuses.new
|
32
|
+
|
33
|
+
statuses.add_url('http://ok.com', 200)
|
34
|
+
statuses.add_url('http://huh.com', 404)
|
35
|
+
|
36
|
+
pairs = []
|
37
|
+
statuses.each { |k, v| pairs << [k, v] }
|
38
|
+
|
39
|
+
pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
|
40
|
+
end
|
41
|
+
|
42
|
+
it "yields url/code Array to 1-arg block" do
|
43
|
+
statuses = UrlStatuses.new
|
44
|
+
|
45
|
+
statuses.add_url('http://ok.com', 200)
|
46
|
+
statuses.add_url('http://huh.com', 404)
|
47
|
+
|
48
|
+
pairs = []
|
49
|
+
statuses.each { |p| pairs << p }
|
50
|
+
|
51
|
+
pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context '#query' do
|
56
|
+
it "returns url/status pairs that match a specifier" do
|
57
|
+
statuses = UrlStatuses.new
|
58
|
+
|
59
|
+
statuses.add_url('http://ok.com', 200)
|
60
|
+
statuses.add_url('http://huh.com', 404)
|
61
|
+
|
62
|
+
pairs = statuses.query(200)
|
63
|
+
|
64
|
+
pairs.should have_same_elements_as([['http://ok.com', 200]])
|
65
|
+
end
|
66
|
+
|
67
|
+
it "returns url/status pairs that match any specifier" do
|
68
|
+
statuses = UrlStatuses.new
|
69
|
+
|
70
|
+
statuses.add_url('http://ok.com', 200)
|
71
|
+
statuses.add_url('http://huh.com', 404)
|
72
|
+
statuses.add_url('http://ohno.com', 500)
|
73
|
+
|
74
|
+
pairs = statuses.query(200, :server_error)
|
75
|
+
|
76
|
+
pairs.should have_same_elements_as([['http://ok.com', 200], ['http://ohno.com', 500]])
|
77
|
+
end
|
78
|
+
|
79
|
+
it "returns empty Array if no pairs match a specifier" do
|
80
|
+
statuses = UrlStatuses.new
|
81
|
+
|
82
|
+
statuses.add_url('http://ok.com', 200)
|
83
|
+
statuses.add_url('http://huh.com', 404)
|
84
|
+
|
85
|
+
pairs = statuses.query(500, :redirect)
|
86
|
+
|
87
|
+
pairs.empty?.should be_true
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: clownfish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paul Salaets
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-02-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: anemone
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.7.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.7.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.12'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.12'
|
41
|
+
description: Anemone helper making common crawls easier to repeat.
|
42
|
+
email:
|
43
|
+
- psalaets@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- .rspec
|
50
|
+
- Gemfile
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- clownfish.gemspec
|
55
|
+
- ideas.txt
|
56
|
+
- lib/clownfish.rb
|
57
|
+
- lib/clownfish/adapter.rb
|
58
|
+
- lib/clownfish/anemone_ext.rb
|
59
|
+
- lib/clownfish/fish/count.rb
|
60
|
+
- lib/clownfish/fish/links_by_page.rb
|
61
|
+
- lib/clownfish/fish/response_times.rb
|
62
|
+
- lib/clownfish/helpers/status_group.rb
|
63
|
+
- lib/clownfish/helpers/url_statuses.rb
|
64
|
+
- lib/clownfish/version.rb
|
65
|
+
- spec/adapter_spec.rb
|
66
|
+
- spec/links_by_page_spec.rb
|
67
|
+
- spec/spec_helper.rb
|
68
|
+
- spec/status_group_spec.rb
|
69
|
+
- spec/url_statuses_spec.rb
|
70
|
+
homepage: https://github.com/psalaets/clownfish
|
71
|
+
licenses:
|
72
|
+
- MIT
|
73
|
+
metadata: {}
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 2.0.0
|
91
|
+
signing_key:
|
92
|
+
specification_version: 4
|
93
|
+
summary: Anemone helper
|
94
|
+
test_files:
|
95
|
+
- spec/adapter_spec.rb
|
96
|
+
- spec/links_by_page_spec.rb
|
97
|
+
- spec/spec_helper.rb
|
98
|
+
- spec/status_group_spec.rb
|
99
|
+
- spec/url_statuses_spec.rb
|