clownfish 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MGYzYmQ4YjNlYmEzNDAxODEwM2M5ODMyZjE2NzQ2N2MwOTczODIwZQ==
5
+ data.tar.gz: !binary |-
6
+ NDBiYjYzMzcwZTQ2MzExYWRkY2ZhMzA2NzFkMWNiODU1ODQwY2MxOQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NTk3N2U5OTEwNmI0MGY2ZDVhYWY4OGZiNDRmMDk5OGNhMjg3NTFiMjMxOTlh
10
+ YjYxZmZlNjQxMGViZWJhZGQzZjk3NjBlZTM4NGRjNDk2YzQxZmJlZTAzMTU2
11
+ YTBmNTYxNGY1MDJjNzc3OGI3NmJmOGYwN2ZhZTc2MDQwOTQwM2M=
12
+ data.tar.gz: !binary |-
13
+ ZmFkMTJhODdmMmY5ODllZGY2MGMyOTUxNTRhNWE0MzQ2OGZhZTcxZmQyMTA0
14
+ YjY5ZjlmNWZiNGJmYTZhYWJjMWRkOWI2MDMwZjg5MzVkODA3NWFjMmQ3ZTQ4
15
+ NjEzOTUxYjIzMWQwZmFhMTQxYjA5ZmNhNmYxMjY5YzI4OGUyZjg=
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Paul Salaets
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,80 @@
1
+ # Clownfish
2
+
3
+ Helper for [Anemone](http://anemone.rubyforge.org/). Makes common crawls easier to repeat.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'clownfish'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install clownfish
18
+
19
+ ## Usage
20
+
21
+ ```ruby
22
+ require 'clownfish'
23
+
24
+ clownfish = MyClownfish.new
25
+
26
+ Anemone.crawl_with_clownfish(start_url, clownfish)
27
+
28
+ # query clownfish for data from crawl
29
+ ```
30
+
31
+ ## Clownfish Spec
32
+
33
+ A clownfish is an object that has one or more of the following instance methods:
34
+
35
+ Reference: [Anemone RDocs](http://anemone.rubyforge.org/doc/index.html)
36
+
37
+ ### anemone_options
38
+
39
+ Returns a `Hash` of `Symbol` to values. See [Anemone::Core::DEFAULT_OPTS](http://git.io/wFmCfA) for available options.
40
+ This is forwarded as the second argument to `Anemone.crawl`. Invoked once before crawl.
41
+
42
+ ### skip_links_like
43
+
44
+ Returns a single `Regexp` or `Array` of `Regexp`. Urls matching any of these will not be crawled. Invoked once before crawl.
45
+
46
+ ### on_every_page
47
+
48
+ Takes one argument, an `Anemone::Page`. Invoked once per page during crawl.
49
+
50
+ ### focus_crawl
51
+
52
+ Takes one argument, an `Anemone::Page`. Returns the links on that page that should be crawled. Invoked once per page during crawl.
53
+
54
+ ### after_crawl
55
+
56
+ Takes one argument, an `Anemone::PageStore`. Invoked once after crawl is done.
57
+
58
+ ## Whats Included
59
+
60
+ See [wiki](https://github.com/psalaets/clownfish/wiki) for examples.
61
+
62
+ ### Clownfish::LinksByPage
63
+
64
+ Lists every page that has links, the links and the status code when following those links.
65
+
66
+ ### Clownfisn::ResponseTimes
67
+
68
+ Record every url and it's response time.
69
+
70
+ ### Clownfisn::Count
71
+
72
+ Count pages.
73
+
74
+ ## Contributing
75
+
76
+ 1. Fork it
77
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
78
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
79
+ 4. Push to the branch (`git push origin my-new-feature`)
80
+ 5. Create new Pull Request
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :default => :spec
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('lib', File.dirname(__FILE__))
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'clownfish/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "clownfish"
8
+ gem.version = Clownfish::VERSION
9
+ gem.authors = ["Paul Salaets"]
10
+ gem.email = ["psalaets@gmail.com"]
11
+ gem.summary = "Anemone helper"
12
+ gem.description = "Anemone helper making common crawls easier to repeat."
13
+ gem.homepage = "https://github.com/psalaets/clownfish"
14
+ gem.license = "MIT"
15
+
16
+ gem.files = `git ls-files`.split($/)
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency('anemone', '~> 0.7.2')
21
+ gem.add_development_dependency('rspec', '~> 2.12')
22
+ end
@@ -0,0 +1,8 @@
1
+ syntax ideas:
2
+
3
+ Anemone.crawl('url', :with => clownfish)
4
+
5
+ Anemone.crawl('url').with(clownfish)
6
+
7
+ Anemone.crawl_with_clownfish('url', clownfish)
8
+
@@ -0,0 +1,10 @@
1
+ require "clownfish/version"
2
+ require "clownfish/adapter"
3
+ require "clownfish/anemone_ext"
4
+
5
+ require "clownfish/helpers/status_group"
6
+ require "clownfish/helpers/url_statuses"
7
+
8
+ require "clownfish/fish/links_by_page"
9
+ require "clownfish/fish/response_times"
10
+ require "clownfish/fish/count"
@@ -0,0 +1,62 @@
1
+ module Clownfish
2
+ # Adapter between Anemone and clownfish objects.
3
+ class Adapter
4
+ # Internal: Create an Adapter that wraps a clownfish.
5
+ #
6
+ # clownfish - Object that conforms to clownfish spec. See README.md.
7
+ def initialize(clownfish)
8
+ raise ArgumentError, "clownfish cannot be nil" if clownfish.nil?
9
+ @delegate = clownfish
10
+ end
11
+
12
+ # Internal: Forwards Anemone options from clownfish.
13
+ #
14
+ # Returns Hash of Anemone options, never nil.
15
+ def anemone_options
16
+ (@delegate.respond_to?(:anemone_options) && @delegate.anemone_options) || {}
17
+ end
18
+
19
+ # Internal: Connects clownfish to Anemone.
20
+ #
21
+ # anemone - Instance of Anemone::Core.
22
+ #
23
+ # Returns nothing.
24
+ def hook_into_anemone(anemone)
25
+ wire_up_after_crawl(anemone)
26
+ wire_up_on_every_page(anemone)
27
+ wire_up_focus_crawl(anemone)
28
+ relay_skip_links_like(anemone)
29
+ end
30
+
31
+ private
32
+
33
+ # Connects delegate's after_crawl to Anemone.
34
+ def wire_up_after_crawl(anemone)
35
+ anemone.after_crawl do |page_store|
36
+ @delegate.after_crawl(page_store)
37
+ end if @delegate.respond_to?(:after_crawl)
38
+ end
39
+
40
+ # Connects delegate's on_every_page to Anemone.
41
+ def wire_up_on_every_page(anemone)
42
+ anemone.on_every_page do |page|
43
+ @delegate.on_every_page(page)
44
+ end if @delegate.respond_to?(:on_every_page)
45
+ end
46
+
47
+ # Connects delegate's focus_crawl to Anemone.
48
+ def wire_up_focus_crawl(anemone)
49
+ anemone.focus_crawl do |page|
50
+ @delegate.focus_crawl(page) || []
51
+ end if @delegate.respond_to?(:focus_crawl)
52
+ end
53
+
54
+ # Passes delegate's skip_links_like to Anemone.
55
+ def relay_skip_links_like(anemone)
56
+ if @delegate.respond_to?(:skip_links_like)
57
+ regexes = @delegate.skip_links_like
58
+ anemone.skip_links_like([regexes].flatten)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,16 @@
1
+ require "anemone"
2
+
3
+ module Anemone
4
+ # Public: Starts an Anemone crawl with a clownfish.
5
+ #
6
+ # urls - String or Array of Strings telling where to start crawl from.
7
+ # clownfish - Object that conforms to clownfish spec. See README.md.
8
+ #
9
+ # Returns nothing.
10
+ def self.crawl_with_clownfish(urls, clownfish)
11
+ adapter = Clownfish::Adapter.new(clownfish)
12
+ self.crawl(urls, adapter.anemone_options) do |anemone|
13
+ adapter.hook_into_anemone(anemone)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,11 @@
1
+ module Clownfish
2
+ # Clownfish that counts number of pages on a site. Taken from Anemone.
3
+ class Count
4
+ # Number of pages found. Only meaningful after a crawl.
5
+ attr_reader :count
6
+
7
+ def after_crawl(page_store)
8
+ @count = page_store.uniq!.size
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,60 @@
1
+ module Clownfish
2
+ # Clownfish that records every link on a page and the repsonse status codes
3
+ # when the links are followed.
4
+ class LinksByPage
5
+ # Hash of url String to UrlStatuses. The values are all links found on page
6
+ # at the key.
7
+ attr_reader :links_by_page
8
+
9
+ def initialize
10
+ @links_by_page = {}
11
+ end
12
+
13
+ def anemone_options
14
+ # Not looking at page bodies so don't keep them around
15
+ {:discard_page_bodies => true}
16
+ end
17
+
18
+ def on_every_page(page)
19
+ # First url in crawl has no page
20
+ referer = page.referer ? page.referer.to_s : '[starting point]'
21
+
22
+ @links_by_page[referer] = UrlStatuses.new unless @links_by_page.include? referer
23
+
24
+ links = @links_by_page[referer]
25
+ links.add_url(page.url.to_s, page.code)
26
+ end
27
+
28
+ # Print links by page to stdout.
29
+ #
30
+ # options - Hash specifying what and how to report.
31
+ # :to - IO to print report to. Defaults to STDOUT.
32
+ # :status - One or Array of status specifiers. Defaults to :all.
33
+ # Only links with these statues will be reported. See
34
+ # Clownfish::StatusGroup for accepted status specifiers.
35
+ def report(options = {})
36
+ options = report_options(options)
37
+ out = options[:to]
38
+ specifiers = options[:status]
39
+
40
+ @links_by_page.each do |page, link_statuses|
41
+ link_status_pairs = link_statuses.query(specifiers)
42
+
43
+ unless link_status_pairs.empty?
44
+ out.puts "#{page}"
45
+ link_status_pairs.each do |link, status|
46
+ out.puts "#{status} #{link}"
47
+ end
48
+ out.puts
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def report_options(options)
56
+ defaults = {:to => STDOUT, :status => :all}
57
+ defaults.merge(options)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,20 @@
1
+ module Clownfish
2
+ # Clownfish that records the response time of every url.
3
+ class ResponseTimes
4
+ # Hash where key is url String and value is number (milliseconds).
5
+ attr_reader :times_by_url
6
+
7
+ def initialize
8
+ @times_by_url = {}
9
+ end
10
+
11
+ def anemone_options
12
+ # Not looking at page bodies so don't keep them around
13
+ {:discard_page_bodies => true}
14
+ end
15
+
16
+ def on_every_page(page)
17
+ @times_by_url[page.url.to_s] = page.response_time
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,66 @@
1
+ module Clownfish
2
+ # One or more response status codes. StatusGroups are filled with status
3
+ # specifiers to determine what is in the group.
4
+ #
5
+ # Status specifiers can be Integer status codes like 200, Integer Ranges like
6
+ # 400..404 or any of the following Symbols:
7
+ # :all - any status code
8
+ # :success - 2xx
9
+ # :redirect - 3xx
10
+ # :non_error - 2xx through 3xx
11
+ # :client_error - 4xx
12
+ # :server_error - 5xx
13
+ # :error - 4xx through 5xx
14
+ class StatusGroup
15
+ ALIASES = {
16
+ :all => 200..599,
17
+ :success => 200..299,
18
+ :redirect => 300..399,
19
+ :non_error => 200..399,
20
+ :client_error => 400..499,
21
+ :server_error => 500..599,
22
+ :error => 400..599
23
+ }
24
+
25
+ # Public: Create a new group.
26
+ #
27
+ # statuses - One or more status specifiers or an Array of status specifiers.
28
+ def initialize(*specifiers)
29
+ @members = []
30
+
31
+ specifiers.flatten.each do |status|
32
+ self << status
33
+ end
34
+ end
35
+
36
+ # Public: Add a status specifier to this group.
37
+ #
38
+ # specifier - A status specifier
39
+ #
40
+ # Returns self for chaining purposes.
41
+ def <<(specifier)
42
+ @members << (resolve_alias(specifier) || specifier)
43
+ self
44
+ end
45
+
46
+ # Public: Tells if this group includes a given status code.
47
+ #
48
+ # status - Integer status code
49
+ #
50
+ # Returns true if status is included, false otherwise.
51
+ def include?(status)
52
+ @members.any? {|m| m === status}
53
+ end
54
+
55
+ private
56
+
57
+ # Resolves a group alias to its Range.
58
+ #
59
+ # group_alias - Symbol representing a set of status codes.
60
+ #
61
+ # Returns Range specified by group_alias or nil if there is none.
62
+ def resolve_alias(group_alias)
63
+ ALIASES[group_alias]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,39 @@
1
+ module Clownfish
2
+ # Helper class for pairing urls with status codes.
3
+ class UrlStatuses
4
+ attr_reader :status_codes_by_url
5
+
6
+ def initialize
7
+ @status_codes_by_url = {}
8
+ end
9
+
10
+ def add_url(url, status_code)
11
+ @status_codes_by_url[url] = status_code
12
+ end
13
+
14
+ def each(&block)
15
+ @status_codes_by_url.each(&block)
16
+ end
17
+
18
+ def size
19
+ @status_codes_by_url.size
20
+ end
21
+
22
+ def empty?
23
+ size == 0
24
+ end
25
+
26
+ # Public: Gets url/status code pairs that match one of the specified status
27
+ # codes.
28
+ #
29
+ # status_group_specifiers - One, many or an Array of status group specifiers
30
+ # as accepted by StatusGroup.new.
31
+ #
32
+ # Returns url/status pairs that match status specifiers.
33
+ def query(*status_group_specifiers)
34
+ group = StatusGroup.new(status_group_specifiers)
35
+
36
+ @status_codes_by_url.find_all { |url, code| group.include? code }
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Clownfish
2
+ VERSION = "0.3.0"
3
+ end
@@ -0,0 +1,144 @@
1
+ require 'spec_helper'
2
+
3
+ module Clownfish
4
+ describe Adapter do
5
+ context ".new" do
6
+ it "doesn't accept nil delegate" do
7
+ expect { Adapter.new(nil) }.to raise_error(ArgumentError)
8
+ end
9
+ end
10
+
11
+ context "#anemone_options" do
12
+ it "forwards anemone_options from delegate" do
13
+ delegate = double('delegate')
14
+ delegate.stub(:anemone_options) {{:name => 'bob'}}
15
+
16
+ adapter = Adapter.new(delegate)
17
+
18
+ adapter.anemone_options.should eq({:name => 'bob'})
19
+ end
20
+
21
+ it "returns empty Hash if delegate has no options" do
22
+ delegate = double('delegate')
23
+ delegate.stub(:anemone_options) {nil}
24
+
25
+ adapter = Adapter.new(delegate)
26
+
27
+ adapter.anemone_options.should eq({})
28
+ end
29
+
30
+ it "returns empty Hash if delegate doesn't support anemone_options" do
31
+ # Has no anemone_options method
32
+ delegate = Object.new
33
+
34
+ adapter = Adapter.new(delegate)
35
+
36
+ adapter.anemone_options.should eq({})
37
+ end
38
+ end
39
+
40
+ context "hooking into Anemone" do
41
+ before :each do
42
+ @page_store = Object.new
43
+ @page1, @page2 = Object.new, Object.new
44
+
45
+ @anemone = FakeAnemone.new(@page_store, @page1, @page2)
46
+ end
47
+
48
+ it "wires up after_crawl when delegate supports it" do
49
+ delegate = double('delegate')
50
+ delegate.should_receive(:after_crawl).with(@page_store).once
51
+
52
+ adapter = Adapter.new(delegate)
53
+
54
+ adapter.hook_into_anemone(@anemone)
55
+ end
56
+
57
+ it "ignores after_crawl when not supported" do
58
+ delegate = Object.new
59
+
60
+ adapter = Adapter.new(delegate)
61
+
62
+ adapter.hook_into_anemone(@anemone)
63
+ end
64
+
65
+ it "wires up on_every_page when delegate supports it" do
66
+ delegate = double('delegate')
67
+ delegate.should_receive(:on_every_page).with(@page1).once
68
+ delegate.should_receive(:on_every_page).with(@page2).once
69
+
70
+ adapter = Adapter.new(delegate)
71
+
72
+ adapter.hook_into_anemone(@anemone)
73
+ end
74
+
75
+ it "ignores on_every_page when not supported" do
76
+ delegate = Object.new
77
+
78
+ adapter = Adapter.new(delegate)
79
+
80
+ adapter.hook_into_anemone(@anemone)
81
+ end
82
+
83
+ it "wires up focus_crawl when delegate supports it" do
84
+ delegate = double('delegate')
85
+ delegate.should_receive(:focus_crawl).with(@page1) {['url1']}.once
86
+
87
+ adapter = Adapter.new(delegate)
88
+
89
+ adapter.hook_into_anemone(@anemone)
90
+
91
+ @anemone.last_focus_crawl_links.should eq(['url1'])
92
+ end
93
+
94
+ it "focuses on no links when delegate doesn't focus on any" do
95
+ delegate = double('delegate')
96
+ delegate.should_receive(:focus_crawl).with(@page1) {nil}
97
+
98
+ adapter = Adapter.new(delegate)
99
+
100
+ adapter.hook_into_anemone(@anemone)
101
+
102
+ @anemone.last_focus_crawl_links.should eq([])
103
+ end
104
+
105
+ it "ignores focus_crawl when delegate doesn't support it" do
106
+ delegate = Object.new
107
+
108
+ adapter = Adapter.new(delegate)
109
+
110
+ adapter.hook_into_anemone(@anemone)
111
+ end
112
+
113
+ it "relays skip_links_like regex when delegate returns one" do
114
+ delegate = double('delegate')
115
+ delegate.stub(:skip_links_like) {/a/}
116
+
117
+ adapter = Adapter.new(delegate)
118
+
119
+ adapter.hook_into_anemone(@anemone)
120
+
121
+ @anemone.last_skip_links_like_regexes.should eq([/a/])
122
+ end
123
+
124
+ it "relays skip_links_like regexes when delegate returns many" do
125
+ delegate = double('delegate')
126
+ delegate.stub(:skip_links_like) {[/a/, /b/]}
127
+
128
+ adapter = Adapter.new(delegate)
129
+
130
+ adapter.hook_into_anemone(@anemone)
131
+
132
+ @anemone.last_skip_links_like_regexes.should eq([/a/, /b/])
133
+ end
134
+
135
+ it "ignores skip_links_like when not supported" do
136
+ delegate = Object.new
137
+
138
+ adapter = Adapter.new(delegate)
139
+
140
+ adapter.hook_into_anemone(@anemone)
141
+ end
142
+ end # end of hooking into Anemone
143
+ end # end of describe Adapter
144
+ end # end of Clownfish module
@@ -0,0 +1,80 @@
1
+ require 'spec_helper'
2
+ require 'stringio'
3
+
4
+ module Clownfish
5
+ describe LinksByPage do
6
+ describe "#report" do
7
+ before :each do
8
+ @home = FakePage.new('home.com', 200)
9
+ @links = FakePage.new('links.com', 200, 'home.com')
10
+ @client = FakePage.new('client.com', 404, 'links.com')
11
+ @server = FakePage.new('server.com', 500, 'links.com')
12
+ end
13
+
14
+ it "reports all statuses by default" do
15
+ fish = LinksByPage.new
16
+
17
+ fish.on_every_page(@client)
18
+ fish.on_every_page(@server)
19
+
20
+ out = StringIO.new
21
+ fish.report(:to => out)
22
+
23
+ out.string.should =~ %r{404 http://client.com\n500 http://server.com}
24
+ end
25
+
26
+ it "reports specified status when specifier given" do
27
+ fish = LinksByPage.new
28
+
29
+ fish.on_every_page(@client)
30
+ fish.on_every_page(@server)
31
+
32
+ out = StringIO.new
33
+ fish.report(:to => out, :status => :server_error)
34
+
35
+ out.string.should =~ %r{500 http://server.com}
36
+ out.string.should_not =~ /404/
37
+ end
38
+
39
+ it "reports specified statuses when many specified" do
40
+ fish = LinksByPage.new
41
+
42
+ fish.on_every_page(@links)
43
+ fish.on_every_page(@client)
44
+ fish.on_every_page(@server)
45
+
46
+ out = StringIO.new
47
+ fish.report(:to => out, :status => [500, 200..204])
48
+
49
+ out.string.should =~ %r{200 http://links.com}
50
+ out.string.should =~ %r{500 http://server.com}
51
+ out.string.should_not =~ /404/
52
+ end
53
+
54
+ it "omits page if none of its links will be shown" do
55
+ fish = LinksByPage.new
56
+
57
+ fish.on_every_page(@links)
58
+ fish.on_every_page(@client)
59
+ fish.on_every_page(@server)
60
+
61
+ out = StringIO.new
62
+ fish.report(:to => out, :status => [304])
63
+
64
+ out.string.should_not =~ %r{http://home.com}
65
+ out.string.should_not =~ %r{http://links.com}
66
+ end
67
+
68
+ it "shows referer of links with no referer as [starting point]" do
69
+ fish = LinksByPage.new
70
+
71
+ fish.on_every_page(@home)
72
+
73
+ out = StringIO.new
74
+ fish.report(:to => out)
75
+
76
+ out.string.should =~ /\[starting point\]/
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,70 @@
1
+ require 'clownfish'
2
+ require 'uri'
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+
9
+ # Run specs in random order to surface order dependencies. If you find an
10
+ # order dependency and want to debug it, you can fix the order by providing
11
+ # the seed, which is printed after each run.
12
+ # --seed 1234
13
+ config.order = 'random'
14
+ end
15
+
16
+ # Matcher for unordered equality Array
17
+ RSpec::Matchers.define :have_same_elements_as do |expected|
18
+ match do |actual|
19
+ expected.sort == actual.sort
20
+ end
21
+ end
22
+
23
+ module Clownfish
24
+ # Fake Anemone::Core to help with tests.
25
+ class FakeAnemone
26
+ attr_reader :last_focus_crawl_links
27
+ attr_reader :last_skip_links_like_regexes
28
+
29
+ def initialize(page_store, page1, page2)
30
+ @page_store = page_store
31
+ @page1 = page1
32
+ @page2 = page2
33
+ end
34
+
35
+ def after_crawl
36
+ yield(@page_store)
37
+ end
38
+
39
+ def on_every_page
40
+ yield(@page1)
41
+ yield(@page2)
42
+ end
43
+
44
+ def focus_crawl
45
+ @last_focus_crawl_links = yield(@page1)
46
+ end
47
+
48
+ def skip_links_like(regexes)
49
+ @last_skip_links_like_regexes = regexes
50
+ end
51
+ end
52
+
53
+ # Fake and minimal Anemone::Page to help with tests.
54
+ class FakePage
55
+ attr_reader :url, :referer, :code
56
+
57
+ def initialize(url, code = 200, referer = nil)
58
+ @url = urlify(url)
59
+ @referer = urlify(referer)
60
+ @code = code
61
+ end
62
+
63
+ def urlify(str)
64
+ return str if str.class == URI || str.nil?
65
+
66
+ str = "http://#{str}" unless str.start_with? 'http'
67
+ URI(str)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+
3
+ module Clownfish
4
+ describe StatusGroup do
5
+ context "#<<" do
6
+ it "takes alias Symbols, Integers and Integer Ranges" do
7
+ group = StatusGroup.new
8
+
9
+ group << :client_error
10
+ group << 500
11
+ group << 200..204
12
+
13
+ group.include?(404).should be_true
14
+ group.include?(500).should be_true
15
+ group.include?(200).should be_true
16
+ group.include?(304).should be_false
17
+ end
18
+
19
+ it "can be chained" do
20
+ group = StatusGroup.new
21
+
22
+ group << :client_error << 304 << :server_error
23
+
24
+ group.include?(404).should be_true
25
+ group.include?(500).should be_true
26
+ group.include?(304).should be_true
27
+ group.include?(200).should be_false
28
+ end
29
+ end
30
+
31
+ context ".new" do
32
+ it "can take a single status specifier" do
33
+ group = StatusGroup.new(:server_error)
34
+
35
+ group.include?(500).should be_true
36
+ group.include?(200).should be_false
37
+ end
38
+
39
+ it "can take multiple status specifiers" do
40
+ group = StatusGroup.new(200, :redirect, 400..406)
41
+
42
+ group.include?(200).should be_true
43
+ group.include?(301).should be_true
44
+ group.include?(401).should be_true
45
+ group.include?(204).should be_false
46
+ end
47
+
48
+ it "can take Array of status specifiers" do
49
+ group = StatusGroup.new([:success, 500, 300..304])
50
+
51
+ group.include?(500).should be_true
52
+ group.include?(200).should be_true
53
+ group.include?(302).should be_true
54
+ group.include?(404).should be_false
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,91 @@
1
+ require 'spec_helper'
2
+
3
+ module Clownfish
4
+ describe UrlStatuses do
5
+ it "accumulates urls with status codes" do
6
+ statuses = UrlStatuses.new
7
+
8
+ statuses.add_url('http://ok.com', 200)
9
+ statuses.add_url('http://huh.com', 404)
10
+
11
+ statuses.status_codes_by_url.should eq({'http://ok.com' => 200, 'http://huh.com' => 404})
12
+ end
13
+
14
+ it "starts off empty" do
15
+ statuses = UrlStatuses.new
16
+
17
+ statuses.empty?.should be_true
18
+ end
19
+
20
+ it "knows how many urls it has" do
21
+ statuses = UrlStatuses.new
22
+
23
+ statuses.add_url('http://ok.com', 200)
24
+ statuses.add_url('http://huh.com', 404)
25
+
26
+ statuses.size.should eq(2)
27
+ end
28
+
29
+ context '#each' do
30
+ it "yields url/code pairs to 2-arg block" do
31
+ statuses = UrlStatuses.new
32
+
33
+ statuses.add_url('http://ok.com', 200)
34
+ statuses.add_url('http://huh.com', 404)
35
+
36
+ pairs = []
37
+ statuses.each { |k, v| pairs << [k, v] }
38
+
39
+ pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
40
+ end
41
+
42
+ it "yields url/code Array to 1-arg block" do
43
+ statuses = UrlStatuses.new
44
+
45
+ statuses.add_url('http://ok.com', 200)
46
+ statuses.add_url('http://huh.com', 404)
47
+
48
+ pairs = []
49
+ statuses.each { |p| pairs << p }
50
+
51
+ pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
52
+ end
53
+ end
54
+
55
+ context '#query' do
56
+ it "returns url/status pairs that match a specifier" do
57
+ statuses = UrlStatuses.new
58
+
59
+ statuses.add_url('http://ok.com', 200)
60
+ statuses.add_url('http://huh.com', 404)
61
+
62
+ pairs = statuses.query(200)
63
+
64
+ pairs.should have_same_elements_as([['http://ok.com', 200]])
65
+ end
66
+
67
+ it "returns url/status pairs that match any specifier" do
68
+ statuses = UrlStatuses.new
69
+
70
+ statuses.add_url('http://ok.com', 200)
71
+ statuses.add_url('http://huh.com', 404)
72
+ statuses.add_url('http://ohno.com', 500)
73
+
74
+ pairs = statuses.query(200, :server_error)
75
+
76
+ pairs.should have_same_elements_as([['http://ok.com', 200], ['http://ohno.com', 500]])
77
+ end
78
+
79
+ it "returns empty Array if no pairs match a specifier" do
80
+ statuses = UrlStatuses.new
81
+
82
+ statuses.add_url('http://ok.com', 200)
83
+ statuses.add_url('http://huh.com', 404)
84
+
85
+ pairs = statuses.query(500, :redirect)
86
+
87
+ pairs.empty?.should be_true
88
+ end
89
+ end
90
+ end
91
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: clownfish
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Salaets
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-02-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: anemone
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '2.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '2.12'
41
+ description: Anemone helper making common crawls easier to repeat.
42
+ email:
43
+ - psalaets@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - .rspec
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - clownfish.gemspec
55
+ - ideas.txt
56
+ - lib/clownfish.rb
57
+ - lib/clownfish/adapter.rb
58
+ - lib/clownfish/anemone_ext.rb
59
+ - lib/clownfish/fish/count.rb
60
+ - lib/clownfish/fish/links_by_page.rb
61
+ - lib/clownfish/fish/response_times.rb
62
+ - lib/clownfish/helpers/status_group.rb
63
+ - lib/clownfish/helpers/url_statuses.rb
64
+ - lib/clownfish/version.rb
65
+ - spec/adapter_spec.rb
66
+ - spec/links_by_page_spec.rb
67
+ - spec/spec_helper.rb
68
+ - spec/status_group_spec.rb
69
+ - spec/url_statuses_spec.rb
70
+ homepage: https://github.com/psalaets/clownfish
71
+ licenses:
72
+ - MIT
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 2.0.0
91
+ signing_key:
92
+ specification_version: 4
93
+ summary: Anemone helper
94
+ test_files:
95
+ - spec/adapter_spec.rb
96
+ - spec/links_by_page_spec.rb
97
+ - spec/spec_helper.rb
98
+ - spec/status_group_spec.rb
99
+ - spec/url_statuses_spec.rb