clownfish 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MGYzYmQ4YjNlYmEzNDAxODEwM2M5ODMyZjE2NzQ2N2MwOTczODIwZQ==
5
+ data.tar.gz: !binary |-
6
+ NDBiYjYzMzcwZTQ2MzExYWRkY2ZhMzA2NzFkMWNiODU1ODQwY2MxOQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NTk3N2U5OTEwNmI0MGY2ZDVhYWY4OGZiNDRmMDk5OGNhMjg3NTFiMjMxOTlh
10
+ YjYxZmZlNjQxMGViZWJhZGQzZjk3NjBlZTM4NGRjNDk2YzQxZmJlZTAzMTU2
11
+ YTBmNTYxNGY1MDJjNzc3OGI3NmJmOGYwN2ZhZTc2MDQwOTQwM2M=
12
+ data.tar.gz: !binary |-
13
+ ZmFkMTJhODdmMmY5ODllZGY2MGMyOTUxNTRhNWE0MzQ2OGZhZTcxZmQyMTA0
14
+ YjY5ZjlmNWZiNGJmYTZhYWJjMWRkOWI2MDMwZjg5MzVkODA3NWFjMmQ3ZTQ4
15
+ NjEzOTUxYjIzMWQwZmFhMTQxYjA5ZmNhNmYxMjY5YzI4OGUyZjg=
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Paul Salaets
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,80 @@
1
+ # Clownfish
2
+
3
+ Helper for [Anemone](http://anemone.rubyforge.org/). Makes common crawls easier to repeat.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'clownfish'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install clownfish
18
+
19
+ ## Usage
20
+
21
+ ```ruby
22
+ require 'clownfish'
23
+
24
+ clownfish = MyClownfish.new
25
+
26
+ Anemone.crawl_with_clownfish(start_url, clownfish)
27
+
28
+ # query clownfish for data from crawl
29
+ ```
30
+
31
+ ## Clownfish Spec
32
+
33
+ A clownfish is an object that has one or more of the following instance methods:
34
+
35
+ Reference: [Anemone RDocs](http://anemone.rubyforge.org/doc/index.html)
36
+
37
+ ### anemone_options
38
+
39
+ Returns a `Hash` of `Symbol` to values. See [Anemone::Core::DEFAULT_OPTS](http://git.io/wFmCfA) for available options.
40
+ This is forwarded as the second argument to `Anemone.crawl`. Invoked once before crawl.
41
+
42
+ ### skip_links_like
43
+
44
+ Returns a single `Regexp` or `Array` of `Regexp`. Urls matching any of these will not be crawled. Invoked once before crawl.
45
+
46
+ ### on_every_page
47
+
48
+ Takes one argument, an `Anemone::Page`. Invoked once per page during crawl.
49
+
50
+ ### focus_crawl
51
+
52
+ Takes one argument, an `Anemone::Page`. Returns the links on that page that should be crawled. Invoked once per page during crawl.
53
+
54
+ ### after_crawl
55
+
56
+ Takes one argument, an `Anemone::PageStore`. Invoked once after crawl is done.
57
+
58
+ ## Whats Included
59
+
60
+ See [wiki](https://github.com/psalaets/clownfish/wiki) for examples.
61
+
62
+ ### Clownfish::LinksByPage
63
+
64
+ Lists every page that has links, the links and the status code when following those links.
65
+
66
+ ### Clownfisn::ResponseTimes
67
+
68
+ Record every url and it's response time.
69
+
70
+ ### Clownfisn::Count
71
+
72
+ Count pages.
73
+
74
+ ## Contributing
75
+
76
+ 1. Fork it
77
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
78
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
79
+ 4. Push to the branch (`git push origin my-new-feature`)
80
+ 5. Create new Pull Request
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :default => :spec
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('lib', File.dirname(__FILE__))
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'clownfish/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "clownfish"
8
+ gem.version = Clownfish::VERSION
9
+ gem.authors = ["Paul Salaets"]
10
+ gem.email = ["psalaets@gmail.com"]
11
+ gem.summary = "Anemone helper"
12
+ gem.description = "Anemone helper making common crawls easier to repeat."
13
+ gem.homepage = "https://github.com/psalaets/clownfish"
14
+ gem.license = "MIT"
15
+
16
+ gem.files = `git ls-files`.split($/)
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency('anemone', '~> 0.7.2')
21
+ gem.add_development_dependency('rspec', '~> 2.12')
22
+ end
@@ -0,0 +1,8 @@
1
+ syntax ideas:
2
+
3
+ Anemone.crawl('url', :with => clownfish)
4
+
5
+ Anemone.crawl('url').with(clownfish)
6
+
7
+ Anemone.crawl_with_clownfish('url', clownfish)
8
+
@@ -0,0 +1,10 @@
1
+ require "clownfish/version"
2
+ require "clownfish/adapter"
3
+ require "clownfish/anemone_ext"
4
+
5
+ require "clownfish/helpers/status_group"
6
+ require "clownfish/helpers/url_statuses"
7
+
8
+ require "clownfish/fish/links_by_page"
9
+ require "clownfish/fish/response_times"
10
+ require "clownfish/fish/count"
@@ -0,0 +1,62 @@
1
+ module Clownfish
2
+ # Adapter between Anemone and clownfish objects.
3
+ class Adapter
4
+ # Internal: Create an Adapter that wraps a clownfish.
5
+ #
6
+ # clownfish - Object that conforms to clownfish spec. See README.md.
7
+ def initialize(clownfish)
8
+ raise ArgumentError, "clownfish cannot be nil" if clownfish.nil?
9
+ @delegate = clownfish
10
+ end
11
+
12
+ # Internal: Forwards Anemone options from clownfish.
13
+ #
14
+ # Returns Hash of Anemone options, never nil.
15
+ def anemone_options
16
+ (@delegate.respond_to?(:anemone_options) && @delegate.anemone_options) || {}
17
+ end
18
+
19
+ # Internal: Connects clownfish to Anemone.
20
+ #
21
+ # anemone - Instance of Anemone::Core.
22
+ #
23
+ # Returns nothing.
24
+ def hook_into_anemone(anemone)
25
+ wire_up_after_crawl(anemone)
26
+ wire_up_on_every_page(anemone)
27
+ wire_up_focus_crawl(anemone)
28
+ relay_skip_links_like(anemone)
29
+ end
30
+
31
+ private
32
+
33
+ # Connects delegate's after_crawl to Anemone.
34
+ def wire_up_after_crawl(anemone)
35
+ anemone.after_crawl do |page_store|
36
+ @delegate.after_crawl(page_store)
37
+ end if @delegate.respond_to?(:after_crawl)
38
+ end
39
+
40
+ # Connects delegate's on_every_page to Anemone.
41
+ def wire_up_on_every_page(anemone)
42
+ anemone.on_every_page do |page|
43
+ @delegate.on_every_page(page)
44
+ end if @delegate.respond_to?(:on_every_page)
45
+ end
46
+
47
+ # Connects delegate's focus_crawl to Anemone.
48
+ def wire_up_focus_crawl(anemone)
49
+ anemone.focus_crawl do |page|
50
+ @delegate.focus_crawl(page) || []
51
+ end if @delegate.respond_to?(:focus_crawl)
52
+ end
53
+
54
+ # Passes delegate's skip_links_like to Anemone.
55
+ def relay_skip_links_like(anemone)
56
+ if @delegate.respond_to?(:skip_links_like)
57
+ regexes = @delegate.skip_links_like
58
+ anemone.skip_links_like([regexes].flatten)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,16 @@
1
+ require "anemone"
2
+
3
+ module Anemone
4
+ # Public: Starts an Anemone crawl with a clownfish.
5
+ #
6
+ # urls - String or Array of Strings telling where to start crawl from.
7
+ # clownfish - Object that conforms to clownfish spec. See README.md.
8
+ #
9
+ # Returns nothing.
10
+ def self.crawl_with_clownfish(urls, clownfish)
11
+ adapter = Clownfish::Adapter.new(clownfish)
12
+ self.crawl(urls, adapter.anemone_options) do |anemone|
13
+ adapter.hook_into_anemone(anemone)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,11 @@
1
+ module Clownfish
2
+ # Clownfish that counts number of pages on a site. Taken from Anemone.
3
+ class Count
4
+ # Number of pages found. Only meaningful after a crawl.
5
+ attr_reader :count
6
+
7
+ def after_crawl(page_store)
8
+ @count = page_store.uniq!.size
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,60 @@
1
+ module Clownfish
2
+ # Clownfish that records every link on a page and the repsonse status codes
3
+ # when the links are followed.
4
+ class LinksByPage
5
+ # Hash of url String to UrlStatuses. The values are all links found on page
6
+ # at the key.
7
+ attr_reader :links_by_page
8
+
9
+ def initialize
10
+ @links_by_page = {}
11
+ end
12
+
13
+ def anemone_options
14
+ # Not looking at page bodies so don't keep them around
15
+ {:discard_page_bodies => true}
16
+ end
17
+
18
+ def on_every_page(page)
19
+ # First url in crawl has no page
20
+ referer = page.referer ? page.referer.to_s : '[starting point]'
21
+
22
+ @links_by_page[referer] = UrlStatuses.new unless @links_by_page.include? referer
23
+
24
+ links = @links_by_page[referer]
25
+ links.add_url(page.url.to_s, page.code)
26
+ end
27
+
28
+ # Print links by page to stdout.
29
+ #
30
+ # options - Hash specifying what and how to report.
31
+ # :to - IO to print report to. Defaults to STDOUT.
32
+ # :status - One or Array of status specifiers. Defaults to :all.
33
+ # Only links with these statues will be reported. See
34
+ # Clownfish::StatusGroup for accepted status specifiers.
35
+ def report(options = {})
36
+ options = report_options(options)
37
+ out = options[:to]
38
+ specifiers = options[:status]
39
+
40
+ @links_by_page.each do |page, link_statuses|
41
+ link_status_pairs = link_statuses.query(specifiers)
42
+
43
+ unless link_status_pairs.empty?
44
+ out.puts "#{page}"
45
+ link_status_pairs.each do |link, status|
46
+ out.puts "#{status} #{link}"
47
+ end
48
+ out.puts
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def report_options(options)
56
+ defaults = {:to => STDOUT, :status => :all}
57
+ defaults.merge(options)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,20 @@
1
+ module Clownfish
2
+ # Clownfish that records the response time of every url.
3
+ class ResponseTimes
4
+ # Hash where key is url String and value is number (milliseconds).
5
+ attr_reader :times_by_url
6
+
7
+ def initialize
8
+ @times_by_url = {}
9
+ end
10
+
11
+ def anemone_options
12
+ # Not looking at page bodies so don't keep them around
13
+ {:discard_page_bodies => true}
14
+ end
15
+
16
+ def on_every_page(page)
17
+ @times_by_url[page.url.to_s] = page.response_time
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,66 @@
1
+ module Clownfish
2
+ # One or more response status codes. StatusGroups are filled with status
3
+ # specifiers to determine what is in the group.
4
+ #
5
+ # Status specifiers can be Integer status codes like 200, Integer Ranges like
6
+ # 400..404 or any of the following Symbols:
7
+ # :all - any status code
8
+ # :success - 2xx
9
+ # :redirect - 3xx
10
+ # :non_error - 2xx through 3xx
11
+ # :client_error - 4xx
12
+ # :server_error - 5xx
13
+ # :error - 4xx through 5xx
14
+ class StatusGroup
15
+ ALIASES = {
16
+ :all => 200..599,
17
+ :success => 200..299,
18
+ :redirect => 300..399,
19
+ :non_error => 200..399,
20
+ :client_error => 400..499,
21
+ :server_error => 500..599,
22
+ :error => 400..599
23
+ }
24
+
25
+ # Public: Create a new group.
26
+ #
27
+ # statuses - One or more status specifiers or an Array of status specifiers.
28
+ def initialize(*specifiers)
29
+ @members = []
30
+
31
+ specifiers.flatten.each do |status|
32
+ self << status
33
+ end
34
+ end
35
+
36
+ # Public: Add a status specifier to this group.
37
+ #
38
+ # specifier - A status specifier
39
+ #
40
+ # Returns self for chaining purposes.
41
+ def <<(specifier)
42
+ @members << (resolve_alias(specifier) || specifier)
43
+ self
44
+ end
45
+
46
+ # Public: Tells if this group includes a given status code.
47
+ #
48
+ # status - Integer status code
49
+ #
50
+ # Returns true if status is included, false otherwise.
51
+ def include?(status)
52
+ @members.any? {|m| m === status}
53
+ end
54
+
55
+ private
56
+
57
+ # Resolves a group alias to its Range.
58
+ #
59
+ # group_alias - Symbol representing a set of status codes.
60
+ #
61
+ # Returns Range specified by group_alias or nil if there is none.
62
+ def resolve_alias(group_alias)
63
+ ALIASES[group_alias]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,39 @@
1
+ module Clownfish
2
+ # Helper class for pairing urls with status codes.
3
+ class UrlStatuses
4
+ attr_reader :status_codes_by_url
5
+
6
+ def initialize
7
+ @status_codes_by_url = {}
8
+ end
9
+
10
+ def add_url(url, status_code)
11
+ @status_codes_by_url[url] = status_code
12
+ end
13
+
14
+ def each(&block)
15
+ @status_codes_by_url.each(&block)
16
+ end
17
+
18
+ def size
19
+ @status_codes_by_url.size
20
+ end
21
+
22
+ def empty?
23
+ size == 0
24
+ end
25
+
26
+ # Public: Gets url/status code pairs that match one of the specified status
27
+ # codes.
28
+ #
29
+ # status_group_specifiers - One, many or an Array of status group specifiers
30
+ # as accepted by StatusGroup.new.
31
+ #
32
+ # Returns url/status pairs that match status specifiers.
33
+ def query(*status_group_specifiers)
34
+ group = StatusGroup.new(status_group_specifiers)
35
+
36
+ @status_codes_by_url.find_all { |url, code| group.include? code }
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Clownfish
2
+ VERSION = "0.3.0"
3
+ end
@@ -0,0 +1,144 @@
1
+ require 'spec_helper'
2
+
3
+ module Clownfish
4
+ describe Adapter do
5
+ context ".new" do
6
+ it "doesn't accept nil delegate" do
7
+ expect { Adapter.new(nil) }.to raise_error(ArgumentError)
8
+ end
9
+ end
10
+
11
+ context "#anemone_options" do
12
+ it "forwards anemone_options from delegate" do
13
+ delegate = double('delegate')
14
+ delegate.stub(:anemone_options) {{:name => 'bob'}}
15
+
16
+ adapter = Adapter.new(delegate)
17
+
18
+ adapter.anemone_options.should eq({:name => 'bob'})
19
+ end
20
+
21
+ it "returns empty Hash if delegate has no options" do
22
+ delegate = double('delegate')
23
+ delegate.stub(:anemone_options) {nil}
24
+
25
+ adapter = Adapter.new(delegate)
26
+
27
+ adapter.anemone_options.should eq({})
28
+ end
29
+
30
+ it "returns empty Hash if delegate doesn't support anemone_options" do
31
+ # Has no anemone_options method
32
+ delegate = Object.new
33
+
34
+ adapter = Adapter.new(delegate)
35
+
36
+ adapter.anemone_options.should eq({})
37
+ end
38
+ end
39
+
40
+ context "hooking into Anemone" do
41
+ before :each do
42
+ @page_store = Object.new
43
+ @page1, @page2 = Object.new, Object.new
44
+
45
+ @anemone = FakeAnemone.new(@page_store, @page1, @page2)
46
+ end
47
+
48
+ it "wires up after_crawl when delegate supports it" do
49
+ delegate = double('delegate')
50
+ delegate.should_receive(:after_crawl).with(@page_store).once
51
+
52
+ adapter = Adapter.new(delegate)
53
+
54
+ adapter.hook_into_anemone(@anemone)
55
+ end
56
+
57
+ it "ignores after_crawl when not supported" do
58
+ delegate = Object.new
59
+
60
+ adapter = Adapter.new(delegate)
61
+
62
+ adapter.hook_into_anemone(@anemone)
63
+ end
64
+
65
+ it "wires up on_every_page when delegate supports it" do
66
+ delegate = double('delegate')
67
+ delegate.should_receive(:on_every_page).with(@page1).once
68
+ delegate.should_receive(:on_every_page).with(@page2).once
69
+
70
+ adapter = Adapter.new(delegate)
71
+
72
+ adapter.hook_into_anemone(@anemone)
73
+ end
74
+
75
+ it "ignores on_every_page when not supported" do
76
+ delegate = Object.new
77
+
78
+ adapter = Adapter.new(delegate)
79
+
80
+ adapter.hook_into_anemone(@anemone)
81
+ end
82
+
83
+ it "wires up focus_crawl when delegate supports it" do
84
+ delegate = double('delegate')
85
+ delegate.should_receive(:focus_crawl).with(@page1) {['url1']}.once
86
+
87
+ adapter = Adapter.new(delegate)
88
+
89
+ adapter.hook_into_anemone(@anemone)
90
+
91
+ @anemone.last_focus_crawl_links.should eq(['url1'])
92
+ end
93
+
94
+ it "focuses on no links when delegate doesn't focus on any" do
95
+ delegate = double('delegate')
96
+ delegate.should_receive(:focus_crawl).with(@page1) {nil}
97
+
98
+ adapter = Adapter.new(delegate)
99
+
100
+ adapter.hook_into_anemone(@anemone)
101
+
102
+ @anemone.last_focus_crawl_links.should eq([])
103
+ end
104
+
105
+ it "ignores focus_crawl when delegate doesn't support it" do
106
+ delegate = Object.new
107
+
108
+ adapter = Adapter.new(delegate)
109
+
110
+ adapter.hook_into_anemone(@anemone)
111
+ end
112
+
113
+ it "relays skip_links_like regex when delegate returns one" do
114
+ delegate = double('delegate')
115
+ delegate.stub(:skip_links_like) {/a/}
116
+
117
+ adapter = Adapter.new(delegate)
118
+
119
+ adapter.hook_into_anemone(@anemone)
120
+
121
+ @anemone.last_skip_links_like_regexes.should eq([/a/])
122
+ end
123
+
124
+ it "relays skip_links_like regexes when delegate returns many" do
125
+ delegate = double('delegate')
126
+ delegate.stub(:skip_links_like) {[/a/, /b/]}
127
+
128
+ adapter = Adapter.new(delegate)
129
+
130
+ adapter.hook_into_anemone(@anemone)
131
+
132
+ @anemone.last_skip_links_like_regexes.should eq([/a/, /b/])
133
+ end
134
+
135
+ it "ignores skip_links_like when not supported" do
136
+ delegate = Object.new
137
+
138
+ adapter = Adapter.new(delegate)
139
+
140
+ adapter.hook_into_anemone(@anemone)
141
+ end
142
+ end # end of hooking into Anemone
143
+ end # end of describe Adapter
144
+ end # end of Clownfish module
@@ -0,0 +1,80 @@
1
+ require 'spec_helper'
2
+ require 'stringio'
3
+
4
+ module Clownfish
5
+ describe LinksByPage do
6
+ describe "#report" do
7
+ before :each do
8
+ @home = FakePage.new('home.com', 200)
9
+ @links = FakePage.new('links.com', 200, 'home.com')
10
+ @client = FakePage.new('client.com', 404, 'links.com')
11
+ @server = FakePage.new('server.com', 500, 'links.com')
12
+ end
13
+
14
+ it "reports all statuses by default" do
15
+ fish = LinksByPage.new
16
+
17
+ fish.on_every_page(@client)
18
+ fish.on_every_page(@server)
19
+
20
+ out = StringIO.new
21
+ fish.report(:to => out)
22
+
23
+ out.string.should =~ %r{404 http://client.com\n500 http://server.com}
24
+ end
25
+
26
+ it "reports specified status when specifier given" do
27
+ fish = LinksByPage.new
28
+
29
+ fish.on_every_page(@client)
30
+ fish.on_every_page(@server)
31
+
32
+ out = StringIO.new
33
+ fish.report(:to => out, :status => :server_error)
34
+
35
+ out.string.should =~ %r{500 http://server.com}
36
+ out.string.should_not =~ /404/
37
+ end
38
+
39
+ it "reports specified statuses when many specified" do
40
+ fish = LinksByPage.new
41
+
42
+ fish.on_every_page(@links)
43
+ fish.on_every_page(@client)
44
+ fish.on_every_page(@server)
45
+
46
+ out = StringIO.new
47
+ fish.report(:to => out, :status => [500, 200..204])
48
+
49
+ out.string.should =~ %r{200 http://links.com}
50
+ out.string.should =~ %r{500 http://server.com}
51
+ out.string.should_not =~ /404/
52
+ end
53
+
54
+ it "omits page if none of its links will be shown" do
55
+ fish = LinksByPage.new
56
+
57
+ fish.on_every_page(@links)
58
+ fish.on_every_page(@client)
59
+ fish.on_every_page(@server)
60
+
61
+ out = StringIO.new
62
+ fish.report(:to => out, :status => [304])
63
+
64
+ out.string.should_not =~ %r{http://home.com}
65
+ out.string.should_not =~ %r{http://links.com}
66
+ end
67
+
68
+ it "shows referer of links with no referer as [starting point]" do
69
+ fish = LinksByPage.new
70
+
71
+ fish.on_every_page(@home)
72
+
73
+ out = StringIO.new
74
+ fish.report(:to => out)
75
+
76
+ out.string.should =~ /\[starting point\]/
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,70 @@
1
+ require 'clownfish'
2
+ require 'uri'
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+
9
+ # Run specs in random order to surface order dependencies. If you find an
10
+ # order dependency and want to debug it, you can fix the order by providing
11
+ # the seed, which is printed after each run.
12
+ # --seed 1234
13
+ config.order = 'random'
14
+ end
15
+
16
+ # Matcher for unordered equality Array
17
+ RSpec::Matchers.define :have_same_elements_as do |expected|
18
+ match do |actual|
19
+ expected.sort == actual.sort
20
+ end
21
+ end
22
+
23
+ module Clownfish
24
+ # Fake Anemone::Core to help with tests.
25
+ class FakeAnemone
26
+ attr_reader :last_focus_crawl_links
27
+ attr_reader :last_skip_links_like_regexes
28
+
29
+ def initialize(page_store, page1, page2)
30
+ @page_store = page_store
31
+ @page1 = page1
32
+ @page2 = page2
33
+ end
34
+
35
+ def after_crawl
36
+ yield(@page_store)
37
+ end
38
+
39
+ def on_every_page
40
+ yield(@page1)
41
+ yield(@page2)
42
+ end
43
+
44
+ def focus_crawl
45
+ @last_focus_crawl_links = yield(@page1)
46
+ end
47
+
48
+ def skip_links_like(regexes)
49
+ @last_skip_links_like_regexes = regexes
50
+ end
51
+ end
52
+
53
+ # Fake and minimal Anemone::Page to help with tests.
54
+ class FakePage
55
+ attr_reader :url, :referer, :code
56
+
57
+ def initialize(url, code = 200, referer = nil)
58
+ @url = urlify(url)
59
+ @referer = urlify(referer)
60
+ @code = code
61
+ end
62
+
63
+ def urlify(str)
64
+ return str if str.class == URI || str.nil?
65
+
66
+ str = "http://#{str}" unless str.start_with? 'http'
67
+ URI(str)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+
3
+ module Clownfish
4
+ describe StatusGroup do
5
+ context "#<<" do
6
+ it "takes alias Symbols, Integers and Integer Ranges" do
7
+ group = StatusGroup.new
8
+
9
+ group << :client_error
10
+ group << 500
11
+ group << 200..204
12
+
13
+ group.include?(404).should be_true
14
+ group.include?(500).should be_true
15
+ group.include?(200).should be_true
16
+ group.include?(304).should be_false
17
+ end
18
+
19
+ it "can be chained" do
20
+ group = StatusGroup.new
21
+
22
+ group << :client_error << 304 << :server_error
23
+
24
+ group.include?(404).should be_true
25
+ group.include?(500).should be_true
26
+ group.include?(304).should be_true
27
+ group.include?(200).should be_false
28
+ end
29
+ end
30
+
31
+ context ".new" do
32
+ it "can take a single status specifier" do
33
+ group = StatusGroup.new(:server_error)
34
+
35
+ group.include?(500).should be_true
36
+ group.include?(200).should be_false
37
+ end
38
+
39
+ it "can take multiple status specifiers" do
40
+ group = StatusGroup.new(200, :redirect, 400..406)
41
+
42
+ group.include?(200).should be_true
43
+ group.include?(301).should be_true
44
+ group.include?(401).should be_true
45
+ group.include?(204).should be_false
46
+ end
47
+
48
+ it "can take Array of status specifiers" do
49
+ group = StatusGroup.new([:success, 500, 300..304])
50
+
51
+ group.include?(500).should be_true
52
+ group.include?(200).should be_true
53
+ group.include?(302).should be_true
54
+ group.include?(404).should be_false
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,91 @@
1
+ require 'spec_helper'
2
+
3
+ module Clownfish
4
+ describe UrlStatuses do
5
+ it "accumulates urls with status codes" do
6
+ statuses = UrlStatuses.new
7
+
8
+ statuses.add_url('http://ok.com', 200)
9
+ statuses.add_url('http://huh.com', 404)
10
+
11
+ statuses.status_codes_by_url.should eq({'http://ok.com' => 200, 'http://huh.com' => 404})
12
+ end
13
+
14
+ it "starts off empty" do
15
+ statuses = UrlStatuses.new
16
+
17
+ statuses.empty?.should be_true
18
+ end
19
+
20
+ it "knows how many urls it has" do
21
+ statuses = UrlStatuses.new
22
+
23
+ statuses.add_url('http://ok.com', 200)
24
+ statuses.add_url('http://huh.com', 404)
25
+
26
+ statuses.size.should eq(2)
27
+ end
28
+
29
+ context '#each' do
30
+ it "yields url/code pairs to 2-arg block" do
31
+ statuses = UrlStatuses.new
32
+
33
+ statuses.add_url('http://ok.com', 200)
34
+ statuses.add_url('http://huh.com', 404)
35
+
36
+ pairs = []
37
+ statuses.each { |k, v| pairs << [k, v] }
38
+
39
+ pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
40
+ end
41
+
42
+ it "yields url/code Array to 1-arg block" do
43
+ statuses = UrlStatuses.new
44
+
45
+ statuses.add_url('http://ok.com', 200)
46
+ statuses.add_url('http://huh.com', 404)
47
+
48
+ pairs = []
49
+ statuses.each { |p| pairs << p }
50
+
51
+ pairs.should have_same_elements_as([['http://ok.com', 200], ['http://huh.com', 404]])
52
+ end
53
+ end
54
+
55
+ context '#query' do
56
+ it "returns url/status pairs that match a specifier" do
57
+ statuses = UrlStatuses.new
58
+
59
+ statuses.add_url('http://ok.com', 200)
60
+ statuses.add_url('http://huh.com', 404)
61
+
62
+ pairs = statuses.query(200)
63
+
64
+ pairs.should have_same_elements_as([['http://ok.com', 200]])
65
+ end
66
+
67
+ it "returns url/status pairs that match any specifier" do
68
+ statuses = UrlStatuses.new
69
+
70
+ statuses.add_url('http://ok.com', 200)
71
+ statuses.add_url('http://huh.com', 404)
72
+ statuses.add_url('http://ohno.com', 500)
73
+
74
+ pairs = statuses.query(200, :server_error)
75
+
76
+ pairs.should have_same_elements_as([['http://ok.com', 200], ['http://ohno.com', 500]])
77
+ end
78
+
79
+ it "returns empty Array if no pairs match a specifier" do
80
+ statuses = UrlStatuses.new
81
+
82
+ statuses.add_url('http://ok.com', 200)
83
+ statuses.add_url('http://huh.com', 404)
84
+
85
+ pairs = statuses.query(500, :redirect)
86
+
87
+ pairs.empty?.should be_true
88
+ end
89
+ end
90
+ end
91
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: clownfish
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Salaets
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-02-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: anemone
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '2.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '2.12'
41
+ description: Anemone helper making common crawls easier to repeat.
42
+ email:
43
+ - psalaets@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - .rspec
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - clownfish.gemspec
55
+ - ideas.txt
56
+ - lib/clownfish.rb
57
+ - lib/clownfish/adapter.rb
58
+ - lib/clownfish/anemone_ext.rb
59
+ - lib/clownfish/fish/count.rb
60
+ - lib/clownfish/fish/links_by_page.rb
61
+ - lib/clownfish/fish/response_times.rb
62
+ - lib/clownfish/helpers/status_group.rb
63
+ - lib/clownfish/helpers/url_statuses.rb
64
+ - lib/clownfish/version.rb
65
+ - spec/adapter_spec.rb
66
+ - spec/links_by_page_spec.rb
67
+ - spec/spec_helper.rb
68
+ - spec/status_group_spec.rb
69
+ - spec/url_statuses_spec.rb
70
+ homepage: https://github.com/psalaets/clownfish
71
+ licenses:
72
+ - MIT
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 2.0.0
91
+ signing_key:
92
+ specification_version: 4
93
+ summary: Anemone helper
94
+ test_files:
95
+ - spec/adapter_spec.rb
96
+ - spec/links_by_page_spec.rb
97
+ - spec/spec_helper.rb
98
+ - spec/status_group_spec.rb
99
+ - spec/url_statuses_spec.rb