spk-anemone 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +10 -0
- data/lib/anemone/cli/count.rb +2 -2
- data/lib/anemone/cli/pagedepth.rb +4 -4
- data/lib/anemone/core.rb +2 -2
- data/lib/anemone/page.rb +12 -0
- data/spec/page_spec.rb +6 -0
- metadata +2 -2
data/CHANGELOG.rdoc
CHANGED
data/lib/anemone/cli/count.rb
CHANGED
@@ -7,7 +7,7 @@ rescue
|
|
7
7
|
puts <<-INFO
|
8
8
|
Usage:
|
9
9
|
anemone count <url>
|
10
|
-
|
10
|
+
|
11
11
|
Synopsis:
|
12
12
|
Crawls a site starting at the given URL and outputs the total number
|
13
13
|
of unique pages on the site.
|
@@ -17,6 +17,6 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(url) do |anemone|
|
19
19
|
anemone.after_crawl do |pages|
|
20
|
-
puts pages.uniq
|
20
|
+
puts pages.uniq!.size
|
21
21
|
end
|
22
22
|
end
|
@@ -17,16 +17,16 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(root) do |anemone|
|
19
19
|
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
-
|
20
|
+
|
21
21
|
anemone.after_crawl do |pages|
|
22
|
-
pages = pages.shortest_paths!(root).uniq
|
23
|
-
|
22
|
+
pages = pages.shortest_paths!(root).uniq!
|
23
|
+
|
24
24
|
depths = pages.values.inject({}) do |depths, page|
|
25
25
|
depths[page.depth] ||= 0
|
26
26
|
depths[page.depth] += 1
|
27
27
|
depths
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
31
|
end
|
32
32
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
|
|
7
7
|
|
8
8
|
module Anemone
|
9
9
|
|
10
|
-
VERSION = '0.3.
|
10
|
+
VERSION = '0.3.1';
|
11
11
|
|
12
12
|
#
|
13
13
|
# Convenience method to start a crawl
|
@@ -193,7 +193,7 @@ module Anemone
|
|
193
193
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
194
194
|
end
|
195
195
|
|
196
|
-
# Generate Authorization string
|
196
|
+
# Generate Authorization string and set authorization opts
|
197
197
|
def authorization(auth=nil)
|
198
198
|
require 'base64'
|
199
199
|
if auth.is_a?(String) && auth.include?(':')
|
data/lib/anemone/page.rb
CHANGED
@@ -6,6 +6,8 @@ module Anemone
|
|
6
6
|
|
7
7
|
# The URL of the page
|
8
8
|
attr_reader :url
|
9
|
+
# The raw HTTP response body of the page
|
10
|
+
attr_reader :body
|
9
11
|
# Headers of the HTTP response
|
10
12
|
attr_reader :headers
|
11
13
|
# URL of the page this one redirected to, if any
|
@@ -50,7 +52,9 @@ module Anemone
|
|
50
52
|
@fetched = !params[:code].nil?
|
51
53
|
end
|
52
54
|
|
55
|
+
#
|
53
56
|
# Array of distinct A tag HREFs from the page
|
57
|
+
#
|
54
58
|
def links
|
55
59
|
return @links unless @links.nil?
|
56
60
|
@links = []
|
@@ -66,18 +70,26 @@ module Anemone
|
|
66
70
|
@links
|
67
71
|
end
|
68
72
|
|
73
|
+
#
|
69
74
|
# Nokogiri document for the HTML body
|
75
|
+
#
|
70
76
|
def doc
|
71
77
|
return @doc if @doc
|
72
78
|
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
73
79
|
end
|
74
80
|
|
81
|
+
#
|
75
82
|
# Delete the Nokogiri document and response body to conserve memory
|
83
|
+
#
|
76
84
|
def discard_doc!
|
77
85
|
links # force parsing of page links before we trash the document
|
78
86
|
@doc = @body = nil
|
79
87
|
end
|
80
88
|
|
89
|
+
#
|
90
|
+
# Was the page successfully fetched?
|
91
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
92
|
+
#
|
81
93
|
def fetched?
|
82
94
|
@fetched
|
83
95
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -17,6 +17,12 @@ module Anemone
|
|
17
17
|
fail_page.fetched?.should == false
|
18
18
|
end
|
19
19
|
|
20
|
+
it "should store and expose the response body of the HTTP request" do
|
21
|
+
body = 'test'
|
22
|
+
page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
|
23
|
+
page.body.should == body
|
24
|
+
end
|
25
|
+
|
20
26
|
it "should record any error that occurs during fetch_page" do
|
21
27
|
@page.should respond_to(:error)
|
22
28
|
@page.error.should be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spk-anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-29 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|