spk-anemone 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +10 -0
- data/lib/anemone/cli/count.rb +2 -2
- data/lib/anemone/cli/pagedepth.rb +4 -4
- data/lib/anemone/core.rb +2 -2
- data/lib/anemone/page.rb +12 -0
- data/spec/page_spec.rb +6 -0
- metadata +2 -2
data/CHANGELOG.rdoc
CHANGED
data/lib/anemone/cli/count.rb
CHANGED
@@ -7,7 +7,7 @@ rescue
|
|
7
7
|
puts <<-INFO
|
8
8
|
Usage:
|
9
9
|
anemone count <url>
|
10
|
-
|
10
|
+
|
11
11
|
Synopsis:
|
12
12
|
Crawls a site starting at the given URL and outputs the total number
|
13
13
|
of unique pages on the site.
|
@@ -17,6 +17,6 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(url) do |anemone|
|
19
19
|
anemone.after_crawl do |pages|
|
20
|
-
puts pages.uniq
|
20
|
+
puts pages.uniq!.size
|
21
21
|
end
|
22
22
|
end
|
@@ -17,16 +17,16 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(root) do |anemone|
|
19
19
|
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
-
|
20
|
+
|
21
21
|
anemone.after_crawl do |pages|
|
22
|
-
pages = pages.shortest_paths!(root).uniq
|
23
|
-
|
22
|
+
pages = pages.shortest_paths!(root).uniq!
|
23
|
+
|
24
24
|
depths = pages.values.inject({}) do |depths, page|
|
25
25
|
depths[page.depth] ||= 0
|
26
26
|
depths[page.depth] += 1
|
27
27
|
depths
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
31
|
end
|
32
32
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
|
|
7
7
|
|
8
8
|
module Anemone
|
9
9
|
|
10
|
-
VERSION = '0.3.
|
10
|
+
VERSION = '0.3.1';
|
11
11
|
|
12
12
|
#
|
13
13
|
# Convenience method to start a crawl
|
@@ -193,7 +193,7 @@ module Anemone
|
|
193
193
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
194
194
|
end
|
195
195
|
|
196
|
-
# Generate Authorization string
|
196
|
+
# Generate Authorization string and set authorization opts
|
197
197
|
def authorization(auth=nil)
|
198
198
|
require 'base64'
|
199
199
|
if auth.is_a?(String) && auth.include?(':')
|
data/lib/anemone/page.rb
CHANGED
@@ -6,6 +6,8 @@ module Anemone
|
|
6
6
|
|
7
7
|
# The URL of the page
|
8
8
|
attr_reader :url
|
9
|
+
# The raw HTTP response body of the page
|
10
|
+
attr_reader :body
|
9
11
|
# Headers of the HTTP response
|
10
12
|
attr_reader :headers
|
11
13
|
# URL of the page this one redirected to, if any
|
@@ -50,7 +52,9 @@ module Anemone
|
|
50
52
|
@fetched = !params[:code].nil?
|
51
53
|
end
|
52
54
|
|
55
|
+
#
|
53
56
|
# Array of distinct A tag HREFs from the page
|
57
|
+
#
|
54
58
|
def links
|
55
59
|
return @links unless @links.nil?
|
56
60
|
@links = []
|
@@ -66,18 +70,26 @@ module Anemone
|
|
66
70
|
@links
|
67
71
|
end
|
68
72
|
|
73
|
+
#
|
69
74
|
# Nokogiri document for the HTML body
|
75
|
+
#
|
70
76
|
def doc
|
71
77
|
return @doc if @doc
|
72
78
|
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
73
79
|
end
|
74
80
|
|
81
|
+
#
|
75
82
|
# Delete the Nokogiri document and response body to conserve memory
|
83
|
+
#
|
76
84
|
def discard_doc!
|
77
85
|
links # force parsing of page links before we trash the document
|
78
86
|
@doc = @body = nil
|
79
87
|
end
|
80
88
|
|
89
|
+
#
|
90
|
+
# Was the page successfully fetched?
|
91
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
92
|
+
#
|
81
93
|
def fetched?
|
82
94
|
@fetched
|
83
95
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -17,6 +17,12 @@ module Anemone
|
|
17
17
|
fail_page.fetched?.should == false
|
18
18
|
end
|
19
19
|
|
20
|
+
it "should store and expose the response body of the HTTP request" do
|
21
|
+
body = 'test'
|
22
|
+
page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
|
23
|
+
page.body.should == body
|
24
|
+
end
|
25
|
+
|
20
26
|
it "should record any error that occurs during fetch_page" do
|
21
27
|
@page.should respond_to(:error)
|
22
28
|
@page.error.should be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spk-anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-29 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|