anemone 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/anemone/cli/count.rb +2 -2
- data/lib/anemone/cli/pagedepth.rb +4 -4
- data/lib/anemone/core.rb +1 -1
- data/lib/anemone/page.rb +12 -0
- data/spec/page_spec.rb +6 -0
- metadata +2 -2
data/lib/anemone/cli/count.rb
CHANGED
@@ -7,7 +7,7 @@ rescue
|
|
7
7
|
puts <<-INFO
|
8
8
|
Usage:
|
9
9
|
anemone count <url>
|
10
|
-
|
10
|
+
|
11
11
|
Synopsis:
|
12
12
|
Crawls a site starting at the given URL and outputs the total number
|
13
13
|
of unique pages on the site.
|
@@ -17,6 +17,6 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(url) do |anemone|
|
19
19
|
anemone.after_crawl do |pages|
|
20
|
-
puts pages.uniq
|
20
|
+
puts pages.uniq!.size
|
21
21
|
end
|
22
22
|
end
|
@@ -17,16 +17,16 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(root) do |anemone|
|
19
19
|
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
-
|
20
|
+
|
21
21
|
anemone.after_crawl do |pages|
|
22
|
-
pages = pages.shortest_paths!(root).uniq
|
23
|
-
|
22
|
+
pages = pages.shortest_paths!(root).uniq!
|
23
|
+
|
24
24
|
depths = pages.values.inject({}) do |depths, page|
|
25
25
|
depths[page.depth] ||= 0
|
26
26
|
depths[page.depth] += 1
|
27
27
|
depths
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
31
|
end
|
32
32
|
end
|
data/lib/anemone/core.rb
CHANGED
data/lib/anemone/page.rb
CHANGED
@@ -6,6 +6,8 @@ module Anemone
|
|
6
6
|
|
7
7
|
# The URL of the page
|
8
8
|
attr_reader :url
|
9
|
+
# The raw HTTP response body of the page
|
10
|
+
attr_reader :body
|
9
11
|
# Headers of the HTTP response
|
10
12
|
attr_reader :headers
|
11
13
|
# URL of the page this one redirected to, if any
|
@@ -48,7 +50,9 @@ module Anemone
|
|
48
50
|
@fetched = !params[:code].nil?
|
49
51
|
end
|
50
52
|
|
53
|
+
#
|
51
54
|
# Array of distinct A tag HREFs from the page
|
55
|
+
#
|
52
56
|
def links
|
53
57
|
return @links unless @links.nil?
|
54
58
|
@links = []
|
@@ -64,18 +68,26 @@ module Anemone
|
|
64
68
|
@links
|
65
69
|
end
|
66
70
|
|
71
|
+
#
|
67
72
|
# Nokogiri document for the HTML body
|
73
|
+
#
|
68
74
|
def doc
|
69
75
|
return @doc if @doc
|
70
76
|
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
71
77
|
end
|
72
78
|
|
79
|
+
#
|
73
80
|
# Delete the Nokogiri document and response body to conserve memory
|
81
|
+
#
|
74
82
|
def discard_doc!
|
75
83
|
links # force parsing of page links before we trash the document
|
76
84
|
@doc = @body = nil
|
77
85
|
end
|
78
86
|
|
87
|
+
#
|
88
|
+
# Was the page successfully fetched?
|
89
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
90
|
+
#
|
79
91
|
def fetched?
|
80
92
|
@fetched
|
81
93
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -17,6 +17,12 @@ module Anemone
|
|
17
17
|
fail_page.fetched?.should == false
|
18
18
|
end
|
19
19
|
|
20
|
+
it "should store and expose the response body of the HTTP request" do
|
21
|
+
body = 'test'
|
22
|
+
page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
|
23
|
+
page.body.should == body
|
24
|
+
end
|
25
|
+
|
20
26
|
it "should record any error that occurs during fetch_page" do
|
21
27
|
@page.should respond_to(:error)
|
22
28
|
@page.error.should be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-22 00:00:00 -06:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|