anemone 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/anemone/cli/count.rb +2 -2
- data/lib/anemone/cli/pagedepth.rb +4 -4
- data/lib/anemone/core.rb +1 -1
- data/lib/anemone/page.rb +12 -0
- data/spec/page_spec.rb +6 -0
- metadata +2 -2
data/lib/anemone/cli/count.rb
CHANGED
@@ -7,7 +7,7 @@ rescue
|
|
7
7
|
puts <<-INFO
|
8
8
|
Usage:
|
9
9
|
anemone count <url>
|
10
|
-
|
10
|
+
|
11
11
|
Synopsis:
|
12
12
|
Crawls a site starting at the given URL and outputs the total number
|
13
13
|
of unique pages on the site.
|
@@ -17,6 +17,6 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(url) do |anemone|
|
19
19
|
anemone.after_crawl do |pages|
|
20
|
-
puts pages.uniq
|
20
|
+
puts pages.uniq!.size
|
21
21
|
end
|
22
22
|
end
|
@@ -17,16 +17,16 @@ end
|
|
17
17
|
|
18
18
|
Anemone.crawl(root) do |anemone|
|
19
19
|
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
-
|
20
|
+
|
21
21
|
anemone.after_crawl do |pages|
|
22
|
-
pages = pages.shortest_paths!(root).uniq
|
23
|
-
|
22
|
+
pages = pages.shortest_paths!(root).uniq!
|
23
|
+
|
24
24
|
depths = pages.values.inject({}) do |depths, page|
|
25
25
|
depths[page.depth] ||= 0
|
26
26
|
depths[page.depth] += 1
|
27
27
|
depths
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
31
|
end
|
32
32
|
end
|
data/lib/anemone/core.rb
CHANGED
data/lib/anemone/page.rb
CHANGED
@@ -6,6 +6,8 @@ module Anemone
|
|
6
6
|
|
7
7
|
# The URL of the page
|
8
8
|
attr_reader :url
|
9
|
+
# The raw HTTP response body of the page
|
10
|
+
attr_reader :body
|
9
11
|
# Headers of the HTTP response
|
10
12
|
attr_reader :headers
|
11
13
|
# URL of the page this one redirected to, if any
|
@@ -48,7 +50,9 @@ module Anemone
|
|
48
50
|
@fetched = !params[:code].nil?
|
49
51
|
end
|
50
52
|
|
53
|
+
#
|
51
54
|
# Array of distinct A tag HREFs from the page
|
55
|
+
#
|
52
56
|
def links
|
53
57
|
return @links unless @links.nil?
|
54
58
|
@links = []
|
@@ -64,18 +68,26 @@ module Anemone
|
|
64
68
|
@links
|
65
69
|
end
|
66
70
|
|
71
|
+
#
|
67
72
|
# Nokogiri document for the HTML body
|
73
|
+
#
|
68
74
|
def doc
|
69
75
|
return @doc if @doc
|
70
76
|
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
71
77
|
end
|
72
78
|
|
79
|
+
#
|
73
80
|
# Delete the Nokogiri document and response body to conserve memory
|
81
|
+
#
|
74
82
|
def discard_doc!
|
75
83
|
links # force parsing of page links before we trash the document
|
76
84
|
@doc = @body = nil
|
77
85
|
end
|
78
86
|
|
87
|
+
#
|
88
|
+
# Was the page successfully fetched?
|
89
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
90
|
+
#
|
79
91
|
def fetched?
|
80
92
|
@fetched
|
81
93
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -17,6 +17,12 @@ module Anemone
|
|
17
17
|
fail_page.fetched?.should == false
|
18
18
|
end
|
19
19
|
|
20
|
+
it "should store and expose the response body of the HTTP request" do
|
21
|
+
body = 'test'
|
22
|
+
page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
|
23
|
+
page.body.should == body
|
24
|
+
end
|
25
|
+
|
20
26
|
it "should record any error that occurs during fetch_page" do
|
21
27
|
@page.should respond_to(:error)
|
22
28
|
@page.error.should be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-22 00:00:00 -06:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|