anemone 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ rescue
7
7
  puts <<-INFO
8
8
  Usage:
9
9
  anemone count <url>
10
-
10
+
11
11
  Synopsis:
12
12
  Crawls a site starting at the given URL and outputs the total number
13
13
  of unique pages on the site.
@@ -17,6 +17,6 @@ end
17
17
 
18
18
  Anemone.crawl(url) do |anemone|
19
19
  anemone.after_crawl do |pages|
20
- puts pages.uniq.size
20
+ puts pages.uniq!.size
21
21
  end
22
22
  end
@@ -17,16 +17,16 @@ end
17
17
 
18
18
  Anemone.crawl(root) do |anemone|
19
19
  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
-
20
+
21
21
  anemone.after_crawl do |pages|
22
- pages = pages.shortest_paths!(root).uniq
23
-
22
+ pages = pages.shortest_paths!(root).uniq!
23
+
24
24
  depths = pages.values.inject({}) do |depths, page|
25
25
  depths[page.depth] ||= 0
26
26
  depths[page.depth] += 1
27
27
  depths
28
28
  end
29
-
29
+
30
30
  depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
31
  end
32
32
  end
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.0';
10
+ VERSION = '0.3.1';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
@@ -6,6 +6,8 @@ module Anemone
6
6
 
7
7
  # The URL of the page
8
8
  attr_reader :url
9
+ # The raw HTTP response body of the page
10
+ attr_reader :body
9
11
  # Headers of the HTTP response
10
12
  attr_reader :headers
11
13
  # URL of the page this one redirected to, if any
@@ -48,7 +50,9 @@ module Anemone
48
50
  @fetched = !params[:code].nil?
49
51
  end
50
52
 
53
+ #
51
54
  # Array of distinct A tag HREFs from the page
55
+ #
52
56
  def links
53
57
  return @links unless @links.nil?
54
58
  @links = []
@@ -64,18 +68,26 @@ module Anemone
64
68
  @links
65
69
  end
66
70
 
71
+ #
67
72
  # Nokogiri document for the HTML body
73
+ #
68
74
  def doc
69
75
  return @doc if @doc
70
76
  @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
71
77
  end
72
78
 
79
+ #
73
80
  # Delete the Nokogiri document and response body to conserve memory
81
+ #
74
82
  def discard_doc!
75
83
  links # force parsing of page links before we trash the document
76
84
  @doc = @body = nil
77
85
  end
78
86
 
87
+ #
88
+ # Was the page successfully fetched?
89
+ # +true+ if the page was fetched with no error, +false+ otherwise.
90
+ #
79
91
  def fetched?
80
92
  @fetched
81
93
  end
@@ -17,6 +17,12 @@ module Anemone
17
17
  fail_page.fetched?.should == false
18
18
  end
19
19
 
20
+ it "should store and expose the response body of the HTTP request" do
21
+ body = 'test'
22
+ page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
23
+ page.body.should == body
24
+ end
25
+
20
26
  it "should record any error that occurs during fetch_page" do
21
27
  @page.should respond_to(:error)
22
28
  @page.error.should be_nil
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-16 00:00:00 -06:00
12
+ date: 2010-01-22 00:00:00 -06:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency