anemone 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,7 @@ rescue
7
7
  puts <<-INFO
8
8
  Usage:
9
9
  anemone count <url>
10
-
10
+
11
11
  Synopsis:
12
12
  Crawls a site starting at the given URL and outputs the total number
13
13
  of unique pages on the site.
@@ -17,6 +17,6 @@ end
17
17
 
18
18
  Anemone.crawl(url) do |anemone|
19
19
  anemone.after_crawl do |pages|
20
- puts pages.uniq.size
20
+ puts pages.uniq!.size
21
21
  end
22
22
  end
@@ -17,16 +17,16 @@ end
17
17
 
18
18
  Anemone.crawl(root) do |anemone|
19
19
  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
-
20
+
21
21
  anemone.after_crawl do |pages|
22
- pages = pages.shortest_paths!(root).uniq
23
-
22
+ pages = pages.shortest_paths!(root).uniq!
23
+
24
24
  depths = pages.values.inject({}) do |depths, page|
25
25
  depths[page.depth] ||= 0
26
26
  depths[page.depth] += 1
27
27
  depths
28
28
  end
29
-
29
+
30
30
  depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
31
  end
32
32
  end
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.0';
10
+ VERSION = '0.3.1';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
@@ -6,6 +6,8 @@ module Anemone
6
6
 
7
7
  # The URL of the page
8
8
  attr_reader :url
9
+ # The raw HTTP response body of the page
10
+ attr_reader :body
9
11
  # Headers of the HTTP response
10
12
  attr_reader :headers
11
13
  # URL of the page this one redirected to, if any
@@ -48,7 +50,9 @@ module Anemone
48
50
  @fetched = !params[:code].nil?
49
51
  end
50
52
 
53
+ #
51
54
  # Array of distinct A tag HREFs from the page
55
+ #
52
56
  def links
53
57
  return @links unless @links.nil?
54
58
  @links = []
@@ -64,18 +68,26 @@ module Anemone
64
68
  @links
65
69
  end
66
70
 
71
+ #
67
72
  # Nokogiri document for the HTML body
73
+ #
68
74
  def doc
69
75
  return @doc if @doc
70
76
  @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
71
77
  end
72
78
 
79
+ #
73
80
  # Delete the Nokogiri document and response body to conserve memory
81
+ #
74
82
  def discard_doc!
75
83
  links # force parsing of page links before we trash the document
76
84
  @doc = @body = nil
77
85
  end
78
86
 
87
+ #
88
+ # Was the page successfully fetched?
89
+ # +true+ if the page was fetched with no error, +false+ otherwise.
90
+ #
79
91
  def fetched?
80
92
  @fetched
81
93
  end
@@ -17,6 +17,12 @@ module Anemone
17
17
  fail_page.fetched?.should == false
18
18
  end
19
19
 
20
+ it "should store and expose the response body of the HTTP request" do
21
+ body = 'test'
22
+ page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
23
+ page.body.should == body
24
+ end
25
+
20
26
  it "should record any error that occurs during fetch_page" do
21
27
  @page.should respond_to(:error)
22
28
  @page.error.should be_nil
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-16 00:00:00 -06:00
12
+ date: 2010-01-22 00:00:00 -06:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency