anemone 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,39 +8,44 @@ module Anemone
8
8
  attr_reader :url
9
9
  # Headers of the HTTP response
10
10
  attr_reader :headers
11
-
11
+ # URL of the page this one redirected to, if any
12
+ attr_reader :redirect_to
13
+ # Exception object, if one was raised during HTTP#fetch_page
14
+ attr_reader :error
15
+
12
16
  # OpenStruct for user-stored data
13
17
  attr_accessor :data
14
- # Nokogiri document for the HTML body
15
- attr_accessor :doc
16
18
  # Integer response code of the page
17
- attr_accessor :code
18
- # Array of redirect-aliases for the page
19
- attr_accessor :aliases
20
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
19
+ attr_accessor :code
20
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
21
21
  attr_accessor :visited
22
22
  # Depth of this page from the root of the crawl. This is not necessarily the
23
- # shortest path; use PageHash#shortest_paths! to find that value.
23
+ # shortest path; use PageStore#shortest_paths! to find that value.
24
24
  attr_accessor :depth
25
25
  # URL of the page that brought us to this page
26
26
  attr_accessor :referer
27
27
  # Response time of the request for this page in milliseconds
28
28
  attr_accessor :response_time
29
-
29
+
30
30
  #
31
31
  # Create a new page
32
32
  #
33
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
33
+ def initialize(url, params = {})
34
34
  @url = url
35
- @code = code
36
- @headers = headers || {}
37
- @headers['content-type'] ||= ['']
38
- @aliases = Array(aka)
39
35
  @data = OpenStruct.new
40
- @referer = referer
41
- @depth = depth || 0
42
- @response_time = response_time
43
- @doc = Nokogiri::HTML(body) if body && html? rescue nil
36
+
37
+ @code = params[:code]
38
+ @headers = params[:headers] || {}
39
+ @headers['content-type'] ||= ['']
40
+ @aliases = Array(params[:aka]).compact
41
+ @referer = params[:referer]
42
+ @depth = params[:depth] || 0
43
+ @redirect_to = to_absolute(params[:redirect_to])
44
+ @response_time = params[:response_time]
45
+ @body = params[:body]
46
+ @error = params[:error]
47
+
48
+ @fetched = !params[:code].nil?
44
49
  end
45
50
 
46
51
  # Array of distinct A tag HREFs from the page
@@ -48,7 +53,7 @@ module Anemone
48
53
  return @links unless @links.nil?
49
54
  @links = []
50
55
  return @links if !doc
51
-
56
+
52
57
  doc.css('a').each do |a|
53
58
  u = a.attributes['href'].content rescue nil
54
59
  next if u.nil? or u.empty?
@@ -58,52 +63,30 @@ module Anemone
58
63
  @links.uniq!
59
64
  @links
60
65
  end
61
-
66
+
67
+ # Nokogiri document for the HTML body
68
+ def doc
69
+ return @doc if @doc
70
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
71
+ end
72
+
73
+ # Delete the Nokogiri document and response body to conserve memory
62
74
  def discard_doc!
63
75
  links # force parsing of page links before we trash the document
64
- @doc = nil
65
- end
66
-
67
- #
68
- # Return a new page with the same *response* and *url*, but
69
- # with a 200 response code
70
- #
71
- def alias_clone(url)
72
- p = clone
73
- p.add_alias!(@aka) if !@aka.nil?
74
- p.code = 200
75
- p
76
+ @doc = @body = nil
76
77
  end
77
78
 
78
- #
79
- # Add a redirect-alias String *aka* to the list of the page's aliases
80
- #
81
- # Returns *self*
82
- #
83
- def add_alias!(aka)
84
- @aliases << aka if !@aliases.include?(aka)
85
- self
79
+ def fetched?
80
+ @fetched
86
81
  end
87
-
88
- #
89
- # Returns an Array of all links from this page, and all the
90
- # redirect-aliases of those pages, as String objects.
91
- #
92
- # *page_hash* is a PageHash object with the results of the current crawl.
93
- #
94
- def links_and_their_aliases(page_hash)
95
- links.inject([]) do |results, link|
96
- results.concat([link].concat(page_hash[link].aliases))
97
- end
98
- end
99
-
82
+
100
83
  #
101
84
  # The content-type returned by the HTTP request for this page
102
85
  #
103
86
  def content_type
104
87
  headers['content-type'].first
105
88
  end
106
-
89
+
107
90
  #
108
91
  # Returns +true+ if the page is a HTML document, returns +false+
109
92
  # otherwise.
@@ -111,15 +94,15 @@ module Anemone
111
94
  def html?
112
95
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
113
96
  end
114
-
97
+
115
98
  #
116
99
  # Returns +true+ if the page is a HTTP redirect, returns +false+
117
100
  # otherwise.
118
- #
101
+ #
119
102
  def redirect?
120
103
  (300..399).include?(@code)
121
104
  end
122
-
105
+
123
106
  #
124
107
  # Returns +true+ if the page was not found (returned 404 code),
125
108
  # returns +false+ otherwise.
@@ -127,12 +110,14 @@ module Anemone
127
110
  def not_found?
128
111
  404 == @code
129
112
  end
130
-
113
+
131
114
  #
132
115
  # Converts relative URL *link* into an absolute URL based on the
133
116
  # location of the page
134
117
  #
135
118
  def to_absolute(link)
119
+ return nil if link.nil?
120
+
136
121
  # remove anchor
137
122
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
138
123
 
@@ -143,7 +128,7 @@ module Anemone
143
128
 
144
129
  return absolute
145
130
  end
146
-
131
+
147
132
  #
148
133
  # Returns +true+ if *uri* is in the same domain as the page, returns
149
134
  # +false+ otherwise
@@ -151,5 +136,14 @@ module Anemone
151
136
  def in_domain?(uri)
152
137
  uri.host == @url.host
153
138
  end
139
+
140
+ def marshal_dump
141
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
142
+ end
143
+
144
+ def marshal_load(ary)
145
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
146
+ end
147
+
154
148
  end
155
149
  end
@@ -1,21 +1,52 @@
1
+ require 'forwardable'
2
+
1
3
  module Anemone
2
- class PageHash < Hash
3
-
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {})
10
+ @storage = storage
11
+ end
12
+
4
13
  # We typically index the hash with a URI,
5
14
  # but convert it to a String for easier retrieval
6
15
  def [](index)
7
- super(index.to_s)
16
+ @storage[index.to_s]
8
17
  end
9
-
18
+
10
19
  def []=(index, other)
11
- super(index.to_s, other)
20
+ @storage[index.to_s] = other
21
+ end
22
+
23
+ def delete(key)
24
+ @storage.delete key.to_s
12
25
  end
13
-
26
+
14
27
  def has_key?(key)
15
- super(key.to_s)
28
+ @storage.has_key? key.to_s
29
+ end
30
+
31
+ def each_value
32
+ each { |key, value| yield value }
33
+ end
34
+
35
+ def values
36
+ result = []
37
+ each { |key, value| result << value }
38
+ result
39
+ end
40
+
41
+ def touch_key(key)
42
+ self[key] = Page.new(key)
16
43
  end
17
44
 
18
- # Does this PageHash contain the specified URL?
45
+ def touch_keys(keys)
46
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
47
+ end
48
+
49
+ # Does this PageStore contain the specified URL?
19
50
  # HTTP and HTTPS versions of a URL are considered to be the same page.
20
51
  def has_page?(url)
21
52
  schemes = %w(http https)
@@ -24,80 +55,67 @@ module Anemone
24
55
  return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
56
  end
26
57
 
27
- has_key?(url)
58
+ has_key? url
28
59
  end
29
-
60
+
30
61
  #
31
62
  # Use a breadth-first search to calculate the single-source
32
- # shortest paths from *root* to all pages in the PageHash
63
+ # shortest paths from *root* to all pages in the PageStore
33
64
  #
34
65
  def shortest_paths!(root)
35
66
  root = URI(root) if root.is_a?(String)
36
67
  raise "Root node not found" if !has_key?(root)
37
-
38
- each_value {|p| p.visited = false if p}
39
-
68
+
40
69
  q = Queue.new
41
-
42
- q.enq(root)
43
- self[root].depth = 0
44
- self[root].visited = true
45
- while(!q.empty?)
46
- url = q.deq
47
-
48
- next if !has_key?(url)
49
-
50
- page = self[url]
51
-
70
+
71
+ q.enq root
72
+ root_page = self[root]
73
+ root_page.depth = 0
74
+ root_page.visited = true
75
+ self[root] = root_page
76
+ while !q.empty?
77
+ page = self[q.deq]
52
78
  page.links.each do |u|
53
- next if !has_key?(u) or self[u].nil?
54
- link = self[u]
55
- aliases = [link].concat(link.aliases.map {|a| self[a] })
56
-
57
- aliases.each do |node|
58
- if node.depth.nil? or page.depth + 1 < node.depth
59
- node.depth = page.depth + 1
79
+ begin
80
+ link = self[u]
81
+ next if link.nil? || !link.fetched? || link.visited
82
+
83
+ q << u unless link.redirect?
84
+ link.visited = true
85
+ link.depth = page.depth + 1
86
+ self[u] = link
87
+
88
+ if link.redirect?
89
+ u = link.redirect_to
90
+ redo
60
91
  end
61
92
  end
62
-
63
- q.enq(self[u].url) if !self[u].visited
64
- self[u].visited = true
65
93
  end
66
94
  end
67
-
95
+
68
96
  self
69
97
  end
70
-
98
+
71
99
  #
72
- # Returns a new PageHash by removing redirect-aliases for each
73
- # non-redirect Page
100
+ # Removes all Pages from storage where redirect? is true
74
101
  #
75
- def uniq
76
- results = PageHash.new
77
- each do |url, page|
78
- #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
- page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
- if !page.redirect? and !page_added
81
- results[url] = page.clone
82
- results[url].aliases = []
83
- end
84
- end
85
-
86
- results
102
+ def uniq!
103
+ each_value { |page| delete page.url if page.redirect? }
104
+ self
87
105
  end
88
-
106
+
89
107
  #
90
108
  # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
109
  # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
110
  #
93
111
  def pages_linking_to(urls)
94
112
  unless urls.is_a?(Array)
95
- urls = [urls] unless urls.is_a?(Array)
113
+ urls = [urls]
96
114
  single = true
97
115
  end
98
116
 
99
117
  urls.map! do |url|
100
- if url.is_a?(String)
118
+ unless url.is_a?(URI)
101
119
  URI(url) rescue nil
102
120
  else
103
121
  url
@@ -112,7 +130,7 @@ module Anemone
112
130
  end
113
131
 
114
132
  if single and !links.empty?
115
- return links.first
133
+ return links[urls.first]
116
134
  else
117
135
  return links
118
136
  end
@@ -132,11 +150,11 @@ module Anemone
132
150
  links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
151
 
134
152
  if single and !links.empty?
135
- return links.first
153
+ return links[urls.first]
136
154
  else
137
155
  return links
138
- end
156
+ end
139
157
  end
140
158
 
141
159
  end
142
- end
160
+ end
@@ -0,0 +1,19 @@
1
+ module Anemone
2
+ module Storage
3
+
4
+ def self.Hash(*args)
5
+ Hash.new(*args)
6
+ end
7
+
8
+ def self.PStore(*args)
9
+ require 'anemone/storage/pstore'
10
+ self::PStore.new(*args)
11
+ end
12
+
13
+ def self.TokyoCabinet(file)
14
+ require 'anemone/storage/tokyo_cabinet'
15
+ self::TokyoCabinet.new(file)
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,48 @@
1
+ require 'pstore'
2
+ require 'forwardable'
3
+
4
+ module Anemone
5
+ module Storage
6
+ class PStore
7
+ extend Forwardable
8
+
9
+ def_delegators :@keys, :has_key?, :keys, :size
10
+
11
+ def initialize(file)
12
+ File.delete(file) if File.exists?(file)
13
+ @store = ::PStore.new(file)
14
+ @keys = {}
15
+ end
16
+
17
+ def [](key)
18
+ @store.transaction { |s| s[key] }
19
+ end
20
+
21
+ def []=(key,value)
22
+ @keys[key] = nil
23
+ @store.transaction { |s| s[key] = value }
24
+ end
25
+
26
+ def delete(key)
27
+ @keys.delete(key)
28
+ @store.transaction { |s| s.delete key}
29
+ end
30
+
31
+ def each
32
+ @keys.each_key do |key|
33
+ value = nil
34
+ @store.transaction { |s| value = s[key] }
35
+ yield key, value
36
+ end
37
+ end
38
+
39
+ def merge!(hash)
40
+ @store.transaction do |s|
41
+ hash.each { |key, value| s[key] = value; @keys[key] = nil }
42
+ end
43
+ self
44
+ end
45
+
46
+ end
47
+ end
48
+ end