anemone 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,39 +8,44 @@ module Anemone
8
8
  attr_reader :url
9
9
  # Headers of the HTTP response
10
10
  attr_reader :headers
11
-
11
+ # URL of the page this one redirected to, if any
12
+ attr_reader :redirect_to
13
+ # Exception object, if one was raised during HTTP#fetch_page
14
+ attr_reader :error
15
+
12
16
  # OpenStruct for user-stored data
13
17
  attr_accessor :data
14
- # Nokogiri document for the HTML body
15
- attr_accessor :doc
16
18
  # Integer response code of the page
17
- attr_accessor :code
18
- # Array of redirect-aliases for the page
19
- attr_accessor :aliases
20
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
19
+ attr_accessor :code
20
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
21
21
  attr_accessor :visited
22
22
  # Depth of this page from the root of the crawl. This is not necessarily the
23
- # shortest path; use PageHash#shortest_paths! to find that value.
23
+ # shortest path; use PageStore#shortest_paths! to find that value.
24
24
  attr_accessor :depth
25
25
  # URL of the page that brought us to this page
26
26
  attr_accessor :referer
27
27
  # Response time of the request for this page in milliseconds
28
28
  attr_accessor :response_time
29
-
29
+
30
30
  #
31
31
  # Create a new page
32
32
  #
33
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
33
+ def initialize(url, params = {})
34
34
  @url = url
35
- @code = code
36
- @headers = headers || {}
37
- @headers['content-type'] ||= ['']
38
- @aliases = Array(aka)
39
35
  @data = OpenStruct.new
40
- @referer = referer
41
- @depth = depth || 0
42
- @response_time = response_time
43
- @doc = Nokogiri::HTML(body) if body && html? rescue nil
36
+
37
+ @code = params[:code]
38
+ @headers = params[:headers] || {}
39
+ @headers['content-type'] ||= ['']
40
+ @aliases = Array(params[:aka]).compact
41
+ @referer = params[:referer]
42
+ @depth = params[:depth] || 0
43
+ @redirect_to = to_absolute(params[:redirect_to])
44
+ @response_time = params[:response_time]
45
+ @body = params[:body]
46
+ @error = params[:error]
47
+
48
+ @fetched = !params[:code].nil?
44
49
  end
45
50
 
46
51
  # Array of distinct A tag HREFs from the page
@@ -48,7 +53,7 @@ module Anemone
48
53
  return @links unless @links.nil?
49
54
  @links = []
50
55
  return @links if !doc
51
-
56
+
52
57
  doc.css('a').each do |a|
53
58
  u = a.attributes['href'].content rescue nil
54
59
  next if u.nil? or u.empty?
@@ -58,52 +63,30 @@ module Anemone
58
63
  @links.uniq!
59
64
  @links
60
65
  end
61
-
66
+
67
+ # Nokogiri document for the HTML body
68
+ def doc
69
+ return @doc if @doc
70
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
71
+ end
72
+
73
+ # Delete the Nokogiri document and response body to conserve memory
62
74
  def discard_doc!
63
75
  links # force parsing of page links before we trash the document
64
- @doc = nil
65
- end
66
-
67
- #
68
- # Return a new page with the same *response* and *url*, but
69
- # with a 200 response code
70
- #
71
- def alias_clone(url)
72
- p = clone
73
- p.add_alias!(@aka) if !@aka.nil?
74
- p.code = 200
75
- p
76
+ @doc = @body = nil
76
77
  end
77
78
 
78
- #
79
- # Add a redirect-alias String *aka* to the list of the page's aliases
80
- #
81
- # Returns *self*
82
- #
83
- def add_alias!(aka)
84
- @aliases << aka if !@aliases.include?(aka)
85
- self
79
+ def fetched?
80
+ @fetched
86
81
  end
87
-
88
- #
89
- # Returns an Array of all links from this page, and all the
90
- # redirect-aliases of those pages, as String objects.
91
- #
92
- # *page_hash* is a PageHash object with the results of the current crawl.
93
- #
94
- def links_and_their_aliases(page_hash)
95
- links.inject([]) do |results, link|
96
- results.concat([link].concat(page_hash[link].aliases))
97
- end
98
- end
99
-
82
+
100
83
  #
101
84
  # The content-type returned by the HTTP request for this page
102
85
  #
103
86
  def content_type
104
87
  headers['content-type'].first
105
88
  end
106
-
89
+
107
90
  #
108
91
  # Returns +true+ if the page is a HTML document, returns +false+
109
92
  # otherwise.
@@ -111,15 +94,15 @@ module Anemone
111
94
  def html?
112
95
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
113
96
  end
114
-
97
+
115
98
  #
116
99
  # Returns +true+ if the page is a HTTP redirect, returns +false+
117
100
  # otherwise.
118
- #
101
+ #
119
102
  def redirect?
120
103
  (300..399).include?(@code)
121
104
  end
122
-
105
+
123
106
  #
124
107
  # Returns +true+ if the page was not found (returned 404 code),
125
108
  # returns +false+ otherwise.
@@ -127,12 +110,14 @@ module Anemone
127
110
  def not_found?
128
111
  404 == @code
129
112
  end
130
-
113
+
131
114
  #
132
115
  # Converts relative URL *link* into an absolute URL based on the
133
116
  # location of the page
134
117
  #
135
118
  def to_absolute(link)
119
+ return nil if link.nil?
120
+
136
121
  # remove anchor
137
122
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
138
123
 
@@ -143,7 +128,7 @@ module Anemone
143
128
 
144
129
  return absolute
145
130
  end
146
-
131
+
147
132
  #
148
133
  # Returns +true+ if *uri* is in the same domain as the page, returns
149
134
  # +false+ otherwise
@@ -151,5 +136,14 @@ module Anemone
151
136
  def in_domain?(uri)
152
137
  uri.host == @url.host
153
138
  end
139
+
140
+ def marshal_dump
141
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
142
+ end
143
+
144
+ def marshal_load(ary)
145
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
146
+ end
147
+
154
148
  end
155
149
  end
@@ -1,21 +1,52 @@
1
+ require 'forwardable'
2
+
1
3
  module Anemone
2
- class PageHash < Hash
3
-
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {})
10
+ @storage = storage
11
+ end
12
+
4
13
  # We typically index the hash with a URI,
5
14
  # but convert it to a String for easier retrieval
6
15
  def [](index)
7
- super(index.to_s)
16
+ @storage[index.to_s]
8
17
  end
9
-
18
+
10
19
  def []=(index, other)
11
- super(index.to_s, other)
20
+ @storage[index.to_s] = other
21
+ end
22
+
23
+ def delete(key)
24
+ @storage.delete key.to_s
12
25
  end
13
-
26
+
14
27
  def has_key?(key)
15
- super(key.to_s)
28
+ @storage.has_key? key.to_s
29
+ end
30
+
31
+ def each_value
32
+ each { |key, value| yield value }
33
+ end
34
+
35
+ def values
36
+ result = []
37
+ each { |key, value| result << value }
38
+ result
39
+ end
40
+
41
+ def touch_key(key)
42
+ self[key] = Page.new(key)
16
43
  end
17
44
 
18
- # Does this PageHash contain the specified URL?
45
+ def touch_keys(keys)
46
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
47
+ end
48
+
49
+ # Does this PageStore contain the specified URL?
19
50
  # HTTP and HTTPS versions of a URL are considered to be the same page.
20
51
  def has_page?(url)
21
52
  schemes = %w(http https)
@@ -24,80 +55,67 @@ module Anemone
24
55
  return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
56
  end
26
57
 
27
- has_key?(url)
58
+ has_key? url
28
59
  end
29
-
60
+
30
61
  #
31
62
  # Use a breadth-first search to calculate the single-source
32
- # shortest paths from *root* to all pages in the PageHash
63
+ # shortest paths from *root* to all pages in the PageStore
33
64
  #
34
65
  def shortest_paths!(root)
35
66
  root = URI(root) if root.is_a?(String)
36
67
  raise "Root node not found" if !has_key?(root)
37
-
38
- each_value {|p| p.visited = false if p}
39
-
68
+
40
69
  q = Queue.new
41
-
42
- q.enq(root)
43
- self[root].depth = 0
44
- self[root].visited = true
45
- while(!q.empty?)
46
- url = q.deq
47
-
48
- next if !has_key?(url)
49
-
50
- page = self[url]
51
-
70
+
71
+ q.enq root
72
+ root_page = self[root]
73
+ root_page.depth = 0
74
+ root_page.visited = true
75
+ self[root] = root_page
76
+ while !q.empty?
77
+ page = self[q.deq]
52
78
  page.links.each do |u|
53
- next if !has_key?(u) or self[u].nil?
54
- link = self[u]
55
- aliases = [link].concat(link.aliases.map {|a| self[a] })
56
-
57
- aliases.each do |node|
58
- if node.depth.nil? or page.depth + 1 < node.depth
59
- node.depth = page.depth + 1
79
+ begin
80
+ link = self[u]
81
+ next if link.nil? || !link.fetched? || link.visited
82
+
83
+ q << u unless link.redirect?
84
+ link.visited = true
85
+ link.depth = page.depth + 1
86
+ self[u] = link
87
+
88
+ if link.redirect?
89
+ u = link.redirect_to
90
+ redo
60
91
  end
61
92
  end
62
-
63
- q.enq(self[u].url) if !self[u].visited
64
- self[u].visited = true
65
93
  end
66
94
  end
67
-
95
+
68
96
  self
69
97
  end
70
-
98
+
71
99
  #
72
- # Returns a new PageHash by removing redirect-aliases for each
73
- # non-redirect Page
100
+ # Removes all Pages from storage where redirect? is true
74
101
  #
75
- def uniq
76
- results = PageHash.new
77
- each do |url, page|
78
- #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
- page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
- if !page.redirect? and !page_added
81
- results[url] = page.clone
82
- results[url].aliases = []
83
- end
84
- end
85
-
86
- results
102
+ def uniq!
103
+ each_value { |page| delete page.url if page.redirect? }
104
+ self
87
105
  end
88
-
106
+
89
107
  #
90
108
  # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
109
  # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
110
  #
93
111
  def pages_linking_to(urls)
94
112
  unless urls.is_a?(Array)
95
- urls = [urls] unless urls.is_a?(Array)
113
+ urls = [urls]
96
114
  single = true
97
115
  end
98
116
 
99
117
  urls.map! do |url|
100
- if url.is_a?(String)
118
+ unless url.is_a?(URI)
101
119
  URI(url) rescue nil
102
120
  else
103
121
  url
@@ -112,7 +130,7 @@ module Anemone
112
130
  end
113
131
 
114
132
  if single and !links.empty?
115
- return links.first
133
+ return links[urls.first]
116
134
  else
117
135
  return links
118
136
  end
@@ -132,11 +150,11 @@ module Anemone
132
150
  links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
151
 
134
152
  if single and !links.empty?
135
- return links.first
153
+ return links[urls.first]
136
154
  else
137
155
  return links
138
- end
156
+ end
139
157
  end
140
158
 
141
159
  end
142
- end
160
+ end
@@ -0,0 +1,19 @@
1
+ module Anemone
2
+ module Storage
3
+
4
+ def self.Hash(*args)
5
+ Hash.new(*args)
6
+ end
7
+
8
+ def self.PStore(*args)
9
+ require 'anemone/storage/pstore'
10
+ self::PStore.new(*args)
11
+ end
12
+
13
+ def self.TokyoCabinet(file)
14
+ require 'anemone/storage/tokyo_cabinet'
15
+ self::TokyoCabinet.new(file)
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,48 @@
1
+ require 'pstore'
2
+ require 'forwardable'
3
+
4
+ module Anemone
5
+ module Storage
6
+ class PStore
7
+ extend Forwardable
8
+
9
+ def_delegators :@keys, :has_key?, :keys, :size
10
+
11
+ def initialize(file)
12
+ File.delete(file) if File.exists?(file)
13
+ @store = ::PStore.new(file)
14
+ @keys = {}
15
+ end
16
+
17
+ def [](key)
18
+ @store.transaction { |s| s[key] }
19
+ end
20
+
21
+ def []=(key,value)
22
+ @keys[key] = nil
23
+ @store.transaction { |s| s[key] = value }
24
+ end
25
+
26
+ def delete(key)
27
+ @keys.delete(key)
28
+ @store.transaction { |s| s.delete key}
29
+ end
30
+
31
+ def each
32
+ @keys.each_key do |key|
33
+ value = nil
34
+ @store.transaction { |s| value = s[key] }
35
+ yield key, value
36
+ end
37
+ end
38
+
39
+ def merge!(hash)
40
+ @store.transaction do |s|
41
+ hash.each { |key, value| s[key] = value; @keys[key] = nil }
42
+ end
43
+ self
44
+ end
45
+
46
+ end
47
+ end
48
+ end