sutch-anemone 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ require 'nokogiri'
2
+ require 'ostruct'
3
+ require 'webrick/cookie'
4
+
5
+ module Anemone
6
+ class Page
7
+
8
+ # The URL of the page
9
+ attr_reader :url
10
+ # The raw HTTP response body of the page
11
+ attr_reader :body
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
+ # URL of the page this one redirected to, if any
15
+ attr_reader :redirect_to
16
+ # Exception object, if one was raised during HTTP#fetch_page
17
+ attr_reader :error
18
+
19
+ # OpenStruct for user-stored data
20
+ attr_accessor :data
21
+ # Integer response code of the page
22
+ attr_accessor :code
23
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
24
+ attr_accessor :visited
25
+ # Depth of this page from the root of the crawl. This is not necessarily the
26
+ # shortest path; use PageStore#shortest_paths! to find that value.
27
+ attr_accessor :depth
28
+ # URL of the page that brought us to this page
29
+ attr_accessor :referer
30
+ # Response time of the request for this page in milliseconds
31
+ attr_accessor :response_time
32
+
33
+ #
34
+ # Create a new page
35
+ #
36
+ def initialize(url, params = {})
37
+ @url = url
38
+ @data = OpenStruct.new
39
+
40
+ @code = params[:code]
41
+ @headers = params[:headers] || {}
42
+ @headers['content-type'] ||= ['']
43
+ @aliases = Array(params[:aka]).compact
44
+ @referer = params[:referer]
45
+ @depth = params[:depth] || 0
46
+ @redirect_to = to_absolute(params[:redirect_to])
47
+ @response_time = params[:response_time]
48
+ @body = params[:body]
49
+ @error = params[:error]
50
+
51
+ @fetched = !params[:code].nil?
52
+ end
53
+
54
+ #
55
+ # Array of distinct A tag HREFs from the page
56
+ #
57
+ def links
58
+ return @links unless @links.nil?
59
+ @links = []
60
+ return @links if !doc
61
+
62
+ doc.search("//a[@href]").each do |a|
63
+ u = a['href']
64
+ next if u.nil? or u.empty?
65
+ abs = to_absolute(u) rescue next
66
+ @links << abs if in_domain?(abs)
67
+ end
68
+ @links.uniq!
69
+ @links
70
+ end
71
+
72
+ #
73
+ # Nokogiri document for the HTML body
74
+ #
75
+ def doc
76
+ return @doc if @doc
77
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
78
+ end
79
+
80
+ #
81
+ # Delete the Nokogiri document and response body to conserve memory
82
+ #
83
+ def discard_doc!
84
+ links # force parsing of page links before we trash the document
85
+ @doc = @body = nil
86
+ end
87
+
88
+ #
89
+ # Was the page successfully fetched?
90
+ # +true+ if the page was fetched with no error, +false+ otherwise.
91
+ #
92
+ def fetched?
93
+ @fetched
94
+ end
95
+
96
+ #
97
+ # Array of cookies received with this page as WEBrick::Cookie objects.
98
+ #
99
+ def cookies
100
+ WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
101
+ end
102
+
103
+ #
104
+ # The content-type returned by the HTTP request for this page
105
+ #
106
+ def content_type
107
+ headers['content-type'].first
108
+ end
109
+
110
+ #
111
+ # Returns +true+ if the page is a HTML document, returns +false+
112
+ # otherwise.
113
+ #
114
+ def html?
115
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
116
+ end
117
+
118
+ #
119
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
120
+ # otherwise.
121
+ #
122
+ def redirect?
123
+ (300..307).include?(@code)
124
+ end
125
+
126
+ #
127
+ # Returns +true+ if the page was not found (returned 404 code),
128
+ # returns +false+ otherwise.
129
+ #
130
+ def not_found?
131
+ 404 == @code
132
+ end
133
+
134
+ #
135
+ # Base URI from the HTML doc head element
136
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
137
+ #
138
+ def base
139
+ @base = if doc
140
+ href = doc.search('//head/base/@href')
141
+ URI(href.to_s) unless href.nil? rescue nil
142
+ end unless @base
143
+
144
+ return nil if @base && @base.to_s().empty?
145
+ @base
146
+ end
147
+
148
+
149
+ #
150
+ # Converts relative URL *link* into an absolute URL based on the
151
+ # location of the page
152
+ #
153
+ def to_absolute(link)
154
+ return nil if link.nil?
155
+
156
+ # remove anchor
157
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
158
+
159
+ relative = URI(link)
160
+ absolute = base ? base.merge(relative) : @url.merge(relative)
161
+
162
+ absolute.path = '/' if absolute.path.empty?
163
+
164
+ return absolute
165
+ end
166
+
167
+ #
168
+ # Returns +true+ if *uri* is in the same domain as the page, returns
169
+ # +false+ otherwise
170
+ #
171
+ def in_domain?(uri)
172
+ uri.host == @url.host
173
+ end
174
+
175
+ def marshal_dump
176
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
177
+ end
178
+
179
+ def marshal_load(ary)
180
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
181
+ end
182
+
183
+ def to_hash
184
+ {'url' => @url.to_s,
185
+ 'headers' => Marshal.dump(@headers),
186
+ 'data' => Marshal.dump(@data),
187
+ 'body' => @body,
188
+ 'links' => links.map(&:to_s),
189
+ 'code' => @code,
190
+ 'visited' => @visited,
191
+ 'depth' => @depth,
192
+ 'referer' => @referer.to_s,
193
+ 'redirect_to' => @redirect_to.to_s,
194
+ 'response_time' => @response_time,
195
+ 'fetched' => @fetched}
196
+ end
197
+
198
+ def self.from_hash(hash)
199
+ page = self.new(URI(hash['url']))
200
+ {'@headers' => Marshal.load(hash['headers']),
201
+ '@data' => Marshal.load(hash['data']),
202
+ '@body' => hash['body'],
203
+ '@links' => hash['links'].map { |link| URI(link) },
204
+ '@code' => hash['code'].to_i,
205
+ '@visited' => hash['visited'],
206
+ '@depth' => hash['depth'].to_i,
207
+ '@referer' => hash['referer'],
208
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
209
+ '@response_time' => hash['response_time'].to_i,
210
+ '@fetched' => hash['fetched']
211
+ }.each do |var, value|
212
+ page.instance_variable_set(var, value)
213
+ end
214
+ page
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,161 @@
1
+ require 'forwardable'
2
+
3
+ module Anemone
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {}, opts)
10
+ @storage = storage
11
+ @opts = opts
12
+ end
13
+
14
+ # We typically index the hash with a URI,
15
+ # but convert it to a String for easier retrieval
16
+ def [](index)
17
+ @storage[index.to_s]
18
+ end
19
+
20
+ def []=(index, other)
21
+ @storage[index.to_s] = other
22
+ end
23
+
24
+ def delete(key)
25
+ @storage.delete key.to_s
26
+ end
27
+
28
+ def has_key?(key)
29
+ @storage.has_key? key.to_s
30
+ end
31
+
32
+ def each_value
33
+ each { |key, value| yield value }
34
+ end
35
+
36
+ def values
37
+ result = []
38
+ each { |key, value| result << value }
39
+ result
40
+ end
41
+
42
+ def touch_key(key)
43
+ self[key] = @opts[:page_class].new(key)
44
+ end
45
+
46
+ def touch_keys(keys)
47
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = @opts[:page_class].new(k); h }
48
+ end
49
+
50
+ # Does this PageStore contain the specified URL?
51
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
52
+ def has_page?(url)
53
+ schemes = %w(http https)
54
+ if schemes.include? url.scheme
55
+ u = url.dup
56
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
57
+ end
58
+
59
+ has_key? url
60
+ end
61
+
62
+ #
63
+ # Use a breadth-first search to calculate the single-source
64
+ # shortest paths from *root* to all pages in the PageStore
65
+ #
66
+ def shortest_paths!(root)
67
+ root = URI(root) if root.is_a?(String)
68
+ raise "Root node not found" if !has_key?(root)
69
+
70
+ q = Queue.new
71
+
72
+ q.enq root
73
+ root_page = self[root]
74
+ root_page.depth = 0
75
+ root_page.visited = true
76
+ self[root] = root_page
77
+ while !q.empty?
78
+ page = self[q.deq]
79
+ page.links.each do |u|
80
+ begin
81
+ link = self[u]
82
+ next if link.nil? || !link.fetched? || link.visited
83
+
84
+ q << u unless link.redirect?
85
+ link.visited = true
86
+ link.depth = page.depth + 1
87
+ self[u] = link
88
+
89
+ if link.redirect?
90
+ u = link.redirect_to
91
+ redo
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+ self
98
+ end
99
+
100
+ #
101
+ # Removes all Pages from storage where redirect? is true
102
+ #
103
+ def uniq!
104
+ each_value { |page| delete page.url if page.redirect? }
105
+ self
106
+ end
107
+
108
+ #
109
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
110
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
111
+ #
112
+ def pages_linking_to(urls)
113
+ unless urls.is_a?(Array)
114
+ urls = [urls]
115
+ single = true
116
+ end
117
+
118
+ urls.map! do |url|
119
+ unless url.is_a?(URI)
120
+ URI(url) rescue nil
121
+ else
122
+ url
123
+ end
124
+ end
125
+ urls.compact
126
+
127
+ links = {}
128
+ urls.each { |url| links[url] = [] }
129
+ values.each do |page|
130
+ urls.each { |url| links[url] << page if page.links.include?(url) }
131
+ end
132
+
133
+ if single and !links.empty?
134
+ return links[urls.first]
135
+ else
136
+ return links
137
+ end
138
+ end
139
+
140
+ #
141
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
142
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
143
+ #
144
+ def urls_linking_to(urls)
145
+ unless urls.is_a?(Array)
146
+ urls = [urls] unless urls.is_a?(Array)
147
+ single = true
148
+ end
149
+
150
+ links = pages_linking_to(urls)
151
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
152
+
153
+ if single and !links.empty?
154
+ return links[urls.first]
155
+ else
156
+ return links
157
+ end
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,42 @@
1
+ require 'content_urls'
2
+
3
+ # Alternative to Anemone::Page which handles HTML pages and other ccontent types.
4
+
5
+ module Anemone
6
+ class Resource < Anemone::Page
7
+
8
+ #
9
+ # Array of distinct URLs from the resource
10
+ #
11
+ def links
12
+ return @links unless @links.nil?
13
+ @links = []
14
+ ContentUrls.urls(body, content_type).each do |u|
15
+ next if u.nil? or u.empty?
16
+ abs = to_absolute(u) rescue next
17
+ @links << abs if in_domain?(abs)
18
+ end
19
+ @links.uniq!
20
+ @links
21
+ end
22
+
23
+ #
24
+ # Base URI from the HTML doc head element
25
+ #
26
+ # Anemone::Base#to_absolute expects an instance of URI or nil
27
+ #
28
+ def base
29
+ return @base unless @body_parsed.nil?
30
+ @body_parsed = true
31
+
32
+ base = ContentUrls.base_url(body, content_type)
33
+
34
+ return @base if base.nil?
35
+ base = URI(base) unless base.nil? rescue nil
36
+ @base = base unless base.to_s.empty?
37
+
38
+ @base
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,44 @@
1
+ module Anemone
2
+ module Storage
3
+
4
+ def self.Hash(*args)
5
+ hash = Hash.new(*args)
6
+ # add close method for compatibility with Storage::Base
7
+ class << hash; def close; end; end
8
+ hash
9
+ end
10
+
11
+ def self.PStore(*args)
12
+ require 'anemone/storage/pstore'
13
+ self::PStore.new(*args)
14
+ end
15
+
16
+ def self.TokyoCabinet(file = 'anemone.tch')
17
+ require 'anemone/storage/tokyo_cabinet'
18
+ self::TokyoCabinet.new(file)
19
+ end
20
+
21
+ def self.KyotoCabinet(file = 'anemone.kch')
22
+ require 'anemone/storage/kyoto_cabinet'
23
+ self::KyotoCabinet.new(file)
24
+ end
25
+
26
+ def self.MongoDB(mongo_db = nil, collection_name = 'pages')
27
+ require 'anemone/storage/mongodb'
28
+ mongo_db ||= Mongo::Connection.new.db('anemone')
29
+ raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
30
+ self::MongoDB.new(mongo_db, collection_name)
31
+ end
32
+
33
+ def self.Redis(opts = {})
34
+ require 'anemone/storage/redis'
35
+ self::Redis.new(opts)
36
+ end
37
+
38
+ def self.SQLite3(file = 'anemone.db')
39
+ require 'anemone/storage/sqlite3'
40
+ self::SQLite3.new(file)
41
+ end
42
+
43
+ end
44
+ end