feed_discover 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/feed_discover.rb +175 -0
  2. metadata +72 -0
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/ruby
2
+ #Copyright (C) 2007 Ben Coffey (me@inelegant.org) <http://inelegant.org/>
3
+ #
4
+ #This program is free software; you can redistribute it and/or
5
+ #modify it under the terms of the GNU General Public License
6
+ #as published by the Free Software Foundation; either version 2
7
+ #of the License, or (at your option) any later version.
8
+ #
9
+ #This program is distributed in the hope that it will be useful,
10
+ #but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ #GNU General Public License for more details.
13
+ #
14
+ #You should have received a copy of the GNU General Public License
15
+ #along with this program; if not, write to the Free Software
16
+ #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17
+ #
18
+ # FeedDiscover performs feed autodiscovery on the given URL.
19
+ #
20
+ # Feed autodiscovery is a process of using standardised heruitsics
21
+ # to determine the Atom/RSS feed(s) associated with a given X?HTML
22
+ # document. The process is being standardised for Atom feeds via the
23
+ # IETF: http://tools.ietf.org/html/draft-ietf-atompub-autodiscovery
24
+ # We use the same approach for RSS feeds.
25
+ #
26
+ # When feeds cannot be discovered using the above method, we use a
27
+ # broader, thus less accurate, method of looking for hyperlinks whose
28
+ # URLs end with an extension typical of feeds. This feature is experimental.
29
+ #
30
+ # Author:: Ben Coffey (mailto:me@inelegant.org)
31
+ # Copyright:: Copyright (c) 2007 Ben Coffey
32
+ # License:: GPL
33
+
34
+ require 'rubygems'
35
+ require 'uri'
36
+ require 'simple_http'
37
+
38
+ class Array #:nodoc: all
39
+ def to_h(default=nil)
40
+ Hash[ *inject([]) { |a, value| a.push value, default || yield(value) } ]
41
+ end
42
+ end
43
+
44
+ #This class performs feed autodiscovery. This is is the process of
45
+ #determining the Atom/RSS URL(s) for a given X?HTML page.
46
+ #
47
+ #== Sample Usage
48
+ #
49
+ # require 'rubygems'
50
+ # require 'feed_discover'
51
+ # fd = FeedDiscover.new('http://bbc.co.uk/')
52
+ # if fd.feeds?
53
+ # puts fd.feeds.join("\n")
54
+ # else
55
+ # puts "No feeds found"
56
+ # end
57
+ #
58
+ #Or:
59
+ #
60
+ # require 'rubygems'
61
+ # require 'feed_discover'
62
+ # puts FeedDiscover.new('http://inelegant.org/').feed
63
+ #
64
+ class FeedDiscover
65
+ #The class should be initialized with the URL of the page to perform
66
+ #autodiscovery on. The URL should be absolute and use either the HTTP
67
+ #or HTTP protocol scheme. The URL should be a string or an object whose
68
+ #+to_s+ method returns a string.
69
+ #An exception will be raised if the URL has invalid syntax.
70
+ def initialize(url)
71
+ @url = URI.parse(url.to_s)
72
+ @feeds = []
73
+ autodiscover
74
+ end
75
+
76
+ #Returns an array of absolute feed URLs which were discovered. If no feeds
77
+ #were found, an empty array is returned.
78
+ def feeds
79
+ @feeds || []
80
+ end
81
+
82
+ #Returns the absolute URL of the first feed found as a string
83
+ def feed
84
+ @feeds[0]
85
+ end
86
+
87
+ #Returns true if feeds were found; false if not
88
+ def feeds?
89
+ !feeds.empty?
90
+ end
91
+
92
+ private
93
+ #TODO: Add to URI class
94
+ #The feed URLs we return need to be absolute. If they're already
95
+ #absolute, we just return them as-is. If they're not, we attempt
96
+ #to absolutize them relative to the current base URL
97
+ def abs_uri_wrt_base(str)
98
+ #In case we're passed a URI object, we normalise it to a string.
99
+ #We remove leading/trailing whitespace because the values of
100
+ #'href' attributes sometimes contain such characters
101
+ str = str.to_s.strip
102
+ return URI.parse(str) if URI.parse(str).absolute?
103
+ return @url.merge(str) if @url.absolute?
104
+ raise "abs_uri_wrt_base: neither URL absolute!"
105
+ end
106
+
107
+ #Valid MIME types for feeds
108
+ VALID_MIME_TYPE = %w{application/x.atom+xml application/atom+xml application/xml
109
+ text/xml application/rss+xml application/rdf+xml}.to_h(true)
110
+ #Valid extensions for feeds
111
+ VALID_EXT = /\.(?:rss|xml|rdf)$/
112
+
113
+ def autodiscover
114
+ res = SimpleHttp.new @url
115
+ #TODO: How to handle exceptions?
116
+ @body = res.get || return
117
+ #If the URL that we were called with is already a feed (IOW,
118
+ #its MIME type matches one of the keys of VALID_MIME_TYPE),
119
+ #just return it. Autodiscovery is only specified for X?HTML
120
+ #documents.
121
+ if VALID_MIME_TYPE[res.response_headers['content-type']]
122
+ @feeds << @url.to_s
123
+ return
124
+ end
125
+ find_links
126
+ end
127
+
128
+ def find_links
129
+ require 'rubyful_soup'
130
+ require 'htmlentities'
131
+ coder = HTMLEntities.new
132
+ soup = BeautifulSoup.new(@body)
133
+ soup.find_all(['a', 'base', 'link']).each do |tag|
134
+ #link tags must have 'rel', 'type', and 'href' attributes to be considered
135
+ if tag.name == 'link' and tag['rel'] and tag['type'] and tag['href']
136
+ #The 'type' attribute must contain a valid MIME type, as defined in
137
+ #VALID_MIME_TYPE. We ignore its case and any trailing whitespace.
138
+ #X?HTML entities are decoded performing the lookup.
139
+ next unless VALID_MIME_TYPE[coder.decode(tag['type']).downcase.strip]
140
+ #The 'rel' attribute contains a whitespace-separated list of values,
141
+ #which we must consider regardless of case. Any X?HTML entities are
142
+ #decoded. We're interested in the presence of one of two specific
143
+ #values, so we map the value list to a hash-based lookup table
144
+ rel = coder.decode(tag['rel']).downcase.split(/\s+/).to_h(true)
145
+ #We use the 'rel' lookup table we created above to determine whether
146
+ #this link tag is a valid source for feed links. If it is, we
147
+ #resolve the URL (the value of its 'href' attribute) relative to
148
+ #the current base URL, then add the result to the @feeds array
149
+ @feeds << abs_uri_wrt_base(tag['href']).to_s if (rel['alternate'] || rel['service.feed'])
150
+ elsif tag.name == 'base' and tag['href']
151
+ #By default, the base URL is equal to the URL we were passed in the
152
+ #constructor; the URL of the HTML page. However, if the document
153
+ #contains a base tag with an 'href' attribute, we use that instead.
154
+ #All discovered feed URLs are resolved relative to the base URL.
155
+ #TODO: Investigate repercussions of multiple base URLs -- which
156
+ #one should we use?
157
+ @url = abs_uri_wrt_base(tag['href'])
158
+ elsif @feeds.empty? and tag.name == 'a' and tag['href']
159
+ #If we didn't find any feeds via the 'link' tags, we switch to another
160
+ #heuristic: look for hyperlinks ('a' tags) whose 'href' attributes
161
+ #contain a URL ending with an extension typical for a feed (VALID_EXT).
162
+ #TODO: Limit the number of URLs found by this method such that blogs
163
+ #with blogroll listings don't generate false positives.
164
+ begin
165
+ href = abs_uri_wrt_base(tag['href'])
166
+ rescue URI::InvalidURIError
167
+ next
168
+ end
169
+ @feeds << href.to_s if VALID_EXT.match(href.path)
170
+ end
171
+ end
172
+ end
173
+
174
+ end
175
+
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: feed_discover
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-05-18 00:00:00 +01:00
8
+ summary: A library for performing feed autodiscovery on X?HTML pages
9
+ require_paths:
10
+ - lib
11
+ email: me@inelegant.org
12
+ homepage: http://inelegant.org/feed-discover/
13
+ rubyforge_project:
14
+ description: This library uses heuristics to determine the Atom/RSS feed associated with a given web page.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Ben Coffey
31
+ files:
32
+ - lib/feed_discover.rb
33
+ test_files: []
34
+
35
+ rdoc_options: []
36
+
37
+ extra_rdoc_files: []
38
+
39
+ executables: []
40
+
41
+ extensions: []
42
+
43
+ requirements: []
44
+
45
+ dependencies:
46
+ - !ruby/object:Gem::Dependency
47
+ name: rubyful_soup
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Version::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.0.4
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: simplehttp
57
+ version_requirement:
58
+ version_requirements: !ruby/object:Gem::Version::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.1.1
63
+ version:
64
+ - !ruby/object:Gem::Dependency
65
+ name: htmlentities
66
+ version_requirement:
67
+ version_requirements: !ruby/object:Gem::Version::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: 4.0.0
72
+ version: