feed_discover 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/feed_discover.rb +175 -0
- metadata +72 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#Copyright (C) 2007 Ben Coffey (me@inelegant.org) <http://inelegant.org/>
|
3
|
+
#
|
4
|
+
#This program is free software; you can redistribute it and/or
|
5
|
+
#modify it under the terms of the GNU General Public License
|
6
|
+
#as published by the Free Software Foundation; either version 2
|
7
|
+
#of the License, or (at your option) any later version.
|
8
|
+
#
|
9
|
+
#This program is distributed in the hope that it will be useful,
|
10
|
+
#but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12
|
+
#GNU General Public License for more details.
|
13
|
+
#
|
14
|
+
#You should have received a copy of the GNU General Public License
|
15
|
+
#along with this program; if not, write to the Free Software
|
16
|
+
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
17
|
+
#
|
18
|
+
# FeedDiscover performs feed autodiscovery on the given URL.
|
19
|
+
#
|
20
|
+
# Feed autodiscovery is a process of using standardised heruitsics
|
21
|
+
# to determine the Atom/RSS feed(s) associated with a given X?HTML
|
22
|
+
# document. The process is being standardised for Atom feeds via the
|
23
|
+
# IETF: http://tools.ietf.org/html/draft-ietf-atompub-autodiscovery
|
24
|
+
# We use the same approach for RSS feeds.
|
25
|
+
#
|
26
|
+
# When feeds cannot be discovered using the above method, we use a
|
27
|
+
# broader, thus less accurate, method of looking for hyperlinks whose
|
28
|
+
# URLs end with an extension typical of feeds. This feature is experimental.
|
29
|
+
#
|
30
|
+
# Author:: Ben Coffey (mailto:me@inelegant.org)
|
31
|
+
# Copyright:: Copyright (c) 2007 Ben Coffey
|
32
|
+
# License:: GPL
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'uri'
|
36
|
+
require 'simple_http'
|
37
|
+
|
38
|
+
class Array #:nodoc: all
|
39
|
+
def to_h(default=nil)
|
40
|
+
Hash[ *inject([]) { |a, value| a.push value, default || yield(value) } ]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
#This class performs feed autodiscovery. This is is the process of
|
45
|
+
#determining the Atom/RSS URL(s) for a given X?HTML page.
|
46
|
+
#
|
47
|
+
#== Sample Usage
|
48
|
+
#
|
49
|
+
# require 'rubygems'
|
50
|
+
# require 'feed_discover'
|
51
|
+
# fd = FeedDiscover.new('http://bbc.co.uk/')
|
52
|
+
# if fd.feeds?
|
53
|
+
# puts fd.feeds.join("\n")
|
54
|
+
# else
|
55
|
+
# puts "No feeds found"
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
#Or:
|
59
|
+
#
|
60
|
+
# require 'rubygems'
|
61
|
+
# require 'feed_discover'
|
62
|
+
# puts FeedDiscover.new('http://inelegant.org/').feed
|
63
|
+
#
|
64
|
+
class FeedDiscover
|
65
|
+
#The class should be initialized with the URL of the page to perform
|
66
|
+
#autodiscovery on. The URL should be absolute and use either the HTTP
|
67
|
+
#or HTTP protocol scheme. The URL should be a string or an object whose
|
68
|
+
#+to_s+ method returns a string.
|
69
|
+
#An exception will be raised if the URL has invalid syntax.
|
70
|
+
def initialize(url)
|
71
|
+
@url = URI.parse(url.to_s)
|
72
|
+
@feeds = []
|
73
|
+
autodiscover
|
74
|
+
end
|
75
|
+
|
76
|
+
#Returns an array of absolute feed URLs which were discovered. If no feeds
|
77
|
+
#were found, an empty array is returned.
|
78
|
+
def feeds
|
79
|
+
@feeds || []
|
80
|
+
end
|
81
|
+
|
82
|
+
#Returns the absolute URL of the first feed found as a string
|
83
|
+
def feed
|
84
|
+
@feeds[0]
|
85
|
+
end
|
86
|
+
|
87
|
+
#Returns true if feeds were found; false if not
|
88
|
+
def feeds?
|
89
|
+
!feeds.empty?
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
#TODO: Add to URI class
|
94
|
+
#The feed URLs we return need to be absolute. If they're already
|
95
|
+
#absolute, we just return them as-is. If they're not, we attempt
|
96
|
+
#to absolutize them relative to the current base URL
|
97
|
+
def abs_uri_wrt_base(str)
|
98
|
+
#In case we're passed a URI object, we normalise it to a string.
|
99
|
+
#We remove leading/trailing whitespace because the values of
|
100
|
+
#'href' attributes sometimes contain such characters
|
101
|
+
str = str.to_s.strip
|
102
|
+
return URI.parse(str) if URI.parse(str).absolute?
|
103
|
+
return @url.merge(str) if @url.absolute?
|
104
|
+
raise "abs_uri_wrt_base: neither URL absolute!"
|
105
|
+
end
|
106
|
+
|
107
|
+
#Valid MIME types for feeds
|
108
|
+
VALID_MIME_TYPE = %w{application/x.atom+xml application/atom+xml application/xml
|
109
|
+
text/xml application/rss+xml application/rdf+xml}.to_h(true)
|
110
|
+
#Valid extensions for feeds
|
111
|
+
VALID_EXT = /\.(?:rss|xml|rdf)$/
|
112
|
+
|
113
|
+
def autodiscover
|
114
|
+
res = SimpleHttp.new @url
|
115
|
+
#TODO: How to handle exceptions?
|
116
|
+
@body = res.get || return
|
117
|
+
#If the URL that we were called with is already a feed (IOW,
|
118
|
+
#its MIME type matches one of the keys of VALID_MIME_TYPE),
|
119
|
+
#just return it. Autodiscovery is only specified for X?HTML
|
120
|
+
#documents.
|
121
|
+
if VALID_MIME_TYPE[res.response_headers['content-type']]
|
122
|
+
@feeds << @url.to_s
|
123
|
+
return
|
124
|
+
end
|
125
|
+
find_links
|
126
|
+
end
|
127
|
+
|
128
|
+
def find_links
|
129
|
+
require 'rubyful_soup'
|
130
|
+
require 'htmlentities'
|
131
|
+
coder = HTMLEntities.new
|
132
|
+
soup = BeautifulSoup.new(@body)
|
133
|
+
soup.find_all(['a', 'base', 'link']).each do |tag|
|
134
|
+
#link tags must have 'rel', 'type', and 'href' attributes to be considered
|
135
|
+
if tag.name == 'link' and tag['rel'] and tag['type'] and tag['href']
|
136
|
+
#The 'type' attribute must contain a valid MIME type, as defined in
|
137
|
+
#VALID_MIME_TYPE. We ignore its case and any trailing whitespace.
|
138
|
+
#X?HTML entities are decoded performing the lookup.
|
139
|
+
next unless VALID_MIME_TYPE[coder.decode(tag['type']).downcase.strip]
|
140
|
+
#The 'rel' attribute contains a whitespace-separated list of values,
|
141
|
+
#which we must consider regardless of case. Any X?HTML entities are
|
142
|
+
#decoded. We're interested in the presence of one of two specific
|
143
|
+
#values, so we map the value list to a hash-based lookup table
|
144
|
+
rel = coder.decode(tag['rel']).downcase.split(/\s+/).to_h(true)
|
145
|
+
#We use the 'rel' lookup table we created above to determine whether
|
146
|
+
#this link tag is a valid source for feed links. If it is, we
|
147
|
+
#resolve the URL (the value of its 'href' attribute) relative to
|
148
|
+
#the current base URL, then add the result to the @feeds array
|
149
|
+
@feeds << abs_uri_wrt_base(tag['href']).to_s if (rel['alternate'] || rel['service.feed'])
|
150
|
+
elsif tag.name == 'base' and tag['href']
|
151
|
+
#By default, the base URL is equal to the URL we were passed in the
|
152
|
+
#constructor; the URL of the HTML page. However, if the document
|
153
|
+
#contains a base tag with an 'href' attribute, we use that instead.
|
154
|
+
#All discovered feed URLs are resolved relative to the base URL.
|
155
|
+
#TODO: Investigate repercussions of multiple base URLs -- which
|
156
|
+
#one should we use?
|
157
|
+
@url = abs_uri_wrt_base(tag['href'])
|
158
|
+
elsif @feeds.empty? and tag.name == 'a' and tag['href']
|
159
|
+
#If we didn't find any feeds via the 'link' tags, we switch to another
|
160
|
+
#heuristic: look for hyperlinks ('a' tags) whose 'href' attributes
|
161
|
+
#contain a URL ending with an extension typical for a feed (VALID_EXT).
|
162
|
+
#TODO: Limit the number of URLs found by this method such that blogs
|
163
|
+
#with blogroll listings don't generate false positives.
|
164
|
+
begin
|
165
|
+
href = abs_uri_wrt_base(tag['href'])
|
166
|
+
rescue URI::InvalidURIError
|
167
|
+
next
|
168
|
+
end
|
169
|
+
@feeds << href.to_s if VALID_EXT.match(href.path)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: feed_discover
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2007-05-18 00:00:00 +01:00
|
8
|
+
summary: A library for performing feed autodiscovery on X?HTML pages
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: me@inelegant.org
|
12
|
+
homepage: http://inelegant.org/feed-discover/
|
13
|
+
rubyforge_project:
|
14
|
+
description: This library uses heuristics to determine the Atom/RSS feed associated with a given web page.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Ben Coffey
|
31
|
+
files:
|
32
|
+
- lib/feed_discover.rb
|
33
|
+
test_files: []
|
34
|
+
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
extra_rdoc_files: []
|
38
|
+
|
39
|
+
executables: []
|
40
|
+
|
41
|
+
extensions: []
|
42
|
+
|
43
|
+
requirements: []
|
44
|
+
|
45
|
+
dependencies:
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rubyful_soup
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.0.4
|
54
|
+
version:
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simplehttp
|
57
|
+
version_requirement:
|
58
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 0.1.1
|
63
|
+
version:
|
64
|
+
- !ruby/object:Gem::Dependency
|
65
|
+
name: htmlentities
|
66
|
+
version_requirement:
|
67
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: 4.0.0
|
72
|
+
version:
|