UnderpantsGnome-rfeedparser 0.9.960
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +68 -0
- data/README +50 -0
- data/RUBY-TESTING +66 -0
- data/lib/rfeedparser.rb +551 -0
- data/lib/rfeedparser/aliases.rb +432 -0
- data/lib/rfeedparser/better_attributelist.rb +41 -0
- data/lib/rfeedparser/better_sgmlparser.rb +264 -0
- data/lib/rfeedparser/encoding_helpers.rb +260 -0
- data/lib/rfeedparser/feedparserdict.rb +106 -0
- data/lib/rfeedparser/loose_feed_parser.rb +75 -0
- data/lib/rfeedparser/markup_helpers.rb +71 -0
- data/lib/rfeedparser/monkey_patches.rb +10 -0
- data/lib/rfeedparser/nokogiri_parser.rb +80 -0
- data/lib/rfeedparser/parser_mixin.rb +1275 -0
- data/lib/rfeedparser/scrub.rb +212 -0
- data/lib/rfeedparser/time_helpers.rb +408 -0
- data/lib/rfeedparser/utilities.rb +23 -0
- metadata +151 -0
data/LICENSE
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
I include this license in good faith effort, and it should be considered the license for the code herein.
|
|
2
|
+
- Jeff Hodges < jeff at somethingsimilar.com >
|
|
3
|
+
--
|
|
4
|
+
Universal Feed Parser (feedparser.py), its testing harness (feedparsertest.py),
|
|
5
|
+
and its unit tests (everything in the tests/ directory) are released under the
|
|
6
|
+
following license:
|
|
7
|
+
|
|
8
|
+
----- begin license block -----
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2002-2005, Mark Pilgrim
|
|
11
|
+
All rights reserved.
|
|
12
|
+
|
|
13
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
14
|
+
are permitted provided that the following conditions are met:
|
|
15
|
+
|
|
16
|
+
* Redistributions of source code must retain the above copyright notice,
|
|
17
|
+
this list of conditions and the following disclaimer.
|
|
18
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
19
|
+
this list of conditions and the following disclaimer in the documentation
|
|
20
|
+
and/or other materials provided with the distribution.
|
|
21
|
+
|
|
22
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
23
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
24
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
25
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
26
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
27
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
28
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
29
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
30
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
31
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
32
|
+
POSSIBILITY OF SUCH DAMAGE.
|
|
33
|
+
|
|
34
|
+
----- end license block -----
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
Universal Feed Parser documentation (everything in the docs/ directory) is
|
|
41
|
+
released under the following license:
|
|
42
|
+
|
|
43
|
+
----- begin license block -----
|
|
44
|
+
|
|
45
|
+
Copyright 2004-2005 Mark Pilgrim. All rights reserved.
|
|
46
|
+
|
|
47
|
+
Redistribution and use in source (XML DocBook) and "compiled" forms (SGML,
|
|
48
|
+
HTML, PDF, PostScript, RTF and so forth) with or without modification, are
|
|
49
|
+
permitted provided that the following conditions are met:
|
|
50
|
+
|
|
51
|
+
* Redistributions of source code (XML DocBook) must retain the above copyright
|
|
52
|
+
notice, this list of conditions and the following disclaimer.
|
|
53
|
+
* Redistributions in compiled form (transformed to other DTDs, converted to
|
|
54
|
+
PDF, PostScript, RTF and other formats) must reproduce the above copyright
|
|
55
|
+
notice, this list of conditions and the following disclaimer in the
|
|
56
|
+
documentation and/or other materials provided with the distribution.
|
|
57
|
+
|
|
58
|
+
THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
59
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
60
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
61
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
62
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
63
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
64
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
65
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
66
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
67
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS DOCUMENTATION, EVEN IF ADVISED OF THE
|
|
68
|
+
POSSIBILITY OF SUCH DAMAGE.
|
data/README
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
= Universal Feed Parser
|
|
2
|
+
http://rfeedparser.rubyforge.org
|
|
3
|
+
Original Python code Copyright (c) 2002-5 by Mark Pilgrim
|
|
4
|
+
Ruby port by Jeff Hodges.
|
|
5
|
+
|
|
6
|
+
== Description
|
|
7
|
+
|
|
8
|
+
Parse nearly any RSS and Atom feeds in Ruby. 3000 unit tests. Open source.
|
|
9
|
+
|
|
10
|
+
== Installation
|
|
11
|
+
|
|
12
|
+
For Debian-based systems:
|
|
13
|
+
|
|
14
|
+
$ sudo apt-get install libxml-ruby1.8 # or libxml-parser-ruby1.8
|
|
15
|
+
|
|
16
|
+
TODO: dependency installation instructions for other platforms.
|
|
17
|
+
|
|
18
|
+
And then:
|
|
19
|
+
|
|
20
|
+
$ sudo gem install rfeedparser
|
|
21
|
+
|
|
22
|
+
Or for the latest development version:
|
|
23
|
+
|
|
24
|
+
$ git clone git://github.com/technomancy/rfeedparser.git
|
|
25
|
+
|
|
26
|
+
Dependencies on other gems are handled by rubygems, but rfeedparser
|
|
27
|
+
also relies on having bindings to a native XML parsing library. The
|
|
28
|
+
recommended choice is libxml, which is installable with the
|
|
29
|
+
libxml-ruby1.8 package in Debian-based systems. But it can also fall
|
|
30
|
+
back to expat (the libxml-parser-ruby1.8 Debian package) if libxml is
|
|
31
|
+
not installed.
|
|
32
|
+
|
|
33
|
+
== Usage
|
|
34
|
+
|
|
35
|
+
require 'rubygems' # may omit if installed manually
|
|
36
|
+
require 'rfeedparser'
|
|
37
|
+
|
|
38
|
+
feed = FeedParser.parse("some-feed-stream-filepath-or-url")
|
|
39
|
+
|
|
40
|
+
feed.entries.each do |e|
|
|
41
|
+
puts e.title
|
|
42
|
+
puts e.content
|
|
43
|
+
puts e.published
|
|
44
|
+
puts '----'
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
== Contributing
|
|
48
|
+
|
|
49
|
+
Clone the git repository at git://github.com/jmhodges/rfeedparser.git and
|
|
50
|
+
which has a webpage at http://github.com/jmhodges/rfeedparser/tree/master.
|
data/RUBY-TESTING
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
=== Testing rFeedParser ===
|
|
2
|
+
Simply run rake to run all of the FeedParser tests.
|
|
3
|
+
|
|
4
|
+
Optionally, you can start up rfeedparserserver.rb and, in another shell, run rfeedparser.rb
|
|
5
|
+
against "http://localhost:8097/tests/path/to/testcase.xml" if you want
|
|
6
|
+
to try a test individually. You can, of course, run rfeedparser.rb with
|
|
7
|
+
the file path to the testcase, but there may be HTTP headers required for
|
|
8
|
+
a successful run that can only be given by running rfeedparser against
|
|
9
|
+
the HTTP server path. I'll probably merge feedparserserver.rb into
|
|
10
|
+
rfeedparsertest.rb soon. Note that there are many test failures that occur
|
|
11
|
+
solely because of problems with a platforms iconv installation. Inconsistency
|
|
12
|
+
in iconv is the leading cause of errors in rfp's test cases.
|
|
13
|
+
|
|
14
|
+
=== Last Count 20070321 ===
|
|
15
|
+
By my last count, rfeedparsertest.rb says that there are 45 assertions
|
|
16
|
+
that fail, and 4 that error out. I've included here a few tests that
|
|
17
|
+
"Failed, Sort Of". By that I mean, the behaviors the tests are meant to
|
|
18
|
+
check are correct, but the test fails because of some other superficial
|
|
19
|
+
or unrelated behavior.
|
|
20
|
+
|
|
21
|
+
=== Tests Failed, Sort Of ===
|
|
22
|
+
|
|
23
|
+
Problem:
|
|
24
|
+
Hpricot adds end tags when it sees an unclosed tag. This means that
|
|
25
|
+
certain tests that rely on feedparser.py's _HTMLSanitizer not closing
|
|
26
|
+
tags will fail. Many of the tests affected (actually, all the ones
|
|
27
|
+
affected, AFAICT) would otherwise passed.
|
|
28
|
+
|
|
29
|
+
Tests Affected:
|
|
30
|
+
* tests/wellformed/rss/item_description_not_a_doctype.xml (extraneous trailing </a>)
|
|
31
|
+
* tests/illformed/rss/item_description_not_a_doctype.xml (ditto)
|
|
32
|
+
==
|
|
33
|
+
Problem:
|
|
34
|
+
The Hpricot#scrub method I've written does not remove the dangerous
|
|
35
|
+
markup in the same way feedparser.py does, but the output is still safe.
|
|
36
|
+
|
|
37
|
+
Tests Affected:
|
|
38
|
+
* tests/wellformed/sanitize/entry_content_crazy.xml
|
|
39
|
+
* tests/wellformed/sanitize/entry_summary_crazy.xml
|
|
40
|
+
* tests/wellformed/sanitize/entry_title_crazy.xml
|
|
41
|
+
* tests/wellformed/sanitize/feed_copyright_crazy.xml
|
|
42
|
+
* tests/wellformed/sanitize/feed_info_crazy.xml
|
|
43
|
+
* tests/wellformed/sanitize/feed_subtitle_crazy.xml
|
|
44
|
+
* tests/wellformed/sanitize/feed_tagline_crazy.xml
|
|
45
|
+
* tests/wellformed/sanitize/feed_title_crazy.xml
|
|
46
|
+
* tests/wellformed/sanitize/item_content_encoded_crazy.xml
|
|
47
|
+
* tests/wellformed/sanitize/item_description_crazy.xml
|
|
48
|
+
* tests/wellformed/sanitize/item_fullitem_crazy.xml
|
|
49
|
+
* tests/illformed/sanitize/entry_content_crazy.xml
|
|
50
|
+
* tests/illformed/sanitize/entry_summary_crazy.xml
|
|
51
|
+
* tests/illformed/sanitize/entry_title_crazy.xml
|
|
52
|
+
* tests/illformed/sanitize/feed_copyright_crazy.xml
|
|
53
|
+
* tests/illformed/sanitize/feed_info_crazy.xml
|
|
54
|
+
* tests/illformed/sanitize/feed_subtitle_crazy.xml
|
|
55
|
+
* tests/illformed/sanitize/feed_tagline_crazy.xml
|
|
56
|
+
* tests/illformed/sanitize/feed_title_crazy.xml
|
|
57
|
+
* tests/illformed/sanitize/item_content_encoded_crazy.xml
|
|
58
|
+
* tests/illformed/sanitize/item_description_crazy.xml
|
|
59
|
+
* tests/illformed/sanitize/item_fullitem_crazy.xml
|
|
60
|
+
==
|
|
61
|
+
|
|
62
|
+
Problem:
|
|
63
|
+
My current system lacks a few encodings that rfeedparser and Iconv need.
|
|
64
|
+
This results in failures that will probably not occur on other machines.
|
|
65
|
+
|
|
66
|
+
Tests Affected:
|
data/lib/rfeedparser.rb
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# Universal feed parser in Ruby
|
|
3
|
+
#
|
|
4
|
+
# Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
|
|
5
|
+
#
|
|
6
|
+
# Visit http://feedparser.org/ for the latest version in Python
|
|
7
|
+
# Visit http://feedparser.org/docs/ for the latest documentation
|
|
8
|
+
# Email Jeff Hodges at jeff@somethingsimilar.com with questions
|
|
9
|
+
#
|
|
10
|
+
# Required: Ruby 1.8
|
|
11
|
+
|
|
12
|
+
$KCODE = 'UTF8'
|
|
13
|
+
require 'stringio'
|
|
14
|
+
require 'uri'
|
|
15
|
+
require 'open-uri'
|
|
16
|
+
require 'cgi' # escaping html
|
|
17
|
+
require 'time'
|
|
18
|
+
require 'pp'
|
|
19
|
+
require 'base64'
|
|
20
|
+
require 'iconv'
|
|
21
|
+
require 'zlib'
|
|
22
|
+
|
|
23
|
+
require 'rubygems'
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
|
27
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
|
28
|
+
begin
|
|
29
|
+
gem 'character-encodings', ">=0.2.0"
|
|
30
|
+
require 'encoding/character/utf-8'
|
|
31
|
+
rescue LoadError
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# TODO: require these in the files that need them, not in the toplevel
|
|
35
|
+
gem 'hpricot', "~>0.6"
|
|
36
|
+
require 'hpricot'
|
|
37
|
+
|
|
38
|
+
gem 'htmltools', ">=1.10"
|
|
39
|
+
require 'html/sgml-parser'
|
|
40
|
+
|
|
41
|
+
gem 'htmlentities', ">=4.0.0"
|
|
42
|
+
require 'htmlentities'
|
|
43
|
+
|
|
44
|
+
gem 'addressable', ">= 1.0.4"
|
|
45
|
+
require 'addressable/uri'
|
|
46
|
+
|
|
47
|
+
gem 'rchardet', ">=1.0"
|
|
48
|
+
require 'rchardet'
|
|
49
|
+
$chardet = true
|
|
50
|
+
|
|
51
|
+
$debug = false
|
|
52
|
+
$compatible = true
|
|
53
|
+
|
|
54
|
+
$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
|
|
55
|
+
require 'rfeedparser/utilities'
|
|
56
|
+
require 'rfeedparser/better_sgmlparser'
|
|
57
|
+
require 'rfeedparser/better_attributelist'
|
|
58
|
+
require 'rfeedparser/feedparserdict'
|
|
59
|
+
require 'rfeedparser/parser_mixin'
|
|
60
|
+
|
|
61
|
+
require 'rfeedparser/loose_feed_parser'
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
require 'rfeedparser/nokogiri_parser'
|
|
65
|
+
StrictFeedParser = FeedParser::Nokogiri::StrictFeedParser
|
|
66
|
+
|
|
67
|
+
require 'rfeedparser/monkey_patches'
|
|
68
|
+
|
|
69
|
+
module FeedParser
|
|
70
|
+
extend FeedParserUtilities
|
|
71
|
+
|
|
72
|
+
VERSION = "0.9.960"
|
|
73
|
+
|
|
74
|
+
AUTHOR = "Mark Pilgrim <http://diveintomark.org/>"
|
|
75
|
+
PORTER = "Jeff Hodges <http://somethingsimilar.com>"
|
|
76
|
+
CONTRIBUTERS = ["Jason Diamond <http://injektilo.org/>",
|
|
77
|
+
"John Beimler <http://john.beimler.org/>",
|
|
78
|
+
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
|
79
|
+
"Aaron Swartz <http://aaronsw.com/>",
|
|
80
|
+
"Kevin Marks <http://epeus.blogspot.com/>",
|
|
81
|
+
"Jesse Newland <http://jnewland.com/>",
|
|
82
|
+
"Charlie Savage <http://cfis.savagexi.com/>",
|
|
83
|
+
"Phil Hagelberg <http://technomancy.us>"]
|
|
84
|
+
|
|
85
|
+
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
|
86
|
+
# If you are embedding feedparser in a larger application, you should
|
|
87
|
+
# change this to your application name and URL.
|
|
88
|
+
USER_AGENT = "rFeedParser/#{VERSION} +http://rfeedparser.rubyforge.org/"
|
|
89
|
+
|
|
90
|
+
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
|
|
91
|
+
# want to send an Accept header, set this to nil.
|
|
92
|
+
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
|
|
96
|
+
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
|
97
|
+
# or utidylib <http://utidylib.berlios.de/>.
|
|
98
|
+
#TIDY_MARKUP = false #FIXME untranslated
|
|
99
|
+
|
|
100
|
+
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
|
|
101
|
+
# if TIDY_MARKUP = true
|
|
102
|
+
#PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ---------- don't touch these ----------
|
|
106
|
+
class ThingsNobodyCaresAboutButMe < StandardError
|
|
107
|
+
end
|
|
108
|
+
class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
|
|
109
|
+
end
|
|
110
|
+
class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
|
|
111
|
+
end
|
|
112
|
+
class NonXMLContentType < ThingsNobodyCaresAboutButMe
|
|
113
|
+
end
|
|
114
|
+
class UndeclaredNamespace < StandardError
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
SUPPORTED_VERSIONS = {'' => 'unknown',
|
|
119
|
+
'rss090' => 'RSS 0.90',
|
|
120
|
+
'rss091n' => 'RSS 0.91 (Netscape)',
|
|
121
|
+
'rss091u' => 'RSS 0.91 (Userland)',
|
|
122
|
+
'rss092' => 'RSS 0.92',
|
|
123
|
+
'rss093' => 'RSS 0.93',
|
|
124
|
+
'rss094' => 'RSS 0.94',
|
|
125
|
+
'rss20' => 'RSS 2.0',
|
|
126
|
+
'rss10' => 'RSS 1.0',
|
|
127
|
+
'rss' => 'RSS (unknown version)',
|
|
128
|
+
'atom01' => 'Atom 0.1',
|
|
129
|
+
'atom02' => 'Atom 0.2',
|
|
130
|
+
'atom03' => 'Atom 0.3',
|
|
131
|
+
'atom10' => 'Atom 1.0',
|
|
132
|
+
'atom' => 'Atom (unknown version)',
|
|
133
|
+
'cdf' => 'CDF',
|
|
134
|
+
'hotrss' => 'Hot RSS'
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Accepted in options: :agent, :modified, :etag, and :referrer
|
|
138
|
+
def open_resource(url_file_stream_or_string, options)
|
|
139
|
+
options[:handlers] ||= []
|
|
140
|
+
|
|
141
|
+
if url_file_stream_or_string.respond_to?(:read)
|
|
142
|
+
return url_file_stream_or_string
|
|
143
|
+
|
|
144
|
+
elsif url_file_stream_or_string == '-'
|
|
145
|
+
return $stdin
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# open-uri freaks out if there's leading spaces.
|
|
149
|
+
url_file_stream_or_string.strip!
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
uri = Addressable::URI.parse(url_file_stream_or_string)
|
|
153
|
+
if uri && ['http','https','ftp'].include?(uri.scheme)
|
|
154
|
+
auth = nil
|
|
155
|
+
|
|
156
|
+
if uri.host && uri.password
|
|
157
|
+
auth = Base64::encode64("#{uri.user}:#{uri.password}").strip
|
|
158
|
+
uri.password = nil
|
|
159
|
+
url_file_stream_or_string = uri.to_s
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
req_headers = {}
|
|
163
|
+
req_headers["User-Agent"] = options[:agent] || USER_AGENT
|
|
164
|
+
req_headers["If-None-Match"] = options[:etag] if options[:etag]
|
|
165
|
+
|
|
166
|
+
if options[:modified]
|
|
167
|
+
if options[:modified].is_a?(String)
|
|
168
|
+
req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
|
|
169
|
+
elsif options[:modified].is_a?(Time)
|
|
170
|
+
req_headers["If-Modified-Since"] = options[:modified].httpdate
|
|
171
|
+
elsif options[:modified].is_a?(Array)
|
|
172
|
+
req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
req_headers["Referer"] = options[:referrer] if options[:referrer]
|
|
177
|
+
req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
|
|
178
|
+
req_headers["Authorization"] = "Basic #{auth}" if auth
|
|
179
|
+
req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
|
|
180
|
+
req_headers['A-IM'] = 'feed' # RFC 3229 support
|
|
181
|
+
|
|
182
|
+
begin
|
|
183
|
+
return open(url_file_stream_or_string, req_headers)
|
|
184
|
+
rescue OpenURI::HTTPError => e
|
|
185
|
+
return e.io
|
|
186
|
+
rescue
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# try to open with native open function (if url_file_stream_or_string is a filename)
|
|
191
|
+
begin
|
|
192
|
+
return open(url_file_stream_or_string)
|
|
193
|
+
rescue
|
|
194
|
+
end
|
|
195
|
+
# treat url_file_stream_or_string as string
|
|
196
|
+
return StringIO.new(url_file_stream_or_string.to_s)
|
|
197
|
+
end
|
|
198
|
+
module_function(:open_resource)
|
|
199
|
+
|
|
200
|
+
# Parse a feed from a URL, file, stream or string
|
|
201
|
+
def parse(url_file_stream_or_string, options = {})
|
|
202
|
+
|
|
203
|
+
# Use the default compatibility if compatible is nil
|
|
204
|
+
$compatible = options[:compatible].nil? ? $compatible : options[:compatible]
|
|
205
|
+
|
|
206
|
+
strictklass = options[:strict] || StrictFeedParser
|
|
207
|
+
looseklass = options[:loose] || LooseFeedParser
|
|
208
|
+
options[:handlers] = options[:handlers] || []
|
|
209
|
+
|
|
210
|
+
result = FeedParserDict.new
|
|
211
|
+
result['feed'] = FeedParserDict.new
|
|
212
|
+
result['entries'] = []
|
|
213
|
+
|
|
214
|
+
result['bozo'] = false
|
|
215
|
+
|
|
216
|
+
begin
|
|
217
|
+
f = open_resource(url_file_stream_or_string, options)
|
|
218
|
+
data = f.read
|
|
219
|
+
rescue => e
|
|
220
|
+
result['bozo'] = true
|
|
221
|
+
result['bozo_exception'] = e
|
|
222
|
+
data = ''
|
|
223
|
+
f = nil
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
if f and !(data.nil? || data.empty?) and f.respond_to?(:meta)
|
|
227
|
+
# if feed is gzip-compressed, decompress it
|
|
228
|
+
if f.meta['content-encoding'] == 'gzip'
|
|
229
|
+
begin
|
|
230
|
+
gz = Zlib::GzipReader.new(StringIO.new(data))
|
|
231
|
+
data = gz.read
|
|
232
|
+
gz.close
|
|
233
|
+
rescue => e
|
|
234
|
+
# Some feeds claim to be gzipped but they're not, so
|
|
235
|
+
# we get garbage. Ideally, we should re-request the
|
|
236
|
+
# feed without the 'Accept-encoding: gzip' header,
|
|
237
|
+
# but we don't.
|
|
238
|
+
result['bozo'] = true
|
|
239
|
+
result['bozo_exception'] = e
|
|
240
|
+
data = ''
|
|
241
|
+
end
|
|
242
|
+
elsif f.meta['content-encoding'] == 'deflate'
|
|
243
|
+
begin
|
|
244
|
+
data = Zlib::Deflate.inflate(data)
|
|
245
|
+
rescue => e
|
|
246
|
+
result['bozo'] = true
|
|
247
|
+
result['bozo_exception'] = e
|
|
248
|
+
data = ''
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
if f.respond_to?(:meta)
|
|
254
|
+
result['etag'] = f.meta['etag']
|
|
255
|
+
result['modified_time'] = parse_date(f.meta['last-modified'])
|
|
256
|
+
result['modified'] = extract_tuple(result['modified_time'])
|
|
257
|
+
result['headers'] = f.meta
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
|
|
261
|
+
if f.respond_to?(:base_uri)
|
|
262
|
+
result['href'] = f.base_uri.to_s # URI => String
|
|
263
|
+
result['status'] = '200'
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
if f.respond_to?(:status)
|
|
267
|
+
result['status'] = f.status[0]
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# there are four encodings to keep track of:
|
|
272
|
+
# - http_encoding is the encoding declared in the Content-Type HTTP header
|
|
273
|
+
# - xml_encoding is the encoding declared in the <?xml declaration
|
|
274
|
+
# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
|
|
275
|
+
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
|
276
|
+
http_headers = result['headers'] || {}
|
|
277
|
+
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
|
|
278
|
+
getCharacterEncoding(http_headers, data)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
if !(http_headers.nil? || http_headers.empty?) && !acceptable_content_type
|
|
282
|
+
if http_headers['content-type']
|
|
283
|
+
bozo_message = "#{http_headers['content-type']} is not an XML media type"
|
|
284
|
+
else
|
|
285
|
+
bozo_message = 'no Content-type specified'
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
result['bozo'] = true
|
|
289
|
+
result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
result['version'], data = stripDoctype(data)
|
|
293
|
+
|
|
294
|
+
baseuri = http_headers['content-location'] || result['href']
|
|
295
|
+
baselang = http_headers['content-language']
|
|
296
|
+
|
|
297
|
+
# if server sent 304, we're done
|
|
298
|
+
if result['status'] == 304
|
|
299
|
+
result['version'] = ''
|
|
300
|
+
result['debug_message'] = "The feed has not changed since you last checked, " +
|
|
301
|
+
"so the server sent no data. This is a feature, not a bug!"
|
|
302
|
+
return result
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# if there was a problem downloading, we're done
|
|
306
|
+
if data.nil? or data.empty?
|
|
307
|
+
return result
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# determine character encoding
|
|
311
|
+
use_strict_parser = false
|
|
312
|
+
known_encoding = false
|
|
313
|
+
tried_encodings = []
|
|
314
|
+
proposed_encoding = nil
|
|
315
|
+
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
|
|
316
|
+
[result['encoding'], xml_encoding, sniffed_xml_encoding].each do |proposed_encoding|
|
|
317
|
+
next if proposed_encoding.nil? or proposed_encoding.empty?
|
|
318
|
+
next if tried_encodings.include? proposed_encoding
|
|
319
|
+
tried_encodings << proposed_encoding
|
|
320
|
+
begin
|
|
321
|
+
data = toUTF8(data, proposed_encoding)
|
|
322
|
+
known_encoding = use_strict_parser = true
|
|
323
|
+
break
|
|
324
|
+
rescue
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# if no luck and we have auto-detection library, try that
|
|
329
|
+
if not known_encoding and $chardet
|
|
330
|
+
begin
|
|
331
|
+
proposed_encoding = CharDet.detect(data)['encoding']
|
|
332
|
+
if proposed_encoding and not tried_encodings.include?proposed_encoding
|
|
333
|
+
tried_encodings << proposed_encoding
|
|
334
|
+
data = toUTF8(data, proposed_encoding)
|
|
335
|
+
known_encoding = use_strict_parser = true
|
|
336
|
+
end
|
|
337
|
+
rescue
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# if still no luck and we haven't tried utf-8 yet, try that
|
|
342
|
+
if not known_encoding and not tried_encodings.include?'utf-8'
|
|
343
|
+
begin
|
|
344
|
+
proposed_encoding = 'utf-8'
|
|
345
|
+
tried_encodings << proposed_encoding
|
|
346
|
+
data = toUTF8(data, proposed_encoding)
|
|
347
|
+
known_encoding = use_strict_parser = true
|
|
348
|
+
rescue
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# if still no luck and we haven't tried windows-1252 yet, try that
|
|
353
|
+
if not known_encoding and not tried_encodings.include?'windows-1252'
|
|
354
|
+
begin
|
|
355
|
+
proposed_encoding = 'windows-1252'
|
|
356
|
+
tried_encodings << proposed_encoding
|
|
357
|
+
data = toUTF8(data, proposed_encoding)
|
|
358
|
+
known_encoding = use_strict_parser = true
|
|
359
|
+
rescue
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# NOTE this isn't in FeedParser.py 4.1
|
|
364
|
+
# if still no luck and we haven't tried iso-8859-2 yet, try that.
|
|
365
|
+
#if not known_encoding and not tried_encodings.include?'iso-8859-2'
|
|
366
|
+
# begin
|
|
367
|
+
# proposed_encoding = 'iso-8859-2'
|
|
368
|
+
# tried_encodings << proposed_encoding
|
|
369
|
+
# data = toUTF8(data, proposed_encoding)
|
|
370
|
+
# known_encoding = use_strict_parser = true
|
|
371
|
+
# rescue
|
|
372
|
+
# end
|
|
373
|
+
#end
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# if still no luck, give up
|
|
377
|
+
if not known_encoding
|
|
378
|
+
result['bozo'] = true
|
|
379
|
+
result['bozo_exception'] = CharacterEncodingUnknown.new("document encoding unknown, I tried #{result['encoding']}, #{xml_encoding}, utf-8 and windows-1252 but nothing worked")
|
|
380
|
+
result['encoding'] = ''
|
|
381
|
+
elsif proposed_encoding != result['encoding']
|
|
382
|
+
result['bozo'] = true
|
|
383
|
+
result['bozo_exception'] = CharacterEncodingOverride.new("documented declared as #{result['encoding']}, but parsed as #{proposed_encoding}")
|
|
384
|
+
result['encoding'] = proposed_encoding
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
use_strict_parser = false unless StrictFeedParser
|
|
388
|
+
|
|
389
|
+
if use_strict_parser
|
|
390
|
+
begin
|
|
391
|
+
parser = StrictFeedParser.new(baseuri, baselang)
|
|
392
|
+
feedparser = parser.handler
|
|
393
|
+
parser.parse(data)
|
|
394
|
+
|
|
395
|
+
rescue => err
|
|
396
|
+
$stderr << "xml parsing failed: #{err.message}\n#{err.backtrace.join("\n")}" if $debug
|
|
397
|
+
result['bozo'] = true
|
|
398
|
+
result['bozo_exception'] = feedparser.exc || err
|
|
399
|
+
use_strict_parser = false
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
if not use_strict_parser
|
|
404
|
+
$stderr << "Using LooseFeed\n\n" if $debug
|
|
405
|
+
feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
|
|
406
|
+
feedparser.parse(data)
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
result['feed'] = feedparser.feeddata
|
|
410
|
+
result['entries'] = feedparser.entries
|
|
411
|
+
result['version'] = result['version'] || feedparser.version
|
|
412
|
+
result['namespaces'] = feedparser.namespacesInUse
|
|
413
|
+
return result
|
|
414
|
+
end
|
|
415
|
+
module_function(:parse)
|
|
416
|
+
end # End FeedParser module
|
|
417
|
+
|
|
418
|
+
def rfp(url_file_stream_or_string, options={})
|
|
419
|
+
FeedParser.parse(url_file_stream_or_string, options)
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
class Serializer
|
|
423
|
+
def initialize(results)
|
|
424
|
+
@results = results
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
class TextSerializer < Serializer
|
|
429
|
+
def write(stream=$stdout)
|
|
430
|
+
writer(stream, @results, '')
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
def writer(stream, node, prefix)
|
|
434
|
+
return if (node.nil? or node.empty?)
|
|
435
|
+
if node.methods.include?'keys'
|
|
436
|
+
node.keys.sort.each do |key|
|
|
437
|
+
next if ['description','link'].include? key
|
|
438
|
+
next if node.has_key? k+'_detail'
|
|
439
|
+
next if node.has_key? k+'_parsed'
|
|
440
|
+
writer(stream,node[k], prefix+k+'.')
|
|
441
|
+
end
|
|
442
|
+
elsif node.class == Array
|
|
443
|
+
node.each_with_index do |thing, index|
|
|
444
|
+
writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
|
|
445
|
+
end
|
|
446
|
+
else
|
|
447
|
+
begin
|
|
448
|
+
s = u(node.to_s)
|
|
449
|
+
stream << prefix[0..-2]
|
|
450
|
+
stream << '='
|
|
451
|
+
stream << s
|
|
452
|
+
stream << "\n"
|
|
453
|
+
rescue
|
|
454
|
+
end
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
class PprintSerializer < Serializer # FIXME use pp instead
|
|
460
|
+
def write(stream = $stdout)
|
|
461
|
+
stream << @results['href'].to_s + "\n\n"
|
|
462
|
+
pp(@results)
|
|
463
|
+
stream << "\n"
|
|
464
|
+
end
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
if $0 == __FILE__
|
|
468
|
+
require 'optparse'
|
|
469
|
+
require 'ostruct'
|
|
470
|
+
options = OpenStruct.new
|
|
471
|
+
options.etag = options.modified = options.agent = options.referrer = nil
|
|
472
|
+
options.content_language = options.content_location = options.ctype = nil
|
|
473
|
+
options.format = 'pprint'
|
|
474
|
+
options.compatible = $compatible
|
|
475
|
+
options.verbose = false
|
|
476
|
+
|
|
477
|
+
opts = OptionParser.new do |opts|
|
|
478
|
+
opts.banner
|
|
479
|
+
opts.separator ""
|
|
480
|
+
opts.on("-A", "--user-agent [AGENT]",
|
|
481
|
+
"User-Agent for HTTP URLs") {|agent|
|
|
482
|
+
options.agent = agent
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
opts.on("-e", "--referrer [URL]",
|
|
486
|
+
"Referrer for HTTP URLs") {|referrer|
|
|
487
|
+
options.referrer = referrer
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
opts.on("-t", "--etag [TAG]",
|
|
491
|
+
"ETag/If-None-Match for HTTP URLs") {|etag|
|
|
492
|
+
options.etag = etag
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
opts.on("-m", "--last-modified [DATE]",
|
|
496
|
+
"Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
|
|
497
|
+
options.modified = modified
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
opts.on("-f", "--format [FORMAT]", [:text, :pprint],
|
|
501
|
+
"output resutls in FORMAT (text, pprint)") {|format|
|
|
502
|
+
options.format = format
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
opts.on("-v", "--[no-]verbose",
|
|
506
|
+
"write debugging information to stderr") {|v|
|
|
507
|
+
options.verbose = v
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
opts.on("-c", "--[no-]compatible",
|
|
511
|
+
"strip element attributes like feedparser.py 4.1 (default)") {|comp|
|
|
512
|
+
options.compatible = comp
|
|
513
|
+
}
|
|
514
|
+
opts.on("-l", "--content-location [LOCATION]",
|
|
515
|
+
"default Content-Location HTTP header") {|loc|
|
|
516
|
+
options.content_location = loc
|
|
517
|
+
}
|
|
518
|
+
opts.on("-a", "--content-language [LANG]",
|
|
519
|
+
"default Content-Language HTTP header") {|lang|
|
|
520
|
+
options.content_language = lang
|
|
521
|
+
}
|
|
522
|
+
opts.on("-t", "--content-type [TYPE]",
|
|
523
|
+
"default Content-type HTTP header") {|ctype|
|
|
524
|
+
options.ctype = ctype
|
|
525
|
+
}
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
opts.parse!(ARGV)
|
|
529
|
+
$debug = true if options.verbose
|
|
530
|
+
$compatible = options.compatible unless options.compatible.nil?
|
|
531
|
+
|
|
532
|
+
if options.format == :text
|
|
533
|
+
serializer = TextSerializer
|
|
534
|
+
else
|
|
535
|
+
serializer = PprintSerializer
|
|
536
|
+
end
|
|
537
|
+
args = *ARGV.dup
|
|
538
|
+
unless args.nil?
|
|
539
|
+
args.each do |url| # opts.parse! removes everything but the urls from the command line
|
|
540
|
+
results = FeedParser.parse(url, :etag => options.etag,
|
|
541
|
+
:modified => options.modified,
|
|
542
|
+
:agent => options.agent,
|
|
543
|
+
:referrer => options.referrer,
|
|
544
|
+
:content_location => options.content_location,
|
|
545
|
+
:content_language => options.content_language,
|
|
546
|
+
:content_type => options.ctype
|
|
547
|
+
)
|
|
548
|
+
serializer.new(results).write($stdout)
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
end
|