syndication 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +6 -0
- data/DEVELOPER +5 -0
- data/IMPLEMENTATION +23 -1
- data/README +31 -17
- data/lib/syndication/atom.rb +2 -0
- data/lib/syndication/common.rb +7 -2
- data/lib/syndication/content.rb +4 -0
- data/lib/syndication/dublincore.rb +20 -14
- data/lib/syndication/podcast.rb +5 -0
- data/lib/syndication/rss.rb +2 -0
- data/lib/syndication/syndication.rb +4 -0
- data/lib/syndication/tagsoup.rb +49 -0
- data/rakefile +52 -0
- data/test/atomtest.rb +4 -0
- data/test/rsstest.rb +4 -0
- data/test/tagsouptest.rb +87 -0
- metadata +10 -2
data/CHANGES
ADDED
data/DEVELOPER
ADDED
data/IMPLEMENTATION
CHANGED
@@ -1,4 +1,26 @@
|
|
1
|
-
# =
|
1
|
+
# = Implementation notes
|
2
|
+
# == Syndication 0.5
|
3
|
+
#
|
4
|
+
# For this release, I added a parser called TagSoup. The name is taken from
|
5
|
+
# the jargon term used for HTML written without any regard to the rules of
|
6
|
+
# HTML structure, i.e. HTML with many common authoring mistakes in.
|
7
|
+
#
|
8
|
+
# TagSoup is a very small and very dumb parser which implements the stream
|
9
|
+
# API of REXML. The test code compares it against REXML for some simple
|
10
|
+
# example XML and makes sure it calls the same callbacks in the same order
|
11
|
+
# with the same parameters.
|
12
|
+
#
|
13
|
+
# Note that hacking together your own XML parser is, generally speaking, the
|
14
|
+
# wrong thing to do. Using TagSoup as a general replacement for REXML is very
|
15
|
+
# definitely the wrong thing to do. Please don't do it.
|
16
|
+
#
|
17
|
+
# A real XML parser does all kinds of things that TagSoup doesn't, like pay
|
18
|
+
# attention to DTDs, handle quoted special characters in element attributes,
|
19
|
+
# handle whitespace in a documented standard way, and so on. The fact that
|
20
|
+
# TagSoup is defective in many areas is intentional. It's designed to be
|
21
|
+
# used as a last resort, for parsing web syndication feeds which are invalid.
|
22
|
+
#
|
23
|
+
# == Syndication 0.4
|
2
24
|
#
|
3
25
|
# As discussed in the README, this is really my fourth attempt at writing
|
4
26
|
# RSS parsing code. For the record, I thought I'd list the approaches I
|
data/README
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
#
|
2
|
-
# = Syndication 0.4
|
1
|
+
# = Syndication 0.5
|
3
2
|
#
|
4
3
|
# This module provides classes for parsing web syndication feeds in RSS and
|
5
4
|
# Atom formats.
|
@@ -77,7 +76,7 @@
|
|
77
76
|
#
|
78
77
|
# - Less source code than the standard library rss module.
|
79
78
|
#
|
80
|
-
# - Faster than the standard library (at least, in my tests
|
79
|
+
# - Faster than the standard library (at least, in my tests).
|
81
80
|
#
|
82
81
|
# Other features:
|
83
82
|
#
|
@@ -93,7 +92,8 @@
|
|
93
92
|
#
|
94
93
|
# - Simple to extend to support your own RSS extensions, uses reflection.
|
95
94
|
#
|
96
|
-
# - Uses REXML fast stream parsing API for speed
|
95
|
+
# - Uses REXML fast stream parsing API for speed, or built-in TagSoup parser
|
96
|
+
# for invalid feeds.
|
97
97
|
#
|
98
98
|
# - Non-validating, tries to be as forgiving as possible of structural errors.
|
99
99
|
#
|
@@ -109,8 +109,6 @@
|
|
109
109
|
#
|
110
110
|
# - Different API, not a drop-in replacement.
|
111
111
|
#
|
112
|
-
# - No way to choose a different XML parser (yet).
|
113
|
-
#
|
114
112
|
# - Incomplete support for Atom 0.3 draft. (Anyone still using it?)
|
115
113
|
#
|
116
114
|
# - No support for base64 data in Atom feeds (yet).
|
@@ -150,11 +148,31 @@
|
|
150
148
|
# For the record, I started work on my library long before simple-rss was
|
151
149
|
# announced.
|
152
150
|
#
|
153
|
-
# = feedtools
|
151
|
+
# = feedtools
|
154
152
|
#
|
155
153
|
# http://rubyforge.org/projects/feedtools/
|
156
154
|
#
|
157
|
-
#
|
155
|
+
# This one solves most of the same problems as Syndication; however the two
|
156
|
+
# were developed in parallel, in ignorance of each other.
|
157
|
+
#
|
158
|
+
# Feedtools builds in database caching and persistance, and HTTP fetching.
|
159
|
+
# Personally, I don't think those belong in a feed parsing library--they
|
160
|
+
# are easily implemented using other standard libraries if you want them.
|
161
|
+
#
|
162
|
+
# Pros:
|
163
|
+
# - Lots of test cases.
|
164
|
+
#
|
165
|
+
# - Used by lots of Rails people.
|
166
|
+
#
|
167
|
+
# - Knows about many more namespaces.
|
168
|
+
#
|
169
|
+
# Cons:
|
170
|
+
# - Skimpy documentation.
|
171
|
+
#
|
172
|
+
# - Uses HTree then XPath parsing, rather than a single stream parse.
|
173
|
+
#
|
174
|
+
# - Tries to unify RSS and Atom APIs, at the expense of Atom functionality.
|
175
|
+
# (Which could also be a pro, depending on your viewpoint.)
|
158
176
|
#
|
159
177
|
# == Design philosophy
|
160
178
|
#
|
@@ -180,6 +198,9 @@
|
|
180
198
|
#
|
181
199
|
# - Get well-formed feeds parsing reliably, then worry about broken feeds.
|
182
200
|
#
|
201
|
+
# - Atom will hopefully be the future. Provide full support for RSS, but don't
|
202
|
+
# hold Atom back by trying to force it into an RSS data model.
|
203
|
+
#
|
183
204
|
# == Future plans
|
184
205
|
#
|
185
206
|
# Here are some possible improvements:
|
@@ -187,12 +208,6 @@
|
|
187
208
|
# - RSS and Atom generation. Create objects, then call Syndication::FeedMaker
|
188
209
|
# to generate XML in various flavors.
|
189
210
|
#
|
190
|
-
# - More lenient parsing. The limiting factor right now appears to be REXML,
|
191
|
-
# which although a non-validating parser, does require fairly well-formed
|
192
|
-
# XML. (In particular, failure to match tags will cause errors.) Perhaps
|
193
|
-
# the answer is to find or build a 'tag soup' parser that implements the
|
194
|
-
# REXML stream parsing API?
|
195
|
-
#
|
196
211
|
# - Faster date parsing. It turns out that when I asked for parsed dates in
|
197
212
|
# my test code, the profiler showed Date.parse chewing up 25% of the total
|
198
213
|
# CPU time used. A more specific date parser that didn't use heuristics
|
@@ -202,7 +217,6 @@
|
|
202
217
|
#
|
203
218
|
# == Feedback
|
204
219
|
#
|
205
|
-
#
|
206
|
-
#
|
207
|
-
# <meta@pobox.com>.
|
220
|
+
# There are doubtless things I could have done better. Comments, suggestions,
|
221
|
+
# etc are welcome; e-mail <meta@pobox.com>.
|
208
222
|
#
|
data/lib/syndication/atom.rb
CHANGED
data/lib/syndication/common.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
#
|
3
3
|
# Copyright � mathew <meta@pobox.com> 2005.
|
4
4
|
# Licensed under the same terms as Ruby.
|
5
|
+
#
|
6
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/common.rb,v 1.3 2005/10/17 15:05:21 meta Exp $
|
5
7
|
|
6
8
|
require 'uri'
|
7
9
|
require 'rexml/parsers/streamparser'
|
@@ -174,8 +176,11 @@ module Syndication
|
|
174
176
|
# Parse the text provided. Returns a Syndication::Atom::Feed or
|
175
177
|
# Syndication::RSS::Feed object, according to which concrete Parser
|
176
178
|
# class is being used.
|
177
|
-
|
178
|
-
|
179
|
+
# The second argument is optional and determines the parser engine to
|
180
|
+
# use. The default is REXML. To use TagSoup, pass in the value
|
181
|
+
# Syndication::TagSoup
|
182
|
+
def parse(text, classname = REXML::Document)
|
183
|
+
classname.parse_stream(text, self)
|
179
184
|
return @parsetree
|
180
185
|
end
|
181
186
|
|
data/lib/syndication/content.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/dublincore.rb,v 1.3 2005/10/17 15:05:21 meta Exp $
|
1
5
|
|
2
6
|
module Syndication
|
3
7
|
|
@@ -70,23 +74,25 @@ module Syndication
|
|
70
74
|
end
|
71
75
|
|
72
76
|
#:enddoc:
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
77
|
+
module RSS
|
78
|
+
# Now we mix in the DublinCore elements to all the Syndication classes that
|
79
|
+
# can contain them. There's probably some clever way to do this via
|
80
|
+
# reflection, but there _is_ such a thing as being too clever.
|
81
|
+
class Item
|
82
|
+
include DublinCore
|
83
|
+
end
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
|
85
|
+
class Channel
|
86
|
+
include DublinCore
|
87
|
+
end
|
83
88
|
|
84
|
-
|
85
|
-
|
86
|
-
|
89
|
+
class Image
|
90
|
+
include DublinCore
|
91
|
+
end
|
87
92
|
|
88
|
-
|
89
|
-
|
93
|
+
class TextInput
|
94
|
+
include DublinCore
|
95
|
+
end
|
90
96
|
end
|
91
97
|
|
92
98
|
end
|
data/lib/syndication/podcast.rb
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/podcast.rb,v 1.2 2005/10/17 15:05:21 meta Exp $
|
5
|
+
|
1
6
|
module Syndication
|
2
7
|
|
3
8
|
# Mixin for iTunes podcast RSS elements.
|
data/lib/syndication/rss.rb
CHANGED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/tagsoup.rb,v 1.2 2005/10/17 15:05:21 meta Exp $
|
5
|
+
|
6
|
+
require 'cgi'
|
7
|
+
|
8
|
+
module Syndication
|
9
|
+
|
10
|
+
# TagSoup is a tiny completely non-validating XML parser which implements the
|
11
|
+
# tag_start, tag_end and text methods of the REXML StreamListener interface.
|
12
|
+
#
|
13
|
+
# It's designed for permissive parsing of RSS and Atom feeds; using it for
|
14
|
+
# anything more complex (like HTML with CSS and JavaScript) is not advised.
|
15
|
+
class TagSoup
|
16
|
+
|
17
|
+
# Parse data String and send events to listener
|
18
|
+
def TagSoup.parse_stream(data, listener)
|
19
|
+
data.scan(/(<\/[^>]*>|<[^>]*>|[^<>]*)/m) do |match|
|
20
|
+
thing = match.first.strip
|
21
|
+
if thing[0,1] == '<'
|
22
|
+
# It's a tag_start or tag_end
|
23
|
+
(tag,rest) = thing.match(/<\/?([^>\s]+)([^>]*)/)[1,2]
|
24
|
+
if thing[1,1] == '/'
|
25
|
+
listener.tag_end(tag)
|
26
|
+
else
|
27
|
+
# Parse the attr=val pairs
|
28
|
+
pairs = Hash.new
|
29
|
+
rest.scan(/([\w:]+)=("([^"]*)"|'([^']*)')/) {|a,j,v1,v2|
|
30
|
+
if v1 == nil
|
31
|
+
v = v2
|
32
|
+
else
|
33
|
+
v = v1
|
34
|
+
end
|
35
|
+
if a
|
36
|
+
pairs[a] = v
|
37
|
+
end
|
38
|
+
}
|
39
|
+
listener.tag_start(tag, pairs)
|
40
|
+
end
|
41
|
+
else
|
42
|
+
# It's text
|
43
|
+
listener.text(CGI.unescapeHTML(thing))
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
data/rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require 'rake/packagetask'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
require 'rubygems'
|
6
|
+
|
7
|
+
PKG_VERSION = "0.5.0"
|
8
|
+
|
9
|
+
desc "Create HTML documentation from RDOC"
|
10
|
+
Rake::RDocTask.new do |rd|
|
11
|
+
rd.main = "README"
|
12
|
+
rd.rdoc_files.include("README", "CHANGES", "IMPLEMENTATION", "DEVELOPER",
|
13
|
+
"lib/**/*.rb", "test/**/*.rb", "examples/**/*.rb")
|
14
|
+
end
|
15
|
+
|
16
|
+
desc "Make tar distribution"
|
17
|
+
Rake::PackageTask.new('syndication', PKG_VERSION) do |t|
|
18
|
+
t.need_tar_bz2 = true
|
19
|
+
t.package_files.include("README", "CHANGES", "IMPLEMENTATION", "DEVELOPER", "lib/**/*.rb", "test/**/*.rb", "examples/**/*.rb", "rakefile", "setup.rb")
|
20
|
+
t.package_dir = "pkg"
|
21
|
+
end
|
22
|
+
|
23
|
+
spec = Gem::Specification.new do |s|
|
24
|
+
s.name = "syndication"
|
25
|
+
s.version = PKG_VERSION
|
26
|
+
s.author = "mathew"
|
27
|
+
s.email = "meta@pobox.com"
|
28
|
+
s.homepage = "http://www.pobox.com/~meta/"
|
29
|
+
s.platform = Gem::Platform::RUBY
|
30
|
+
s.summary = "A web syndication parser for Atom and RSS with a uniform API"
|
31
|
+
candidates = Dir.glob("{bin,docs,lib,test,examples}/**/*")
|
32
|
+
candidates << "rakefile"
|
33
|
+
s.files = candidates.delete_if do |item|
|
34
|
+
item.include?("CVS") || item.include?("html")
|
35
|
+
end
|
36
|
+
s.require_path = "lib"
|
37
|
+
s.test_files = ["test/atomtest.rb", "test/rsstest.rb",
|
38
|
+
"test/tagsouptest.rb"]
|
39
|
+
s.has_rdoc = true
|
40
|
+
s.extra_rdoc_files = ["README", "IMPLEMENTATION", "CHANGES", "DEVELOPER"]
|
41
|
+
end
|
42
|
+
|
43
|
+
desc "Make RubyGems gem distribution"
|
44
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
45
|
+
pkg.need_zip = true
|
46
|
+
pkg.need_tar = true
|
47
|
+
end
|
48
|
+
|
49
|
+
task :default do
|
50
|
+
puts "This is a pure Ruby library, no compilation is required."
|
51
|
+
puts "Try rake --tasks"
|
52
|
+
end
|
data/test/atomtest.rb
CHANGED
data/test/rsstest.rb
CHANGED
data/test/tagsouptest.rb
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/test/tagsouptest.rb,v 1.2 2005/10/17 20:06:51 meta Exp $
|
5
|
+
|
6
|
+
require 'syndication/tagsoup'
|
7
|
+
require 'test/unit'
|
8
|
+
require 'rexml/document'
|
9
|
+
require 'pp'
|
10
|
+
|
11
|
+
module Syndication
|
12
|
+
|
13
|
+
# This class contains the unit tests for the Syndication module.
|
14
|
+
class Tests < Test::Unit::TestCase
|
15
|
+
|
16
|
+
def tag_start(x, pairs)
|
17
|
+
@events << "tag_start(#{x.strip})"
|
18
|
+
lst = nil
|
19
|
+
if pairs
|
20
|
+
for p in pairs
|
21
|
+
if lst
|
22
|
+
lst = lst + ","
|
23
|
+
else
|
24
|
+
lst = ""
|
25
|
+
end
|
26
|
+
lst << "#{p[0]}=#{p[1]}"
|
27
|
+
end
|
28
|
+
@events << "attrs(#{lst})"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def tag_end(x)
|
33
|
+
@events << "tag_end(#{x.strip})"
|
34
|
+
end
|
35
|
+
|
36
|
+
def text(x)
|
37
|
+
@events << "text(#{x.strip})"
|
38
|
+
end
|
39
|
+
|
40
|
+
# Minimal test
|
41
|
+
def test_tagsoup
|
42
|
+
xml = <<-EOF
|
43
|
+
<a>
|
44
|
+
<b>one
|
45
|
+
<c></c></b>
|
46
|
+
<d arg1="alpha">two</d>
|
47
|
+
<e arg2='beta'>
|
48
|
+
three<fourc™
|
49
|
+
</e>
|
50
|
+
</a>
|
51
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
52
|
+
<title>One good turn usually gets most of the blanket.</title>
|
53
|
+
<updated>2005-08-20T21:14:38Z</updated>
|
54
|
+
<id>urn:uuid:035d3aa3022c1b1b2a17e37ae2dcc376</id>
|
55
|
+
<entry>
|
56
|
+
<title>Quidquid latine dictum sit, altum viditur.</title>
|
57
|
+
<link href="http://example.com/05/08/20/2114.html"/>
|
58
|
+
<id>urn:uuid:89d96d76a99426264f6f1f520c1b93c2</id>
|
59
|
+
<updated>2005-08-20T21:14:38Z</updated>
|
60
|
+
</entry>
|
61
|
+
</feed>
|
62
|
+
EOF
|
63
|
+
@events = Array.new
|
64
|
+
Syndication::TagSoup.parse_stream(xml, self)
|
65
|
+
@tagsoup = @events
|
66
|
+
@events = Array.new
|
67
|
+
REXML::Document.parse_stream(xml, self)
|
68
|
+
@rexml = @events
|
69
|
+
puts "REXML\n-----"
|
70
|
+
pp @rexml
|
71
|
+
puts "\nTAGSOUP\n-------"
|
72
|
+
pp @tagsoup
|
73
|
+
errs = false
|
74
|
+
for tsevt in @tagsoup
|
75
|
+
rxevt = @rexml.shift
|
76
|
+
if rxevt
|
77
|
+
if tsevt.to_s != rxevt.to_s
|
78
|
+
errs = true
|
79
|
+
puts "TagSoup: [#{tsevt}]\nREXML: [#{rxevt}]"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
assert(!errs, "TagSoup and REXML parse results didn't match")
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: syndication
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2005-
|
6
|
+
version: 0.5.0
|
7
|
+
date: 2005-10-17 00:00:00 -05:00
|
8
8
|
summary: A web syndication parser for Atom and RSS with a uniform API
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,21 +34,29 @@ files:
|
|
34
34
|
- lib/syndication/common.rb
|
35
35
|
- lib/syndication/podcast.rb
|
36
36
|
- lib/syndication/content.rb
|
37
|
+
- lib/syndication/tagsoup.rb
|
37
38
|
- lib/syndication/rss.rb
|
38
39
|
- lib/syndication/syndication.rb
|
39
40
|
- lib/syndication/atom.rb
|
41
|
+
- test/tagsouptest.rb
|
40
42
|
- test/rsstest.rb
|
41
43
|
- test/atomtest.rb
|
42
44
|
- examples/yahoo.rb
|
45
|
+
- rakefile
|
43
46
|
- README
|
44
47
|
- IMPLEMENTATION
|
48
|
+
- CHANGES
|
49
|
+
- DEVELOPER
|
45
50
|
test_files:
|
46
51
|
- test/atomtest.rb
|
47
52
|
- test/rsstest.rb
|
53
|
+
- test/tagsouptest.rb
|
48
54
|
rdoc_options: []
|
49
55
|
extra_rdoc_files:
|
50
56
|
- README
|
51
57
|
- IMPLEMENTATION
|
58
|
+
- CHANGES
|
59
|
+
- DEVELOPER
|
52
60
|
executables: []
|
53
61
|
extensions: []
|
54
62
|
requirements: []
|