syndication 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +6 -0
- data/DEVELOPER +5 -0
- data/IMPLEMENTATION +23 -1
- data/README +31 -17
- data/lib/syndication/atom.rb +2 -0
- data/lib/syndication/common.rb +7 -2
- data/lib/syndication/content.rb +4 -0
- data/lib/syndication/dublincore.rb +20 -14
- data/lib/syndication/podcast.rb +5 -0
- data/lib/syndication/rss.rb +2 -0
- data/lib/syndication/syndication.rb +4 -0
- data/lib/syndication/tagsoup.rb +49 -0
- data/rakefile +52 -0
- data/test/atomtest.rb +4 -0
- data/test/rsstest.rb +4 -0
- data/test/tagsouptest.rb +87 -0
- metadata +10 -2
data/CHANGES
ADDED
data/DEVELOPER
ADDED
data/IMPLEMENTATION
CHANGED
@@ -1,4 +1,26 @@
|
|
1
|
-
# =
|
1
|
+
# = Implementation notes
|
2
|
+
# == Syndication 0.5
|
3
|
+
#
|
4
|
+
# For this release, I added a parser called TagSoup. The name is taken from
|
5
|
+
# the jargon term used for HTML written without any regard to the rules of
|
6
|
+
# HTML structure, i.e. HTML with many common authoring mistakes in.
|
7
|
+
#
|
8
|
+
# TagSoup is a very small and very dumb parser which implements the stream
|
9
|
+
# API of REXML. The test code compares it against REXML for some simple
|
10
|
+
# example XML and makes sure it calls the same callbacks in the same order
|
11
|
+
# with the same parameters.
|
12
|
+
#
|
13
|
+
# Note that hacking together your own XML parser is, generally speaking, the
|
14
|
+
# wrong thing to do. Using TagSoup as a general replacement for REXML is very
|
15
|
+
# definitely the wrong thing to do. Please don't do it.
|
16
|
+
#
|
17
|
+
# A real XML parser does all kinds of things that TagSoup doesn't, like pay
|
18
|
+
# attention to DTDs, handle quoted special characters in element attributes,
|
19
|
+
# handle whitespace in a documented standard way, and so on. The fact that
|
20
|
+
# TagSoup is defective in many areas is intentional. It's designed to be
|
21
|
+
# used as a last resort, for parsing web syndication feeds which are invalid.
|
22
|
+
#
|
23
|
+
# == Syndication 0.4
|
2
24
|
#
|
3
25
|
# As discussed in the README, this is really my fourth attempt at writing
|
4
26
|
# RSS parsing code. For the record, I thought I'd list the approaches I
|
data/README
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
#
|
2
|
-
# = Syndication 0.4
|
1
|
+
# = Syndication 0.5
|
3
2
|
#
|
4
3
|
# This module provides classes for parsing web syndication feeds in RSS and
|
5
4
|
# Atom formats.
|
@@ -77,7 +76,7 @@
|
|
77
76
|
#
|
78
77
|
# - Less source code than the standard library rss module.
|
79
78
|
#
|
80
|
-
# - Faster than the standard library (at least, in my tests
|
79
|
+
# - Faster than the standard library (at least, in my tests).
|
81
80
|
#
|
82
81
|
# Other features:
|
83
82
|
#
|
@@ -93,7 +92,8 @@
|
|
93
92
|
#
|
94
93
|
# - Simple to extend to support your own RSS extensions, uses reflection.
|
95
94
|
#
|
96
|
-
# - Uses REXML fast stream parsing API for speed
|
95
|
+
# - Uses REXML fast stream parsing API for speed, or built-in TagSoup parser
|
96
|
+
# for invalid feeds.
|
97
97
|
#
|
98
98
|
# - Non-validating, tries to be as forgiving as possible of structural errors.
|
99
99
|
#
|
@@ -109,8 +109,6 @@
|
|
109
109
|
#
|
110
110
|
# - Different API, not a drop-in replacement.
|
111
111
|
#
|
112
|
-
# - No way to choose a different XML parser (yet).
|
113
|
-
#
|
114
112
|
# - Incomplete support for Atom 0.3 draft. (Anyone still using it?)
|
115
113
|
#
|
116
114
|
# - No support for base64 data in Atom feeds (yet).
|
@@ -150,11 +148,31 @@
|
|
150
148
|
# For the record, I started work on my library long before simple-rss was
|
151
149
|
# announced.
|
152
150
|
#
|
153
|
-
# = feedtools
|
151
|
+
# = feedtools
|
154
152
|
#
|
155
153
|
# http://rubyforge.org/projects/feedtools/
|
156
154
|
#
|
157
|
-
#
|
155
|
+
# This one solves most of the same problems as Syndication; however the two
|
156
|
+
# were developed in parallel, in ignorance of each other.
|
157
|
+
#
|
158
|
+
# Feedtools builds in database caching and persistance, and HTTP fetching.
|
159
|
+
# Personally, I don't think those belong in a feed parsing library--they
|
160
|
+
# are easily implemented using other standard libraries if you want them.
|
161
|
+
#
|
162
|
+
# Pros:
|
163
|
+
# - Lots of test cases.
|
164
|
+
#
|
165
|
+
# - Used by lots of Rails people.
|
166
|
+
#
|
167
|
+
# - Knows about many more namespaces.
|
168
|
+
#
|
169
|
+
# Cons:
|
170
|
+
# - Skimpy documentation.
|
171
|
+
#
|
172
|
+
# - Uses HTree then XPath parsing, rather than a single stream parse.
|
173
|
+
#
|
174
|
+
# - Tries to unify RSS and Atom APIs, at the expense of Atom functionality.
|
175
|
+
# (Which could also be a pro, depending on your viewpoint.)
|
158
176
|
#
|
159
177
|
# == Design philosophy
|
160
178
|
#
|
@@ -180,6 +198,9 @@
|
|
180
198
|
#
|
181
199
|
# - Get well-formed feeds parsing reliably, then worry about broken feeds.
|
182
200
|
#
|
201
|
+
# - Atom will hopefully be the future. Provide full support for RSS, but don't
|
202
|
+
# hold Atom back by trying to force it into an RSS data model.
|
203
|
+
#
|
183
204
|
# == Future plans
|
184
205
|
#
|
185
206
|
# Here are some possible improvements:
|
@@ -187,12 +208,6 @@
|
|
187
208
|
# - RSS and Atom generation. Create objects, then call Syndication::FeedMaker
|
188
209
|
# to generate XML in various flavors.
|
189
210
|
#
|
190
|
-
# - More lenient parsing. The limiting factor right now appears to be REXML,
|
191
|
-
# which although a non-validating parser, does require fairly well-formed
|
192
|
-
# XML. (In particular, failure to match tags will cause errors.) Perhaps
|
193
|
-
# the answer is to find or build a 'tag soup' parser that implements the
|
194
|
-
# REXML stream parsing API?
|
195
|
-
#
|
196
211
|
# - Faster date parsing. It turns out that when I asked for parsed dates in
|
197
212
|
# my test code, the profiler showed Date.parse chewing up 25% of the total
|
198
213
|
# CPU time used. A more specific date parser that didn't use heuristics
|
@@ -202,7 +217,6 @@
|
|
202
217
|
#
|
203
218
|
# == Feedback
|
204
219
|
#
|
205
|
-
#
|
206
|
-
#
|
207
|
-
# <meta@pobox.com>.
|
220
|
+
# There are doubtless things I could have done better. Comments, suggestions,
|
221
|
+
# etc are welcome; e-mail <meta@pobox.com>.
|
208
222
|
#
|
data/lib/syndication/atom.rb
CHANGED
data/lib/syndication/common.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
#
|
3
3
|
# Copyright � mathew <meta@pobox.com> 2005.
|
4
4
|
# Licensed under the same terms as Ruby.
|
5
|
+
#
|
6
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/common.rb,v 1.3 2005/10/17 15:05:21 meta Exp $
|
5
7
|
|
6
8
|
require 'uri'
|
7
9
|
require 'rexml/parsers/streamparser'
|
@@ -174,8 +176,11 @@ module Syndication
|
|
174
176
|
# Parse the text provided. Returns a Syndication::Atom::Feed or
|
175
177
|
# Syndication::RSS::Feed object, according to which concrete Parser
|
176
178
|
# class is being used.
|
177
|
-
|
178
|
-
|
179
|
+
# The second argument is optional and determines the parser engine to
|
180
|
+
# use. The default is REXML. To use TagSoup, pass in the value
|
181
|
+
# Syndication::TagSoup
|
182
|
+
def parse(text, classname = REXML::Document)
|
183
|
+
classname.parse_stream(text, self)
|
179
184
|
return @parsetree
|
180
185
|
end
|
181
186
|
|
data/lib/syndication/content.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/dublincore.rb,v 1.3 2005/10/17 15:05:21 meta Exp $
|
1
5
|
|
2
6
|
module Syndication
|
3
7
|
|
@@ -70,23 +74,25 @@ module Syndication
|
|
70
74
|
end
|
71
75
|
|
72
76
|
#:enddoc:
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
77
|
+
module RSS
|
78
|
+
# Now we mix in the DublinCore elements to all the Syndication classes that
|
79
|
+
# can contain them. There's probably some clever way to do this via
|
80
|
+
# reflection, but there _is_ such a thing as being too clever.
|
81
|
+
class Item
|
82
|
+
include DublinCore
|
83
|
+
end
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
|
85
|
+
class Channel
|
86
|
+
include DublinCore
|
87
|
+
end
|
83
88
|
|
84
|
-
|
85
|
-
|
86
|
-
|
89
|
+
class Image
|
90
|
+
include DublinCore
|
91
|
+
end
|
87
92
|
|
88
|
-
|
89
|
-
|
93
|
+
class TextInput
|
94
|
+
include DublinCore
|
95
|
+
end
|
90
96
|
end
|
91
97
|
|
92
98
|
end
|
data/lib/syndication/podcast.rb
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/podcast.rb,v 1.2 2005/10/17 15:05:21 meta Exp $
|
5
|
+
|
1
6
|
module Syndication
|
2
7
|
|
3
8
|
# Mixin for iTunes podcast RSS elements.
|
data/lib/syndication/rss.rb
CHANGED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/lib/syndication/tagsoup.rb,v 1.2 2005/10/17 15:05:21 meta Exp $
|
5
|
+
|
6
|
+
require 'cgi'
|
7
|
+
|
8
|
+
module Syndication
|
9
|
+
|
10
|
+
# TagSoup is a tiny completely non-validating XML parser which implements the
|
11
|
+
# tag_start, tag_end and text methods of the REXML StreamListener interface.
|
12
|
+
#
|
13
|
+
# It's designed for permissive parsing of RSS and Atom feeds; using it for
|
14
|
+
# anything more complex (like HTML with CSS and JavaScript) is not advised.
|
15
|
+
class TagSoup
|
16
|
+
|
17
|
+
# Parse data String and send events to listener
|
18
|
+
def TagSoup.parse_stream(data, listener)
|
19
|
+
data.scan(/(<\/[^>]*>|<[^>]*>|[^<>]*)/m) do |match|
|
20
|
+
thing = match.first.strip
|
21
|
+
if thing[0,1] == '<'
|
22
|
+
# It's a tag_start or tag_end
|
23
|
+
(tag,rest) = thing.match(/<\/?([^>\s]+)([^>]*)/)[1,2]
|
24
|
+
if thing[1,1] == '/'
|
25
|
+
listener.tag_end(tag)
|
26
|
+
else
|
27
|
+
# Parse the attr=val pairs
|
28
|
+
pairs = Hash.new
|
29
|
+
rest.scan(/([\w:]+)=("([^"]*)"|'([^']*)')/) {|a,j,v1,v2|
|
30
|
+
if v1 == nil
|
31
|
+
v = v2
|
32
|
+
else
|
33
|
+
v = v1
|
34
|
+
end
|
35
|
+
if a
|
36
|
+
pairs[a] = v
|
37
|
+
end
|
38
|
+
}
|
39
|
+
listener.tag_start(tag, pairs)
|
40
|
+
end
|
41
|
+
else
|
42
|
+
# It's text
|
43
|
+
listener.text(CGI.unescapeHTML(thing))
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
data/rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require 'rake/packagetask'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
require 'rubygems'
|
6
|
+
|
7
|
+
PKG_VERSION = "0.5.0"
|
8
|
+
|
9
|
+
desc "Create HTML documentation from RDOC"
|
10
|
+
Rake::RDocTask.new do |rd|
|
11
|
+
rd.main = "README"
|
12
|
+
rd.rdoc_files.include("README", "CHANGES", "IMPLEMENTATION", "DEVELOPER",
|
13
|
+
"lib/**/*.rb", "test/**/*.rb", "examples/**/*.rb")
|
14
|
+
end
|
15
|
+
|
16
|
+
desc "Make tar distribution"
|
17
|
+
Rake::PackageTask.new('syndication', PKG_VERSION) do |t|
|
18
|
+
t.need_tar_bz2 = true
|
19
|
+
t.package_files.include("README", "CHANGES", "IMPLEMENTATION", "DEVELOPER", "lib/**/*.rb", "test/**/*.rb", "examples/**/*.rb", "rakefile", "setup.rb")
|
20
|
+
t.package_dir = "pkg"
|
21
|
+
end
|
22
|
+
|
23
|
+
spec = Gem::Specification.new do |s|
|
24
|
+
s.name = "syndication"
|
25
|
+
s.version = PKG_VERSION
|
26
|
+
s.author = "mathew"
|
27
|
+
s.email = "meta@pobox.com"
|
28
|
+
s.homepage = "http://www.pobox.com/~meta/"
|
29
|
+
s.platform = Gem::Platform::RUBY
|
30
|
+
s.summary = "A web syndication parser for Atom and RSS with a uniform API"
|
31
|
+
candidates = Dir.glob("{bin,docs,lib,test,examples}/**/*")
|
32
|
+
candidates << "rakefile"
|
33
|
+
s.files = candidates.delete_if do |item|
|
34
|
+
item.include?("CVS") || item.include?("html")
|
35
|
+
end
|
36
|
+
s.require_path = "lib"
|
37
|
+
s.test_files = ["test/atomtest.rb", "test/rsstest.rb",
|
38
|
+
"test/tagsouptest.rb"]
|
39
|
+
s.has_rdoc = true
|
40
|
+
s.extra_rdoc_files = ["README", "IMPLEMENTATION", "CHANGES", "DEVELOPER"]
|
41
|
+
end
|
42
|
+
|
43
|
+
desc "Make RubyGems gem distribution"
|
44
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
45
|
+
pkg.need_zip = true
|
46
|
+
pkg.need_tar = true
|
47
|
+
end
|
48
|
+
|
49
|
+
task :default do
|
50
|
+
puts "This is a pure Ruby library, no compilation is required."
|
51
|
+
puts "Try rake --tasks"
|
52
|
+
end
|
data/test/atomtest.rb
CHANGED
data/test/rsstest.rb
CHANGED
data/test/tagsouptest.rb
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
2
|
+
# Licensed under the same terms as Ruby.
|
3
|
+
#
|
4
|
+
# $Header: /var/cvs/syndication/syndication/test/tagsouptest.rb,v 1.2 2005/10/17 20:06:51 meta Exp $
|
5
|
+
|
6
|
+
require 'syndication/tagsoup'
|
7
|
+
require 'test/unit'
|
8
|
+
require 'rexml/document'
|
9
|
+
require 'pp'
|
10
|
+
|
11
|
+
module Syndication
|
12
|
+
|
13
|
+
# This class contains the unit tests for the Syndication module.
|
14
|
+
class Tests < Test::Unit::TestCase
|
15
|
+
|
16
|
+
def tag_start(x, pairs)
|
17
|
+
@events << "tag_start(#{x.strip})"
|
18
|
+
lst = nil
|
19
|
+
if pairs
|
20
|
+
for p in pairs
|
21
|
+
if lst
|
22
|
+
lst = lst + ","
|
23
|
+
else
|
24
|
+
lst = ""
|
25
|
+
end
|
26
|
+
lst << "#{p[0]}=#{p[1]}"
|
27
|
+
end
|
28
|
+
@events << "attrs(#{lst})"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def tag_end(x)
|
33
|
+
@events << "tag_end(#{x.strip})"
|
34
|
+
end
|
35
|
+
|
36
|
+
def text(x)
|
37
|
+
@events << "text(#{x.strip})"
|
38
|
+
end
|
39
|
+
|
40
|
+
# Minimal test
|
41
|
+
def test_tagsoup
|
42
|
+
xml = <<-EOF
|
43
|
+
<a>
|
44
|
+
<b>one
|
45
|
+
<c></c></b>
|
46
|
+
<d arg1="alpha">two</d>
|
47
|
+
<e arg2='beta'>
|
48
|
+
three<fourc™
|
49
|
+
</e>
|
50
|
+
</a>
|
51
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
52
|
+
<title>One good turn usually gets most of the blanket.</title>
|
53
|
+
<updated>2005-08-20T21:14:38Z</updated>
|
54
|
+
<id>urn:uuid:035d3aa3022c1b1b2a17e37ae2dcc376</id>
|
55
|
+
<entry>
|
56
|
+
<title>Quidquid latine dictum sit, altum viditur.</title>
|
57
|
+
<link href="http://example.com/05/08/20/2114.html"/>
|
58
|
+
<id>urn:uuid:89d96d76a99426264f6f1f520c1b93c2</id>
|
59
|
+
<updated>2005-08-20T21:14:38Z</updated>
|
60
|
+
</entry>
|
61
|
+
</feed>
|
62
|
+
EOF
|
63
|
+
@events = Array.new
|
64
|
+
Syndication::TagSoup.parse_stream(xml, self)
|
65
|
+
@tagsoup = @events
|
66
|
+
@events = Array.new
|
67
|
+
REXML::Document.parse_stream(xml, self)
|
68
|
+
@rexml = @events
|
69
|
+
puts "REXML\n-----"
|
70
|
+
pp @rexml
|
71
|
+
puts "\nTAGSOUP\n-------"
|
72
|
+
pp @tagsoup
|
73
|
+
errs = false
|
74
|
+
for tsevt in @tagsoup
|
75
|
+
rxevt = @rexml.shift
|
76
|
+
if rxevt
|
77
|
+
if tsevt.to_s != rxevt.to_s
|
78
|
+
errs = true
|
79
|
+
puts "TagSoup: [#{tsevt}]\nREXML: [#{rxevt}]"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
assert(!errs, "TagSoup and REXML parse results didn't match")
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: syndication
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2005-
|
6
|
+
version: 0.5.0
|
7
|
+
date: 2005-10-17 00:00:00 -05:00
|
8
8
|
summary: A web syndication parser for Atom and RSS with a uniform API
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,21 +34,29 @@ files:
|
|
34
34
|
- lib/syndication/common.rb
|
35
35
|
- lib/syndication/podcast.rb
|
36
36
|
- lib/syndication/content.rb
|
37
|
+
- lib/syndication/tagsoup.rb
|
37
38
|
- lib/syndication/rss.rb
|
38
39
|
- lib/syndication/syndication.rb
|
39
40
|
- lib/syndication/atom.rb
|
41
|
+
- test/tagsouptest.rb
|
40
42
|
- test/rsstest.rb
|
41
43
|
- test/atomtest.rb
|
42
44
|
- examples/yahoo.rb
|
45
|
+
- rakefile
|
43
46
|
- README
|
44
47
|
- IMPLEMENTATION
|
48
|
+
- CHANGES
|
49
|
+
- DEVELOPER
|
45
50
|
test_files:
|
46
51
|
- test/atomtest.rb
|
47
52
|
- test/rsstest.rb
|
53
|
+
- test/tagsouptest.rb
|
48
54
|
rdoc_options: []
|
49
55
|
extra_rdoc_files:
|
50
56
|
- README
|
51
57
|
- IMPLEMENTATION
|
58
|
+
- CHANGES
|
59
|
+
- DEVELOPER
|
52
60
|
executables: []
|
53
61
|
extensions: []
|
54
62
|
requirements: []
|