rfeedparser 0.9.8 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +170 -3345
- data/lib/rfeedparser/aliases.rb +432 -0
- data/lib/rfeedparser/better_attributelist.rb +41 -0
- data/lib/rfeedparser/better_sgmlparser.rb +264 -0
- data/lib/rfeedparser/encoding_helpers.rb +257 -0
- data/lib/rfeedparser/feedparserdict.rb +93 -0
- data/lib/rfeedparser/forgiving_uri.rb +93 -0
- data/lib/rfeedparser/markup_helpers.rb +73 -0
- data/lib/rfeedparser/parser_mixin.rb +1235 -0
- data/lib/rfeedparser/parsers.rb +177 -0
- data/lib/rfeedparser/scrub.rb +207 -0
- data/lib/rfeedparser/time_helpers.rb +408 -0
- data/tests/rfeedparsertest.rb +3 -1
- metadata +3271 -3250
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
module FeedParserUtilities
|
3
|
+
class FeedParserDict < Hash
|
4
|
+
=begin
|
5
|
+
The naming of a certain common attribute (such as, "When was the last
|
6
|
+
time this feed was updated?") can have many different names depending
|
7
|
+
on the type of feed we are handling. This class allows us to satisfy
|
8
|
+
the expectations of both the developer who has prior knowledge of the
|
9
|
+
feed type as well as the developer who wants a consistent application
|
10
|
+
interface.
|
11
|
+
|
12
|
+
@@keymap is a Hash that contains information on what a certain
|
13
|
+
attribute names "really are" in each kind of feed. It does this by
|
14
|
+
providing a common name that will map to any feed type in the keys,
|
15
|
+
with possible "correct" attributes in the its values. the #[] and #[]=
|
16
|
+
methods check with keymaps to see what attribute the developer "really
|
17
|
+
means" if they've asked for one which happens to be in @@keymap's keys.
|
18
|
+
=end
|
19
|
+
@@keymap = {'channel' => 'feed',
|
20
|
+
'items' => 'entries',
|
21
|
+
'guid' => 'id',
|
22
|
+
'date' => 'updated',
|
23
|
+
'date_parsed' => 'updated_parsed',
|
24
|
+
'description' => ['subtitle', 'summary'],
|
25
|
+
'url' => ['href'],
|
26
|
+
'modified' => 'updated',
|
27
|
+
'modified_parsed' => 'updated_parsed',
|
28
|
+
'issued' => 'published',
|
29
|
+
'issued_parsed' => 'published_parsed',
|
30
|
+
'copyright' => 'rights',
|
31
|
+
'copyright_detail' => 'rights_detail',
|
32
|
+
'tagline' => 'subtitle',
|
33
|
+
'tagline_detail' => 'subtitle_detail'}
|
34
|
+
|
35
|
+
def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
36
|
+
return self['entries']
|
37
|
+
end
|
38
|
+
|
39
|
+
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
40
|
+
# but we'd still have to overwrite []= and such.
|
41
|
+
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
42
|
+
def initialize(pairs=nil)
|
43
|
+
if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
|
44
|
+
pairs.each do |l|
|
45
|
+
k,v = l
|
46
|
+
self[k] = v
|
47
|
+
end
|
48
|
+
elsif pairs.class == Hash
|
49
|
+
self.merge!(pairs)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def [](key)
|
54
|
+
if key == 'category'
|
55
|
+
return self['tags'][0]['term']
|
56
|
+
end
|
57
|
+
if key == 'categories'
|
58
|
+
return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
|
59
|
+
end
|
60
|
+
realkey = @@keymap[key] || key
|
61
|
+
if realkey.class == Array
|
62
|
+
realkey.each{ |key| return self[key] if has_key?key }
|
63
|
+
end
|
64
|
+
# Note that the original key is preferred over the realkey we (might
|
65
|
+
# have) found in @@keymap
|
66
|
+
if has_key?(key)
|
67
|
+
return super(key)
|
68
|
+
end
|
69
|
+
return super(realkey)
|
70
|
+
end
|
71
|
+
|
72
|
+
def []=(key,value)
|
73
|
+
if @@keymap.key?key
|
74
|
+
key = @@keymap[key]
|
75
|
+
if key.class == Array
|
76
|
+
key = key[0]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
super(key,value)
|
80
|
+
end
|
81
|
+
|
82
|
+
def method_missing(msym, *args)
|
83
|
+
methodname = msym.to_s
|
84
|
+
if methodname[-1] == '='
|
85
|
+
return self[methodname[0..-2]] = args[0]
|
86
|
+
elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private?
|
87
|
+
return self[methodname]
|
88
|
+
else
|
89
|
+
raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
module URI
|
4
|
+
# NOTE I wish I didn't have to open this module up,but I cannot find a
|
5
|
+
# better way of accessing all of the instance methods of the URI module. I \
|
6
|
+
# may just be an idiot.
|
7
|
+
def self.split(uri)
|
8
|
+
case uri
|
9
|
+
when ''
|
10
|
+
# null uri
|
11
|
+
|
12
|
+
when ABS_URI
|
13
|
+
scheme, opaque, userinfo, host, port,
|
14
|
+
registry, path, query, fragment = $~[1..-1]
|
15
|
+
|
16
|
+
# URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
17
|
+
|
18
|
+
# absoluteURI = scheme ":" ( hier_part | opaque_part )
|
19
|
+
# hier_part = ( net_path | abs_path ) [ "?" query ]
|
20
|
+
# opaque_part = uric_no_slash *uric
|
21
|
+
|
22
|
+
# abs_path = "/" path_segments
|
23
|
+
# net_path = "//" authority [ abs_path ]
|
24
|
+
|
25
|
+
# authority = server | reg_name
|
26
|
+
# server = [ [ userinfo "@" ] hostport ]
|
27
|
+
|
28
|
+
if !scheme
|
29
|
+
raise InvalidURIError,
|
30
|
+
"bad URI(absolute but no scheme): #{uri}"
|
31
|
+
end
|
32
|
+
if !opaque && (!path && (!host && !registry))
|
33
|
+
raise InvalidURIError,
|
34
|
+
"bad URI(absolute but no path): #{uri}"
|
35
|
+
end
|
36
|
+
|
37
|
+
when REL_URI
|
38
|
+
scheme = nil
|
39
|
+
opaque = nil
|
40
|
+
|
41
|
+
userinfo, host, port, registry,
|
42
|
+
rel_segment, abs_path, query, fragment = $~[1..-1]
|
43
|
+
if rel_segment && abs_path
|
44
|
+
path = rel_segment + abs_path
|
45
|
+
elsif rel_segment
|
46
|
+
path = rel_segment
|
47
|
+
elsif abs_path
|
48
|
+
path = abs_path
|
49
|
+
end
|
50
|
+
|
51
|
+
# URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
52
|
+
|
53
|
+
# relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
|
54
|
+
|
55
|
+
# net_path = "//" authority [ abs_path ]
|
56
|
+
# abs_path = "/" path_segments
|
57
|
+
# rel_path = rel_segment [ abs_path ]
|
58
|
+
|
59
|
+
# authority = server | reg_name
|
60
|
+
# server = [ [ userinfo "@" ] hostport ]
|
61
|
+
|
62
|
+
else
|
63
|
+
# NOTE this is the only part of the code that differs from the "clean"
|
64
|
+
# URI module.
|
65
|
+
return [nil,nil,uri,nil,nil,nil,nil,nil,nil]
|
66
|
+
end
|
67
|
+
|
68
|
+
path = '' if !path && !opaque # (see RFC2396 Section 5.2)
|
69
|
+
ret = [
|
70
|
+
scheme,
|
71
|
+
userinfo, host, port, # X
|
72
|
+
registry, # X
|
73
|
+
path, # Y
|
74
|
+
opaque, # Y
|
75
|
+
query,
|
76
|
+
fragment
|
77
|
+
]
|
78
|
+
return ret
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def urljoin(base, uri)
|
83
|
+
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
84
|
+
uri = uri.sub(urifixer, '\1\3')
|
85
|
+
begin
|
86
|
+
return URI.join(base, uri).to_s
|
87
|
+
rescue URI::BadURIError => e
|
88
|
+
if URI.parse(base).relative?
|
89
|
+
return URI::parse(uri).to_s
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
module FeedParserUtilities
|
3
|
+
#FIXME we need to find a better place for this method
|
4
|
+
def stripDoctype(data)
|
5
|
+
=begin
|
6
|
+
Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
7
|
+
|
8
|
+
rss_version may be 'rss091n' or None
|
9
|
+
stripped_data is the same XML document, minus the DOCTYPE
|
10
|
+
=end
|
11
|
+
entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
|
12
|
+
data = data.gsub(entity_pattern,'')
|
13
|
+
|
14
|
+
doctype_pattern = /<!DOCTYPE(.*?)>/m
|
15
|
+
doctype_results = data.scan(doctype_pattern)
|
16
|
+
if doctype_results and doctype_results[0]
|
17
|
+
doctype = doctype_results[0][0]
|
18
|
+
else
|
19
|
+
doctype = ''
|
20
|
+
end
|
21
|
+
|
22
|
+
if /netscape/ =~ doctype.downcase
|
23
|
+
version = 'rss091n'
|
24
|
+
else
|
25
|
+
version = nil
|
26
|
+
end
|
27
|
+
data = data.sub(doctype_pattern, '')
|
28
|
+
return version, data
|
29
|
+
end
|
30
|
+
|
31
|
+
def resolveRelativeURIs(htmlSource, baseURI, encoding)
|
32
|
+
$stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
|
33
|
+
relative_uris = [ ['a','href'],
|
34
|
+
['applet','codebase'],
|
35
|
+
['area','href'],
|
36
|
+
['blockquote','cite'],
|
37
|
+
['body','background'],
|
38
|
+
['del','cite'],
|
39
|
+
['form','action'],
|
40
|
+
['frame','longdesc'],
|
41
|
+
['frame','src'],
|
42
|
+
['iframe','longdesc'],
|
43
|
+
['iframe','src'],
|
44
|
+
['head','profile'],
|
45
|
+
['img','longdesc'],
|
46
|
+
['img','src'],
|
47
|
+
['img','usemap'],
|
48
|
+
['input','src'],
|
49
|
+
['input','usemap'],
|
50
|
+
['ins','cite'],
|
51
|
+
['link','href'],
|
52
|
+
['object','classid'],
|
53
|
+
['object','codebase'],
|
54
|
+
['object','data'],
|
55
|
+
['object','usemap'],
|
56
|
+
['q','cite'],
|
57
|
+
['script','src'],
|
58
|
+
]
|
59
|
+
h = Hpricot(htmlSource)
|
60
|
+
relative_uris.each do |l|
|
61
|
+
ename, eattr = l
|
62
|
+
h.search(ename).each do |elem|
|
63
|
+
euri = elem.attributes[eattr]
|
64
|
+
if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
|
65
|
+
elem.attributes[eattr] = urljoin(baseURI, euri)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
return h.to_html
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,1235 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
module FeedParserMixin
|
3
|
+
attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
|
4
|
+
|
5
|
+
def startup(baseuri=nil, baselang=nil, encoding='utf-8')
|
6
|
+
$stderr << "initializing FeedParser\n" if $debug
|
7
|
+
|
8
|
+
@namespaces = {'' => '',
|
9
|
+
'http://backend.userland.com/rss' => '',
|
10
|
+
'http://blogs.law.harvard.edu/tech/rss' => '',
|
11
|
+
'http://purl.org/rss/1.0/' => '',
|
12
|
+
'http://my.netscape.com/rdf/simple/0.9/' => '',
|
13
|
+
'http://example.com/newformat#' => '',
|
14
|
+
'http://example.com/necho' => '',
|
15
|
+
'http://purl.org/echo/' => '',
|
16
|
+
'uri/of/echo/namespace#' => '',
|
17
|
+
'http://purl.org/pie/' => '',
|
18
|
+
'http://purl.org/atom/ns#' => '',
|
19
|
+
'http://www.w3.org/2005/Atom' => '',
|
20
|
+
'http://purl.org/rss/1.0/modules/rss091#' => '',
|
21
|
+
'http://webns.net/mvcb/' => 'admin',
|
22
|
+
'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
|
23
|
+
'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
|
24
|
+
'http://media.tangent.org/rss/1.0/' => 'audio',
|
25
|
+
'http://backend.userland.com/blogChannelModule' => 'blogChannel',
|
26
|
+
'http://web.resource.org/cc/' => 'cc',
|
27
|
+
'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
|
28
|
+
'http://purl.org/rss/1.0/modules/company' => 'co',
|
29
|
+
'http://purl.org/rss/1.0/modules/content/' => 'content',
|
30
|
+
'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
|
31
|
+
'http://purl.org/dc/elements/1.1/' => 'dc',
|
32
|
+
'http://purl.org/dc/terms/' => 'dcterms',
|
33
|
+
'http://purl.org/rss/1.0/modules/email/' => 'email',
|
34
|
+
'http://purl.org/rss/1.0/modules/event/' => 'ev',
|
35
|
+
'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
|
36
|
+
'http://freshmeat.net/rss/fm/' => 'fm',
|
37
|
+
'http://xmlns.com/foaf/0.1/' => 'foaf',
|
38
|
+
'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
|
39
|
+
'http://postneo.com/icbm/' => 'icbm',
|
40
|
+
'http://purl.org/rss/1.0/modules/image/' => 'image',
|
41
|
+
'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
42
|
+
'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
43
|
+
'http://purl.org/rss/1.0/modules/link/' => 'l',
|
44
|
+
'http://search.yahoo.com/mrss' => 'media',
|
45
|
+
'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
|
46
|
+
'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
|
47
|
+
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
|
48
|
+
'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
|
49
|
+
'http://purl.org/rss/1.0/modules/reference/' => 'ref',
|
50
|
+
'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
|
51
|
+
'http://purl.org/rss/1.0/modules/search/' => 'search',
|
52
|
+
'http://purl.org/rss/1.0/modules/slash/' => 'slash',
|
53
|
+
'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
|
54
|
+
'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
|
55
|
+
'http://hacks.benhammersley.com/rss/streaming/' => 'str',
|
56
|
+
'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
|
57
|
+
'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
|
58
|
+
'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
|
59
|
+
'http://purl.org/rss/1.0/modules/threading/' => 'thr',
|
60
|
+
'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
|
61
|
+
'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
|
62
|
+
'http://wellformedweb.org/commentAPI/' => 'wfw',
|
63
|
+
'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
|
64
|
+
'http://www.w3.org/1999/xhtml' => 'xhtml',
|
65
|
+
'http://www.w3.org/XML/1998/namespace' => 'xml',
|
66
|
+
'http://www.w3.org/1999/xlink' => 'xlink',
|
67
|
+
'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
|
68
|
+
}
|
69
|
+
@matchnamespaces = {}
|
70
|
+
@namespaces.each do |l|
|
71
|
+
@matchnamespaces[l[0].downcase] = l[1]
|
72
|
+
end
|
73
|
+
@can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
|
74
|
+
@can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
75
|
+
@can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
76
|
+
@html_types = ['text/html', 'application/xhtml+xml']
|
77
|
+
@feeddata = FeedParserDict.new # feed-level data
|
78
|
+
@encoding = encoding # character encoding
|
79
|
+
@entries = [] # list of entry-level data
|
80
|
+
@version = '' # feed type/version see SUPPORTED_VERSIOSN
|
81
|
+
@namespacesInUse = {} # hash of namespaces defined by the feed
|
82
|
+
|
83
|
+
# the following are used internall to track state;
|
84
|
+
# this is really out of control and should be refactored
|
85
|
+
@infeed = false
|
86
|
+
@inentry = false
|
87
|
+
@incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
|
88
|
+
@intextinput = false
|
89
|
+
@inimage = false
|
90
|
+
@inauthor = false
|
91
|
+
@incontributor = false
|
92
|
+
@inpublisher = false
|
93
|
+
@insource = false
|
94
|
+
@sourcedata = FeedParserDict.new
|
95
|
+
@contentparams = FeedParserDict.new
|
96
|
+
@summaryKey = nil
|
97
|
+
@namespacemap = {}
|
98
|
+
@elementstack = []
|
99
|
+
@basestack = []
|
100
|
+
@langstack = []
|
101
|
+
@baseuri = baseuri || ''
|
102
|
+
@lang = baselang || nil
|
103
|
+
if baselang
|
104
|
+
@feeddata['language'] = baselang.gsub('_','-')
|
105
|
+
end
|
106
|
+
@date_handlers = [:_parse_date_rfc822,
|
107
|
+
:_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
|
108
|
+
:_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
|
109
|
+
]
|
110
|
+
$stderr << "Leaving startup\n" if $debug # My addition
|
111
|
+
end
|
112
|
+
|
113
|
+
def unknown_starttag(tag, attrsd)
|
114
|
+
$stderr << "start #{tag} with #{attrsd}\n" if $debug
|
115
|
+
# normalize attrs
|
116
|
+
attrsD = {}
|
117
|
+
attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
|
118
|
+
# LooseFeedParser needs the above because SGMLParser sends attrs as a
|
119
|
+
# list of lists (like [['type','text/html'],['mode','escaped']])
|
120
|
+
|
121
|
+
attrsd.each do |old_k,value|
|
122
|
+
# There has to be a better, non-ugly way of doing this
|
123
|
+
k = old_k.downcase # Downcase all keys
|
124
|
+
attrsD[k] = value
|
125
|
+
if ['rel','type'].include?value
|
126
|
+
attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# track xml:base and xml:lang
|
131
|
+
baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
|
132
|
+
@baseuri = urljoin(@baseuri, baseuri)
|
133
|
+
lang = attrsD['xml:lang'] || attrsD['lang']
|
134
|
+
if lang == '' # FIXME This next bit of code is right? Wtf?
|
135
|
+
# xml:lang could be explicitly set to '', we need to capture that
|
136
|
+
lang = nil
|
137
|
+
elsif lang.nil?
|
138
|
+
# if no xml:lang is specified, use parent lang
|
139
|
+
lang = @lang
|
140
|
+
end
|
141
|
+
if lang and not lang.empty? # Seriously, this cannot be correct
|
142
|
+
if ['feed', 'rss', 'rdf:RDF'].include?tag
|
143
|
+
@feeddata['language'] = lang.gsub('_','-')
|
144
|
+
end
|
145
|
+
end
|
146
|
+
@lang = lang
|
147
|
+
@basestack << @baseuri
|
148
|
+
@langstack << lang
|
149
|
+
|
150
|
+
# track namespaces
|
151
|
+
attrsd.each do |prefix, uri|
|
152
|
+
if /^xmlns:/ =~ prefix # prefix begins with xmlns:
|
153
|
+
trackNamespace(prefix[6..-1], uri)
|
154
|
+
elsif prefix == 'xmlns':
|
155
|
+
trackNamespace(nil, uri)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# track inline content
|
160
|
+
if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
|
161
|
+
# element declared itself as escaped markup, but isn't really
|
162
|
+
|
163
|
+
@contentparams['type'] = 'application/xhtml+xml'
|
164
|
+
end
|
165
|
+
if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
|
166
|
+
# Note: probably shouldn't simply recreate localname here, but
|
167
|
+
# our namespace handling isn't actually 100% correct in cases where
|
168
|
+
# the feed redefines the default namespace (which is actually
|
169
|
+
# the usual case for inline content, thanks Sam), so here we
|
170
|
+
# cheat and just reconstruct the element based on localname
|
171
|
+
# because that compensates for the bugs in our namespace handling.
|
172
|
+
# This will horribly munge inline content with non-empty qnames,
|
173
|
+
# but nobody actually does that, so I'm not fixing it.
|
174
|
+
tag = tag.split(':')[-1]
|
175
|
+
attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
|
176
|
+
attrsS = ' '+attrsA.join(' ')
|
177
|
+
return handle_data("<#{tag}#{attrsS}>", escape=false)
|
178
|
+
end
|
179
|
+
|
180
|
+
# match namespaces
|
181
|
+
if /:/ =~ tag
|
182
|
+
prefix, suffix = tag.split(':', 2)
|
183
|
+
else
|
184
|
+
prefix, suffix = '', tag
|
185
|
+
end
|
186
|
+
prefix = @namespacemap[prefix] || prefix
|
187
|
+
if prefix and not prefix.empty?
|
188
|
+
prefix = prefix + '_'
|
189
|
+
end
|
190
|
+
|
191
|
+
# special hack for better tracking of empty textinput/image elements in illformed feeds
|
192
|
+
if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
|
193
|
+
@intextinput = false
|
194
|
+
end
|
195
|
+
if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
|
196
|
+
@inimage = false
|
197
|
+
end
|
198
|
+
|
199
|
+
# call special handler (if defined) or default handler
|
200
|
+
begin
|
201
|
+
return send('_start_'+prefix+suffix, attrsD)
|
202
|
+
rescue NoMethodError
|
203
|
+
return push(prefix + suffix, true)
|
204
|
+
end
|
205
|
+
end # End unknown_starttag
|
206
|
+
|
207
|
+
def unknown_endtag(tag)
|
208
|
+
$stderr << "end #{tag}\n" if $debug
|
209
|
+
# match namespaces
|
210
|
+
if tag.index(':')
|
211
|
+
prefix, suffix = tag.split(':',2)
|
212
|
+
else
|
213
|
+
prefix, suffix = '', tag
|
214
|
+
end
|
215
|
+
prefix = @namespacemap[prefix] || prefix
|
216
|
+
if prefix and not prefix.empty?
|
217
|
+
prefix = prefix + '_'
|
218
|
+
end
|
219
|
+
|
220
|
+
# call special handler (if defined) or default handler
|
221
|
+
begin
|
222
|
+
send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
|
223
|
+
rescue NoMethodError => details
|
224
|
+
pop(prefix + suffix)
|
225
|
+
end
|
226
|
+
|
227
|
+
# track inline content
|
228
|
+
if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
|
229
|
+
# element declared itself as escaped markup, but it isn't really
|
230
|
+
@contentparams['type'] = 'application/xhtml+xml'
|
231
|
+
end
|
232
|
+
if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
|
233
|
+
tag = tag.split(':')[-1]
|
234
|
+
handle_data("</#{tag}>", escape=false)
|
235
|
+
end
|
236
|
+
|
237
|
+
# track xml:base and xml:lang going out of scope
|
238
|
+
if @basestack and not @basestack.empty?
|
239
|
+
@basestack.pop
|
240
|
+
if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
|
241
|
+
@baseuri = @basestack[-1]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
if @langstack and not @langstack.empty?
|
245
|
+
@langstack.pop
|
246
|
+
if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
|
247
|
+
@lang = @langstack[-1]
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
def handle_charref(ref)
|
253
|
+
# LooseParserOnly
|
254
|
+
# called for each character reference, e.g. for ' ', ref will be '160'
|
255
|
+
$stderr << "entering handle_charref with #{ref}\n" if $debug
|
256
|
+
return if @elementstack.nil? or @elementstack.empty?
|
257
|
+
ref.downcase!
|
258
|
+
chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
|
259
|
+
if chars.include?ref
|
260
|
+
text = "&##{ref};"
|
261
|
+
else
|
262
|
+
if ref[0..0] == 'x'
|
263
|
+
c = (ref[1..-1]).to_i(16)
|
264
|
+
else
|
265
|
+
c = ref.to_i
|
266
|
+
end
|
267
|
+
text = [c].pack('U*')
|
268
|
+
end
|
269
|
+
@elementstack[-1][2] << text
|
270
|
+
end
|
271
|
+
|
272
|
+
def handle_entityref(ref)
|
273
|
+
# LooseParserOnly
|
274
|
+
# called for each entity reference, e.g. for '©', ref will be 'copy'
|
275
|
+
|
276
|
+
return if @elementstack.nil? or @elementstack.empty?
|
277
|
+
$stderr << "entering handle_entityref with #{ref}\n" if $debug
|
278
|
+
ents = ['lt', 'gt', 'quot', 'amp', 'apos']
|
279
|
+
if ents.include?ref
|
280
|
+
text = "&#{ref};"
|
281
|
+
else
|
282
|
+
text = HTMLEntities::decode_entities("&#{ref};")
|
283
|
+
end
|
284
|
+
@elementstack[-1][2] << text
|
285
|
+
end
|
286
|
+
|
287
|
+
def handle_data(text, escape=true)
|
288
|
+
# called for each block of plain text, i.e. outside of any tag and
|
289
|
+
# not containing any character or entity references
|
290
|
+
return if @elementstack.nil? or @elementstack.empty?
|
291
|
+
if escape and @contentparams['type'] == 'application/xhtml+xml'
|
292
|
+
text = text.to_xs
|
293
|
+
end
|
294
|
+
@elementstack[-1][2] << text
|
295
|
+
end
|
296
|
+
|
297
|
+
def handle_comment(comment)
|
298
|
+
# called for each comment, e.g. <!-- insert message here -->
|
299
|
+
end
|
300
|
+
|
301
|
+
def handle_pi(text)
|
302
|
+
end
|
303
|
+
|
304
|
+
def handle_decl(text)
|
305
|
+
end
|
306
|
+
|
307
|
+
def parse_declaration(i)
|
308
|
+
# for LooseFeedParser
|
309
|
+
$stderr << "entering parse_declaration\n" if $debug
|
310
|
+
if @rawdata[i...i+9] == '<![CDATA['
|
311
|
+
k = @rawdata.index(/\]\]>/u,i+9)
|
312
|
+
k = @rawdata.length unless k
|
313
|
+
handle_data(@rawdata[i+9...k].to_xs,false)
|
314
|
+
return k+3
|
315
|
+
else
|
316
|
+
k = @rawdata.index(/>/,i).to_i
|
317
|
+
return k+1
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
def mapContentType(contentType)
|
322
|
+
contentType.downcase!
|
323
|
+
case contentType
|
324
|
+
when 'text'
|
325
|
+
contentType = 'text/plain'
|
326
|
+
when 'html'
|
327
|
+
contentType = 'text/html'
|
328
|
+
when 'xhtml'
|
329
|
+
contentType = 'application/xhtml+xml'
|
330
|
+
end
|
331
|
+
return contentType
|
332
|
+
end
|
333
|
+
|
334
|
+
def trackNamespace(prefix, uri)
|
335
|
+
|
336
|
+
loweruri = uri.downcase.strip
|
337
|
+
if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
|
338
|
+
@version = 'rss090'
|
339
|
+
elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
|
340
|
+
@version = 'rss10'
|
341
|
+
elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
|
342
|
+
@version = 'atom10'
|
343
|
+
elsif /backend\.userland\.com\/rss/ =~ loweruri
|
344
|
+
# match any backend.userland.com namespace
|
345
|
+
uri = 'http://backend.userland.com/rss'
|
346
|
+
loweruri = uri
|
347
|
+
end
|
348
|
+
if @matchnamespaces.has_key? loweruri
|
349
|
+
@namespacemap[prefix] = @matchnamespaces[loweruri]
|
350
|
+
@namespacesInUse[@matchnamespaces[loweruri]] = uri
|
351
|
+
else
|
352
|
+
@namespacesInUse[prefix || ''] = uri
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
def resolveURI(uri)
|
357
|
+
return urljoin(@baseuri || '', uri)
|
358
|
+
end
|
359
|
+
|
360
|
+
def decodeEntities(element, data)
|
361
|
+
return data
|
362
|
+
end
|
363
|
+
|
364
|
+
def push(element, expectingText)
|
365
|
+
@elementstack << [element, expectingText, []]
|
366
|
+
end
|
367
|
+
|
368
|
+
def pop(element, stripWhitespace=true)
|
369
|
+
return if @elementstack.nil? or @elementstack.empty?
|
370
|
+
return if @elementstack[-1][0] != element
|
371
|
+
element, expectingText, pieces = @elementstack.pop
|
372
|
+
if pieces.class == Array
|
373
|
+
output = pieces.join('')
|
374
|
+
else
|
375
|
+
output = pieces
|
376
|
+
end
|
377
|
+
if stripWhitespace
|
378
|
+
output.strip!
|
379
|
+
end
|
380
|
+
return output if not expectingText
|
381
|
+
|
382
|
+
# decode base64 content
|
383
|
+
if @contentparams['base64']
|
384
|
+
out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
|
385
|
+
if not output.empty? and not out64.empty?
|
386
|
+
output = out64
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
# resolve relative URIs
|
391
|
+
if @can_be_relative_uri.include?element and output and not output.empty?
|
392
|
+
output = resolveURI(output)
|
393
|
+
end
|
394
|
+
|
395
|
+
# decode entities within embedded markup
|
396
|
+
if not @contentparams['base64']
|
397
|
+
output = decodeEntities(element, output)
|
398
|
+
end
|
399
|
+
|
400
|
+
# remove temporary cruft from contentparams
|
401
|
+
@contentparams.delete('mode')
|
402
|
+
@contentparams.delete('base64')
|
403
|
+
|
404
|
+
# resolve relative URIs within embedded markup
|
405
|
+
if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
|
406
|
+
if @can_contain_relative_uris.include?element
|
407
|
+
output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
|
408
|
+
end
|
409
|
+
end
|
410
|
+
# sanitize embedded markup
|
411
|
+
if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
|
412
|
+
if @can_contain_dangerous_markup.include?element
|
413
|
+
output = FeedParser.sanitizeHTML(output, @encoding)
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
if @encoding and not @encoding.empty? and @encoding != 'utf-8'
|
418
|
+
output = uconvert(output, @encoding, 'utf-8')
|
419
|
+
# FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
|
420
|
+
end
|
421
|
+
|
422
|
+
# categories/tags/keywords/whatever are handled in _end_category
|
423
|
+
return output if element == 'category'
|
424
|
+
|
425
|
+
# store output in appropriate place(s)
|
426
|
+
if @inentry and not @insource
|
427
|
+
if element == 'content'
|
428
|
+
@entries[-1][element] ||= []
|
429
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
|
430
|
+
contentparams['value'] = output
|
431
|
+
@entries[-1][element] << contentparams
|
432
|
+
elsif element == 'link'
|
433
|
+
@entries[-1][element] = output
|
434
|
+
if output and not output.empty?
|
435
|
+
@entries[-1]['links'][-1]['href'] = output
|
436
|
+
end
|
437
|
+
else
|
438
|
+
element = 'summary' if element == 'description'
|
439
|
+
@entries[-1][element] = output
|
440
|
+
if @incontent != 0
|
441
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
442
|
+
contentparams['value'] = output
|
443
|
+
@entries[-1][element + '_detail'] = contentparams
|
444
|
+
end
|
445
|
+
end
|
446
|
+
elsif (@infeed or @insource) and not @intextinput and not @inimage
|
447
|
+
context = getContext()
|
448
|
+
element = 'subtitle' if element == 'description'
|
449
|
+
context[element] = output
|
450
|
+
if element == 'link'
|
451
|
+
context['links'][-1]['href'] = output
|
452
|
+
elsif @incontent != 0
|
453
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
454
|
+
contentparams['value'] = output
|
455
|
+
context[element + '_detail'] = contentparams
|
456
|
+
end
|
457
|
+
end
|
458
|
+
return output
|
459
|
+
end
|
460
|
+
|
461
|
+
def pushContent(tag, attrsD, defaultContentType, expectingText)
|
462
|
+
@incontent += 1 # Yes, I hate this.
|
463
|
+
type = mapContentType(attrsD['type'] || defaultContentType)
|
464
|
+
@contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
|
465
|
+
@contentparams['base64'] = isBase64(attrsD, @contentparams)
|
466
|
+
push(tag, expectingText)
|
467
|
+
end
|
468
|
+
|
469
|
+
def popContent(tag)
|
470
|
+
value = pop(tag)
|
471
|
+
@incontent -= 1
|
472
|
+
@contentparams.clear
|
473
|
+
return value
|
474
|
+
end
|
475
|
+
|
476
|
+
def mapToStandardPrefix(name)
|
477
|
+
colonpos = name.index(':')
|
478
|
+
if colonpos
|
479
|
+
prefix = name[0..colonpos-1]
|
480
|
+
suffix = name[colonpos+1..-1]
|
481
|
+
prefix = @namespacemap[prefix] || prefix
|
482
|
+
name = prefix + ':' + suffix
|
483
|
+
end
|
484
|
+
return name
|
485
|
+
end
|
486
|
+
|
487
|
+
def getAttribute(attrsD, name)
|
488
|
+
return attrsD[mapToStandardPrefix(name)]
|
489
|
+
end
|
490
|
+
|
491
|
+
def isBase64(attrsD, contentparams)
|
492
|
+
return true if (attrsD['mode'] == 'base64')
|
493
|
+
if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
|
494
|
+
return false
|
495
|
+
end
|
496
|
+
return true
|
497
|
+
end
|
498
|
+
|
499
|
+
def itsAnHrefDamnIt(attrsD)
|
500
|
+
href= attrsD['url'] || attrsD['uri'] || attrsD['href']
|
501
|
+
if href
|
502
|
+
attrsD.delete('url')
|
503
|
+
attrsD.delete('uri')
|
504
|
+
attrsD['href'] = href
|
505
|
+
end
|
506
|
+
return attrsD
|
507
|
+
end
|
508
|
+
|
509
|
+
|
510
|
+
def _save(key, value)
|
511
|
+
context = getContext()
|
512
|
+
context[key] ||= value
|
513
|
+
end
|
514
|
+
|
515
|
+
def _start_rss(attrsD)
|
516
|
+
versionmap = {'0.91' => 'rss091u',
|
517
|
+
'0.92' => 'rss092',
|
518
|
+
'0.93' => 'rss093',
|
519
|
+
'0.94' => 'rss094'
|
520
|
+
}
|
521
|
+
|
522
|
+
if not @version or @version.empty?
|
523
|
+
attr_version = attrsD['version'] || ''
|
524
|
+
version = versionmap[attr_version]
|
525
|
+
if version and not version.empty?
|
526
|
+
@version = version
|
527
|
+
elsif /^2\./ =~ attr_version
|
528
|
+
@version = 'rss20'
|
529
|
+
else
|
530
|
+
@version = 'rss'
|
531
|
+
end
|
532
|
+
end
|
533
|
+
end
|
534
|
+
|
535
|
+
def _start_dlhottitles(attrsD)
|
536
|
+
@version = 'hotrss'
|
537
|
+
end
|
538
|
+
|
539
|
+
def _start_channel(attrsD)
|
540
|
+
@infeed = true
|
541
|
+
_cdf_common(attrsD)
|
542
|
+
end
|
543
|
+
alias :_start_feedinfo :_start_channel
|
544
|
+
|
545
|
+
def _cdf_common(attrsD)
|
546
|
+
if attrsD.has_key?'lastmod'
|
547
|
+
_start_modified({})
|
548
|
+
@elementstack[-1][-1] = attrsD['lastmod']
|
549
|
+
_end_modified
|
550
|
+
end
|
551
|
+
if attrsD.has_key?'href'
|
552
|
+
_start_link({})
|
553
|
+
@elementstack[-1][-1] = attrsD['href']
|
554
|
+
_end_link
|
555
|
+
end
|
556
|
+
end
|
557
|
+
|
558
|
+
def _start_feed(attrsD)
|
559
|
+
@infeed = true
|
560
|
+
versionmap = {'0.1' => 'atom01',
|
561
|
+
'0.2' => 'atom02',
|
562
|
+
'0.3' => 'atom03'
|
563
|
+
}
|
564
|
+
|
565
|
+
if not @version or @version.empty?
|
566
|
+
attr_version = attrsD['version']
|
567
|
+
version = versionmap[attr_version]
|
568
|
+
if @version and not @version.empty?
|
569
|
+
@version = version
|
570
|
+
else
|
571
|
+
@version = 'atom'
|
572
|
+
end
|
573
|
+
end
|
574
|
+
end
|
575
|
+
|
576
|
+
def _end_channel
|
577
|
+
@infeed = false
|
578
|
+
end
|
579
|
+
alias :_end_feed :_end_channel
|
580
|
+
|
581
|
+
def _start_image(attrsD)
|
582
|
+
@inimage = true
|
583
|
+
push('image', false)
|
584
|
+
context = getContext()
|
585
|
+
context['image'] ||= FeedParserDict.new
|
586
|
+
end
|
587
|
+
|
588
|
+
def _end_image
|
589
|
+
pop('image')
|
590
|
+
@inimage = false
|
591
|
+
end
|
592
|
+
|
593
|
+
def _start_textinput(attrsD)
|
594
|
+
@intextinput = true
|
595
|
+
push('textinput', false)
|
596
|
+
context = getContext()
|
597
|
+
context['textinput'] ||= FeedParserDict.new
|
598
|
+
end
|
599
|
+
alias :_start_textInput :_start_textinput
|
600
|
+
|
601
|
+
def _end_textinput
|
602
|
+
pop('textinput')
|
603
|
+
@intextinput = false
|
604
|
+
end
|
605
|
+
alias :_end_textInput :_end_textinput
|
606
|
+
|
607
|
+
def _start_author(attrsD)
|
608
|
+
@inauthor = true
|
609
|
+
push('author', true)
|
610
|
+
end
|
611
|
+
alias :_start_managingeditor :_start_author
|
612
|
+
alias :_start_dc_author :_start_author
|
613
|
+
alias :_start_dc_creator :_start_author
|
614
|
+
alias :_start_itunes_author :_start_author
|
615
|
+
|
616
|
+
def _end_author
|
617
|
+
pop('author')
|
618
|
+
@inauthor = false
|
619
|
+
_sync_author_detail()
|
620
|
+
end
|
621
|
+
alias :_end_managingeditor :_end_author
|
622
|
+
alias :_end_dc_author :_end_author
|
623
|
+
alias :_end_dc_creator :_end_author
|
624
|
+
alias :_end_itunes_author :_end_author
|
625
|
+
|
626
|
+
def _start_itunes_owner(attrsD)
|
627
|
+
@inpublisher = true
|
628
|
+
push('publisher', false)
|
629
|
+
end
|
630
|
+
|
631
|
+
def _end_itunes_owner
|
632
|
+
pop('publisher')
|
633
|
+
@inpublisher = false
|
634
|
+
_sync_author_detail('publisher')
|
635
|
+
end
|
636
|
+
|
637
|
+
def _start_contributor(attrsD)
|
638
|
+
@incontributor = true
|
639
|
+
context = getContext()
|
640
|
+
context['contributors'] ||= []
|
641
|
+
context['contributors'] << FeedParserDict.new
|
642
|
+
push('contributor', false)
|
643
|
+
end
|
644
|
+
|
645
|
+
def _end_contributor
|
646
|
+
pop('contributor')
|
647
|
+
@incontributor = false
|
648
|
+
end
|
649
|
+
|
650
|
+
def _start_dc_contributor(attrsD)
|
651
|
+
@incontributor = true
|
652
|
+
context = getContext()
|
653
|
+
context['contributors'] ||= []
|
654
|
+
context['contributors'] << FeedParserDict.new
|
655
|
+
push('name', false)
|
656
|
+
end
|
657
|
+
|
658
|
+
def _end_dc_contributor
|
659
|
+
_end_name
|
660
|
+
@incontributor = false
|
661
|
+
end
|
662
|
+
|
663
|
+
def _start_name(attrsD)
|
664
|
+
push('name', false)
|
665
|
+
end
|
666
|
+
alias :_start_itunes_name :_start_name
|
667
|
+
|
668
|
+
def _end_name
|
669
|
+
value = pop('name')
|
670
|
+
if @inpublisher
|
671
|
+
_save_author('name', value, 'publisher')
|
672
|
+
elsif @inauthor
|
673
|
+
_save_author('name', value)
|
674
|
+
elsif @incontributor
|
675
|
+
_save_contributor('name', value)
|
676
|
+
elsif @intextinput
|
677
|
+
context = getContext()
|
678
|
+
context['textinput']['name'] = value
|
679
|
+
end
|
680
|
+
end
|
681
|
+
alias :_end_itunes_name :_end_name
|
682
|
+
|
683
|
+
def _start_width(attrsD)
|
684
|
+
push('width', false)
|
685
|
+
end
|
686
|
+
|
687
|
+
def _end_width
|
688
|
+
value = pop('width').to_i
|
689
|
+
if @inimage
|
690
|
+
context = getContext
|
691
|
+
context['image']['width'] = value
|
692
|
+
end
|
693
|
+
end
|
694
|
+
|
695
|
+
def _start_height(attrsD)
|
696
|
+
push('height', false)
|
697
|
+
end
|
698
|
+
|
699
|
+
def _end_height
|
700
|
+
value = pop('height').to_i
|
701
|
+
if @inimage
|
702
|
+
context = getContext()
|
703
|
+
context['image']['height'] = value
|
704
|
+
end
|
705
|
+
end
|
706
|
+
|
707
|
+
def _start_url(attrsD)
|
708
|
+
push('href', true)
|
709
|
+
end
|
710
|
+
alias :_start_homepage :_start_url
|
711
|
+
alias :_start_uri :_start_url
|
712
|
+
|
713
|
+
def _end_url
|
714
|
+
value = pop('href')
|
715
|
+
if @inauthor
|
716
|
+
_save_author('href', value)
|
717
|
+
elsif @incontributor
|
718
|
+
_save_contributor('href', value)
|
719
|
+
elsif @inimage
|
720
|
+
context = getContext()
|
721
|
+
context['image']['href'] = value
|
722
|
+
elsif @intextinput
|
723
|
+
context = getContext()
|
724
|
+
context['textinput']['link'] = value
|
725
|
+
end
|
726
|
+
end
|
727
|
+
alias :_end_homepage :_end_url
|
728
|
+
alias :_end_uri :_end_url
|
729
|
+
|
730
|
+
def _start_email(attrsD)
|
731
|
+
push('email', false)
|
732
|
+
end
|
733
|
+
alias :_start_itunes_email :_start_email
|
734
|
+
|
735
|
+
def _end_email
|
736
|
+
value = pop('email')
|
737
|
+
if @inpublisher
|
738
|
+
_save_author('email', value, 'publisher')
|
739
|
+
elsif @inauthor
|
740
|
+
_save_author('email', value)
|
741
|
+
elsif @incontributor
|
742
|
+
_save_contributor('email', value)
|
743
|
+
end
|
744
|
+
end
|
745
|
+
alias :_end_itunes_email :_end_email
|
746
|
+
|
747
|
+
def getContext
|
748
|
+
if @insource
|
749
|
+
context = @sourcedata
|
750
|
+
elsif @inentry
|
751
|
+
context = @entries[-1]
|
752
|
+
else
|
753
|
+
context = @feeddata
|
754
|
+
end
|
755
|
+
return context
|
756
|
+
end
|
757
|
+
|
758
|
+
def _save_author(key, value, prefix='author')
|
759
|
+
context = getContext()
|
760
|
+
context[prefix + '_detail'] ||= FeedParserDict.new
|
761
|
+
context[prefix + '_detail'][key] = value
|
762
|
+
_sync_author_detail()
|
763
|
+
end
|
764
|
+
|
765
|
+
def _save_contributor(key, value)
|
766
|
+
context = getContext
|
767
|
+
context['contributors'] ||= [FeedParserDict.new]
|
768
|
+
context['contributors'][-1][key] = value
|
769
|
+
end
|
770
|
+
|
771
|
+
def _sync_author_detail(key='author')
|
772
|
+
context = getContext()
|
773
|
+
detail = context["#{key}_detail"]
|
774
|
+
if detail and not detail.empty?
|
775
|
+
name = detail['name']
|
776
|
+
email = detail['email']
|
777
|
+
|
778
|
+
if name and email and not (name.empty? or name.empty?)
|
779
|
+
context[key] = "#{name} (#{email})"
|
780
|
+
elsif name and not name.empty?
|
781
|
+
context[key] = name
|
782
|
+
elsif email and not email.empty?
|
783
|
+
context[key] = email
|
784
|
+
end
|
785
|
+
else
|
786
|
+
author = context[key].dup unless context[key].nil?
|
787
|
+
return if not author or author.empty?
|
788
|
+
emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
|
789
|
+
email = emailmatch[1]
|
790
|
+
author.gsub!(email, '')
|
791
|
+
author.gsub!("\(\)", '')
|
792
|
+
author.strip!
|
793
|
+
author.gsub!(/^\(/,'')
|
794
|
+
author.gsub!(/\)$/,'')
|
795
|
+
author.strip!
|
796
|
+
context["#{key}_detail"] ||= FeedParserDict.new
|
797
|
+
context["#{key}_detail"]['name'] = author
|
798
|
+
context["#{key}_detail"]['email'] = email
|
799
|
+
end
|
800
|
+
end
|
801
|
+
|
802
|
+
def _start_subtitle(attrsD)
|
803
|
+
pushContent('subtitle', attrsD, 'text/plain', true)
|
804
|
+
end
|
805
|
+
alias :_start_tagline :_start_subtitle
|
806
|
+
alias :_start_itunes_subtitle :_start_subtitle
|
807
|
+
|
808
|
+
def _end_subtitle
|
809
|
+
popContent('subtitle')
|
810
|
+
end
|
811
|
+
alias :_end_tagline :_end_subtitle
|
812
|
+
alias :_end_itunes_subtitle :_end_subtitle
|
813
|
+
|
814
|
+
def _start_rights(attrsD)
|
815
|
+
pushContent('rights', attrsD, 'text/plain', true)
|
816
|
+
end
|
817
|
+
alias :_start_dc_rights :_start_rights
|
818
|
+
alias :_start_copyright :_start_rights
|
819
|
+
|
820
|
+
def _end_rights
|
821
|
+
popContent('rights')
|
822
|
+
end
|
823
|
+
alias :_end_dc_rights :_end_rights
|
824
|
+
alias :_end_copyright :_end_rights
|
825
|
+
|
826
|
+
def _start_item(attrsD)
|
827
|
+
@entries << FeedParserDict.new
|
828
|
+
push('item', false)
|
829
|
+
@inentry = true
|
830
|
+
@guidislink = false
|
831
|
+
id = getAttribute(attrsD, 'rdf:about')
|
832
|
+
if id and not id.empty?
|
833
|
+
context = getContext()
|
834
|
+
context['id'] = id
|
835
|
+
end
|
836
|
+
_cdf_common(attrsD)
|
837
|
+
end
|
838
|
+
alias :_start_entry :_start_item
|
839
|
+
alias :_start_product :_start_item
|
840
|
+
|
841
|
+
def _end_item
|
842
|
+
pop('item')
|
843
|
+
@inentry = false
|
844
|
+
end
|
845
|
+
alias :_end_entry :_end_item
|
846
|
+
|
847
|
+
def _start_dc_language(attrsD)
|
848
|
+
push('language', true)
|
849
|
+
end
|
850
|
+
alias :_start_language :_start_dc_language
|
851
|
+
|
852
|
+
def _end_dc_language
|
853
|
+
@lang = pop('language')
|
854
|
+
end
|
855
|
+
alias :_end_language :_end_dc_language
|
856
|
+
|
857
|
+
def _start_dc_publisher(attrsD)
|
858
|
+
push('publisher', true)
|
859
|
+
end
|
860
|
+
alias :_start_webmaster :_start_dc_publisher
|
861
|
+
|
862
|
+
def _end_dc_publisher
|
863
|
+
pop('publisher')
|
864
|
+
_sync_author_detail('publisher')
|
865
|
+
end
|
866
|
+
alias :_end_webmaster :_end_dc_publisher
|
867
|
+
|
868
|
+
def _start_published(attrsD)
|
869
|
+
push('published', true)
|
870
|
+
end
|
871
|
+
alias :_start_dcterms_issued :_start_published
|
872
|
+
alias :_start_issued :_start_published
|
873
|
+
|
874
|
+
def _end_published
|
875
|
+
value = pop('published')
|
876
|
+
_save('published_parsed', parse_date(value))
|
877
|
+
end
|
878
|
+
alias :_end_dcterms_issued :_end_published
|
879
|
+
alias :_end_issued :_end_published
|
880
|
+
|
881
|
+
def _start_updated(attrsD)
|
882
|
+
push('updated', true)
|
883
|
+
end
|
884
|
+
alias :_start_modified :_start_updated
|
885
|
+
alias :_start_dcterms_modified :_start_updated
|
886
|
+
alias :_start_pubdate :_start_updated
|
887
|
+
alias :_start_dc_date :_start_updated
|
888
|
+
|
889
|
+
def _end_updated
|
890
|
+
value = pop('updated')
|
891
|
+
_save('updated_parsed', parse_date(value))
|
892
|
+
end
|
893
|
+
alias :_end_modified :_end_updated
|
894
|
+
alias :_end_dcterms_modified :_end_updated
|
895
|
+
alias :_end_pubdate :_end_updated
|
896
|
+
alias :_end_dc_date :_end_updated
|
897
|
+
|
898
|
+
def _start_created(attrsD)
|
899
|
+
push('created', true)
|
900
|
+
end
|
901
|
+
alias :_start_dcterms_created :_start_created
|
902
|
+
|
903
|
+
def _end_created
|
904
|
+
value = pop('created')
|
905
|
+
_save('created_parsed', parse_date(value))
|
906
|
+
end
|
907
|
+
alias :_end_dcterms_created :_end_created
|
908
|
+
|
909
|
+
def _start_expirationdate(attrsD)
|
910
|
+
push('expired', true)
|
911
|
+
end
|
912
|
+
def _end_expirationdate
|
913
|
+
_save('expired_parsed', parse_date(pop('expired')))
|
914
|
+
end
|
915
|
+
|
916
|
+
def _start_cc_license(attrsD)
|
917
|
+
push('license', true)
|
918
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
919
|
+
if value and not value.empty?
|
920
|
+
@elementstack[-1][2] << value
|
921
|
+
pop('license')
|
922
|
+
end
|
923
|
+
end
|
924
|
+
|
925
|
+
def _start_creativecommons_license(attrsD)
|
926
|
+
push('license', true)
|
927
|
+
end
|
928
|
+
|
929
|
+
def _end_creativecommons_license
|
930
|
+
pop('license')
|
931
|
+
end
|
932
|
+
|
933
|
+
def addTag(term, scheme, label)
|
934
|
+
context = getContext()
|
935
|
+
context['tags'] ||= []
|
936
|
+
tags = context['tags']
|
937
|
+
if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
|
938
|
+
return
|
939
|
+
end
|
940
|
+
value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
941
|
+
if not tags.include?value
|
942
|
+
context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
943
|
+
end
|
944
|
+
end
|
945
|
+
|
946
|
+
def _start_category(attrsD)
|
947
|
+
$stderr << "entering _start_category with #{attrsD}\n" if $debug
|
948
|
+
|
949
|
+
term = attrsD['term']
|
950
|
+
scheme = attrsD['scheme'] || attrsD['domain']
|
951
|
+
label = attrsD['label']
|
952
|
+
addTag(term, scheme, label)
|
953
|
+
push('category', true)
|
954
|
+
end
|
955
|
+
alias :_start_dc_subject :_start_category
|
956
|
+
alias :_start_keywords :_start_category
|
957
|
+
|
958
|
+
def _end_itunes_keywords
|
959
|
+
pop('itunes_keywords').split.each do |term|
|
960
|
+
addTag(term, 'http://www.itunes.com/', nil)
|
961
|
+
end
|
962
|
+
end
|
963
|
+
|
964
|
+
def _start_itunes_category(attrsD)
|
965
|
+
addTag(attrsD['text'], 'http://www.itunes.com/', nil)
|
966
|
+
push('category', true)
|
967
|
+
end
|
968
|
+
|
969
|
+
def _end_category
|
970
|
+
value = pop('category')
|
971
|
+
return if value.nil? or value.empty?
|
972
|
+
context = getContext()
|
973
|
+
tags = context['tags']
|
974
|
+
if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
|
975
|
+
tags[-1]['term'] = value
|
976
|
+
else
|
977
|
+
addTag(value, nil, nil)
|
978
|
+
end
|
979
|
+
end
|
980
|
+
alias :_end_dc_subject :_end_category
|
981
|
+
alias :_end_keywords :_end_category
|
982
|
+
alias :_end_itunes_category :_end_category
|
983
|
+
|
984
|
+
def _start_cloud(attrsD)
|
985
|
+
getContext()['cloud'] = FeedParserDict.new(attrsD)
|
986
|
+
end
|
987
|
+
|
988
|
+
def _start_link(attrsD)
|
989
|
+
attrsD['rel'] ||= 'alternate'
|
990
|
+
attrsD['type'] ||= 'text/html'
|
991
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
992
|
+
if attrsD.has_key? 'href'
|
993
|
+
attrsD['href'] = resolveURI(attrsD['href'])
|
994
|
+
end
|
995
|
+
expectingText = @infeed || @inentry || @insource
|
996
|
+
context = getContext()
|
997
|
+
context['links'] ||= []
|
998
|
+
context['links'] << FeedParserDict.new(attrsD)
|
999
|
+
if attrsD['rel'] == 'enclosure'
|
1000
|
+
_start_enclosure(attrsD)
|
1001
|
+
end
|
1002
|
+
if attrsD.has_key? 'href'
|
1003
|
+
expectingText = false
|
1004
|
+
if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
|
1005
|
+
context['link'] = attrsD['href']
|
1006
|
+
end
|
1007
|
+
else
|
1008
|
+
push('link', expectingText)
|
1009
|
+
end
|
1010
|
+
end
|
1011
|
+
alias :_start_producturl :_start_link
|
1012
|
+
|
1013
|
+
def _end_link
|
1014
|
+
value = pop('link')
|
1015
|
+
context = getContext()
|
1016
|
+
if @intextinput
|
1017
|
+
context['textinput']['link'] = value
|
1018
|
+
end
|
1019
|
+
if @inimage
|
1020
|
+
context['image']['link'] = value
|
1021
|
+
end
|
1022
|
+
end
|
1023
|
+
alias :_end_producturl :_end_link
|
1024
|
+
|
1025
|
+
def _start_guid(attrsD)
|
1026
|
+
@guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
|
1027
|
+
push('id', true)
|
1028
|
+
end
|
1029
|
+
|
1030
|
+
def _end_guid
|
1031
|
+
value = pop('id')
|
1032
|
+
_save('guidislink', (@guidislink and not getContext().has_key?('link')))
|
1033
|
+
if @guidislink:
|
1034
|
+
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
|
1035
|
+
# and only if the item doesn't already have a link element
|
1036
|
+
_save('link', value)
|
1037
|
+
end
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
|
1041
|
+
def _start_title(attrsD)
|
1042
|
+
pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
1043
|
+
end
|
1044
|
+
alias :_start_dc_title :_start_title
|
1045
|
+
alias :_start_media_title :_start_title
|
1046
|
+
|
1047
|
+
def _end_title
|
1048
|
+
value = popContent('title')
|
1049
|
+
context = getContext()
|
1050
|
+
if @intextinput
|
1051
|
+
context['textinput']['title'] = value
|
1052
|
+
elsif @inimage
|
1053
|
+
context['image']['title'] = value
|
1054
|
+
end
|
1055
|
+
end
|
1056
|
+
alias :_end_dc_title :_end_title
|
1057
|
+
alias :_end_media_title :_end_title
|
1058
|
+
|
1059
|
+
def _start_description(attrsD)
|
1060
|
+
context = getContext()
|
1061
|
+
if context.has_key?('summary')
|
1062
|
+
@summaryKey = 'content'
|
1063
|
+
_start_content(attrsD)
|
1064
|
+
else
|
1065
|
+
pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
|
1066
|
+
end
|
1067
|
+
end
|
1068
|
+
|
1069
|
+
def _start_abstract(attrsD)
|
1070
|
+
pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
1071
|
+
end
|
1072
|
+
|
1073
|
+
def _end_description
|
1074
|
+
if @summaryKey == 'content'
|
1075
|
+
_end_content()
|
1076
|
+
else
|
1077
|
+
value = popContent('description')
|
1078
|
+
context = getContext()
|
1079
|
+
if @intextinput
|
1080
|
+
context['textinput']['description'] = value
|
1081
|
+
elsif @inimage:
|
1082
|
+
context['image']['description'] = value
|
1083
|
+
end
|
1084
|
+
end
|
1085
|
+
@summaryKey = nil
|
1086
|
+
end
|
1087
|
+
alias :_end_abstract :_end_description
|
1088
|
+
|
1089
|
+
def _start_info(attrsD)
|
1090
|
+
pushContent('info', attrsD, 'text/plain', true)
|
1091
|
+
end
|
1092
|
+
alias :_start_feedburner_browserfriendly :_start_info
|
1093
|
+
|
1094
|
+
def _end_info
|
1095
|
+
popContent('info')
|
1096
|
+
end
|
1097
|
+
alias :_end_feedburner_browserfriendly :_end_info
|
1098
|
+
|
1099
|
+
def _start_generator(attrsD)
|
1100
|
+
if attrsD and not attrsD.empty?
|
1101
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
1102
|
+
if attrsD.has_key?('href')
|
1103
|
+
attrsD['href'] = resolveURI(attrsD['href'])
|
1104
|
+
end
|
1105
|
+
end
|
1106
|
+
getContext()['generator_detail'] = FeedParserDict.new(attrsD)
|
1107
|
+
push('generator', true)
|
1108
|
+
end
|
1109
|
+
|
1110
|
+
def _end_generator
|
1111
|
+
value = pop('generator')
|
1112
|
+
context = getContext()
|
1113
|
+
if context.has_key?('generator_detail')
|
1114
|
+
context['generator_detail']['name'] = value
|
1115
|
+
end
|
1116
|
+
end
|
1117
|
+
|
1118
|
+
def _start_admin_generatoragent(attrsD)
|
1119
|
+
push('generator', true)
|
1120
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
1121
|
+
if value and not value.empty?
|
1122
|
+
@elementstack[-1][2] << value
|
1123
|
+
end
|
1124
|
+
pop('generator')
|
1125
|
+
getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
|
1126
|
+
end
|
1127
|
+
|
1128
|
+
def _start_admin_errorreportsto(attrsD)
|
1129
|
+
push('errorreportsto', true)
|
1130
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
1131
|
+
if value and not value.empty?
|
1132
|
+
@elementstack[-1][2] << value
|
1133
|
+
end
|
1134
|
+
pop('errorreportsto')
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
def _start_summary(attrsD)
|
1138
|
+
context = getContext()
|
1139
|
+
if context.has_key?'summary'
|
1140
|
+
@summaryKey = 'content'
|
1141
|
+
_start_content(attrsD)
|
1142
|
+
else
|
1143
|
+
@summaryKey = 'summary'
|
1144
|
+
pushContent(@summaryKey, attrsD, 'text/plain', true)
|
1145
|
+
end
|
1146
|
+
end
|
1147
|
+
alias :_start_itunes_summary :_start_summary
|
1148
|
+
|
1149
|
+
def _end_summary
|
1150
|
+
if @summaryKey == 'content':
|
1151
|
+
_end_content()
|
1152
|
+
else
|
1153
|
+
popContent(@summaryKey || 'summary')
|
1154
|
+
end
|
1155
|
+
@summaryKey = nil
|
1156
|
+
end
|
1157
|
+
alias :_end_itunes_summary :_end_summary
|
1158
|
+
|
1159
|
+
def _start_enclosure(attrsD)
|
1160
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
1161
|
+
getContext()['enclosures'] ||= []
|
1162
|
+
getContext()['enclosures'] << FeedParserDict.new(attrsD)
|
1163
|
+
href = attrsD['href']
|
1164
|
+
if href and not href.empty?
|
1165
|
+
context = getContext()
|
1166
|
+
if not context['id']
|
1167
|
+
context['id'] = href
|
1168
|
+
end
|
1169
|
+
end
|
1170
|
+
end
|
1171
|
+
|
1172
|
+
def _start_source(attrsD)
|
1173
|
+
@insource = true
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
def _end_source
|
1177
|
+
@insource = false
|
1178
|
+
getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
|
1179
|
+
@sourcedata.clear()
|
1180
|
+
end
|
1181
|
+
|
1182
|
+
def _start_content(attrsD)
|
1183
|
+
pushContent('content', attrsD, 'text/plain', true)
|
1184
|
+
src = attrsD['src']
|
1185
|
+
if src and not src.empty?:
|
1186
|
+
@contentparams['src'] = src
|
1187
|
+
end
|
1188
|
+
push('content', true)
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
def _start_prodlink(attrsD)
|
1192
|
+
pushContent('content', attrsD, 'text/html', true)
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
def _start_body(attrsD)
|
1196
|
+
pushContent('content', attrsD, 'application/xhtml+xml', true)
|
1197
|
+
end
|
1198
|
+
alias :_start_xhtml_body :_start_body
|
1199
|
+
|
1200
|
+
def _start_content_encoded(attrsD)
|
1201
|
+
pushContent('content', attrsD, 'text/html', true)
|
1202
|
+
end
|
1203
|
+
alias :_start_fullitem :_start_content_encoded
|
1204
|
+
|
1205
|
+
def _end_content
|
1206
|
+
copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
|
1207
|
+
value = popContent('content')
|
1208
|
+
if copyToDescription
|
1209
|
+
_save('description', value)
|
1210
|
+
end
|
1211
|
+
alias :_end_body :_end_content
|
1212
|
+
alias :_end_xhtml_body :_end_content
|
1213
|
+
alias :_end_content_encoded :_end_content
|
1214
|
+
alias :_end_fullitem :_end_content
|
1215
|
+
alias :_end_prodlink :_end_content
|
1216
|
+
end
|
1217
|
+
|
1218
|
+
def _start_itunes_image(attrsD)
|
1219
|
+
push('itunes_image', false)
|
1220
|
+
getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
|
1221
|
+
end
|
1222
|
+
alias :_start_itunes_link :_start_itunes_image
|
1223
|
+
|
1224
|
+
def _end_itunes_block
|
1225
|
+
value = pop('itunes_block', false)
|
1226
|
+
getContext()['itunes_block'] = (value == 'yes') and true or false
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
def _end_itunes_explicit
|
1230
|
+
value = pop('itunes_explicit', false)
|
1231
|
+
getContext()['itunes_explicit'] = (value == 'yes') and true or false
|
1232
|
+
end
|
1233
|
+
end # End FeedParserMixin
|
1234
|
+
|
1235
|
+
|