UnderpantsGnome-rfeedparser 0.9.960

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env ruby
2
+ module FeedParser
3
+ class FeedParserDict < Hash
4
+ =begin
5
+ The naming of a certain common attribute (such as, "When was the last
6
+ time this feed was updated?") can have many different names depending
7
+ on the type of feed we are handling. This class allows us to satisfy
8
+ the expectations of both the developer who has prior knowledge of the
9
+ feed type as well as the developer who wants a consistent application
10
+ interface.
11
+
12
+ @@keymap is a Hash that contains information on what a certain
13
+ attribute names "really are" in each kind of feed. It does this by
14
+ providing a common name that will map to any feed type in the keys,
15
+ with possible "correct" attributes in the its values. the #[] and #[]=
16
+ methods check with keymaps to see what attribute the developer "really
17
+ means" if they've asked for one which happens to be in @@keymap's keys.
18
+ =end
19
+ @@keymap = {
20
+ 'channel' => 'feed',
21
+ 'items' => 'entries',
22
+ 'guid' => 'id',
23
+ 'date' => 'updated',
24
+ 'date_parsed' => 'updated_parsed',
25
+ 'description' => ['subtitle', 'summary'],
26
+ 'url' => ['href'],
27
+ 'modified' => 'updated',
28
+ 'modified_parsed' => 'updated_parsed',
29
+ 'issued' => 'published',
30
+ 'issued_parsed' => 'published_parsed',
31
+ 'copyright' => 'rights',
32
+ 'copyright_detail' => 'rights_detail',
33
+ 'tagline' => 'subtitle',
34
+ 'tagline_detail' => 'subtitle_detail'
35
+ }
36
+
37
+ # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
38
+ alias :hash_entries :entries
39
+ def entries
40
+ self['entries']
41
+ end
42
+
43
+ # Added to avoid deprecated method wornings
44
+ def type
45
+ self['type']
46
+ end
47
+
48
+ # We could include the [] rewrite in new using Hash.new's fancy pants block thing
49
+ # but we'd still have to overwrite []= and such.
50
+ # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
51
+ def initialize(pairs=nil)
52
+ if pairs.is_a?(Array) && pairs[0].is_a?(Array) && pairs[0].length == 2
53
+ pairs.each do |l|
54
+ k,v = l
55
+ self[k] = v
56
+ end
57
+ elsif pairs.is_a? Hash
58
+ self.merge!(pairs)
59
+ end
60
+ end
61
+
62
+ def [](key)
63
+ if key == 'category'
64
+ return self['tags'][0]['term']
65
+ end
66
+
67
+ if key == 'categories'
68
+ return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
69
+ end
70
+
71
+ realkey = @@keymap[key] || key
72
+ if realkey.is_a? Array
73
+ realkey.each{ |key| return self[key] if has_key?(key) }
74
+ end
75
+
76
+ # Note that the original key is preferred over the realkey we (might
77
+ # have) found in @@keymap
78
+ if has_key?(key)
79
+ return super(key)
80
+ end
81
+
82
+ super(realkey)
83
+ end
84
+
85
+ def []=(key,value)
86
+ if @@keymap.key?(key)
87
+ key = @@keymap[key]
88
+ if key.is_a? Array
89
+ key = key[0]
90
+ end
91
+ end
92
+ super(key,value)
93
+ end
94
+
95
+ def method_missing(msym, *args)
96
+ methodname = msym.to_s
97
+ if methodname[-1,1] == '='
98
+ self[methodname[0..-2]] = args[0]
99
+ elsif methodname[-1,1] != '!' && methodname[-1,1] != '?' && methodname[0,1] != "_" # FIXME implement with private?
100
+ self[methodname]
101
+ else
102
+ raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,75 @@
1
+ module FeedParser
2
+ class LooseFeedParser < BetterSGMLParser
3
+ include FeedParserMixin
4
+ # We write the methods that were in BaseHTMLProcessor in the python code
5
+ # in here directly. We do this because if we inherited from
6
+ # BaseHTMLProcessor but then included from FeedParserMixin, the methods
7
+ # of Mixin would overwrite the methods we inherited from
8
+ # BaseHTMLProcessor. This is exactly the opposite of what we want to
9
+ # happen!
10
+
11
+ attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
12
+
13
+ Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
14
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
15
+ New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
16
+ alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
17
+ def feed
18
+ @feeddata
19
+ end
20
+ def feed=(data)
21
+ @feeddata = data
22
+ end
23
+
24
+ def initialize(baseuri, baselang, encoding)
25
+ startup(baseuri, baselang, encoding)
26
+ super() # Keep the parentheses! No touchy.
27
+ end
28
+
29
+ def reset
30
+ @pieces = []
31
+ super
32
+ end
33
+
34
+ def parse(data)
35
+ data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
36
+ data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
37
+ clean = tag[1..-3].strip
38
+ if Elements_No_End_Tag.include?clean
39
+ tag
40
+ else
41
+ '<'+clean+'></'+clean+'>'
42
+ end
43
+ end
44
+
45
+ data.gsub!(/&#39;/, "'")
46
+ data.gsub!(/&#34;/, "'")
47
+ if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
48
+ data = uconvert(data,'utf-8',@encoding)
49
+ end
50
+ sgml_feed(data) # see the alias above
51
+ end
52
+
53
+
54
+ def decodeEntities(element, data)
55
+ data.gsub!('&#60;', '&lt;')
56
+ data.gsub!('&#x3c;', '&lt;')
57
+ data.gsub!('&#62;', '&gt;')
58
+ data.gsub!('&#x3e;', '&gt;')
59
+ data.gsub!('&#38;', '&amp;')
60
+ data.gsub!('&#x26;', '&amp;')
61
+ data.gsub!('&#34;', '&quot;')
62
+ data.gsub!('&#x22;', '&quot;')
63
+ data.gsub!('&#39;', '&apos;')
64
+ data.gsub!('&#x27;', '&apos;')
65
+ if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
66
+ data.gsub!('&lt;', '<')
67
+ data.gsub!('&gt;', '>')
68
+ data.gsub!('&amp;', '&')
69
+ data.gsub!('&quot;', '"')
70
+ data.gsub!('&apos;', "'")
71
+ end
72
+ return data
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ module FeedParserUtilities
3
+ def stripDoctype(data)
4
+ #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
5
+ #rss_version may be 'rss091n' or None
6
+ #stripped_data is the same XML document, minus the DOCTYPE
7
+ entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
8
+
9
+ data = data.gsub(entity_pattern,'')
10
+
11
+ doctype_pattern = /<!DOCTYPE(.*?)>/m
12
+ doctype_results = data.scan(doctype_pattern)
13
+ if doctype_results and doctype_results[0]
14
+ doctype = doctype_results[0][0]
15
+ else
16
+ doctype = ''
17
+ end
18
+
19
+ if /netscape/ =~ doctype.downcase
20
+ version = 'rss091n'
21
+ else
22
+ version = nil
23
+ end
24
+ data = data.sub(doctype_pattern, '')
25
+ return version, data
26
+ end
27
+
28
+ def resolveRelativeURIs(htmlSource, baseURI, encoding)
29
+ $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
30
+ relative_uris = [ ['a','href'],
31
+ ['applet','codebase'],
32
+ ['area','href'],
33
+ ['blockquote','cite'],
34
+ ['body','background'],
35
+ ['del','cite'],
36
+ ['form','action'],
37
+ ['frame','longdesc'],
38
+ ['frame','src'],
39
+ ['iframe','longdesc'],
40
+ ['iframe','src'],
41
+ ['head','profile'],
42
+ ['img','longdesc'],
43
+ ['img','src'],
44
+ ['img','usemap'],
45
+ ['input','src'],
46
+ ['input','usemap'],
47
+ ['ins','cite'],
48
+ ['link','href'],
49
+ ['object','classid'],
50
+ ['object','codebase'],
51
+ ['object','data'],
52
+ ['object','usemap'],
53
+ ['q','cite'],
54
+ ['script','src'],
55
+ ]
56
+ h = Hpricot(htmlSource)
57
+ relative_uris.each do |l|
58
+ ename, eattr = l
59
+ h.search(ename).each do |elem|
60
+ euri = elem.attributes[eattr]
61
+ uri = Addressable::URI.parse(Addressable::URI.encode(euri)) rescue nil
62
+ if euri and not euri.empty? and uri and uri.relative?
63
+ elem.raw_attributes[eattr] = urljoin(baseURI, euri)
64
+ end
65
+ end
66
+ end
67
+ return h.to_html
68
+ end
69
+ end
70
+
71
+
@@ -0,0 +1,10 @@
1
+ class Time
2
+ class << self
3
+ COMMON_YEAR_DAYS_IN_MONTH = [nil, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
4
+
5
+ def days_in_month(month, year = now.year)
6
+ return 29 if month == 2 && ::Date.gregorian_leap?(year)
7
+ COMMON_YEAR_DAYS_IN_MONTH[month]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,80 @@
1
+ gem 'nokogiri', '~>1.2'
2
+ require 'nokogiri'
3
+
4
+ module FeedParser
5
+ module Nokogiri
6
+
7
+ class NokogiriSyntaxError < StandardError; end
8
+
9
+ class StrictFeedParser
10
+ attr_reader :handler
11
+ def initialize(baseuri, baselang)
12
+ @handler = StrictFeedParserHandler.new(baseuri, baselang, 'utf-8')
13
+ end
14
+
15
+ def parse(data)
16
+ saxparser = ::Nokogiri::XML::SAX::Parser.new(@handler)
17
+
18
+ saxparser.parse data
19
+ end
20
+ end
21
+
22
+ class StrictFeedParserHandler < ::Nokogiri::XML::SAX::Document
23
+ include FeedParserMixin
24
+
25
+ attr_accessor :bozo, :entries, :feeddata, :exc
26
+
27
+ def initialize(baseuri, baselang, encoding)
28
+ $stderr.puts "trying Nokogiri::StrictFeedParser" if $debug
29
+ startup(baseuri, baselang, encoding)
30
+ @bozo = false
31
+ end
32
+
33
+ def start_element(name, attrs)
34
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
35
+ namespaceuri = ($2 || '').downcase
36
+ name = $3
37
+ if /backend\.userland\.com\/rss/ =~ namespaceuri
38
+ # match any backend.userland.com namespace
39
+ namespaceuri = 'http://backend.userland.com/rss'
40
+ end
41
+ prefix = @matchnamespaces[namespaceuri]
42
+
43
+ if prefix && !prefix.empty?
44
+ name = prefix + ':' + name
45
+ end
46
+
47
+ name.downcase!
48
+ unknown_starttag(name, attrs)
49
+ end
50
+
51
+ def characters(text)
52
+ handle_data(text)
53
+ end
54
+
55
+ def cdata_block(text)
56
+ handle_data(text)
57
+ end
58
+
59
+ def end_element(name)
60
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
61
+ namespaceuri = ($2 || '').downcase
62
+
63
+ prefix = @matchnamespaces[namespaceuri]
64
+
65
+ if prefix && !prefix.empty?
66
+ localname = prefix + ':' + name
67
+ end
68
+
69
+ name.downcase!
70
+ unknown_endtag(name)
71
+ end
72
+
73
+ def error(error_string)
74
+ @bozo = true
75
+ @exc = NokogiriSyntaxError.new(error_string)
76
+ raise @exc
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,1275 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module FeedParser
4
+ module FeedParserMixin
5
+ include FeedParserUtilities
6
+ attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
7
+
8
+ def startup(baseuri=nil, baselang=nil, encoding='utf-8')
9
+ $stderr << "initializing FeedParser\n" if $debug
10
+
11
+ @namespaces = {'' => '',
12
+ 'http://backend.userland.com/rss' => '',
13
+ 'http://blogs.law.harvard.edu/tech/rss' => '',
14
+ 'http://purl.org/rss/1.0/' => '',
15
+ 'http://my.netscape.com/rdf/simple/0.9/' => '',
16
+ 'http://example.com/newformat#' => '',
17
+ 'http://example.com/necho' => '',
18
+ 'http://purl.org/echo/' => '',
19
+ 'uri/of/echo/namespace#' => '',
20
+ 'http://purl.org/pie/' => '',
21
+ 'http://purl.org/atom/ns#' => '',
22
+ 'http://www.w3.org/2005/Atom' => '',
23
+ 'http://purl.org/rss/1.0/modules/rss091#' => '',
24
+ 'http://webns.net/mvcb/' => 'admin',
25
+ 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
26
+ 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
27
+ 'http://media.tangent.org/rss/1.0/' => 'audio',
28
+ 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
29
+ 'http://web.resource.org/cc/' => 'cc',
30
+ 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
31
+ 'http://purl.org/rss/1.0/modules/company' => 'co',
32
+ 'http://purl.org/rss/1.0/modules/content/' => 'content',
33
+ 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
34
+ 'http://purl.org/dc/elements/1.1/' => 'dc',
35
+ 'http://purl.org/dc/terms/' => 'dcterms',
36
+ 'http://purl.org/rss/1.0/modules/email/' => 'email',
37
+ 'http://purl.org/rss/1.0/modules/event/' => 'ev',
38
+ 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
39
+ 'http://freshmeat.net/rss/fm/' => 'fm',
40
+ 'http://xmlns.com/foaf/0.1/' => 'foaf',
41
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
42
+ 'http://postneo.com/icbm/' => 'icbm',
43
+ 'http://purl.org/rss/1.0/modules/image/' => 'image',
44
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
45
+ 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
46
+ 'http://purl.org/rss/1.0/modules/link/' => 'l',
47
+ 'http://search.yahoo.com/mrss' => 'media',
48
+ 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
49
+ 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
50
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
51
+ 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
52
+ 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
53
+ 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
54
+ 'http://purl.org/rss/1.0/modules/search/' => 'search',
55
+ 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
56
+ 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
57
+ 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
58
+ 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
59
+ 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
60
+ 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
61
+ 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
62
+ 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
63
+ 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
64
+ 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
65
+ 'http://wellformedweb.org/commentAPI/' => 'wfw',
66
+ 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
67
+ 'http://www.w3.org/1999/xhtml' => 'xhtml',
68
+ 'http://www.w3.org/XML/1998/namespace' => 'xml',
69
+ 'http://www.w3.org/1999/xlink' => 'xlink',
70
+ 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
71
+ }
72
+ @matchnamespaces = {}
73
+ @namespaces.each do |l|
74
+ @matchnamespaces[l[0].downcase] = l[1]
75
+ end
76
+ @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
77
+ @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
78
+ @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
79
+ @html_types = ['text/html', 'application/xhtml+xml']
80
+ @feeddata = FeedParserDict.new # feed-level data
81
+ @encoding = encoding # character encoding
82
+ @entries = [] # list of entry-level data
83
+ @version = '' # feed type/version see SUPPORTED_VERSIOSN
84
+ @namespacesInUse = {} # hash of namespaces defined by the feed
85
+
86
+ # the following are used internally to track state;
87
+ # this is really out of control and should be refactored
88
+ @infeed = false
89
+ @inentry = false
90
+ @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
91
+ @intextinput = false
92
+ @inimage = false
93
+ @inauthor = false
94
+ @incontributor = false
95
+ @inpublisher = false
96
+ @insource = false
97
+ @sourcedata = FeedParserDict.new
98
+ @contentparams = FeedParserDict.new
99
+ @summaryKey = nil
100
+ @namespacemap = {}
101
+ @elementstack = []
102
+ @basestack = []
103
+ @langstack = []
104
+ @baseuri = baseuri || ''
105
+ @lang = baselang || nil
106
+ @has_title = false
107
+ if baselang
108
+ @feeddata['language'] = baselang.gsub('_','-')
109
+ end
110
+ $stderr << "Leaving startup\n" if $debug # My addition
111
+ end
112
+
113
+ def unknown_starttag(tag, attrsd)
114
+ $stderr << "start #{tag} with #{attrsd.inspect}\n" if $debug
115
+ # normalize attrs
116
+ attrsD = {}
117
+ attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
118
+ # LooseFeedParser needs the above because SGMLParser sends attrs as a
119
+ # list of lists (like [['type','text/html'],['mode','escaped']])
120
+
121
+ attrsd.each do |old_k,value|
122
+ # There has to be a better, non-ugly way of doing this
123
+ k = old_k.downcase # Downcase all keys
124
+ attrsD[k] = value
125
+ if ['rel','type'].include?value
126
+ attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
127
+ end
128
+ end
129
+
130
+ # track xml:base and xml:lang
131
+ baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
132
+ @baseuri = urljoin(@baseuri, baseuri)
133
+ lang = attrsD['xml:lang'] || attrsD['lang']
134
+ if lang == '' # FIXME This next bit of code is right? Wtf?
135
+ # xml:lang could be explicitly set to '', we need to capture that
136
+ lang = nil
137
+ elsif lang.nil?
138
+ # if no xml:lang is specified, use parent lang
139
+ lang = @lang
140
+ end
141
+
142
+ if lang && ! lang.empty? # Seriously, this cannot be correct
143
+ if ['feed', 'rss', 'rdf:RDF'].include?tag
144
+ @feeddata['language'] = lang.gsub('_','-')
145
+ end
146
+ end
147
+ @lang = lang
148
+ @basestack << @baseuri
149
+ @langstack << lang
150
+
151
+ # track namespaces
152
+ attrsd.each do |prefix, uri|
153
+ if /^xmlns:/ =~ prefix # prefix begins with xmlns:
154
+ trackNamespace(prefix[6..-1], uri)
155
+ elsif prefix == 'xmlns':
156
+ trackNamespace(nil, uri)
157
+ end
158
+ end
159
+
160
+ # track inline content
161
+ if @incontent != 0 && @contentparams.has_key?('type') && ! ( /xml$/ =~ (@contentparams['type'] || 'xml') )
162
+ # element declared itself as escaped markup, but isn't really
163
+
164
+ @contentparams['type'] = 'application/xhtml+xml'
165
+ end
166
+ if @incontent != 0 && @contentparams['type'] == 'application/xhtml+xml'
167
+ # Note: probably shouldn't simply recreate localname here, but
168
+ # our namespace handling isn't actually 100% correct in cases where
169
+ # the feed redefines the default namespace (which is actually
170
+ # the usual case for inline content, thanks Sam), so here we
171
+ # cheat and just reconstruct the element based on localname
172
+ # because that compensates for the bugs in our namespace handling.
173
+ # This will horribly munge inline content with non-empty qnames,
174
+ # but nobody actually does that, so I'm not fixing it.
175
+ tag = tag.split(':')[-1]
176
+ attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
177
+ attrsS = ' '+attrsA.join(' ')
178
+ return handle_data("<#{tag}#{attrsS}>", escape=false)
179
+ end
180
+
181
+ # match namespaces
182
+ if /:/ =~ tag
183
+ prefix, suffix = tag.split(':', 2)
184
+ else
185
+ prefix, suffix = '', tag
186
+ end
187
+ prefix = @namespacemap[prefix] || prefix
188
+ if prefix && ! prefix.empty?
189
+ prefix = prefix + '_'
190
+ end
191
+
192
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
193
+ if (not prefix && ! prefix.empty?) && ! (['title', 'link', 'description','name'].include?tag)
194
+ @intextinput = false
195
+ end
196
+ if (prefix.nil? || prefix.empty?) && ! (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
197
+ @inimage = false
198
+ end
199
+
200
+ # call special handler (if defined) or default handler
201
+ begin
202
+ return send('_start_'+prefix+suffix, attrsD)
203
+ rescue NoMethodError
204
+ return push(prefix + suffix, true)
205
+ end
206
+ end # End unknown_starttag
207
+
208
+ def unknown_endtag(tag)
209
+ $stderr << "end #{tag}\n" if $debug
210
+ # match namespaces
211
+ if tag.index(':')
212
+ prefix, suffix = tag.split(':',2)
213
+ else
214
+ prefix, suffix = '', tag
215
+ end
216
+ prefix = @namespacemap[prefix] || prefix
217
+ if prefix && ! prefix.empty?
218
+ prefix = prefix + '_'
219
+ end
220
+
221
+ # call special handler (if defined) or default handler
222
+ begin
223
+ send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
224
+ rescue NoMethodError => details
225
+ pop(prefix + suffix)
226
+ end
227
+
228
+ # track inline content
229
+ if @incontent != 0 && @contentparams.has_key?('type') && /xml$/ =~ (@contentparams['type'] || 'xml')
230
+ # element declared itself as escaped markup, but it isn't really
231
+ @contentparams['type'] = 'application/xhtml+xml'
232
+ end
233
+ if @incontent != 0 && @contentparams['type'] == 'application/xhtml+xml'
234
+ tag = tag.split(':')[-1]
235
+ handle_data("</#{tag}>", escape=false)
236
+ end
237
+
238
+ # track xml:base and xml:lang going out of scope
239
+ if @basestack && ! @basestack.empty?
240
+ @basestack.pop
241
+ if @basestack && @basestack[-1] && ! (@basestack.empty? || @basestack[-1].empty?)
242
+ @baseuri = @basestack[-1]
243
+ end
244
+ end
245
+ if @langstack && ! @langstack.empty?
246
+ @langstack.pop
247
+ if @langstack && ! @langstack.empty? # && @langstack[-1] && ! @langstack.empty?
248
+ @lang = @langstack[-1]
249
+ end
250
+ end
251
+ end
252
+
253
+ def handle_charref(ref)
254
+ # LooseParserOnly
255
+ # called for each character reference, e.g. for '&#160;', ref will be '160'
256
+ $stderr << "entering handle_charref with #{ref}\n" if $debug
257
+ return if @elementstack.nil? || @elementstack.empty?
258
+
259
+ ref.downcase!
260
+ chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
261
+ if chars.include?ref
262
+ text = "&##{ref};"
263
+ else
264
+ if ref[0..0] == 'x'
265
+ c = (ref[1..-1]).to_i(16)
266
+ else
267
+ c = ref.to_i
268
+ end
269
+ text = [c].pack('U*')
270
+ end
271
+ @elementstack[-1][2] << text
272
+ end
273
+
274
+ def handle_entityref(ref)
275
+ # LooseParserOnly
276
+ # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
277
+
278
+ return if @elementstack.nil? || @elementstack.empty?
279
+ $stderr << "entering handle_entityref with #{ref}\n" if $debug
280
+ ents = ['lt', 'gt', 'quot', 'amp', 'apos']
281
+ if ents.include?ref
282
+ text = "&#{ref};"
283
+ else
284
+ text = HTMLEntities::decode_entities("&#{ref};")
285
+ end
286
+ @elementstack[-1][2] << text
287
+ end
288
+
289
+ def handle_data(text, escape=true)
290
+ # called for each block of plain text, i.e. outside of any tag and
291
+ # not containing any character or entity references
292
+ return if @elementstack.nil? || @elementstack.empty?
293
+ if escape && @contentparams['type'] == 'application/xhtml+xml'
294
+ text = text.to_xs
295
+ end
296
+ @elementstack[-1][2] << text
297
+ end
298
+
299
+ def handle_comment(comment)
300
+ # called for each comment, e.g. <!-- insert message here -->
301
+ end
302
+
303
+ def handle_pi(text)
304
+ end
305
+
306
+ def handle_decl(text)
307
+ end
308
+
309
+ def parse_declaration(i)
310
+ # for LooseFeedParser
311
+ $stderr << "entering parse_declaration\n" if $debug
312
+ if @rawdata[i...i+9] == '<![CDATA['
313
+ k = @rawdata.index(/\]\]>/u,i+9)
314
+ k = @rawdata.length unless k
315
+ handle_data(@rawdata[i+9...k].to_xs,false)
316
+ return k+3
317
+ else
318
+ k = @rawdata.index(/>/,i).to_i
319
+ return k+1
320
+ end
321
+ end
322
+
323
+ def mapContentType(contentType)
324
+ contentType.downcase!
325
+ case contentType
326
+ when 'text'
327
+ contentType = 'text/plain'
328
+ when 'html'
329
+ contentType = 'text/html'
330
+ when 'xhtml'
331
+ contentType = 'application/xhtml+xml'
332
+ end
333
+ return contentType
334
+ end
335
+
336
+ def trackNamespace(prefix, uri)
337
+
338
+ loweruri = uri.downcase.strip
339
+ if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] && (@version.nil? || @version.empty?)
340
+ @version = 'rss090'
341
+ elsif loweruri == 'http://purl.org/rss/1.0/' && (@version.nil? || @version.empty?)
342
+ @version = 'rss10'
343
+ elsif loweruri == 'http://www.w3.org/2005/atom' && (@version.nil? || @version.empty?)
344
+ @version = 'atom10'
345
+ elsif /backend\.userland\.com\/rss/ =~ loweruri
346
+ # match any backend.userland.com namespace
347
+ uri = 'http://backend.userland.com/rss'
348
+ loweruri = uri
349
+ end
350
+ if @matchnamespaces.has_key? loweruri
351
+ @namespacemap[prefix] = @matchnamespaces[loweruri]
352
+ @namespacesInUse[@matchnamespaces[loweruri]] = uri
353
+ else
354
+ @namespacesInUse[prefix || ''] = uri
355
+ end
356
+ end
357
+
358
+ def resolveURI(uri)
359
+ return urljoin(@baseuri || '', uri)
360
+ end
361
+
362
+ def decodeEntities(element, data)
363
+ return data
364
+ end
365
+
366
+ def push(element, expectingText)
367
+ @elementstack << [element, expectingText, []]
368
+ end
369
+
370
+ def pop(element, stripWhitespace=true)
371
+ return if @elementstack.nil? || @elementstack.empty?
372
+ return if @elementstack[-1][0] != element
373
+ element, expectingText, pieces = @elementstack.pop
374
+
375
+ if pieces.class == Array
376
+ output = pieces.join('')
377
+ else
378
+ output = pieces
379
+ end
380
+ if stripWhitespace
381
+ output.strip!
382
+ end
383
+ return output if ! expectingText
384
+
385
+ # decode base64 content
386
+ if @contentparams['base64']
387
+ out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
388
+ if ! output.empty? && ! out64.empty?
389
+ output = out64
390
+ end
391
+ end
392
+
393
+ # resolve relative URIs
394
+ if @can_be_relative_uri.include?(element) && output && !output.empty?
395
+ output = resolveURI(output)
396
+ end
397
+
398
+ # decode entities within embedded markup
399
+ if ! @contentparams['base64']
400
+ output = decodeEntities(element, output)
401
+ end
402
+
403
+ # remove temporary cruft from contentparams
404
+ @contentparams.delete('mode')
405
+ @contentparams.delete('base64')
406
+
407
+ # resolve relative URIs within embedded markup
408
+ if @html_types.include?(mapContentType(@contentparams['type'] || 'text/html'))
409
+ if @can_contain_relative_uris.include?(element)
410
+ output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
411
+ end
412
+ end
413
+ # sanitize embedded markup
414
+ if @html_types.include?(mapContentType(@contentparams['type'] || 'text/html'))
415
+ if @can_contain_dangerous_markup.include?(element)
416
+ output = FeedParser.sanitizeHTML(output, @encoding)
417
+ end
418
+ end
419
+
420
+ if @encoding && ! @encoding.empty? && @encoding != 'utf-8'
421
+ output = uconvert(output, @encoding, 'utf-8')
422
+ # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
423
+ end
424
+
425
+ # categories/tags/keywords/whatever are handled in _end_category
426
+ return output if element == 'category'
427
+
428
+ return output if element == 'title' && @has_title
429
+
430
+ # store output in appropriate place(s)
431
+ if @inentry && ! @insource
432
+ if element == 'content'
433
+ @entries[-1][element] ||= []
434
+ contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
435
+ contentparams['value'] = output
436
+ @entries[-1][element] << contentparams
437
+ elsif element == 'link'
438
+ @entries[-1][element] = output
439
+ if output && ! output.empty?
440
+ @entries[-1]['links'][-1]['href'] = output
441
+ end
442
+ else
443
+ element = 'summary' if element == 'description'
444
+ @entries[-1][element] = output
445
+ if @incontent != 0
446
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
447
+ contentparams['value'] = output
448
+ @entries[-1][element + '_detail'] = contentparams
449
+ end
450
+ end
451
+ elsif (@infeed || @insource) && ! @intextinput && ! @inimage
452
+ context = getContext()
453
+ element = 'subtitle' if element == 'description'
454
+ context[element] = output
455
+ if element == 'link'
456
+ context['links'][-1]['href'] = output
457
+ elsif @incontent != 0
458
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
459
+ contentparams['value'] = output
460
+ context[element + '_detail'] = contentparams
461
+ end
462
+ end
463
+
464
+ return output
465
+ end
466
+
467
+ def pushContent(tag, attrsD, defaultContentType, expectingText)
468
+ @incontent += 1 # Yes, I hate this.
469
+ type = mapContentType(attrsD['type'] || defaultContentType)
470
+ @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
471
+ @contentparams['base64'] = isBase64(attrsD, @contentparams)
472
+ push(tag, expectingText)
473
+ end
474
+
475
+ def popContent(tag)
476
+ value = pop(tag)
477
+ @incontent -= 1
478
+ @contentparams.clear
479
+ return value
480
+ end
481
+
482
+ def mapToStandardPrefix(name)
483
+ colonpos = name.index(':')
484
+ if colonpos
485
+ prefix = name[0..colonpos-1]
486
+ suffix = name[colonpos+1..-1]
487
+ prefix = @namespacemap[prefix] || prefix
488
+ name = prefix + ':' + suffix
489
+ end
490
+ return name
491
+ end
492
+
493
+ def getAttribute(attrsD, name)
494
+ return attrsD[mapToStandardPrefix(name)]
495
+ end
496
+
497
+ def isBase64(attrsD, contentparams)
498
+ return true if (attrsD['mode'] == 'base64')
499
+ if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
500
+ return false
501
+ end
502
+ return true
503
+ end
504
+
505
+ def itsAnHrefDamnIt(attrsD)
506
+ href= attrsD['url'] || attrsD['uri'] || attrsD['href']
507
+ if href
508
+ attrsD.delete('url')
509
+ attrsD.delete('uri')
510
+ attrsD['href'] = href
511
+ end
512
+ return attrsD
513
+ end
514
+
515
+
516
+ def _save(key, value)
517
+ context = getContext()
518
+ context[key] ||= value
519
+ end
520
+
521
+ def _start_rss(attrsD)
522
+ versionmap = {'0.91' => 'rss091u',
523
+ '0.92' => 'rss092',
524
+ '0.93' => 'rss093',
525
+ '0.94' => 'rss094'
526
+ }
527
+
528
+ if ! @version || @version.empty?
529
+ attr_version = attrsD['version'] || ''
530
+ version = versionmap[attr_version]
531
+ if version && ! version.empty?
532
+ @version = version
533
+ elsif /^2\./ =~ attr_version
534
+ @version = 'rss20'
535
+ else
536
+ @version = 'rss'
537
+ end
538
+ end
539
+ end
540
+
541
+ def _start_dlhottitles(attrsD)
542
+ @version = 'hotrss'
543
+ end
544
+
545
+ def _start_channel(attrsD)
546
+ @infeed = true
547
+ _cdf_common(attrsD)
548
+ end
549
+ alias :_start_feedinfo :_start_channel
550
+
551
+ def _cdf_common(attrsD)
552
+ if attrsD.has_key?'lastmod'
553
+ _start_modified({})
554
+ @elementstack[-1][-1] = attrsD['lastmod']
555
+ _end_modified
556
+ end
557
+ if attrsD.has_key?'href'
558
+ _start_link({})
559
+ @elementstack[-1][-1] = attrsD['href']
560
+ _end_link
561
+ end
562
+ end
563
+
564
+ def _start_feed(attrsD)
565
+ @infeed = true
566
+ versionmap = {'0.1' => 'atom01',
567
+ '0.2' => 'atom02',
568
+ '0.3' => 'atom03'
569
+ }
570
+
571
+ if ! @version || @version.empty?
572
+ attr_version = attrsD['version']
573
+ version = versionmap[attr_version]
574
+ if @version && ! @version.empty?
575
+ @version = version
576
+ else
577
+ @version = 'atom'
578
+ end
579
+ end
580
+ end
581
+
582
+ def _end_channel
583
+ @infeed = false
584
+ end
585
+ alias :_end_feed :_end_channel
586
+
587
+ def _start_image(attrsD)
588
+ @inimage = true
589
+ @has_title = false
590
+ push('image', false)
591
+ context = getContext()
592
+ context['image'] ||= FeedParserDict.new
593
+ end
594
+
595
+ def _end_image
596
+ pop('image')
597
+ @inimage = false
598
+ end
599
+
600
+ def _start_textinput(attrsD)
601
+ @intextinput = true
602
+ @has_title = false
603
+ push('textinput', false)
604
+ context = getContext()
605
+ context['textinput'] ||= FeedParserDict.new
606
+ end
607
+ alias :_start_textInput :_start_textinput
608
+
609
+ def _end_textinput
610
+ pop('textinput')
611
+ @intextinput = false
612
+ end
613
+ alias :_end_textInput :_end_textinput
614
+
615
+ def _start_author(attrsD)
616
+ @inauthor = true
617
+ push('author', true)
618
+ end
619
+ alias :_start_managingeditor :_start_author
620
+ alias :_start_dc_author :_start_author
621
+ alias :_start_dc_creator :_start_author
622
+ alias :_start_itunes_author :_start_author
623
+
624
+ def _end_author
625
+ pop('author')
626
+ @inauthor = false
627
+ _sync_author_detail()
628
+ end
629
+ alias :_end_managingeditor :_end_author
630
+ alias :_end_dc_author :_end_author
631
+ alias :_end_dc_creator :_end_author
632
+ alias :_end_itunes_author :_end_author
633
+
634
+ def _start_itunes_owner(attrsD)
635
+ @inpublisher = true
636
+ push('publisher', false)
637
+ end
638
+
639
+ def _end_itunes_owner
640
+ pop('publisher')
641
+ @inpublisher = false
642
+ _sync_author_detail('publisher')
643
+ end
644
+
645
+ def _start_contributor(attrsD)
646
+ @incontributor = true
647
+ context = getContext()
648
+ context['contributors'] ||= []
649
+ context['contributors'] << FeedParserDict.new
650
+ push('contributor', false)
651
+ end
652
+
653
+ def _end_contributor
654
+ pop('contributor')
655
+ @incontributor = false
656
+ end
657
+
658
+ def _start_dc_contributor(attrsD)
659
+ @incontributor = true
660
+ context = getContext()
661
+ context['contributors'] ||= []
662
+ context['contributors'] << FeedParserDict.new
663
+ push('name', false)
664
+ end
665
+
666
+ def _end_dc_contributor
667
+ _end_name
668
+ @incontributor = false
669
+ end
670
+
671
+ def _start_name(attrsD)
672
+ push('name', false)
673
+ end
674
+ alias :_start_itunes_name :_start_name
675
+
676
+ def _end_name
677
+ value = pop('name')
678
+ if @inpublisher
679
+ _save_author('name', value, 'publisher')
680
+ elsif @inauthor
681
+ _save_author('name', value)
682
+ elsif @incontributor
683
+ _save_contributor('name', value)
684
+ elsif @intextinput
685
+ context = getContext()
686
+ context['textinput']['name'] = value
687
+ end
688
+ end
689
+ alias :_end_itunes_name :_end_name
690
+
691
+ def _start_width(attrsD)
692
+ push('width', false)
693
+ end
694
+
695
+ def _end_width
696
+ value = pop('width').to_i
697
+ if @inimage
698
+ context = getContext
699
+ context['image']['width'] = value
700
+ end
701
+ end
702
+
703
+ def _start_height(attrsD)
704
+ push('height', false)
705
+ end
706
+
707
+ def _end_height
708
+ value = pop('height').to_i
709
+ if @inimage
710
+ context = getContext()
711
+ context['image']['height'] = value
712
+ end
713
+ end
714
+
715
+ def _start_url(attrsD)
716
+ push('href', true)
717
+ end
718
+ alias :_start_homepage :_start_url
719
+ alias :_start_uri :_start_url
720
+
721
+ def _end_url
722
+ value = pop('href')
723
+ if @inauthor
724
+ _save_author('href', value)
725
+ elsif @incontributor
726
+ _save_contributor('href', value)
727
+ elsif @inimage
728
+ context = getContext()
729
+ context['image']['href'] = value
730
+ elsif @intextinput
731
+ context = getContext()
732
+ context['textinput']['link'] = value
733
+ end
734
+ end
735
+ alias :_end_homepage :_end_url
736
+ alias :_end_uri :_end_url
737
+
738
+ def _start_email(attrsD)
739
+ push('email', false)
740
+ end
741
+ alias :_start_itunes_email :_start_email
742
+
743
+ def _end_email
744
+ value = pop('email')
745
+ if @inpublisher
746
+ _save_author('email', value, 'publisher')
747
+ elsif @inauthor
748
+ _save_author('email', value)
749
+ elsif @incontributor
750
+ _save_contributor('email', value)
751
+ end
752
+ end
753
+ alias :_end_itunes_email :_end_email
754
+
755
+ def getContext
756
+ if @insource
757
+ context = @sourcedata
758
+ elsif @inentry
759
+ context = @entries[-1]
760
+ else
761
+ context = @feeddata
762
+ end
763
+ return context
764
+ end
765
+
766
+ def _save_author(key, value, prefix='author')
767
+ context = getContext()
768
+ context[prefix + '_detail'] ||= FeedParserDict.new
769
+ context[prefix + '_detail'][key] = value
770
+ _sync_author_detail()
771
+ end
772
+
773
+ def _save_contributor(key, value)
774
+ context = getContext
775
+ context['contributors'] ||= [FeedParserDict.new]
776
+ context['contributors'][-1][key] = value
777
+ end
778
+
779
+ def _sync_author_detail(key='author')
780
+ context = getContext()
781
+ detail = context["#{key}_detail"]
782
+ if detail && ! detail.empty?
783
+ name = detail['name']
784
+ email = detail['email']
785
+
786
+ if name && email && ! (name.empty? || name.empty?)
787
+ context[key] = "#{name} (#{email})"
788
+ elsif name && ! name.empty?
789
+ context[key] = name
790
+ elsif email && ! email.empty?
791
+ context[key] = email
792
+ end
793
+ else
794
+ author = context[key].dup unless context[key].nil?
795
+ return if ! author || author.empty?
796
+ emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
797
+ email = emailmatch[1]
798
+ author.gsub!(email, '')
799
+ author.gsub!("\(\)", '')
800
+ author.strip!
801
+ author.gsub!(/^\(/,'')
802
+ author.gsub!(/\)$/,'')
803
+ author.strip!
804
+ context["#{key}_detail"] ||= FeedParserDict.new
805
+ context["#{key}_detail"]['name'] = author
806
+ context["#{key}_detail"]['email'] = email
807
+ end
808
+ end
809
+
810
+ def _start_subtitle(attrsD)
811
+ pushContent('subtitle', attrsD, 'text/plain', true)
812
+ end
813
+ alias :_start_tagline :_start_subtitle
814
+ alias :_start_itunes_subtitle :_start_subtitle
815
+
816
+ def _end_subtitle
817
+ popContent('subtitle')
818
+ end
819
+ alias :_end_tagline :_end_subtitle
820
+ alias :_end_itunes_subtitle :_end_subtitle
821
+
822
+ def _start_rights(attrsD)
823
+ pushContent('rights', attrsD, 'text/plain', true)
824
+ end
825
+ alias :_start_dc_rights :_start_rights
826
+ alias :_start_copyright :_start_rights
827
+
828
+ def _end_rights
829
+ popContent('rights')
830
+ end
831
+ alias :_end_dc_rights :_end_rights
832
+ alias :_end_copyright :_end_rights
833
+
834
+ def _start_item(attrsD)
835
+ @entries << FeedParserDict.new
836
+ push('item', false)
837
+ @inentry = true
838
+ @has_title = false
839
+ @guidislink = false
840
+ id = getAttribute(attrsD, 'rdf:about')
841
+ if id && ! id.empty?
842
+ context = getContext()
843
+ context['id'] = id
844
+ end
845
+ _cdf_common(attrsD)
846
+ end
847
+ alias :_start_entry :_start_item
848
+ alias :_start_product :_start_item
849
+
850
+ def _end_item
851
+ pop('item')
852
+ @inentry = false
853
+ end
854
+ alias :_end_entry :_end_item
855
+
856
+ def _start_dc_language(attrsD)
857
+ push('language', true)
858
+ end
859
+ alias :_start_language :_start_dc_language
860
+
861
+ def _end_dc_language
862
+ @lang = pop('language')
863
+ end
864
+ alias :_end_language :_end_dc_language
865
+
866
+ def _start_dc_publisher(attrsD)
867
+ push('publisher', true)
868
+ end
869
+ alias :_start_webmaster :_start_dc_publisher
870
+
871
+ def _end_dc_publisher
872
+ pop('publisher')
873
+ _sync_author_detail('publisher')
874
+ end
875
+ alias :_end_webmaster :_end_dc_publisher
876
+
877
+ def _start_published(attrsD)
878
+ push('published', true)
879
+ end
880
+ alias :_start_dcterms_issued :_start_published
881
+ alias :_start_issued :_start_published
882
+
883
+ def _end_published
884
+ value = pop('published')
885
+ d = parse_date(value)
886
+ _save('published_parsed', extract_tuple(d))
887
+ _save('published_time', d)
888
+ end
889
+ alias :_end_dcterms_issued :_end_published
890
+ alias :_end_issued :_end_published
891
+
892
+ def _start_updated(attrsD)
893
+ push('updated', true)
894
+ end
895
+ alias :_start_modified :_start_updated
896
+ alias :_start_dcterms_modified :_start_updated
897
+ alias :_start_pubdate :_start_updated
898
+ alias :_start_dc_date :_start_updated
899
+
900
+ def _end_updated
901
+ value = pop('updated')
902
+ d = parse_date(value)
903
+ _save('updated_parsed', extract_tuple(d))
904
+ _save('updated_time', d)
905
+ end
906
+ alias :_end_modified :_end_updated
907
+ alias :_end_dcterms_modified :_end_updated
908
+ alias :_end_pubdate :_end_updated
909
+ alias :_end_dc_date :_end_updated
910
+
911
+ def _start_created(attrsD)
912
+ push('created', true)
913
+ end
914
+ alias :_start_dcterms_created :_start_created
915
+
916
+ def _end_created
917
+ value = pop('created')
918
+ d = parse_date(value)
919
+ _save('created_parsed', extract_tuple(d))
920
+ _save('created_time', d)
921
+ end
922
+ alias :_end_dcterms_created :_end_created
923
+
924
+ def _start_expirationdate(attrsD)
925
+ push('expired', true)
926
+ end
927
+ def _end_expirationdate
928
+ d = parse_date(pop('expired'))
929
+ _save('expired_parsed', extract_tuple(d))
930
+ _save('expired_time', d)
931
+ end
932
+
933
+ def _start_cc_license(attrsD)
934
+ push('license', true)
935
+ value = getAttribute(attrsD, 'rdf:resource')
936
+ if value && ! value.empty?
937
+ @elementstack[-1][2] << value
938
+ pop('license')
939
+ end
940
+ end
941
+
942
+ def _start_creativecommons_license(attrsD)
943
+ push('license', true)
944
+ end
945
+
946
+ def _end_creativecommons_license
947
+ pop('license')
948
+ end
949
+
950
+ def addTag(term, scheme, label)
951
+ context = getContext()
952
+ context['tags'] ||= []
953
+ tags = context['tags']
954
+ if (term.nil? || term.empty?) && (scheme.nil? || scheme.empty?) && (label.nil? || label.empty?)
955
+ return
956
+ end
957
+ value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
958
+ if ! tags.include?value
959
+ context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
960
+ end
961
+ end
962
+
963
+ def _start_category(attrsD)
964
+ $stderr << "entering _start_category with #{attrsD}\n" if $debug
965
+
966
+ term = attrsD['term']
967
+ scheme = attrsD['scheme'] || attrsD['domain']
968
+ label = attrsD['label']
969
+ addTag(term, scheme, label)
970
+ push('category', true)
971
+ end
972
+ alias :_start_dc_subject :_start_category
973
+ alias :_start_keywords :_start_category
974
+
975
+ def _end_itunes_keywords
976
+ pop('itunes_keywords').split.each do |term|
977
+ addTag(term, 'http://www.itunes.com/', nil)
978
+ end
979
+ end
980
+
981
+ def _start_itunes_category(attrsD)
982
+ addTag(attrsD['text'], 'http://www.itunes.com/', nil)
983
+ push('category', true)
984
+ end
985
+
986
+ def _end_category
987
+ value = pop('category')
988
+ return if value.nil? || value.empty?
989
+ context = getContext()
990
+ tags = context['tags']
991
+ if value && ! value.empty? && ! tags.empty? && ! tags[-1]['term']:
992
+ tags[-1]['term'] = value
993
+ else
994
+ addTag(value, nil, nil)
995
+ end
996
+ end
997
+ alias :_end_dc_subject :_end_category
998
+ alias :_end_keywords :_end_category
999
+ alias :_end_itunes_category :_end_category
1000
+
1001
+ def _start_cloud(attrsD)
1002
+ getContext()['cloud'] = FeedParserDict.new(attrsD)
1003
+ end
1004
+
1005
+ def _start_link(attrsD)
1006
+ attrsD['rel'] ||= 'alternate'
1007
+ attrsD['type'] ||= 'text/html'
1008
+ attrsD = itsAnHrefDamnIt(attrsD)
1009
+ if attrsD.has_key? 'href'
1010
+ attrsD['href'] = resolveURI(attrsD['href'])
1011
+ end
1012
+ expectingText = @infeed || @inentry || @insource
1013
+ context = getContext()
1014
+ context['links'] ||= []
1015
+ context['links'] << FeedParserDict.new(attrsD)
1016
+ if attrsD['rel'] == 'enclosure'
1017
+ _start_enclosure(attrsD)
1018
+ end
1019
+ if attrsD.has_key? 'href'
1020
+ expectingText = false
1021
+ if (attrsD['rel'] == 'alternate') && @html_types.include?(mapContentType(attrsD['type']))
1022
+ context['link'] = attrsD['href']
1023
+ end
1024
+ else
1025
+ push('link', expectingText)
1026
+ end
1027
+ end
1028
+ alias :_start_producturl :_start_link
1029
+
1030
+ def _end_link
1031
+ value = pop('link')
1032
+ context = getContext()
1033
+ if @intextinput
1034
+ context['textinput']['link'] = value
1035
+ end
1036
+ if @inimage
1037
+ context['image']['link'] = value
1038
+ end
1039
+ end
1040
+ alias :_end_producturl :_end_link
1041
+
1042
+ def _start_guid(attrsD)
1043
+ @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
1044
+ push('id', true)
1045
+ end
1046
+
1047
+ def _end_guid
1048
+ value = pop('id')
1049
+ _save('guidislink', (@guidislink && ! getContext().has_key?('link')))
1050
+ if @guidislink:
1051
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1052
+ # and only if the item doesn't already have a link element
1053
+ _save('link', value)
1054
+ end
1055
+ end
1056
+
1057
+
1058
+ def _start_title(attrsD)
1059
+ pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
1060
+ end
1061
+ alias :_start_dc_title :_start_title
1062
+ alias :_start_media_title :_start_title
1063
+
1064
+ def _end_title
1065
+ value = popContent('title')
1066
+ context = getContext
1067
+ if @intextinput
1068
+ context['textinput']['title'] = value
1069
+ elsif @inimage
1070
+ context['image']['title'] = value
1071
+ end
1072
+ @has_title = true
1073
+ end
1074
+ alias :_end_dc_title :_end_title
1075
+
1076
+ def _end_media_title
1077
+ orig_has_title = @has_title
1078
+ _end_title
1079
+ @has_title = orig_has_title
1080
+ end
1081
+
1082
+ def _start_description(attrsD)
1083
+ context = getContext()
1084
+ if context.has_key?('summary')
1085
+ @summaryKey = 'content'
1086
+ _start_content(attrsD)
1087
+ else
1088
+ pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
1089
+ end
1090
+ end
1091
+
1092
+ def _start_abstract(attrsD)
1093
+ pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
1094
+ end
1095
+
1096
+ def _end_description
1097
+ if @summaryKey == 'content'
1098
+ _end_content()
1099
+ else
1100
+ value = popContent('description')
1101
+ context = getContext()
1102
+ if @intextinput
1103
+ context['textinput']['description'] = value
1104
+ elsif @inimage:
1105
+ context['image']['description'] = value
1106
+ end
1107
+ end
1108
+ @summaryKey = nil
1109
+ end
1110
+ alias :_end_abstract :_end_description
1111
+
1112
+ def _start_info(attrsD)
1113
+ pushContent('info', attrsD, 'text/plain', true)
1114
+ end
1115
+ alias :_start_feedburner_browserfriendly :_start_info
1116
+
1117
+ def _end_info
1118
+ popContent('info')
1119
+ end
1120
+ alias :_end_feedburner_browserfriendly :_end_info
1121
+
1122
+ def _start_generator(attrsD)
1123
+ if attrsD && ! attrsD.empty?
1124
+ attrsD = itsAnHrefDamnIt(attrsD)
1125
+ if attrsD.has_key?('href')
1126
+ attrsD['href'] = resolveURI(attrsD['href'])
1127
+ end
1128
+ end
1129
+ getContext()['generator_detail'] = FeedParserDict.new(attrsD)
1130
+ push('generator', true)
1131
+ end
1132
+
1133
+ def _end_generator
1134
+ value = pop('generator')
1135
+ context = getContext()
1136
+ if context.has_key?('generator_detail')
1137
+ context['generator_detail']['name'] = value
1138
+ end
1139
+ end
1140
+
1141
+ def _start_admin_generatoragent(attrsD)
1142
+ push('generator', true)
1143
+ value = getAttribute(attrsD, 'rdf:resource')
1144
+ if value && ! value.empty?
1145
+ @elementstack[-1][2] << value
1146
+ end
1147
+ pop('generator')
1148
+ getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
1149
+ end
1150
+
1151
+ def _start_admin_errorreportsto(attrsD)
1152
+ push('errorreportsto', true)
1153
+ value = getAttribute(attrsD, 'rdf:resource')
1154
+ if value && ! value.empty?
1155
+ @elementstack[-1][2] << value
1156
+ end
1157
+ pop('errorreportsto')
1158
+ end
1159
+
1160
+ def _start_summary(attrsD)
1161
+ context = getContext()
1162
+ if context.has_key?('summary')
1163
+ @summaryKey = 'content'
1164
+ _start_content(attrsD)
1165
+ else
1166
+ @summaryKey = 'summary'
1167
+ pushContent(@summaryKey, attrsD, 'text/plain', true)
1168
+ end
1169
+ end
1170
+ alias :_start_itunes_summary :_start_summary
1171
+
1172
+ def _end_summary
1173
+ if @summaryKey == 'content':
1174
+ _end_content()
1175
+ else
1176
+ popContent(@summaryKey || 'summary')
1177
+ end
1178
+ @summaryKey = nil
1179
+ end
1180
+ alias :_end_itunes_summary :_end_summary
1181
+
1182
+ def _start_enclosure(attrsD)
1183
+ attrsD = itsAnHrefDamnIt(attrsD)
1184
+ getContext()['enclosures'] ||= []
1185
+ getContext()['enclosures'] << FeedParserDict.new(attrsD)
1186
+ href = attrsD['href']
1187
+ if href && ! href.empty?
1188
+ context = getContext()
1189
+ if ! context['id']
1190
+ context['id'] = href
1191
+ end
1192
+ end
1193
+ end
1194
+ alias :_start_media_content :_start_enclosure
1195
+ alias :_start_media_thumbnail :_start_enclosure
1196
+
1197
+ def _start_source(attrsD)
1198
+ @insource = true
1199
+ @has_title = false
1200
+ end
1201
+
1202
+ def _end_source
1203
+ @insource = false
1204
+ getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
1205
+ @sourcedata.clear()
1206
+ end
1207
+
1208
+ def _start_content(attrsD)
1209
+ pushContent('content', attrsD, 'text/plain', true)
1210
+ src = attrsD['src']
1211
+ if src && ! src.empty?:
1212
+ @contentparams['src'] = src
1213
+ end
1214
+ push('content', true)
1215
+ end
1216
+
1217
+ def _start_prodlink(attrsD)
1218
+ pushContent('content', attrsD, 'text/html', true)
1219
+ end
1220
+
1221
+ def _start_body(attrsD)
1222
+ pushContent('content', attrsD, 'application/xhtml+xml', true)
1223
+ end
1224
+ alias :_start_xhtml_body :_start_body
1225
+
1226
+ def _start_content_encoded(attrsD)
1227
+ pushContent('content', attrsD, 'text/html', true)
1228
+ end
1229
+ alias :_start_fullitem :_start_content_encoded
1230
+
1231
+ def _end_content
1232
+ copyToDescription = (['text/plain'] + @html_types).include?(mapContentType(@contentparams['type']))
1233
+ value = popContent('content')
1234
+ if copyToDescription
1235
+ _save('description', value)
1236
+ end
1237
+ end
1238
+ alias :_end_body :_end_content
1239
+ alias :_end_xhtml_body :_end_content
1240
+ alias :_end_content_encoded :_end_content
1241
+ alias :_end_fullitem :_end_content
1242
+ alias :_end_prodlink :_end_content
1243
+
1244
+ def _start_itunes_image(attrsD)
1245
+ push('itunes_image', false)
1246
+ getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
1247
+ end
1248
+ alias :_start_itunes_link :_start_itunes_image
1249
+
1250
+ def _end_itunes_block
1251
+ value = pop('itunes_block', false)
1252
+ getContext()['itunes_block'] = (value == 'yes') && true || false
1253
+ end
1254
+
1255
+ def _end_itunes_explicit
1256
+ value = pop('itunes_explicit', false)
1257
+ getContext()['itunes_explicit'] = (value.downcase == 'yes') && true || false
1258
+ end
1259
+
1260
+ end # End FeedParserMixin
1261
+ end
1262
+
1263
+ def urljoin(base, uri)
1264
+ urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
1265
+ uri = uri.sub(urifixer, '\1\3')
1266
+ pbase = Addressable::URI.parse(base) rescue nil
1267
+ if pbase && pbase.absolute?
1268
+ puri = Addressable::URI.parse(uri) rescue nil
1269
+ if puri && puri.relative?
1270
+ # ForgivingURI.join does the wrong thing. What the hell.
1271
+ return Addressable::URI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
1272
+ end
1273
+ end
1274
+ return uri
1275
+ end