jgre-rfeedparser 0.9.961

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env ruby
2
+ module FeedParser
3
+ class FeedParserDict < Hash
4
+ =begin
5
+ The naming of a certain common attribute (such as, "When was the last
6
+ time this feed was updated?") can have many different names depending
7
+ on the type of feed we are handling. This class allows us to satisfy
8
+ the expectations of both the developer who has prior knowledge of the
9
+ feed type as well as the developer who wants a consistent application
10
+ interface.
11
+
12
+ @@keymap is a Hash that contains information on what a certain
13
+ attribute names "really are" in each kind of feed. It does this by
14
+ providing a common name that will map to any feed type in the keys,
15
+ with possible "correct" attributes in the its values. the #[] and #[]=
16
+ methods check with keymaps to see what attribute the developer "really
17
+ means" if they've asked for one which happens to be in @@keymap's keys.
18
+ =end
19
+ @@keymap = {
20
+ 'channel' => 'feed',
21
+ 'items' => 'entries',
22
+ 'guid' => 'id',
23
+ 'date' => 'updated',
24
+ 'date_parsed' => 'updated_parsed',
25
+ 'description' => ['subtitle', 'summary'],
26
+ 'url' => ['href'],
27
+ 'modified' => 'updated',
28
+ 'modified_parsed' => 'updated_parsed',
29
+ 'issued' => 'published',
30
+ 'issued_parsed' => 'published_parsed',
31
+ 'copyright' => 'rights',
32
+ 'copyright_detail' => 'rights_detail',
33
+ 'tagline' => 'subtitle',
34
+ 'tagline_detail' => 'subtitle_detail'
35
+ }
36
+
37
+ # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
38
+ alias :hash_entries :entries
39
+ def entries
40
+ self['entries']
41
+ end
42
+
43
+ # Added to avoid deprecated method wornings
44
+ def type
45
+ self['type']
46
+ end
47
+
48
+ # We could include the [] rewrite in new using Hash.new's fancy pants block thing
49
+ # but we'd still have to overwrite []= and such.
50
+ # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
51
+ def initialize(pairs=nil)
52
+ if pairs.is_a?(Array) && pairs[0].is_a?(Array) && pairs[0].length == 2
53
+ pairs.each do |l|
54
+ k,v = l
55
+ self[k] = v
56
+ end
57
+ elsif pairs.is_a? Hash
58
+ self.merge!(pairs)
59
+ end
60
+ end
61
+
62
+ def [](key)
63
+ if key == 'category'
64
+ return self['tags'][0]['term']
65
+ end
66
+
67
+ if key == 'categories'
68
+ return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
69
+ end
70
+
71
+ realkey = @@keymap[key] || key
72
+ if realkey.is_a? Array
73
+ realkey.each{ |key| return self[key] if has_key?(key) }
74
+ end
75
+
76
+ # Note that the original key is preferred over the realkey we (might
77
+ # have) found in @@keymap
78
+ if has_key?(key)
79
+ return super(key)
80
+ end
81
+
82
+ super(realkey)
83
+ end
84
+
85
+ def []=(key,value)
86
+ if @@keymap.key?(key)
87
+ key = @@keymap[key]
88
+ if key.is_a? Array
89
+ key = key[0]
90
+ end
91
+ end
92
+ super(key,value)
93
+ end
94
+
95
+ def method_missing(msym, *args)
96
+ methodname = msym.to_s
97
+ if methodname[-1,1] == '='
98
+ self[methodname[0..-2]] = args[0]
99
+ elsif methodname[-1,1] != '!' && methodname[-1,1] != '?' && methodname[0,1] != "_" # FIXME implement with private?
100
+ self[methodname]
101
+ else
102
+ raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,75 @@
1
+ module FeedParser
2
+ class LooseFeedParser < BetterSGMLParser
3
+ include FeedParserMixin
4
+ # We write the methods that were in BaseHTMLProcessor in the python code
5
+ # in here directly. We do this because if we inherited from
6
+ # BaseHTMLProcessor but then included from FeedParserMixin, the methods
7
+ # of Mixin would overwrite the methods we inherited from
8
+ # BaseHTMLProcessor. This is exactly the opposite of what we want to
9
+ # happen!
10
+
11
+ attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
12
+
13
+ Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
14
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
15
+ New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
16
+ alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
17
+ def feed
18
+ @feeddata
19
+ end
20
+ def feed=(data)
21
+ @feeddata = data
22
+ end
23
+
24
+ def initialize(baseuri, baselang, encoding)
25
+ startup(baseuri, baselang, encoding)
26
+ super() # Keep the parentheses! No touchy.
27
+ end
28
+
29
+ def reset
30
+ @pieces = []
31
+ super
32
+ end
33
+
34
+ def parse(data)
35
+ data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
36
+ data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
37
+ clean = tag[1..-3].strip
38
+ if Elements_No_End_Tag.include?clean
39
+ tag
40
+ else
41
+ '<'+clean+'></'+clean+'>'
42
+ end
43
+ end
44
+
45
+ data.gsub!(/&#39;/, "'")
46
+ data.gsub!(/&#34;/, "'")
47
+ if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
48
+ data = uconvert(data,'utf-8',@encoding)
49
+ end
50
+ sgml_feed(data) # see the alias above
51
+ end
52
+
53
+
54
+ def decodeEntities(element, data)
55
+ data.gsub!('&#60;', '&lt;')
56
+ data.gsub!('&#x3c;', '&lt;')
57
+ data.gsub!('&#62;', '&gt;')
58
+ data.gsub!('&#x3e;', '&gt;')
59
+ data.gsub!('&#38;', '&amp;')
60
+ data.gsub!('&#x26;', '&amp;')
61
+ data.gsub!('&#34;', '&quot;')
62
+ data.gsub!('&#x22;', '&quot;')
63
+ data.gsub!('&#39;', '&apos;')
64
+ data.gsub!('&#x27;', '&apos;')
65
+ if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
66
+ data.gsub!('&lt;', '<')
67
+ data.gsub!('&gt;', '>')
68
+ data.gsub!('&amp;', '&')
69
+ data.gsub!('&quot;', '"')
70
+ data.gsub!('&apos;', "'")
71
+ end
72
+ return data
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ module FeedParserUtilities
3
+ def stripDoctype(data)
4
+ #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
5
+ #rss_version may be 'rss091n' or None
6
+ #stripped_data is the same XML document, minus the DOCTYPE
7
+ entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
8
+
9
+ data = data.gsub(entity_pattern,'')
10
+
11
+ doctype_pattern = /<!DOCTYPE(.*?)>/m
12
+ doctype_results = data.scan(doctype_pattern)
13
+ if doctype_results and doctype_results[0]
14
+ doctype = doctype_results[0][0]
15
+ else
16
+ doctype = ''
17
+ end
18
+
19
+ if /netscape/ =~ doctype.downcase
20
+ version = 'rss091n'
21
+ else
22
+ version = nil
23
+ end
24
+ data = data.sub(doctype_pattern, '')
25
+ return version, data
26
+ end
27
+
28
+ def resolveRelativeURIs(htmlSource, baseURI, encoding)
29
+ $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
30
+ relative_uris = [ ['a','href'],
31
+ ['applet','codebase'],
32
+ ['area','href'],
33
+ ['blockquote','cite'],
34
+ ['body','background'],
35
+ ['del','cite'],
36
+ ['form','action'],
37
+ ['frame','longdesc'],
38
+ ['frame','src'],
39
+ ['iframe','longdesc'],
40
+ ['iframe','src'],
41
+ ['head','profile'],
42
+ ['img','longdesc'],
43
+ ['img','src'],
44
+ ['img','usemap'],
45
+ ['input','src'],
46
+ ['input','usemap'],
47
+ ['ins','cite'],
48
+ ['link','href'],
49
+ ['object','classid'],
50
+ ['object','codebase'],
51
+ ['object','data'],
52
+ ['object','usemap'],
53
+ ['q','cite'],
54
+ ['script','src'],
55
+ ]
56
+ h = Hpricot(htmlSource)
57
+ relative_uris.each do |l|
58
+ ename, eattr = l
59
+ h.search(ename).each do |elem|
60
+ euri = elem.attributes[eattr]
61
+ uri = Addressable::URI.parse(Addressable::URI.encode(euri)) rescue nil
62
+ if euri and not euri.empty? and uri and uri.relative?
63
+ elem.raw_attributes[eattr] = urljoin(baseURI, euri)
64
+ end
65
+ end
66
+ end
67
+ return h.to_html
68
+ end
69
+ end
70
+
71
+
@@ -0,0 +1,10 @@
1
+ class Time
2
+ class << self
3
+ COMMON_YEAR_DAYS_IN_MONTH = [nil, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
4
+
5
+ def days_in_month(month, year = now.year)
6
+ return 29 if month == 2 && ::Date.gregorian_leap?(year)
7
+ COMMON_YEAR_DAYS_IN_MONTH[month]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,80 @@
1
+ gem 'nokogiri', '~>1.2'
2
+ require 'nokogiri'
3
+
4
+ module FeedParser
5
+ module Nokogiri
6
+
7
+ class NokogiriSyntaxError < StandardError; end
8
+
9
+ class StrictFeedParser
10
+ attr_reader :handler
11
+ def initialize(baseuri, baselang)
12
+ @handler = StrictFeedParserHandler.new(baseuri, baselang, 'utf-8')
13
+ end
14
+
15
+ def parse(data)
16
+ saxparser = ::Nokogiri::XML::SAX::Parser.new(@handler)
17
+
18
+ saxparser.parse data
19
+ end
20
+ end
21
+
22
+ class StrictFeedParserHandler < ::Nokogiri::XML::SAX::Document
23
+ include FeedParserMixin
24
+
25
+ attr_accessor :bozo, :entries, :feeddata, :exc
26
+
27
+ def initialize(baseuri, baselang, encoding)
28
+ $stderr.puts "trying Nokogiri::StrictFeedParser" if $debug
29
+ startup(baseuri, baselang, encoding)
30
+ @bozo = false
31
+ end
32
+
33
+ def start_element(name, attrs)
34
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
35
+ namespaceuri = ($2 || '').downcase
36
+ name = $3
37
+ if /backend\.userland\.com\/rss/ =~ namespaceuri
38
+ # match any backend.userland.com namespace
39
+ namespaceuri = 'http://backend.userland.com/rss'
40
+ end
41
+ prefix = @matchnamespaces[namespaceuri]
42
+
43
+ if prefix && !prefix.empty?
44
+ name = prefix + ':' + name
45
+ end
46
+
47
+ name.downcase!
48
+ unknown_starttag(name, attrs)
49
+ end
50
+
51
+ def characters(text)
52
+ handle_data(text)
53
+ end
54
+
55
+ def cdata_block(text)
56
+ handle_data(text)
57
+ end
58
+
59
+ def end_element(name)
60
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
61
+ namespaceuri = ($2 || '').downcase
62
+
63
+ prefix = @matchnamespaces[namespaceuri]
64
+
65
+ if prefix && !prefix.empty?
66
+ localname = prefix + ':' + name
67
+ end
68
+
69
+ name.downcase!
70
+ unknown_endtag(name)
71
+ end
72
+
73
+ def error(error_string)
74
+ @bozo = true
75
+ @exc = NokogiriSyntaxError.new(error_string)
76
+ raise @exc
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,1275 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module FeedParser
4
+ module FeedParserMixin
5
+ include FeedParserUtilities
6
+ attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
7
+
8
+ def startup(baseuri=nil, baselang=nil, encoding='utf-8')
9
+ $stderr << "initializing FeedParser\n" if $debug
10
+
11
+ @namespaces = {'' => '',
12
+ 'http://backend.userland.com/rss' => '',
13
+ 'http://blogs.law.harvard.edu/tech/rss' => '',
14
+ 'http://purl.org/rss/1.0/' => '',
15
+ 'http://my.netscape.com/rdf/simple/0.9/' => '',
16
+ 'http://example.com/newformat#' => '',
17
+ 'http://example.com/necho' => '',
18
+ 'http://purl.org/echo/' => '',
19
+ 'uri/of/echo/namespace#' => '',
20
+ 'http://purl.org/pie/' => '',
21
+ 'http://purl.org/atom/ns#' => '',
22
+ 'http://www.w3.org/2005/Atom' => '',
23
+ 'http://purl.org/rss/1.0/modules/rss091#' => '',
24
+ 'http://webns.net/mvcb/' => 'admin',
25
+ 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
26
+ 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
27
+ 'http://media.tangent.org/rss/1.0/' => 'audio',
28
+ 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
29
+ 'http://web.resource.org/cc/' => 'cc',
30
+ 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
31
+ 'http://purl.org/rss/1.0/modules/company' => 'co',
32
+ 'http://purl.org/rss/1.0/modules/content/' => 'content',
33
+ 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
34
+ 'http://purl.org/dc/elements/1.1/' => 'dc',
35
+ 'http://purl.org/dc/terms/' => 'dcterms',
36
+ 'http://purl.org/rss/1.0/modules/email/' => 'email',
37
+ 'http://purl.org/rss/1.0/modules/event/' => 'ev',
38
+ 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
39
+ 'http://freshmeat.net/rss/fm/' => 'fm',
40
+ 'http://xmlns.com/foaf/0.1/' => 'foaf',
41
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
42
+ 'http://postneo.com/icbm/' => 'icbm',
43
+ 'http://purl.org/rss/1.0/modules/image/' => 'image',
44
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
45
+ 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
46
+ 'http://purl.org/rss/1.0/modules/link/' => 'l',
47
+ 'http://search.yahoo.com/mrss' => 'media',
48
+ 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
49
+ 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
50
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
51
+ 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
52
+ 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
53
+ 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
54
+ 'http://purl.org/rss/1.0/modules/search/' => 'search',
55
+ 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
56
+ 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
57
+ 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
58
+ 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
59
+ 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
60
+ 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
61
+ 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
62
+ 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
63
+ 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
64
+ 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
65
+ 'http://wellformedweb.org/commentAPI/' => 'wfw',
66
+ 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
67
+ 'http://www.w3.org/1999/xhtml' => 'xhtml',
68
+ 'http://www.w3.org/XML/1998/namespace' => 'xml',
69
+ 'http://www.w3.org/1999/xlink' => 'xlink',
70
+ 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
71
+ }
72
+ @matchnamespaces = {}
73
+ @namespaces.each do |l|
74
+ @matchnamespaces[l[0].downcase] = l[1]
75
+ end
76
+ @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
77
+ @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
78
+ @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
79
+ @html_types = ['text/html', 'application/xhtml+xml']
80
+ @feeddata = FeedParserDict.new # feed-level data
81
+ @encoding = encoding # character encoding
82
+ @entries = [] # list of entry-level data
83
+ @version = '' # feed type/version see SUPPORTED_VERSIOSN
84
+ @namespacesInUse = {} # hash of namespaces defined by the feed
85
+
86
+ # the following are used internally to track state;
87
+ # this is really out of control and should be refactored
88
+ @infeed = false
89
+ @inentry = false
90
+ @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
91
+ @intextinput = false
92
+ @inimage = false
93
+ @inauthor = false
94
+ @incontributor = false
95
+ @inpublisher = false
96
+ @insource = false
97
+ @sourcedata = FeedParserDict.new
98
+ @contentparams = FeedParserDict.new
99
+ @summaryKey = nil
100
+ @namespacemap = {}
101
+ @elementstack = []
102
+ @basestack = []
103
+ @langstack = []
104
+ @baseuri = baseuri || ''
105
+ @lang = baselang || nil
106
+ @has_title = false
107
+ if baselang
108
+ @feeddata['language'] = baselang.gsub('_','-')
109
+ end
110
+ $stderr << "Leaving startup\n" if $debug # My addition
111
+ end
112
+
113
+ def unknown_starttag(tag, attrsd)
114
+ $stderr << "start #{tag} with #{attrsd.inspect}\n" if $debug
115
+ # normalize attrs
116
+ attrsD = {}
117
+ attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
118
+ # LooseFeedParser needs the above because SGMLParser sends attrs as a
119
+ # list of lists (like [['type','text/html'],['mode','escaped']])
120
+
121
+ attrsd.each do |old_k,value|
122
+ # There has to be a better, non-ugly way of doing this
123
+ k = old_k.downcase # Downcase all keys
124
+ attrsD[k] = value
125
+ if ['rel','type'].include?value
126
+ attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
127
+ end
128
+ end
129
+
130
+ # track xml:base and xml:lang
131
+ baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
132
+ @baseuri = urljoin(@baseuri, baseuri)
133
+ lang = attrsD['xml:lang'] || attrsD['lang']
134
+ if lang == '' # FIXME This next bit of code is right? Wtf?
135
+ # xml:lang could be explicitly set to '', we need to capture that
136
+ lang = nil
137
+ elsif lang.nil?
138
+ # if no xml:lang is specified, use parent lang
139
+ lang = @lang
140
+ end
141
+
142
+ if lang && ! lang.empty? # Seriously, this cannot be correct
143
+ if ['feed', 'rss', 'rdf:RDF'].include?tag
144
+ @feeddata['language'] = lang.gsub('_','-')
145
+ end
146
+ end
147
+ @lang = lang
148
+ @basestack << @baseuri
149
+ @langstack << lang
150
+
151
+ # track namespaces
152
+ attrsd.each do |prefix, uri|
153
+ if /^xmlns:/ =~ prefix # prefix begins with xmlns:
154
+ trackNamespace(prefix[6..-1], uri)
155
+ elsif prefix == 'xmlns':
156
+ trackNamespace(nil, uri)
157
+ end
158
+ end
159
+
160
+ # track inline content
161
+ if @incontent != 0 && @contentparams.has_key?('type') && ! ( /xml$/ =~ (@contentparams['type'] || 'xml') )
162
+ # element declared itself as escaped markup, but isn't really
163
+
164
+ @contentparams['type'] = 'application/xhtml+xml'
165
+ end
166
+ if @incontent != 0 && @contentparams['type'] == 'application/xhtml+xml'
167
+ # Note: probably shouldn't simply recreate localname here, but
168
+ # our namespace handling isn't actually 100% correct in cases where
169
+ # the feed redefines the default namespace (which is actually
170
+ # the usual case for inline content, thanks Sam), so here we
171
+ # cheat and just reconstruct the element based on localname
172
+ # because that compensates for the bugs in our namespace handling.
173
+ # This will horribly munge inline content with non-empty qnames,
174
+ # but nobody actually does that, so I'm not fixing it.
175
+ tag = tag.split(':')[-1]
176
+ attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
177
+ attrsS = ' '+attrsA.join(' ')
178
+ return handle_data("<#{tag}#{attrsS}>", escape=false)
179
+ end
180
+
181
+ # match namespaces
182
+ if /:/ =~ tag
183
+ prefix, suffix = tag.split(':', 2)
184
+ else
185
+ prefix, suffix = '', tag
186
+ end
187
+ prefix = @namespacemap[prefix] || prefix
188
+ if prefix && ! prefix.empty?
189
+ prefix = prefix + '_'
190
+ end
191
+
192
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
193
+ if (not prefix && ! prefix.empty?) && ! (['title', 'link', 'description','name'].include?tag)
194
+ @intextinput = false
195
+ end
196
+ if (prefix.nil? || prefix.empty?) && ! (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
197
+ @inimage = false
198
+ end
199
+
200
+ # call special handler (if defined) or default handler
201
+ begin
202
+ return send('_start_'+prefix+suffix, attrsD)
203
+ rescue NoMethodError
204
+ return push(prefix + suffix, true)
205
+ end
206
+ end # End unknown_starttag
207
+
208
+ def unknown_endtag(tag)
209
+ $stderr << "end #{tag}\n" if $debug
210
+ # match namespaces
211
+ if tag.index(':')
212
+ prefix, suffix = tag.split(':',2)
213
+ else
214
+ prefix, suffix = '', tag
215
+ end
216
+ prefix = @namespacemap[prefix] || prefix
217
+ if prefix && ! prefix.empty?
218
+ prefix = prefix + '_'
219
+ end
220
+
221
+ # call special handler (if defined) or default handler
222
+ begin
223
+ send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
224
+ rescue NoMethodError => details
225
+ pop(prefix + suffix)
226
+ end
227
+
228
+ # track inline content
229
+ if @incontent != 0 && @contentparams.has_key?('type') && /xml$/ =~ (@contentparams['type'] || 'xml')
230
+ # element declared itself as escaped markup, but it isn't really
231
+ @contentparams['type'] = 'application/xhtml+xml'
232
+ end
233
+ if @incontent != 0 && @contentparams['type'] == 'application/xhtml+xml'
234
+ tag = tag.split(':')[-1]
235
+ handle_data("</#{tag}>", escape=false)
236
+ end
237
+
238
+ # track xml:base and xml:lang going out of scope
239
+ if @basestack && ! @basestack.empty?
240
+ @basestack.pop
241
+ if @basestack && @basestack[-1] && ! (@basestack.empty? || @basestack[-1].empty?)
242
+ @baseuri = @basestack[-1]
243
+ end
244
+ end
245
+ if @langstack && ! @langstack.empty?
246
+ @langstack.pop
247
+ if @langstack && ! @langstack.empty? # && @langstack[-1] && ! @langstack.empty?
248
+ @lang = @langstack[-1]
249
+ end
250
+ end
251
+ end
252
+
253
+ def handle_charref(ref)
254
+ # LooseParserOnly
255
+ # called for each character reference, e.g. for '&#160;', ref will be '160'
256
+ $stderr << "entering handle_charref with #{ref}\n" if $debug
257
+ return if @elementstack.nil? || @elementstack.empty?
258
+
259
+ ref.downcase!
260
+ chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
261
+ if chars.include?ref
262
+ text = "&##{ref};"
263
+ else
264
+ if ref[0..0] == 'x'
265
+ c = (ref[1..-1]).to_i(16)
266
+ else
267
+ c = ref.to_i
268
+ end
269
+ text = [c].pack('U*')
270
+ end
271
+ @elementstack[-1][2] << text
272
+ end
273
+
274
+ def handle_entityref(ref)
275
+ # LooseParserOnly
276
+ # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
277
+
278
+ return if @elementstack.nil? || @elementstack.empty?
279
+ $stderr << "entering handle_entityref with #{ref}\n" if $debug
280
+ ents = ['lt', 'gt', 'quot', 'amp', 'apos']
281
+ if ents.include?ref
282
+ text = "&#{ref};"
283
+ else
284
+ text = HTMLEntities::decode_entities("&#{ref};")
285
+ end
286
+ @elementstack[-1][2] << text
287
+ end
288
+
289
+ def handle_data(text, escape=true)
290
+ # called for each block of plain text, i.e. outside of any tag and
291
+ # not containing any character or entity references
292
+ return if @elementstack.nil? || @elementstack.empty?
293
+ if escape && @contentparams['type'] == 'application/xhtml+xml'
294
+ text = text.to_xs
295
+ end
296
+ @elementstack[-1][2] << text
297
+ end
298
+
299
+ def handle_comment(comment)
300
+ # called for each comment, e.g. <!-- insert message here -->
301
+ end
302
+
303
+ def handle_pi(text)
304
+ end
305
+
306
+ def handle_decl(text)
307
+ end
308
+
309
+ def parse_declaration(i)
310
+ # for LooseFeedParser
311
+ $stderr << "entering parse_declaration\n" if $debug
312
+ if @rawdata[i...i+9] == '<![CDATA['
313
+ k = @rawdata.index(/\]\]>/u,i+9)
314
+ k = @rawdata.length unless k
315
+ handle_data(@rawdata[i+9...k].to_xs,false)
316
+ return k+3
317
+ else
318
+ k = @rawdata.index(/>/,i).to_i
319
+ return k+1
320
+ end
321
+ end
322
+
323
+ def mapContentType(contentType)
324
+ contentType.downcase!
325
+ case contentType
326
+ when 'text'
327
+ contentType = 'text/plain'
328
+ when 'html'
329
+ contentType = 'text/html'
330
+ when 'xhtml'
331
+ contentType = 'application/xhtml+xml'
332
+ end
333
+ return contentType
334
+ end
335
+
336
+ def trackNamespace(prefix, uri)
337
+
338
+ loweruri = uri.downcase.strip
339
+ if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] && (@version.nil? || @version.empty?)
340
+ @version = 'rss090'
341
+ elsif loweruri == 'http://purl.org/rss/1.0/' && (@version.nil? || @version.empty?)
342
+ @version = 'rss10'
343
+ elsif loweruri == 'http://www.w3.org/2005/atom' && (@version.nil? || @version.empty?)
344
+ @version = 'atom10'
345
+ elsif /backend\.userland\.com\/rss/ =~ loweruri
346
+ # match any backend.userland.com namespace
347
+ uri = 'http://backend.userland.com/rss'
348
+ loweruri = uri
349
+ end
350
+ if @matchnamespaces.has_key? loweruri
351
+ @namespacemap[prefix] = @matchnamespaces[loweruri]
352
+ @namespacesInUse[@matchnamespaces[loweruri]] = uri
353
+ else
354
+ @namespacesInUse[prefix || ''] = uri
355
+ end
356
+ end
357
+
358
+ def resolveURI(uri)
359
+ return urljoin(@baseuri || '', uri)
360
+ end
361
+
362
+ def decodeEntities(element, data)
363
+ return data
364
+ end
365
+
366
+ def push(element, expectingText)
367
+ @elementstack << [element, expectingText, []]
368
+ end
369
+
370
+ def pop(element, stripWhitespace=true)
371
+ return if @elementstack.nil? || @elementstack.empty?
372
+ return if @elementstack[-1][0] != element
373
+ element, expectingText, pieces = @elementstack.pop
374
+
375
+ if pieces.class == Array
376
+ output = pieces.join('')
377
+ else
378
+ output = pieces
379
+ end
380
+ if stripWhitespace
381
+ output.strip!
382
+ end
383
+ return output if ! expectingText
384
+
385
+ # decode base64 content
386
+ if @contentparams['base64']
387
+ out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
388
+ if ! output.empty? && ! out64.empty?
389
+ output = out64
390
+ end
391
+ end
392
+
393
+ # resolve relative URIs
394
+ if @can_be_relative_uri.include?(element) && output && !output.empty?
395
+ output = resolveURI(output)
396
+ end
397
+
398
+ # decode entities within embedded markup
399
+ if ! @contentparams['base64']
400
+ output = decodeEntities(element, output)
401
+ end
402
+
403
+ # remove temporary cruft from contentparams
404
+ @contentparams.delete('mode')
405
+ @contentparams.delete('base64')
406
+
407
+ # resolve relative URIs within embedded markup
408
+ if @html_types.include?(mapContentType(@contentparams['type'] || 'text/html'))
409
+ if @can_contain_relative_uris.include?(element)
410
+ output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
411
+ end
412
+ end
413
+ # sanitize embedded markup
414
+ if @html_types.include?(mapContentType(@contentparams['type'] || 'text/html'))
415
+ if @can_contain_dangerous_markup.include?(element)
416
+ output = FeedParser.sanitizeHTML(output, @encoding)
417
+ end
418
+ end
419
+
420
+ if @encoding && ! @encoding.empty? && @encoding != 'utf-8'
421
+ output = uconvert(output, @encoding, 'utf-8')
422
+ # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
423
+ end
424
+
425
+ # categories/tags/keywords/whatever are handled in _end_category
426
+ return output if element == 'category'
427
+
428
+ return output if element == 'title' && @has_title
429
+
430
+ # store output in appropriate place(s)
431
+ if @inentry && ! @insource
432
+ if element == 'content'
433
+ @entries[-1][element] ||= []
434
+ contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
435
+ contentparams['value'] = output
436
+ @entries[-1][element] << contentparams
437
+ elsif element == 'link'
438
+ @entries[-1][element] = output
439
+ if output && ! output.empty?
440
+ @entries[-1]['links'][-1]['href'] = output
441
+ end
442
+ else
443
+ element = 'summary' if element == 'description'
444
+ @entries[-1][element] = output
445
+ if @incontent != 0
446
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
447
+ contentparams['value'] = output
448
+ @entries[-1][element + '_detail'] = contentparams
449
+ end
450
+ end
451
+ elsif (@infeed || @insource) && ! @intextinput && ! @inimage
452
+ context = getContext()
453
+ element = 'subtitle' if element == 'description'
454
+ context[element] = output
455
+ if element == 'link'
456
+ context['links'][-1]['href'] = output
457
+ elsif @incontent != 0
458
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
459
+ contentparams['value'] = output
460
+ context[element + '_detail'] = contentparams
461
+ end
462
+ end
463
+
464
+ return output
465
+ end
466
+
467
+ def pushContent(tag, attrsD, defaultContentType, expectingText)
468
+ @incontent += 1 # Yes, I hate this.
469
+ type = mapContentType(attrsD['type'] || defaultContentType)
470
+ @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
471
+ @contentparams['base64'] = isBase64(attrsD, @contentparams)
472
+ push(tag, expectingText)
473
+ end
474
+
475
+ def popContent(tag)
476
+ value = pop(tag)
477
+ @incontent -= 1
478
+ @contentparams.clear
479
+ return value
480
+ end
481
+
482
+ def mapToStandardPrefix(name)
483
+ colonpos = name.index(':')
484
+ if colonpos
485
+ prefix = name[0..colonpos-1]
486
+ suffix = name[colonpos+1..-1]
487
+ prefix = @namespacemap[prefix] || prefix
488
+ name = prefix + ':' + suffix
489
+ end
490
+ return name
491
+ end
492
+
493
+ def getAttribute(attrsD, name)
494
+ return attrsD[mapToStandardPrefix(name)]
495
+ end
496
+
497
+ def isBase64(attrsD, contentparams)
498
+ return true if (attrsD['mode'] == 'base64')
499
+ if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
500
+ return false
501
+ end
502
+ return true
503
+ end
504
+
505
+ def itsAnHrefDamnIt(attrsD)
506
+ href= attrsD['url'] || attrsD['uri'] || attrsD['href']
507
+ if href
508
+ attrsD.delete('url')
509
+ attrsD.delete('uri')
510
+ attrsD['href'] = href
511
+ end
512
+ return attrsD
513
+ end
514
+
515
+
516
+ def _save(key, value)
517
+ context = getContext()
518
+ context[key] ||= value
519
+ end
520
+
521
+ def _start_rss(attrsD)
522
+ versionmap = {'0.91' => 'rss091u',
523
+ '0.92' => 'rss092',
524
+ '0.93' => 'rss093',
525
+ '0.94' => 'rss094'
526
+ }
527
+
528
+ if ! @version || @version.empty?
529
+ attr_version = attrsD['version'] || ''
530
+ version = versionmap[attr_version]
531
+ if version && ! version.empty?
532
+ @version = version
533
+ elsif /^2\./ =~ attr_version
534
+ @version = 'rss20'
535
+ else
536
+ @version = 'rss'
537
+ end
538
+ end
539
+ end
540
+
541
+ def _start_dlhottitles(attrsD)
542
+ @version = 'hotrss'
543
+ end
544
+
545
+ def _start_channel(attrsD)
546
+ @infeed = true
547
+ _cdf_common(attrsD)
548
+ end
549
+ alias :_start_feedinfo :_start_channel
550
+
551
+ def _cdf_common(attrsD)
552
+ if attrsD.has_key?'lastmod'
553
+ _start_modified({})
554
+ @elementstack[-1][-1] = attrsD['lastmod']
555
+ _end_modified
556
+ end
557
+ if attrsD.has_key?'href'
558
+ _start_link({})
559
+ @elementstack[-1][-1] = attrsD['href']
560
+ _end_link
561
+ end
562
+ end
563
+
564
+ def _start_feed(attrsD)
565
+ @infeed = true
566
+ versionmap = {'0.1' => 'atom01',
567
+ '0.2' => 'atom02',
568
+ '0.3' => 'atom03'
569
+ }
570
+
571
+ if ! @version || @version.empty?
572
+ attr_version = attrsD['version']
573
+ version = versionmap[attr_version]
574
+ if @version && ! @version.empty?
575
+ @version = version
576
+ else
577
+ @version = 'atom'
578
+ end
579
+ end
580
+ end
581
+
582
+ def _end_channel
583
+ @infeed = false
584
+ end
585
+ alias :_end_feed :_end_channel
586
+
587
+ def _start_image(attrsD)
588
+ @inimage = true
589
+ @has_title = false
590
+ push('image', false)
591
+ context = getContext()
592
+ context['image'] ||= FeedParserDict.new
593
+ end
594
+
595
+ def _end_image
596
+ pop('image')
597
+ @inimage = false
598
+ end
599
+
600
+ def _start_textinput(attrsD)
601
+ @intextinput = true
602
+ @has_title = false
603
+ push('textinput', false)
604
+ context = getContext()
605
+ context['textinput'] ||= FeedParserDict.new
606
+ end
607
+ alias :_start_textInput :_start_textinput
608
+
609
+ def _end_textinput
610
+ pop('textinput')
611
+ @intextinput = false
612
+ end
613
+ alias :_end_textInput :_end_textinput
614
+
615
+ def _start_author(attrsD)
616
+ @inauthor = true
617
+ push('author', true)
618
+ end
619
+ alias :_start_managingeditor :_start_author
620
+ alias :_start_dc_author :_start_author
621
+ alias :_start_dc_creator :_start_author
622
+ alias :_start_itunes_author :_start_author
623
+
624
+ def _end_author
625
+ pop('author')
626
+ @inauthor = false
627
+ _sync_author_detail()
628
+ end
629
+ alias :_end_managingeditor :_end_author
630
+ alias :_end_dc_author :_end_author
631
+ alias :_end_dc_creator :_end_author
632
+ alias :_end_itunes_author :_end_author
633
+
634
+ def _start_itunes_owner(attrsD)
635
+ @inpublisher = true
636
+ push('publisher', false)
637
+ end
638
+
639
+ def _end_itunes_owner
640
+ pop('publisher')
641
+ @inpublisher = false
642
+ _sync_author_detail('publisher')
643
+ end
644
+
645
+ def _start_contributor(attrsD)
646
+ @incontributor = true
647
+ context = getContext()
648
+ context['contributors'] ||= []
649
+ context['contributors'] << FeedParserDict.new
650
+ push('contributor', false)
651
+ end
652
+
653
+ def _end_contributor
654
+ pop('contributor')
655
+ @incontributor = false
656
+ end
657
+
658
+ def _start_dc_contributor(attrsD)
659
+ @incontributor = true
660
+ context = getContext()
661
+ context['contributors'] ||= []
662
+ context['contributors'] << FeedParserDict.new
663
+ push('name', false)
664
+ end
665
+
666
+ def _end_dc_contributor
667
+ _end_name
668
+ @incontributor = false
669
+ end
670
+
671
+ def _start_name(attrsD)
672
+ push('name', false)
673
+ end
674
+ alias :_start_itunes_name :_start_name
675
+
676
+ def _end_name
677
+ value = pop('name')
678
+ if @inpublisher
679
+ _save_author('name', value, 'publisher')
680
+ elsif @inauthor
681
+ _save_author('name', value)
682
+ elsif @incontributor
683
+ _save_contributor('name', value)
684
+ elsif @intextinput
685
+ context = getContext()
686
+ context['textinput']['name'] = value
687
+ end
688
+ end
689
+ alias :_end_itunes_name :_end_name
690
+
691
+ def _start_width(attrsD)
692
+ push('width', false)
693
+ end
694
+
695
+ def _end_width
696
+ value = pop('width').to_i
697
+ if @inimage
698
+ context = getContext
699
+ context['image']['width'] = value
700
+ end
701
+ end
702
+
703
+ def _start_height(attrsD)
704
+ push('height', false)
705
+ end
706
+
707
+ def _end_height
708
+ value = pop('height').to_i
709
+ if @inimage
710
+ context = getContext()
711
+ context['image']['height'] = value
712
+ end
713
+ end
714
+
715
+ def _start_url(attrsD)
716
+ push('href', true)
717
+ end
718
+ alias :_start_homepage :_start_url
719
+ alias :_start_uri :_start_url
720
+
721
+ def _end_url
722
+ value = pop('href')
723
+ if @inauthor
724
+ _save_author('href', value)
725
+ elsif @incontributor
726
+ _save_contributor('href', value)
727
+ elsif @inimage
728
+ context = getContext()
729
+ context['image']['href'] = value
730
+ elsif @intextinput
731
+ context = getContext()
732
+ context['textinput']['link'] = value
733
+ end
734
+ end
735
+ alias :_end_homepage :_end_url
736
+ alias :_end_uri :_end_url
737
+
738
+ def _start_email(attrsD)
739
+ push('email', false)
740
+ end
741
+ alias :_start_itunes_email :_start_email
742
+
743
+ def _end_email
744
+ value = pop('email')
745
+ if @inpublisher
746
+ _save_author('email', value, 'publisher')
747
+ elsif @inauthor
748
+ _save_author('email', value)
749
+ elsif @incontributor
750
+ _save_contributor('email', value)
751
+ end
752
+ end
753
+ alias :_end_itunes_email :_end_email
754
+
755
+ def getContext
756
+ if @insource
757
+ context = @sourcedata
758
+ elsif @inentry
759
+ context = @entries[-1]
760
+ else
761
+ context = @feeddata
762
+ end
763
+ return context
764
+ end
765
+
766
+ def _save_author(key, value, prefix='author')
767
+ context = getContext()
768
+ context[prefix + '_detail'] ||= FeedParserDict.new
769
+ context[prefix + '_detail'][key] = value
770
+ _sync_author_detail()
771
+ end
772
+
773
+ def _save_contributor(key, value)
774
+ context = getContext
775
+ context['contributors'] ||= [FeedParserDict.new]
776
+ context['contributors'][-1][key] = value
777
+ end
778
+
779
+ def _sync_author_detail(key='author')
780
+ context = getContext()
781
+ detail = context["#{key}_detail"]
782
+ if detail && ! detail.empty?
783
+ name = detail['name']
784
+ email = detail['email']
785
+
786
+ if name && email && ! (name.empty? || name.empty?)
787
+ context[key] = "#{name} (#{email})"
788
+ elsif name && ! name.empty?
789
+ context[key] = name
790
+ elsif email && ! email.empty?
791
+ context[key] = email
792
+ end
793
+ else
794
+ author = context[key].dup unless context[key].nil?
795
+ return if ! author || author.empty?
796
+ emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
797
+ email = emailmatch[1]
798
+ author.gsub!(email, '')
799
+ author.gsub!("\(\)", '')
800
+ author.strip!
801
+ author.gsub!(/^\(/,'')
802
+ author.gsub!(/\)$/,'')
803
+ author.strip!
804
+ context["#{key}_detail"] ||= FeedParserDict.new
805
+ context["#{key}_detail"]['name'] = author
806
+ context["#{key}_detail"]['email'] = email
807
+ end
808
+ end
809
+
810
+ def _start_subtitle(attrsD)
811
+ pushContent('subtitle', attrsD, 'text/plain', true)
812
+ end
813
+ alias :_start_tagline :_start_subtitle
814
+ alias :_start_itunes_subtitle :_start_subtitle
815
+
816
+ def _end_subtitle
817
+ popContent('subtitle')
818
+ end
819
+ alias :_end_tagline :_end_subtitle
820
+ alias :_end_itunes_subtitle :_end_subtitle
821
+
822
+ def _start_rights(attrsD)
823
+ pushContent('rights', attrsD, 'text/plain', true)
824
+ end
825
+ alias :_start_dc_rights :_start_rights
826
+ alias :_start_copyright :_start_rights
827
+
828
+ def _end_rights
829
+ popContent('rights')
830
+ end
831
+ alias :_end_dc_rights :_end_rights
832
+ alias :_end_copyright :_end_rights
833
+
834
+ def _start_item(attrsD)
835
+ @entries << FeedParserDict.new
836
+ push('item', false)
837
+ @inentry = true
838
+ @has_title = false
839
+ @guidislink = false
840
+ id = getAttribute(attrsD, 'rdf:about')
841
+ if id && ! id.empty?
842
+ context = getContext()
843
+ context['id'] = id
844
+ end
845
+ _cdf_common(attrsD)
846
+ end
847
+ alias :_start_entry :_start_item
848
+ alias :_start_product :_start_item
849
+
850
+ def _end_item
851
+ pop('item')
852
+ @inentry = false
853
+ end
854
+ alias :_end_entry :_end_item
855
+
856
+ def _start_dc_language(attrsD)
857
+ push('language', true)
858
+ end
859
+ alias :_start_language :_start_dc_language
860
+
861
+ def _end_dc_language
862
+ @lang = pop('language')
863
+ end
864
+ alias :_end_language :_end_dc_language
865
+
866
+ def _start_dc_publisher(attrsD)
867
+ push('publisher', true)
868
+ end
869
+ alias :_start_webmaster :_start_dc_publisher
870
+
871
+ def _end_dc_publisher
872
+ pop('publisher')
873
+ _sync_author_detail('publisher')
874
+ end
875
+ alias :_end_webmaster :_end_dc_publisher
876
+
877
+ def _start_published(attrsD)
878
+ push('published', true)
879
+ end
880
+ alias :_start_dcterms_issued :_start_published
881
+ alias :_start_issued :_start_published
882
+
883
+ def _end_published
884
+ value = pop('published')
885
+ d = parse_date(value)
886
+ _save('published_parsed', extract_tuple(d))
887
+ _save('published_time', d)
888
+ end
889
+ alias :_end_dcterms_issued :_end_published
890
+ alias :_end_issued :_end_published
891
+
892
+ def _start_updated(attrsD)
893
+ push('updated', true)
894
+ end
895
+ alias :_start_modified :_start_updated
896
+ alias :_start_dcterms_modified :_start_updated
897
+ alias :_start_pubdate :_start_updated
898
+ alias :_start_dc_date :_start_updated
899
+
900
+ def _end_updated
901
+ value = pop('updated')
902
+ d = parse_date(value)
903
+ _save('updated_parsed', extract_tuple(d))
904
+ _save('updated_time', d)
905
+ end
906
+ alias :_end_modified :_end_updated
907
+ alias :_end_dcterms_modified :_end_updated
908
+ alias :_end_pubdate :_end_updated
909
+ alias :_end_dc_date :_end_updated
910
+
911
+ def _start_created(attrsD)
912
+ push('created', true)
913
+ end
914
+ alias :_start_dcterms_created :_start_created
915
+
916
+ def _end_created
917
+ value = pop('created')
918
+ d = parse_date(value)
919
+ _save('created_parsed', extract_tuple(d))
920
+ _save('created_time', d)
921
+ end
922
+ alias :_end_dcterms_created :_end_created
923
+
924
+ def _start_expirationdate(attrsD)
925
+ push('expired', true)
926
+ end
927
+ def _end_expirationdate
928
+ d = parse_date(pop('expired'))
929
+ _save('expired_parsed', extract_tuple(d))
930
+ _save('expired_time', d)
931
+ end
932
+
933
+ def _start_cc_license(attrsD)
934
+ push('license', true)
935
+ value = getAttribute(attrsD, 'rdf:resource')
936
+ if value && ! value.empty?
937
+ @elementstack[-1][2] << value
938
+ pop('license')
939
+ end
940
+ end
941
+
942
+ def _start_creativecommons_license(attrsD)
943
+ push('license', true)
944
+ end
945
+
946
+ def _end_creativecommons_license
947
+ pop('license')
948
+ end
949
+
950
+ def addTag(term, scheme, label)
951
+ context = getContext()
952
+ context['tags'] ||= []
953
+ tags = context['tags']
954
+ if (term.nil? || term.empty?) && (scheme.nil? || scheme.empty?) && (label.nil? || label.empty?)
955
+ return
956
+ end
957
+ value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
958
+ if ! tags.include?value
959
+ context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
960
+ end
961
+ end
962
+
963
+ def _start_category(attrsD)
964
+ $stderr << "entering _start_category with #{attrsD}\n" if $debug
965
+
966
+ term = attrsD['term']
967
+ scheme = attrsD['scheme'] || attrsD['domain']
968
+ label = attrsD['label']
969
+ addTag(term, scheme, label)
970
+ push('category', true)
971
+ end
972
+ alias :_start_dc_subject :_start_category
973
+ alias :_start_keywords :_start_category
974
+
975
+ def _end_itunes_keywords
976
+ pop('itunes_keywords').split.each do |term|
977
+ addTag(term, 'http://www.itunes.com/', nil)
978
+ end
979
+ end
980
+
981
+ def _start_itunes_category(attrsD)
982
+ addTag(attrsD['text'], 'http://www.itunes.com/', nil)
983
+ push('category', true)
984
+ end
985
+
986
+ def _end_category
987
+ value = pop('category')
988
+ return if value.nil? || value.empty?
989
+ context = getContext()
990
+ tags = context['tags']
991
+ if value && ! value.empty? && ! tags.empty? && ! tags[-1]['term']:
992
+ tags[-1]['term'] = value
993
+ else
994
+ addTag(value, nil, nil)
995
+ end
996
+ end
997
+ alias :_end_dc_subject :_end_category
998
+ alias :_end_keywords :_end_category
999
+ alias :_end_itunes_category :_end_category
1000
+
1001
+ def _start_cloud(attrsD)
1002
+ getContext()['cloud'] = FeedParserDict.new(attrsD)
1003
+ end
1004
+
1005
+ def _start_link(attrsD)
1006
+ attrsD['rel'] ||= 'alternate'
1007
+ attrsD['type'] ||= 'text/html'
1008
+ attrsD = itsAnHrefDamnIt(attrsD)
1009
+ if attrsD.has_key? 'href'
1010
+ attrsD['href'] = resolveURI(attrsD['href'])
1011
+ end
1012
+ expectingText = @infeed || @inentry || @insource
1013
+ context = getContext()
1014
+ context['links'] ||= []
1015
+ context['links'] << FeedParserDict.new(attrsD)
1016
+ if attrsD['rel'] == 'enclosure'
1017
+ _start_enclosure(attrsD)
1018
+ end
1019
+ if attrsD.has_key? 'href'
1020
+ expectingText = false
1021
+ if (attrsD['rel'] == 'alternate') && @html_types.include?(mapContentType(attrsD['type']))
1022
+ context['link'] = attrsD['href']
1023
+ end
1024
+ else
1025
+ push('link', expectingText)
1026
+ end
1027
+ end
1028
+ alias :_start_producturl :_start_link
1029
+
1030
+ def _end_link
1031
+ value = pop('link')
1032
+ context = getContext()
1033
+ if @intextinput
1034
+ context['textinput']['link'] = value
1035
+ end
1036
+ if @inimage
1037
+ context['image']['link'] = value
1038
+ end
1039
+ end
1040
+ alias :_end_producturl :_end_link
1041
+
1042
+ def _start_guid(attrsD)
1043
+ @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
1044
+ push('id', true)
1045
+ end
1046
+
1047
+ def _end_guid
1048
+ value = pop('id')
1049
+ _save('guidislink', (@guidislink && ! getContext().has_key?('link')))
1050
+ if @guidislink:
1051
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1052
+ # and only if the item doesn't already have a link element
1053
+ _save('link', value)
1054
+ end
1055
+ end
1056
+
1057
+
1058
+ def _start_title(attrsD)
1059
+ pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
1060
+ end
1061
+ alias :_start_dc_title :_start_title
1062
+ alias :_start_media_title :_start_title
1063
+
1064
+ def _end_title
1065
+ value = popContent('title')
1066
+ context = getContext
1067
+ if @intextinput
1068
+ context['textinput']['title'] = value
1069
+ elsif @inimage
1070
+ context['image']['title'] = value
1071
+ end
1072
+ @has_title = true
1073
+ end
1074
+ alias :_end_dc_title :_end_title
1075
+
1076
+ def _end_media_title
1077
+ orig_has_title = @has_title
1078
+ _end_title
1079
+ @has_title = orig_has_title
1080
+ end
1081
+
1082
+ def _start_description(attrsD)
1083
+ context = getContext()
1084
+ if context.has_key?('summary')
1085
+ @summaryKey = 'content'
1086
+ _start_content(attrsD)
1087
+ else
1088
+ pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
1089
+ end
1090
+ end
1091
+
1092
+ def _start_abstract(attrsD)
1093
+ pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
1094
+ end
1095
+
1096
+ def _end_description
1097
+ if @summaryKey == 'content'
1098
+ _end_content()
1099
+ else
1100
+ value = popContent('description')
1101
+ context = getContext()
1102
+ if @intextinput
1103
+ context['textinput']['description'] = value
1104
+ elsif @inimage:
1105
+ context['image']['description'] = value
1106
+ end
1107
+ end
1108
+ @summaryKey = nil
1109
+ end
1110
+ alias :_end_abstract :_end_description
1111
+
1112
+ def _start_info(attrsD)
1113
+ pushContent('info', attrsD, 'text/plain', true)
1114
+ end
1115
+ alias :_start_feedburner_browserfriendly :_start_info
1116
+
1117
+ def _end_info
1118
+ popContent('info')
1119
+ end
1120
+ alias :_end_feedburner_browserfriendly :_end_info
1121
+
1122
+ def _start_generator(attrsD)
1123
+ if attrsD && ! attrsD.empty?
1124
+ attrsD = itsAnHrefDamnIt(attrsD)
1125
+ if attrsD.has_key?('href')
1126
+ attrsD['href'] = resolveURI(attrsD['href'])
1127
+ end
1128
+ end
1129
+ getContext()['generator_detail'] = FeedParserDict.new(attrsD)
1130
+ push('generator', true)
1131
+ end
1132
+
1133
+ def _end_generator
1134
+ value = pop('generator')
1135
+ context = getContext()
1136
+ if context.has_key?('generator_detail')
1137
+ context['generator_detail']['name'] = value
1138
+ end
1139
+ end
1140
+
1141
+ def _start_admin_generatoragent(attrsD)
1142
+ push('generator', true)
1143
+ value = getAttribute(attrsD, 'rdf:resource')
1144
+ if value && ! value.empty?
1145
+ @elementstack[-1][2] << value
1146
+ end
1147
+ pop('generator')
1148
+ getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
1149
+ end
1150
+
1151
+ def _start_admin_errorreportsto(attrsD)
1152
+ push('errorreportsto', true)
1153
+ value = getAttribute(attrsD, 'rdf:resource')
1154
+ if value && ! value.empty?
1155
+ @elementstack[-1][2] << value
1156
+ end
1157
+ pop('errorreportsto')
1158
+ end
1159
+
1160
+ def _start_summary(attrsD)
1161
+ context = getContext()
1162
+ if context.has_key?('summary')
1163
+ @summaryKey = 'content'
1164
+ _start_content(attrsD)
1165
+ else
1166
+ @summaryKey = 'summary'
1167
+ pushContent(@summaryKey, attrsD, 'text/plain', true)
1168
+ end
1169
+ end
1170
+ alias :_start_itunes_summary :_start_summary
1171
+
1172
+ def _end_summary
1173
+ if @summaryKey == 'content':
1174
+ _end_content()
1175
+ else
1176
+ popContent(@summaryKey || 'summary')
1177
+ end
1178
+ @summaryKey = nil
1179
+ end
1180
+ alias :_end_itunes_summary :_end_summary
1181
+
1182
+ def _start_enclosure(attrsD)
1183
+ attrsD = itsAnHrefDamnIt(attrsD)
1184
+ getContext()['enclosures'] ||= []
1185
+ getContext()['enclosures'] << FeedParserDict.new(attrsD)
1186
+ href = attrsD['href']
1187
+ if href && ! href.empty?
1188
+ context = getContext()
1189
+ if ! context['id']
1190
+ context['id'] = href
1191
+ end
1192
+ end
1193
+ end
1194
+ alias :_start_media_content :_start_enclosure
1195
+ alias :_start_media_thumbnail :_start_enclosure
1196
+
1197
+ def _start_source(attrsD)
1198
+ @insource = true
1199
+ @has_title = false
1200
+ end
1201
+
1202
+ def _end_source
1203
+ @insource = false
1204
+ getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
1205
+ @sourcedata.clear()
1206
+ end
1207
+
1208
+ def _start_content(attrsD)
1209
+ pushContent('content', attrsD, 'text/plain', true)
1210
+ src = attrsD['src']
1211
+ if src && ! src.empty?:
1212
+ @contentparams['src'] = src
1213
+ end
1214
+ push('content', true)
1215
+ end
1216
+
1217
+ def _start_prodlink(attrsD)
1218
+ pushContent('content', attrsD, 'text/html', true)
1219
+ end
1220
+
1221
+ def _start_body(attrsD)
1222
+ pushContent('content', attrsD, 'application/xhtml+xml', true)
1223
+ end
1224
+ alias :_start_xhtml_body :_start_body
1225
+
1226
+ def _start_content_encoded(attrsD)
1227
+ pushContent('content', attrsD, 'text/html', true)
1228
+ end
1229
+ alias :_start_fullitem :_start_content_encoded
1230
+
1231
+ def _end_content
1232
+ copyToDescription = (['text/plain'] + @html_types).include?(mapContentType(@contentparams['type']))
1233
+ value = popContent('content')
1234
+ if copyToDescription
1235
+ _save('description', value)
1236
+ end
1237
+ end
1238
+ alias :_end_body :_end_content
1239
+ alias :_end_xhtml_body :_end_content
1240
+ alias :_end_content_encoded :_end_content
1241
+ alias :_end_fullitem :_end_content
1242
+ alias :_end_prodlink :_end_content
1243
+
1244
+ def _start_itunes_image(attrsD)
1245
+ push('itunes_image', false)
1246
+ getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
1247
+ end
1248
+ alias :_start_itunes_link :_start_itunes_image
1249
+
1250
+ def _end_itunes_block
1251
+ value = pop('itunes_block', false)
1252
+ getContext()['itunes_block'] = (value == 'yes') && true || false
1253
+ end
1254
+
1255
+ def _end_itunes_explicit
1256
+ value = pop('itunes_explicit', false)
1257
+ getContext()['itunes_explicit'] = (value.downcase == 'yes') && true || false
1258
+ end
1259
+
1260
+ end # End FeedParserMixin
1261
+ end
1262
+
1263
+ def urljoin(base, uri)
1264
+ urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
1265
+ uri = uri.sub(urifixer, '\1\3')
1266
+ pbase = Addressable::URI.parse(base) rescue nil
1267
+ if pbase && pbase.absolute?
1268
+ puri = Addressable::URI.parse(uri) rescue nil
1269
+ if puri && puri.relative?
1270
+ # ForgivingURI.join does the wrong thing. What the hell.
1271
+ return Addressable::URI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
1272
+ end
1273
+ end
1274
+ return uri
1275
+ end