jgre-rfeedparser 0.9.961
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +68 -0
- data/README +50 -0
- data/RUBY-TESTING +66 -0
- data/lib/rfeedparser.rb +551 -0
- data/lib/rfeedparser/aliases.rb +432 -0
- data/lib/rfeedparser/better_attributelist.rb +41 -0
- data/lib/rfeedparser/better_sgmlparser.rb +264 -0
- data/lib/rfeedparser/encoding_helpers.rb +260 -0
- data/lib/rfeedparser/feedparserdict.rb +106 -0
- data/lib/rfeedparser/loose_feed_parser.rb +75 -0
- data/lib/rfeedparser/markup_helpers.rb +71 -0
- data/lib/rfeedparser/monkey_patches.rb +10 -0
- data/lib/rfeedparser/nokogiri_parser.rb +80 -0
- data/lib/rfeedparser/parser_mixin.rb +1275 -0
- data/lib/rfeedparser/scrub.rb +212 -0
- data/lib/rfeedparser/time_helpers.rb +408 -0
- data/lib/rfeedparser/utilities.rb +23 -0
- metadata +187 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
module FeedParser
|
|
3
|
+
class FeedParserDict < Hash
|
|
4
|
+
=begin
|
|
5
|
+
The naming of a certain common attribute (such as, "When was the last
|
|
6
|
+
time this feed was updated?") can have many different names depending
|
|
7
|
+
on the type of feed we are handling. This class allows us to satisfy
|
|
8
|
+
the expectations of both the developer who has prior knowledge of the
|
|
9
|
+
feed type as well as the developer who wants a consistent application
|
|
10
|
+
interface.
|
|
11
|
+
|
|
12
|
+
@@keymap is a Hash that contains information on what a certain
|
|
13
|
+
attribute names "really are" in each kind of feed. It does this by
|
|
14
|
+
providing a common name that will map to any feed type in the keys,
|
|
15
|
+
with possible "correct" attributes in the its values. the #[] and #[]=
|
|
16
|
+
methods check with keymaps to see what attribute the developer "really
|
|
17
|
+
means" if they've asked for one which happens to be in @@keymap's keys.
|
|
18
|
+
=end
|
|
19
|
+
@@keymap = {
|
|
20
|
+
'channel' => 'feed',
|
|
21
|
+
'items' => 'entries',
|
|
22
|
+
'guid' => 'id',
|
|
23
|
+
'date' => 'updated',
|
|
24
|
+
'date_parsed' => 'updated_parsed',
|
|
25
|
+
'description' => ['subtitle', 'summary'],
|
|
26
|
+
'url' => ['href'],
|
|
27
|
+
'modified' => 'updated',
|
|
28
|
+
'modified_parsed' => 'updated_parsed',
|
|
29
|
+
'issued' => 'published',
|
|
30
|
+
'issued_parsed' => 'published_parsed',
|
|
31
|
+
'copyright' => 'rights',
|
|
32
|
+
'copyright_detail' => 'rights_detail',
|
|
33
|
+
'tagline' => 'subtitle',
|
|
34
|
+
'tagline_detail' => 'subtitle_detail'
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
|
38
|
+
alias :hash_entries :entries
|
|
39
|
+
def entries
|
|
40
|
+
self['entries']
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Added to avoid deprecated method wornings
|
|
44
|
+
def type
|
|
45
|
+
self['type']
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
|
49
|
+
# but we'd still have to overwrite []= and such.
|
|
50
|
+
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
|
51
|
+
def initialize(pairs=nil)
|
|
52
|
+
if pairs.is_a?(Array) && pairs[0].is_a?(Array) && pairs[0].length == 2
|
|
53
|
+
pairs.each do |l|
|
|
54
|
+
k,v = l
|
|
55
|
+
self[k] = v
|
|
56
|
+
end
|
|
57
|
+
elsif pairs.is_a? Hash
|
|
58
|
+
self.merge!(pairs)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def [](key)
|
|
63
|
+
if key == 'category'
|
|
64
|
+
return self['tags'][0]['term']
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
if key == 'categories'
|
|
68
|
+
return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
realkey = @@keymap[key] || key
|
|
72
|
+
if realkey.is_a? Array
|
|
73
|
+
realkey.each{ |key| return self[key] if has_key?(key) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Note that the original key is preferred over the realkey we (might
|
|
77
|
+
# have) found in @@keymap
|
|
78
|
+
if has_key?(key)
|
|
79
|
+
return super(key)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
super(realkey)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def []=(key,value)
|
|
86
|
+
if @@keymap.key?(key)
|
|
87
|
+
key = @@keymap[key]
|
|
88
|
+
if key.is_a? Array
|
|
89
|
+
key = key[0]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
super(key,value)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def method_missing(msym, *args)
|
|
96
|
+
methodname = msym.to_s
|
|
97
|
+
if methodname[-1,1] == '='
|
|
98
|
+
self[methodname[0..-2]] = args[0]
|
|
99
|
+
elsif methodname[-1,1] != '!' && methodname[-1,1] != '?' && methodname[0,1] != "_" # FIXME implement with private?
|
|
100
|
+
self[methodname]
|
|
101
|
+
else
|
|
102
|
+
raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module FeedParser
|
|
2
|
+
class LooseFeedParser < BetterSGMLParser
|
|
3
|
+
include FeedParserMixin
|
|
4
|
+
# We write the methods that were in BaseHTMLProcessor in the python code
|
|
5
|
+
# in here directly. We do this because if we inherited from
|
|
6
|
+
# BaseHTMLProcessor but then included from FeedParserMixin, the methods
|
|
7
|
+
# of Mixin would overwrite the methods we inherited from
|
|
8
|
+
# BaseHTMLProcessor. This is exactly the opposite of what we want to
|
|
9
|
+
# happen!
|
|
10
|
+
|
|
11
|
+
attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
|
|
12
|
+
|
|
13
|
+
Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
|
14
|
+
'img', 'input', 'isindex', 'link', 'meta', 'param']
|
|
15
|
+
New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
|
|
16
|
+
alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
|
|
17
|
+
def feed
|
|
18
|
+
@feeddata
|
|
19
|
+
end
|
|
20
|
+
def feed=(data)
|
|
21
|
+
@feeddata = data
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def initialize(baseuri, baselang, encoding)
|
|
25
|
+
startup(baseuri, baselang, encoding)
|
|
26
|
+
super() # Keep the parentheses! No touchy.
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def reset
|
|
30
|
+
@pieces = []
|
|
31
|
+
super
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def parse(data)
|
|
35
|
+
data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '<!\1')
|
|
36
|
+
data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
|
|
37
|
+
clean = tag[1..-3].strip
|
|
38
|
+
if Elements_No_End_Tag.include?clean
|
|
39
|
+
tag
|
|
40
|
+
else
|
|
41
|
+
'<'+clean+'></'+clean+'>'
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
data.gsub!(/'/, "'")
|
|
46
|
+
data.gsub!(/"/, "'")
|
|
47
|
+
if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
|
|
48
|
+
data = uconvert(data,'utf-8',@encoding)
|
|
49
|
+
end
|
|
50
|
+
sgml_feed(data) # see the alias above
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def decodeEntities(element, data)
|
|
55
|
+
data.gsub!('<', '<')
|
|
56
|
+
data.gsub!('<', '<')
|
|
57
|
+
data.gsub!('>', '>')
|
|
58
|
+
data.gsub!('>', '>')
|
|
59
|
+
data.gsub!('&', '&')
|
|
60
|
+
data.gsub!('&', '&')
|
|
61
|
+
data.gsub!('"', '"')
|
|
62
|
+
data.gsub!('"', '"')
|
|
63
|
+
data.gsub!(''', ''')
|
|
64
|
+
data.gsub!(''', ''')
|
|
65
|
+
if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
|
|
66
|
+
data.gsub!('<', '<')
|
|
67
|
+
data.gsub!('>', '>')
|
|
68
|
+
data.gsub!('&', '&')
|
|
69
|
+
data.gsub!('"', '"')
|
|
70
|
+
data.gsub!(''', "'")
|
|
71
|
+
end
|
|
72
|
+
return data
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
module FeedParserUtilities
|
|
3
|
+
def stripDoctype(data)
|
|
4
|
+
#Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
5
|
+
#rss_version may be 'rss091n' or None
|
|
6
|
+
#stripped_data is the same XML document, minus the DOCTYPE
|
|
7
|
+
entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
|
|
8
|
+
|
|
9
|
+
data = data.gsub(entity_pattern,'')
|
|
10
|
+
|
|
11
|
+
doctype_pattern = /<!DOCTYPE(.*?)>/m
|
|
12
|
+
doctype_results = data.scan(doctype_pattern)
|
|
13
|
+
if doctype_results and doctype_results[0]
|
|
14
|
+
doctype = doctype_results[0][0]
|
|
15
|
+
else
|
|
16
|
+
doctype = ''
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
if /netscape/ =~ doctype.downcase
|
|
20
|
+
version = 'rss091n'
|
|
21
|
+
else
|
|
22
|
+
version = nil
|
|
23
|
+
end
|
|
24
|
+
data = data.sub(doctype_pattern, '')
|
|
25
|
+
return version, data
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def resolveRelativeURIs(htmlSource, baseURI, encoding)
|
|
29
|
+
$stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
|
|
30
|
+
relative_uris = [ ['a','href'],
|
|
31
|
+
['applet','codebase'],
|
|
32
|
+
['area','href'],
|
|
33
|
+
['blockquote','cite'],
|
|
34
|
+
['body','background'],
|
|
35
|
+
['del','cite'],
|
|
36
|
+
['form','action'],
|
|
37
|
+
['frame','longdesc'],
|
|
38
|
+
['frame','src'],
|
|
39
|
+
['iframe','longdesc'],
|
|
40
|
+
['iframe','src'],
|
|
41
|
+
['head','profile'],
|
|
42
|
+
['img','longdesc'],
|
|
43
|
+
['img','src'],
|
|
44
|
+
['img','usemap'],
|
|
45
|
+
['input','src'],
|
|
46
|
+
['input','usemap'],
|
|
47
|
+
['ins','cite'],
|
|
48
|
+
['link','href'],
|
|
49
|
+
['object','classid'],
|
|
50
|
+
['object','codebase'],
|
|
51
|
+
['object','data'],
|
|
52
|
+
['object','usemap'],
|
|
53
|
+
['q','cite'],
|
|
54
|
+
['script','src'],
|
|
55
|
+
]
|
|
56
|
+
h = Hpricot(htmlSource)
|
|
57
|
+
relative_uris.each do |l|
|
|
58
|
+
ename, eattr = l
|
|
59
|
+
h.search(ename).each do |elem|
|
|
60
|
+
euri = elem.attributes[eattr]
|
|
61
|
+
uri = Addressable::URI.parse(Addressable::URI.encode(euri)) rescue nil
|
|
62
|
+
if euri and not euri.empty? and uri and uri.relative?
|
|
63
|
+
elem.raw_attributes[eattr] = urljoin(baseURI, euri)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
return h.to_html
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
class Time
|
|
2
|
+
class << self
|
|
3
|
+
COMMON_YEAR_DAYS_IN_MONTH = [nil, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
|
4
|
+
|
|
5
|
+
def days_in_month(month, year = now.year)
|
|
6
|
+
return 29 if month == 2 && ::Date.gregorian_leap?(year)
|
|
7
|
+
COMMON_YEAR_DAYS_IN_MONTH[month]
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
gem 'nokogiri', '~>1.2'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
|
|
4
|
+
module FeedParser
|
|
5
|
+
module Nokogiri
|
|
6
|
+
|
|
7
|
+
class NokogiriSyntaxError < StandardError; end
|
|
8
|
+
|
|
9
|
+
class StrictFeedParser
|
|
10
|
+
attr_reader :handler
|
|
11
|
+
def initialize(baseuri, baselang)
|
|
12
|
+
@handler = StrictFeedParserHandler.new(baseuri, baselang, 'utf-8')
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def parse(data)
|
|
16
|
+
saxparser = ::Nokogiri::XML::SAX::Parser.new(@handler)
|
|
17
|
+
|
|
18
|
+
saxparser.parse data
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class StrictFeedParserHandler < ::Nokogiri::XML::SAX::Document
|
|
23
|
+
include FeedParserMixin
|
|
24
|
+
|
|
25
|
+
attr_accessor :bozo, :entries, :feeddata, :exc
|
|
26
|
+
|
|
27
|
+
def initialize(baseuri, baselang, encoding)
|
|
28
|
+
$stderr.puts "trying Nokogiri::StrictFeedParser" if $debug
|
|
29
|
+
startup(baseuri, baselang, encoding)
|
|
30
|
+
@bozo = false
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def start_element(name, attrs)
|
|
34
|
+
name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
|
|
35
|
+
namespaceuri = ($2 || '').downcase
|
|
36
|
+
name = $3
|
|
37
|
+
if /backend\.userland\.com\/rss/ =~ namespaceuri
|
|
38
|
+
# match any backend.userland.com namespace
|
|
39
|
+
namespaceuri = 'http://backend.userland.com/rss'
|
|
40
|
+
end
|
|
41
|
+
prefix = @matchnamespaces[namespaceuri]
|
|
42
|
+
|
|
43
|
+
if prefix && !prefix.empty?
|
|
44
|
+
name = prefix + ':' + name
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
name.downcase!
|
|
48
|
+
unknown_starttag(name, attrs)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def characters(text)
|
|
52
|
+
handle_data(text)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def cdata_block(text)
|
|
56
|
+
handle_data(text)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def end_element(name)
|
|
60
|
+
name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
|
|
61
|
+
namespaceuri = ($2 || '').downcase
|
|
62
|
+
|
|
63
|
+
prefix = @matchnamespaces[namespaceuri]
|
|
64
|
+
|
|
65
|
+
if prefix && !prefix.empty?
|
|
66
|
+
localname = prefix + ':' + name
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
name.downcase!
|
|
70
|
+
unknown_endtag(name)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def error(error_string)
|
|
74
|
+
@bozo = true
|
|
75
|
+
@exc = NokogiriSyntaxError.new(error_string)
|
|
76
|
+
raise @exc
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,1275 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
module FeedParser
|
|
4
|
+
module FeedParserMixin
|
|
5
|
+
include FeedParserUtilities
|
|
6
|
+
attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
|
|
7
|
+
|
|
8
|
+
def startup(baseuri=nil, baselang=nil, encoding='utf-8')
|
|
9
|
+
$stderr << "initializing FeedParser\n" if $debug
|
|
10
|
+
|
|
11
|
+
@namespaces = {'' => '',
|
|
12
|
+
'http://backend.userland.com/rss' => '',
|
|
13
|
+
'http://blogs.law.harvard.edu/tech/rss' => '',
|
|
14
|
+
'http://purl.org/rss/1.0/' => '',
|
|
15
|
+
'http://my.netscape.com/rdf/simple/0.9/' => '',
|
|
16
|
+
'http://example.com/newformat#' => '',
|
|
17
|
+
'http://example.com/necho' => '',
|
|
18
|
+
'http://purl.org/echo/' => '',
|
|
19
|
+
'uri/of/echo/namespace#' => '',
|
|
20
|
+
'http://purl.org/pie/' => '',
|
|
21
|
+
'http://purl.org/atom/ns#' => '',
|
|
22
|
+
'http://www.w3.org/2005/Atom' => '',
|
|
23
|
+
'http://purl.org/rss/1.0/modules/rss091#' => '',
|
|
24
|
+
'http://webns.net/mvcb/' => 'admin',
|
|
25
|
+
'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
|
|
26
|
+
'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
|
|
27
|
+
'http://media.tangent.org/rss/1.0/' => 'audio',
|
|
28
|
+
'http://backend.userland.com/blogChannelModule' => 'blogChannel',
|
|
29
|
+
'http://web.resource.org/cc/' => 'cc',
|
|
30
|
+
'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
|
|
31
|
+
'http://purl.org/rss/1.0/modules/company' => 'co',
|
|
32
|
+
'http://purl.org/rss/1.0/modules/content/' => 'content',
|
|
33
|
+
'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
|
|
34
|
+
'http://purl.org/dc/elements/1.1/' => 'dc',
|
|
35
|
+
'http://purl.org/dc/terms/' => 'dcterms',
|
|
36
|
+
'http://purl.org/rss/1.0/modules/email/' => 'email',
|
|
37
|
+
'http://purl.org/rss/1.0/modules/event/' => 'ev',
|
|
38
|
+
'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
|
|
39
|
+
'http://freshmeat.net/rss/fm/' => 'fm',
|
|
40
|
+
'http://xmlns.com/foaf/0.1/' => 'foaf',
|
|
41
|
+
'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
|
|
42
|
+
'http://postneo.com/icbm/' => 'icbm',
|
|
43
|
+
'http://purl.org/rss/1.0/modules/image/' => 'image',
|
|
44
|
+
'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
|
45
|
+
'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
|
46
|
+
'http://purl.org/rss/1.0/modules/link/' => 'l',
|
|
47
|
+
'http://search.yahoo.com/mrss' => 'media',
|
|
48
|
+
'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
|
|
49
|
+
'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
|
|
50
|
+
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
|
|
51
|
+
'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
|
|
52
|
+
'http://purl.org/rss/1.0/modules/reference/' => 'ref',
|
|
53
|
+
'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
|
|
54
|
+
'http://purl.org/rss/1.0/modules/search/' => 'search',
|
|
55
|
+
'http://purl.org/rss/1.0/modules/slash/' => 'slash',
|
|
56
|
+
'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
|
|
57
|
+
'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
|
|
58
|
+
'http://hacks.benhammersley.com/rss/streaming/' => 'str',
|
|
59
|
+
'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
|
|
60
|
+
'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
|
|
61
|
+
'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
|
|
62
|
+
'http://purl.org/rss/1.0/modules/threading/' => 'thr',
|
|
63
|
+
'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
|
|
64
|
+
'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
|
|
65
|
+
'http://wellformedweb.org/commentAPI/' => 'wfw',
|
|
66
|
+
'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
|
|
67
|
+
'http://www.w3.org/1999/xhtml' => 'xhtml',
|
|
68
|
+
'http://www.w3.org/XML/1998/namespace' => 'xml',
|
|
69
|
+
'http://www.w3.org/1999/xlink' => 'xlink',
|
|
70
|
+
'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
|
|
71
|
+
}
|
|
72
|
+
@matchnamespaces = {}
|
|
73
|
+
@namespaces.each do |l|
|
|
74
|
+
@matchnamespaces[l[0].downcase] = l[1]
|
|
75
|
+
end
|
|
76
|
+
@can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
|
|
77
|
+
@can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
|
78
|
+
@can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
|
79
|
+
@html_types = ['text/html', 'application/xhtml+xml']
|
|
80
|
+
@feeddata = FeedParserDict.new # feed-level data
|
|
81
|
+
@encoding = encoding # character encoding
|
|
82
|
+
@entries = [] # list of entry-level data
|
|
83
|
+
@version = '' # feed type/version see SUPPORTED_VERSIOSN
|
|
84
|
+
@namespacesInUse = {} # hash of namespaces defined by the feed
|
|
85
|
+
|
|
86
|
+
# the following are used internally to track state;
|
|
87
|
+
# this is really out of control and should be refactored
|
|
88
|
+
@infeed = false
|
|
89
|
+
@inentry = false
|
|
90
|
+
@incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
|
|
91
|
+
@intextinput = false
|
|
92
|
+
@inimage = false
|
|
93
|
+
@inauthor = false
|
|
94
|
+
@incontributor = false
|
|
95
|
+
@inpublisher = false
|
|
96
|
+
@insource = false
|
|
97
|
+
@sourcedata = FeedParserDict.new
|
|
98
|
+
@contentparams = FeedParserDict.new
|
|
99
|
+
@summaryKey = nil
|
|
100
|
+
@namespacemap = {}
|
|
101
|
+
@elementstack = []
|
|
102
|
+
@basestack = []
|
|
103
|
+
@langstack = []
|
|
104
|
+
@baseuri = baseuri || ''
|
|
105
|
+
@lang = baselang || nil
|
|
106
|
+
@has_title = false
|
|
107
|
+
if baselang
|
|
108
|
+
@feeddata['language'] = baselang.gsub('_','-')
|
|
109
|
+
end
|
|
110
|
+
$stderr << "Leaving startup\n" if $debug # My addition
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def unknown_starttag(tag, attrsd)
|
|
114
|
+
$stderr << "start #{tag} with #{attrsd.inspect}\n" if $debug
|
|
115
|
+
# normalize attrs
|
|
116
|
+
attrsD = {}
|
|
117
|
+
attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
|
|
118
|
+
# LooseFeedParser needs the above because SGMLParser sends attrs as a
|
|
119
|
+
# list of lists (like [['type','text/html'],['mode','escaped']])
|
|
120
|
+
|
|
121
|
+
attrsd.each do |old_k,value|
|
|
122
|
+
# There has to be a better, non-ugly way of doing this
|
|
123
|
+
k = old_k.downcase # Downcase all keys
|
|
124
|
+
attrsD[k] = value
|
|
125
|
+
if ['rel','type'].include?value
|
|
126
|
+
attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# track xml:base and xml:lang
|
|
131
|
+
baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
|
|
132
|
+
@baseuri = urljoin(@baseuri, baseuri)
|
|
133
|
+
lang = attrsD['xml:lang'] || attrsD['lang']
|
|
134
|
+
if lang == '' # FIXME This next bit of code is right? Wtf?
|
|
135
|
+
# xml:lang could be explicitly set to '', we need to capture that
|
|
136
|
+
lang = nil
|
|
137
|
+
elsif lang.nil?
|
|
138
|
+
# if no xml:lang is specified, use parent lang
|
|
139
|
+
lang = @lang
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
if lang && ! lang.empty? # Seriously, this cannot be correct
|
|
143
|
+
if ['feed', 'rss', 'rdf:RDF'].include?tag
|
|
144
|
+
@feeddata['language'] = lang.gsub('_','-')
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
@lang = lang
|
|
148
|
+
@basestack << @baseuri
|
|
149
|
+
@langstack << lang
|
|
150
|
+
|
|
151
|
+
# track namespaces
|
|
152
|
+
attrsd.each do |prefix, uri|
|
|
153
|
+
if /^xmlns:/ =~ prefix # prefix begins with xmlns:
|
|
154
|
+
trackNamespace(prefix[6..-1], uri)
|
|
155
|
+
elsif prefix == 'xmlns':
|
|
156
|
+
trackNamespace(nil, uri)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# track inline content
|
|
161
|
+
if @incontent != 0 && @contentparams.has_key?('type') && ! ( /xml$/ =~ (@contentparams['type'] || 'xml') )
|
|
162
|
+
# element declared itself as escaped markup, but isn't really
|
|
163
|
+
|
|
164
|
+
@contentparams['type'] = 'application/xhtml+xml'
|
|
165
|
+
end
|
|
166
|
+
if @incontent != 0 && @contentparams['type'] == 'application/xhtml+xml'
|
|
167
|
+
# Note: probably shouldn't simply recreate localname here, but
|
|
168
|
+
# our namespace handling isn't actually 100% correct in cases where
|
|
169
|
+
# the feed redefines the default namespace (which is actually
|
|
170
|
+
# the usual case for inline content, thanks Sam), so here we
|
|
171
|
+
# cheat and just reconstruct the element based on localname
|
|
172
|
+
# because that compensates for the bugs in our namespace handling.
|
|
173
|
+
# This will horribly munge inline content with non-empty qnames,
|
|
174
|
+
# but nobody actually does that, so I'm not fixing it.
|
|
175
|
+
tag = tag.split(':')[-1]
|
|
176
|
+
attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
|
|
177
|
+
attrsS = ' '+attrsA.join(' ')
|
|
178
|
+
return handle_data("<#{tag}#{attrsS}>", escape=false)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# match namespaces
|
|
182
|
+
if /:/ =~ tag
|
|
183
|
+
prefix, suffix = tag.split(':', 2)
|
|
184
|
+
else
|
|
185
|
+
prefix, suffix = '', tag
|
|
186
|
+
end
|
|
187
|
+
prefix = @namespacemap[prefix] || prefix
|
|
188
|
+
if prefix && ! prefix.empty?
|
|
189
|
+
prefix = prefix + '_'
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# special hack for better tracking of empty textinput/image elements in illformed feeds
|
|
193
|
+
if (not prefix && ! prefix.empty?) && ! (['title', 'link', 'description','name'].include?tag)
|
|
194
|
+
@intextinput = false
|
|
195
|
+
end
|
|
196
|
+
if (prefix.nil? || prefix.empty?) && ! (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
|
|
197
|
+
@inimage = false
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# call special handler (if defined) or default handler
|
|
201
|
+
begin
|
|
202
|
+
return send('_start_'+prefix+suffix, attrsD)
|
|
203
|
+
rescue NoMethodError
|
|
204
|
+
return push(prefix + suffix, true)
|
|
205
|
+
end
|
|
206
|
+
end # End unknown_starttag
|
|
207
|
+
|
|
208
|
+
def unknown_endtag(tag)
|
|
209
|
+
$stderr << "end #{tag}\n" if $debug
|
|
210
|
+
# match namespaces
|
|
211
|
+
if tag.index(':')
|
|
212
|
+
prefix, suffix = tag.split(':',2)
|
|
213
|
+
else
|
|
214
|
+
prefix, suffix = '', tag
|
|
215
|
+
end
|
|
216
|
+
prefix = @namespacemap[prefix] || prefix
|
|
217
|
+
if prefix && ! prefix.empty?
|
|
218
|
+
prefix = prefix + '_'
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# call special handler (if defined) or default handler
|
|
222
|
+
begin
|
|
223
|
+
send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
|
|
224
|
+
rescue NoMethodError => details
|
|
225
|
+
pop(prefix + suffix)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# track inline content
|
|
229
|
+
if @incontent != 0 && @contentparams.has_key?('type') && /xml$/ =~ (@contentparams['type'] || 'xml')
|
|
230
|
+
# element declared itself as escaped markup, but it isn't really
|
|
231
|
+
@contentparams['type'] = 'application/xhtml+xml'
|
|
232
|
+
end
|
|
233
|
+
if @incontent != 0 && @contentparams['type'] == 'application/xhtml+xml'
|
|
234
|
+
tag = tag.split(':')[-1]
|
|
235
|
+
handle_data("</#{tag}>", escape=false)
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# track xml:base and xml:lang going out of scope
|
|
239
|
+
if @basestack && ! @basestack.empty?
|
|
240
|
+
@basestack.pop
|
|
241
|
+
if @basestack && @basestack[-1] && ! (@basestack.empty? || @basestack[-1].empty?)
|
|
242
|
+
@baseuri = @basestack[-1]
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
if @langstack && ! @langstack.empty?
|
|
246
|
+
@langstack.pop
|
|
247
|
+
if @langstack && ! @langstack.empty? # && @langstack[-1] && ! @langstack.empty?
|
|
248
|
+
@lang = @langstack[-1]
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def handle_charref(ref)
|
|
254
|
+
# LooseParserOnly
|
|
255
|
+
# called for each character reference, e.g. for ' ', ref will be '160'
|
|
256
|
+
$stderr << "entering handle_charref with #{ref}\n" if $debug
|
|
257
|
+
return if @elementstack.nil? || @elementstack.empty?
|
|
258
|
+
|
|
259
|
+
ref.downcase!
|
|
260
|
+
chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
|
|
261
|
+
if chars.include?ref
|
|
262
|
+
text = "&##{ref};"
|
|
263
|
+
else
|
|
264
|
+
if ref[0..0] == 'x'
|
|
265
|
+
c = (ref[1..-1]).to_i(16)
|
|
266
|
+
else
|
|
267
|
+
c = ref.to_i
|
|
268
|
+
end
|
|
269
|
+
text = [c].pack('U*')
|
|
270
|
+
end
|
|
271
|
+
@elementstack[-1][2] << text
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def handle_entityref(ref)
|
|
275
|
+
# LooseParserOnly
|
|
276
|
+
# called for each entity reference, e.g. for '©', ref will be 'copy'
|
|
277
|
+
|
|
278
|
+
return if @elementstack.nil? || @elementstack.empty?
|
|
279
|
+
$stderr << "entering handle_entityref with #{ref}\n" if $debug
|
|
280
|
+
ents = ['lt', 'gt', 'quot', 'amp', 'apos']
|
|
281
|
+
if ents.include?ref
|
|
282
|
+
text = "&#{ref};"
|
|
283
|
+
else
|
|
284
|
+
text = HTMLEntities::decode_entities("&#{ref};")
|
|
285
|
+
end
|
|
286
|
+
@elementstack[-1][2] << text
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def handle_data(text, escape=true)
|
|
290
|
+
# called for each block of plain text, i.e. outside of any tag and
|
|
291
|
+
# not containing any character or entity references
|
|
292
|
+
return if @elementstack.nil? || @elementstack.empty?
|
|
293
|
+
if escape && @contentparams['type'] == 'application/xhtml+xml'
|
|
294
|
+
text = text.to_xs
|
|
295
|
+
end
|
|
296
|
+
@elementstack[-1][2] << text
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def handle_comment(comment)
|
|
300
|
+
# called for each comment, e.g. <!-- insert message here -->
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def handle_pi(text)
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def handle_decl(text)
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def parse_declaration(i)
|
|
310
|
+
# for LooseFeedParser
|
|
311
|
+
$stderr << "entering parse_declaration\n" if $debug
|
|
312
|
+
if @rawdata[i...i+9] == '<![CDATA['
|
|
313
|
+
k = @rawdata.index(/\]\]>/u,i+9)
|
|
314
|
+
k = @rawdata.length unless k
|
|
315
|
+
handle_data(@rawdata[i+9...k].to_xs,false)
|
|
316
|
+
return k+3
|
|
317
|
+
else
|
|
318
|
+
k = @rawdata.index(/>/,i).to_i
|
|
319
|
+
return k+1
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def mapContentType(contentType)
|
|
324
|
+
contentType.downcase!
|
|
325
|
+
case contentType
|
|
326
|
+
when 'text'
|
|
327
|
+
contentType = 'text/plain'
|
|
328
|
+
when 'html'
|
|
329
|
+
contentType = 'text/html'
|
|
330
|
+
when 'xhtml'
|
|
331
|
+
contentType = 'application/xhtml+xml'
|
|
332
|
+
end
|
|
333
|
+
return contentType
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def trackNamespace(prefix, uri)
|
|
337
|
+
|
|
338
|
+
loweruri = uri.downcase.strip
|
|
339
|
+
if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] && (@version.nil? || @version.empty?)
|
|
340
|
+
@version = 'rss090'
|
|
341
|
+
elsif loweruri == 'http://purl.org/rss/1.0/' && (@version.nil? || @version.empty?)
|
|
342
|
+
@version = 'rss10'
|
|
343
|
+
elsif loweruri == 'http://www.w3.org/2005/atom' && (@version.nil? || @version.empty?)
|
|
344
|
+
@version = 'atom10'
|
|
345
|
+
elsif /backend\.userland\.com\/rss/ =~ loweruri
|
|
346
|
+
# match any backend.userland.com namespace
|
|
347
|
+
uri = 'http://backend.userland.com/rss'
|
|
348
|
+
loweruri = uri
|
|
349
|
+
end
|
|
350
|
+
if @matchnamespaces.has_key? loweruri
|
|
351
|
+
@namespacemap[prefix] = @matchnamespaces[loweruri]
|
|
352
|
+
@namespacesInUse[@matchnamespaces[loweruri]] = uri
|
|
353
|
+
else
|
|
354
|
+
@namespacesInUse[prefix || ''] = uri
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
def resolveURI(uri)
|
|
359
|
+
return urljoin(@baseuri || '', uri)
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
def decodeEntities(element, data)
|
|
363
|
+
return data
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
def push(element, expectingText)
|
|
367
|
+
@elementstack << [element, expectingText, []]
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
def pop(element, stripWhitespace=true)
|
|
371
|
+
return if @elementstack.nil? || @elementstack.empty?
|
|
372
|
+
return if @elementstack[-1][0] != element
|
|
373
|
+
element, expectingText, pieces = @elementstack.pop
|
|
374
|
+
|
|
375
|
+
if pieces.class == Array
|
|
376
|
+
output = pieces.join('')
|
|
377
|
+
else
|
|
378
|
+
output = pieces
|
|
379
|
+
end
|
|
380
|
+
if stripWhitespace
|
|
381
|
+
output.strip!
|
|
382
|
+
end
|
|
383
|
+
return output if ! expectingText
|
|
384
|
+
|
|
385
|
+
# decode base64 content
|
|
386
|
+
if @contentparams['base64']
|
|
387
|
+
out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
|
|
388
|
+
if ! output.empty? && ! out64.empty?
|
|
389
|
+
output = out64
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
# resolve relative URIs
|
|
394
|
+
if @can_be_relative_uri.include?(element) && output && !output.empty?
|
|
395
|
+
output = resolveURI(output)
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
# decode entities within embedded markup
|
|
399
|
+
if ! @contentparams['base64']
|
|
400
|
+
output = decodeEntities(element, output)
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
# remove temporary cruft from contentparams
|
|
404
|
+
@contentparams.delete('mode')
|
|
405
|
+
@contentparams.delete('base64')
|
|
406
|
+
|
|
407
|
+
# resolve relative URIs within embedded markup
|
|
408
|
+
if @html_types.include?(mapContentType(@contentparams['type'] || 'text/html'))
|
|
409
|
+
if @can_contain_relative_uris.include?(element)
|
|
410
|
+
output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
# sanitize embedded markup
|
|
414
|
+
if @html_types.include?(mapContentType(@contentparams['type'] || 'text/html'))
|
|
415
|
+
if @can_contain_dangerous_markup.include?(element)
|
|
416
|
+
output = FeedParser.sanitizeHTML(output, @encoding)
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
if @encoding && ! @encoding.empty? && @encoding != 'utf-8'
|
|
421
|
+
output = uconvert(output, @encoding, 'utf-8')
|
|
422
|
+
# FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# categories/tags/keywords/whatever are handled in _end_category
|
|
426
|
+
return output if element == 'category'
|
|
427
|
+
|
|
428
|
+
return output if element == 'title' && @has_title
|
|
429
|
+
|
|
430
|
+
# store output in appropriate place(s)
|
|
431
|
+
if @inentry && ! @insource
|
|
432
|
+
if element == 'content'
|
|
433
|
+
@entries[-1][element] ||= []
|
|
434
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
|
|
435
|
+
contentparams['value'] = output
|
|
436
|
+
@entries[-1][element] << contentparams
|
|
437
|
+
elsif element == 'link'
|
|
438
|
+
@entries[-1][element] = output
|
|
439
|
+
if output && ! output.empty?
|
|
440
|
+
@entries[-1]['links'][-1]['href'] = output
|
|
441
|
+
end
|
|
442
|
+
else
|
|
443
|
+
element = 'summary' if element == 'description'
|
|
444
|
+
@entries[-1][element] = output
|
|
445
|
+
if @incontent != 0
|
|
446
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
|
447
|
+
contentparams['value'] = output
|
|
448
|
+
@entries[-1][element + '_detail'] = contentparams
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
elsif (@infeed || @insource) && ! @intextinput && ! @inimage
|
|
452
|
+
context = getContext()
|
|
453
|
+
element = 'subtitle' if element == 'description'
|
|
454
|
+
context[element] = output
|
|
455
|
+
if element == 'link'
|
|
456
|
+
context['links'][-1]['href'] = output
|
|
457
|
+
elsif @incontent != 0
|
|
458
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
|
459
|
+
contentparams['value'] = output
|
|
460
|
+
context[element + '_detail'] = contentparams
|
|
461
|
+
end
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
return output
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
def pushContent(tag, attrsD, defaultContentType, expectingText)
|
|
468
|
+
@incontent += 1 # Yes, I hate this.
|
|
469
|
+
type = mapContentType(attrsD['type'] || defaultContentType)
|
|
470
|
+
@contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
|
|
471
|
+
@contentparams['base64'] = isBase64(attrsD, @contentparams)
|
|
472
|
+
push(tag, expectingText)
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
def popContent(tag)
|
|
476
|
+
value = pop(tag)
|
|
477
|
+
@incontent -= 1
|
|
478
|
+
@contentparams.clear
|
|
479
|
+
return value
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
def mapToStandardPrefix(name)
|
|
483
|
+
colonpos = name.index(':')
|
|
484
|
+
if colonpos
|
|
485
|
+
prefix = name[0..colonpos-1]
|
|
486
|
+
suffix = name[colonpos+1..-1]
|
|
487
|
+
prefix = @namespacemap[prefix] || prefix
|
|
488
|
+
name = prefix + ':' + suffix
|
|
489
|
+
end
|
|
490
|
+
return name
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
def getAttribute(attrsD, name)
|
|
494
|
+
return attrsD[mapToStandardPrefix(name)]
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
def isBase64(attrsD, contentparams)
|
|
498
|
+
return true if (attrsD['mode'] == 'base64')
|
|
499
|
+
if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
|
|
500
|
+
return false
|
|
501
|
+
end
|
|
502
|
+
return true
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
def itsAnHrefDamnIt(attrsD)
|
|
506
|
+
href= attrsD['url'] || attrsD['uri'] || attrsD['href']
|
|
507
|
+
if href
|
|
508
|
+
attrsD.delete('url')
|
|
509
|
+
attrsD.delete('uri')
|
|
510
|
+
attrsD['href'] = href
|
|
511
|
+
end
|
|
512
|
+
return attrsD
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def _save(key, value)
|
|
517
|
+
context = getContext()
|
|
518
|
+
context[key] ||= value
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
def _start_rss(attrsD)
|
|
522
|
+
versionmap = {'0.91' => 'rss091u',
|
|
523
|
+
'0.92' => 'rss092',
|
|
524
|
+
'0.93' => 'rss093',
|
|
525
|
+
'0.94' => 'rss094'
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
if ! @version || @version.empty?
|
|
529
|
+
attr_version = attrsD['version'] || ''
|
|
530
|
+
version = versionmap[attr_version]
|
|
531
|
+
if version && ! version.empty?
|
|
532
|
+
@version = version
|
|
533
|
+
elsif /^2\./ =~ attr_version
|
|
534
|
+
@version = 'rss20'
|
|
535
|
+
else
|
|
536
|
+
@version = 'rss'
|
|
537
|
+
end
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
def _start_dlhottitles(attrsD)
|
|
542
|
+
@version = 'hotrss'
|
|
543
|
+
end
|
|
544
|
+
|
|
545
|
+
def _start_channel(attrsD)
|
|
546
|
+
@infeed = true
|
|
547
|
+
_cdf_common(attrsD)
|
|
548
|
+
end
|
|
549
|
+
alias :_start_feedinfo :_start_channel
|
|
550
|
+
|
|
551
|
+
def _cdf_common(attrsD)
|
|
552
|
+
if attrsD.has_key?'lastmod'
|
|
553
|
+
_start_modified({})
|
|
554
|
+
@elementstack[-1][-1] = attrsD['lastmod']
|
|
555
|
+
_end_modified
|
|
556
|
+
end
|
|
557
|
+
if attrsD.has_key?'href'
|
|
558
|
+
_start_link({})
|
|
559
|
+
@elementstack[-1][-1] = attrsD['href']
|
|
560
|
+
_end_link
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
def _start_feed(attrsD)
|
|
565
|
+
@infeed = true
|
|
566
|
+
versionmap = {'0.1' => 'atom01',
|
|
567
|
+
'0.2' => 'atom02',
|
|
568
|
+
'0.3' => 'atom03'
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if ! @version || @version.empty?
|
|
572
|
+
attr_version = attrsD['version']
|
|
573
|
+
version = versionmap[attr_version]
|
|
574
|
+
if @version && ! @version.empty?
|
|
575
|
+
@version = version
|
|
576
|
+
else
|
|
577
|
+
@version = 'atom'
|
|
578
|
+
end
|
|
579
|
+
end
|
|
580
|
+
end
|
|
581
|
+
|
|
582
|
+
def _end_channel
|
|
583
|
+
@infeed = false
|
|
584
|
+
end
|
|
585
|
+
alias :_end_feed :_end_channel
|
|
586
|
+
|
|
587
|
+
def _start_image(attrsD)
|
|
588
|
+
@inimage = true
|
|
589
|
+
@has_title = false
|
|
590
|
+
push('image', false)
|
|
591
|
+
context = getContext()
|
|
592
|
+
context['image'] ||= FeedParserDict.new
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
def _end_image
|
|
596
|
+
pop('image')
|
|
597
|
+
@inimage = false
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
def _start_textinput(attrsD)
|
|
601
|
+
@intextinput = true
|
|
602
|
+
@has_title = false
|
|
603
|
+
push('textinput', false)
|
|
604
|
+
context = getContext()
|
|
605
|
+
context['textinput'] ||= FeedParserDict.new
|
|
606
|
+
end
|
|
607
|
+
alias :_start_textInput :_start_textinput
|
|
608
|
+
|
|
609
|
+
def _end_textinput
|
|
610
|
+
pop('textinput')
|
|
611
|
+
@intextinput = false
|
|
612
|
+
end
|
|
613
|
+
alias :_end_textInput :_end_textinput
|
|
614
|
+
|
|
615
|
+
def _start_author(attrsD)
|
|
616
|
+
@inauthor = true
|
|
617
|
+
push('author', true)
|
|
618
|
+
end
|
|
619
|
+
alias :_start_managingeditor :_start_author
|
|
620
|
+
alias :_start_dc_author :_start_author
|
|
621
|
+
alias :_start_dc_creator :_start_author
|
|
622
|
+
alias :_start_itunes_author :_start_author
|
|
623
|
+
|
|
624
|
+
def _end_author
|
|
625
|
+
pop('author')
|
|
626
|
+
@inauthor = false
|
|
627
|
+
_sync_author_detail()
|
|
628
|
+
end
|
|
629
|
+
alias :_end_managingeditor :_end_author
|
|
630
|
+
alias :_end_dc_author :_end_author
|
|
631
|
+
alias :_end_dc_creator :_end_author
|
|
632
|
+
alias :_end_itunes_author :_end_author
|
|
633
|
+
|
|
634
|
+
def _start_itunes_owner(attrsD)
|
|
635
|
+
@inpublisher = true
|
|
636
|
+
push('publisher', false)
|
|
637
|
+
end
|
|
638
|
+
|
|
639
|
+
def _end_itunes_owner
|
|
640
|
+
pop('publisher')
|
|
641
|
+
@inpublisher = false
|
|
642
|
+
_sync_author_detail('publisher')
|
|
643
|
+
end
|
|
644
|
+
|
|
645
|
+
def _start_contributor(attrsD)
|
|
646
|
+
@incontributor = true
|
|
647
|
+
context = getContext()
|
|
648
|
+
context['contributors'] ||= []
|
|
649
|
+
context['contributors'] << FeedParserDict.new
|
|
650
|
+
push('contributor', false)
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
def _end_contributor
|
|
654
|
+
pop('contributor')
|
|
655
|
+
@incontributor = false
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
def _start_dc_contributor(attrsD)
|
|
659
|
+
@incontributor = true
|
|
660
|
+
context = getContext()
|
|
661
|
+
context['contributors'] ||= []
|
|
662
|
+
context['contributors'] << FeedParserDict.new
|
|
663
|
+
push('name', false)
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
def _end_dc_contributor
|
|
667
|
+
_end_name
|
|
668
|
+
@incontributor = false
|
|
669
|
+
end
|
|
670
|
+
|
|
671
|
+
def _start_name(attrsD)
|
|
672
|
+
push('name', false)
|
|
673
|
+
end
|
|
674
|
+
alias :_start_itunes_name :_start_name
|
|
675
|
+
|
|
676
|
+
def _end_name
|
|
677
|
+
value = pop('name')
|
|
678
|
+
if @inpublisher
|
|
679
|
+
_save_author('name', value, 'publisher')
|
|
680
|
+
elsif @inauthor
|
|
681
|
+
_save_author('name', value)
|
|
682
|
+
elsif @incontributor
|
|
683
|
+
_save_contributor('name', value)
|
|
684
|
+
elsif @intextinput
|
|
685
|
+
context = getContext()
|
|
686
|
+
context['textinput']['name'] = value
|
|
687
|
+
end
|
|
688
|
+
end
|
|
689
|
+
alias :_end_itunes_name :_end_name
|
|
690
|
+
|
|
691
|
+
def _start_width(attrsD)
|
|
692
|
+
push('width', false)
|
|
693
|
+
end
|
|
694
|
+
|
|
695
|
+
def _end_width
|
|
696
|
+
value = pop('width').to_i
|
|
697
|
+
if @inimage
|
|
698
|
+
context = getContext
|
|
699
|
+
context['image']['width'] = value
|
|
700
|
+
end
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
def _start_height(attrsD)
|
|
704
|
+
push('height', false)
|
|
705
|
+
end
|
|
706
|
+
|
|
707
|
+
def _end_height
|
|
708
|
+
value = pop('height').to_i
|
|
709
|
+
if @inimage
|
|
710
|
+
context = getContext()
|
|
711
|
+
context['image']['height'] = value
|
|
712
|
+
end
|
|
713
|
+
end
|
|
714
|
+
|
|
715
|
+
def _start_url(attrsD)
|
|
716
|
+
push('href', true)
|
|
717
|
+
end
|
|
718
|
+
alias :_start_homepage :_start_url
|
|
719
|
+
alias :_start_uri :_start_url
|
|
720
|
+
|
|
721
|
+
def _end_url
|
|
722
|
+
value = pop('href')
|
|
723
|
+
if @inauthor
|
|
724
|
+
_save_author('href', value)
|
|
725
|
+
elsif @incontributor
|
|
726
|
+
_save_contributor('href', value)
|
|
727
|
+
elsif @inimage
|
|
728
|
+
context = getContext()
|
|
729
|
+
context['image']['href'] = value
|
|
730
|
+
elsif @intextinput
|
|
731
|
+
context = getContext()
|
|
732
|
+
context['textinput']['link'] = value
|
|
733
|
+
end
|
|
734
|
+
end
|
|
735
|
+
alias :_end_homepage :_end_url
|
|
736
|
+
alias :_end_uri :_end_url
|
|
737
|
+
|
|
738
|
+
def _start_email(attrsD)
|
|
739
|
+
push('email', false)
|
|
740
|
+
end
|
|
741
|
+
alias :_start_itunes_email :_start_email
|
|
742
|
+
|
|
743
|
+
def _end_email
|
|
744
|
+
value = pop('email')
|
|
745
|
+
if @inpublisher
|
|
746
|
+
_save_author('email', value, 'publisher')
|
|
747
|
+
elsif @inauthor
|
|
748
|
+
_save_author('email', value)
|
|
749
|
+
elsif @incontributor
|
|
750
|
+
_save_contributor('email', value)
|
|
751
|
+
end
|
|
752
|
+
end
|
|
753
|
+
alias :_end_itunes_email :_end_email
|
|
754
|
+
|
|
755
|
+
def getContext
|
|
756
|
+
if @insource
|
|
757
|
+
context = @sourcedata
|
|
758
|
+
elsif @inentry
|
|
759
|
+
context = @entries[-1]
|
|
760
|
+
else
|
|
761
|
+
context = @feeddata
|
|
762
|
+
end
|
|
763
|
+
return context
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
def _save_author(key, value, prefix='author')
|
|
767
|
+
context = getContext()
|
|
768
|
+
context[prefix + '_detail'] ||= FeedParserDict.new
|
|
769
|
+
context[prefix + '_detail'][key] = value
|
|
770
|
+
_sync_author_detail()
|
|
771
|
+
end
|
|
772
|
+
|
|
773
|
+
def _save_contributor(key, value)
|
|
774
|
+
context = getContext
|
|
775
|
+
context['contributors'] ||= [FeedParserDict.new]
|
|
776
|
+
context['contributors'][-1][key] = value
|
|
777
|
+
end
|
|
778
|
+
|
|
779
|
+
def _sync_author_detail(key='author')
|
|
780
|
+
context = getContext()
|
|
781
|
+
detail = context["#{key}_detail"]
|
|
782
|
+
if detail && ! detail.empty?
|
|
783
|
+
name = detail['name']
|
|
784
|
+
email = detail['email']
|
|
785
|
+
|
|
786
|
+
if name && email && ! (name.empty? || name.empty?)
|
|
787
|
+
context[key] = "#{name} (#{email})"
|
|
788
|
+
elsif name && ! name.empty?
|
|
789
|
+
context[key] = name
|
|
790
|
+
elsif email && ! email.empty?
|
|
791
|
+
context[key] = email
|
|
792
|
+
end
|
|
793
|
+
else
|
|
794
|
+
author = context[key].dup unless context[key].nil?
|
|
795
|
+
return if ! author || author.empty?
|
|
796
|
+
emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
|
|
797
|
+
email = emailmatch[1]
|
|
798
|
+
author.gsub!(email, '')
|
|
799
|
+
author.gsub!("\(\)", '')
|
|
800
|
+
author.strip!
|
|
801
|
+
author.gsub!(/^\(/,'')
|
|
802
|
+
author.gsub!(/\)$/,'')
|
|
803
|
+
author.strip!
|
|
804
|
+
context["#{key}_detail"] ||= FeedParserDict.new
|
|
805
|
+
context["#{key}_detail"]['name'] = author
|
|
806
|
+
context["#{key}_detail"]['email'] = email
|
|
807
|
+
end
|
|
808
|
+
end
|
|
809
|
+
|
|
810
|
+
def _start_subtitle(attrsD)
|
|
811
|
+
pushContent('subtitle', attrsD, 'text/plain', true)
|
|
812
|
+
end
|
|
813
|
+
alias :_start_tagline :_start_subtitle
|
|
814
|
+
alias :_start_itunes_subtitle :_start_subtitle
|
|
815
|
+
|
|
816
|
+
def _end_subtitle
|
|
817
|
+
popContent('subtitle')
|
|
818
|
+
end
|
|
819
|
+
alias :_end_tagline :_end_subtitle
|
|
820
|
+
alias :_end_itunes_subtitle :_end_subtitle
|
|
821
|
+
|
|
822
|
+
def _start_rights(attrsD)
|
|
823
|
+
pushContent('rights', attrsD, 'text/plain', true)
|
|
824
|
+
end
|
|
825
|
+
alias :_start_dc_rights :_start_rights
|
|
826
|
+
alias :_start_copyright :_start_rights
|
|
827
|
+
|
|
828
|
+
def _end_rights
|
|
829
|
+
popContent('rights')
|
|
830
|
+
end
|
|
831
|
+
alias :_end_dc_rights :_end_rights
|
|
832
|
+
alias :_end_copyright :_end_rights
|
|
833
|
+
|
|
834
|
+
def _start_item(attrsD)
|
|
835
|
+
@entries << FeedParserDict.new
|
|
836
|
+
push('item', false)
|
|
837
|
+
@inentry = true
|
|
838
|
+
@has_title = false
|
|
839
|
+
@guidislink = false
|
|
840
|
+
id = getAttribute(attrsD, 'rdf:about')
|
|
841
|
+
if id && ! id.empty?
|
|
842
|
+
context = getContext()
|
|
843
|
+
context['id'] = id
|
|
844
|
+
end
|
|
845
|
+
_cdf_common(attrsD)
|
|
846
|
+
end
|
|
847
|
+
alias :_start_entry :_start_item
|
|
848
|
+
alias :_start_product :_start_item
|
|
849
|
+
|
|
850
|
+
def _end_item
|
|
851
|
+
pop('item')
|
|
852
|
+
@inentry = false
|
|
853
|
+
end
|
|
854
|
+
alias :_end_entry :_end_item
|
|
855
|
+
|
|
856
|
+
def _start_dc_language(attrsD)
|
|
857
|
+
push('language', true)
|
|
858
|
+
end
|
|
859
|
+
alias :_start_language :_start_dc_language
|
|
860
|
+
|
|
861
|
+
def _end_dc_language
|
|
862
|
+
@lang = pop('language')
|
|
863
|
+
end
|
|
864
|
+
alias :_end_language :_end_dc_language
|
|
865
|
+
|
|
866
|
+
def _start_dc_publisher(attrsD)
|
|
867
|
+
push('publisher', true)
|
|
868
|
+
end
|
|
869
|
+
alias :_start_webmaster :_start_dc_publisher
|
|
870
|
+
|
|
871
|
+
def _end_dc_publisher
|
|
872
|
+
pop('publisher')
|
|
873
|
+
_sync_author_detail('publisher')
|
|
874
|
+
end
|
|
875
|
+
alias :_end_webmaster :_end_dc_publisher
|
|
876
|
+
|
|
877
|
+
def _start_published(attrsD)
|
|
878
|
+
push('published', true)
|
|
879
|
+
end
|
|
880
|
+
alias :_start_dcterms_issued :_start_published
|
|
881
|
+
alias :_start_issued :_start_published
|
|
882
|
+
|
|
883
|
+
def _end_published
|
|
884
|
+
value = pop('published')
|
|
885
|
+
d = parse_date(value)
|
|
886
|
+
_save('published_parsed', extract_tuple(d))
|
|
887
|
+
_save('published_time', d)
|
|
888
|
+
end
|
|
889
|
+
alias :_end_dcterms_issued :_end_published
|
|
890
|
+
alias :_end_issued :_end_published
|
|
891
|
+
|
|
892
|
+
def _start_updated(attrsD)
|
|
893
|
+
push('updated', true)
|
|
894
|
+
end
|
|
895
|
+
alias :_start_modified :_start_updated
|
|
896
|
+
alias :_start_dcterms_modified :_start_updated
|
|
897
|
+
alias :_start_pubdate :_start_updated
|
|
898
|
+
alias :_start_dc_date :_start_updated
|
|
899
|
+
|
|
900
|
+
def _end_updated
|
|
901
|
+
value = pop('updated')
|
|
902
|
+
d = parse_date(value)
|
|
903
|
+
_save('updated_parsed', extract_tuple(d))
|
|
904
|
+
_save('updated_time', d)
|
|
905
|
+
end
|
|
906
|
+
alias :_end_modified :_end_updated
|
|
907
|
+
alias :_end_dcterms_modified :_end_updated
|
|
908
|
+
alias :_end_pubdate :_end_updated
|
|
909
|
+
alias :_end_dc_date :_end_updated
|
|
910
|
+
|
|
911
|
+
def _start_created(attrsD)
|
|
912
|
+
push('created', true)
|
|
913
|
+
end
|
|
914
|
+
alias :_start_dcterms_created :_start_created
|
|
915
|
+
|
|
916
|
+
def _end_created
|
|
917
|
+
value = pop('created')
|
|
918
|
+
d = parse_date(value)
|
|
919
|
+
_save('created_parsed', extract_tuple(d))
|
|
920
|
+
_save('created_time', d)
|
|
921
|
+
end
|
|
922
|
+
alias :_end_dcterms_created :_end_created
|
|
923
|
+
|
|
924
|
+
def _start_expirationdate(attrsD)
|
|
925
|
+
push('expired', true)
|
|
926
|
+
end
|
|
927
|
+
def _end_expirationdate
|
|
928
|
+
d = parse_date(pop('expired'))
|
|
929
|
+
_save('expired_parsed', extract_tuple(d))
|
|
930
|
+
_save('expired_time', d)
|
|
931
|
+
end
|
|
932
|
+
|
|
933
|
+
def _start_cc_license(attrsD)
|
|
934
|
+
push('license', true)
|
|
935
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
|
936
|
+
if value && ! value.empty?
|
|
937
|
+
@elementstack[-1][2] << value
|
|
938
|
+
pop('license')
|
|
939
|
+
end
|
|
940
|
+
end
|
|
941
|
+
|
|
942
|
+
def _start_creativecommons_license(attrsD)
|
|
943
|
+
push('license', true)
|
|
944
|
+
end
|
|
945
|
+
|
|
946
|
+
def _end_creativecommons_license
|
|
947
|
+
pop('license')
|
|
948
|
+
end
|
|
949
|
+
|
|
950
|
+
def addTag(term, scheme, label)
|
|
951
|
+
context = getContext()
|
|
952
|
+
context['tags'] ||= []
|
|
953
|
+
tags = context['tags']
|
|
954
|
+
if (term.nil? || term.empty?) && (scheme.nil? || scheme.empty?) && (label.nil? || label.empty?)
|
|
955
|
+
return
|
|
956
|
+
end
|
|
957
|
+
value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
|
958
|
+
if ! tags.include?value
|
|
959
|
+
context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
|
960
|
+
end
|
|
961
|
+
end
|
|
962
|
+
|
|
963
|
+
def _start_category(attrsD)
|
|
964
|
+
$stderr << "entering _start_category with #{attrsD}\n" if $debug
|
|
965
|
+
|
|
966
|
+
term = attrsD['term']
|
|
967
|
+
scheme = attrsD['scheme'] || attrsD['domain']
|
|
968
|
+
label = attrsD['label']
|
|
969
|
+
addTag(term, scheme, label)
|
|
970
|
+
push('category', true)
|
|
971
|
+
end
|
|
972
|
+
alias :_start_dc_subject :_start_category
|
|
973
|
+
alias :_start_keywords :_start_category
|
|
974
|
+
|
|
975
|
+
def _end_itunes_keywords
|
|
976
|
+
pop('itunes_keywords').split.each do |term|
|
|
977
|
+
addTag(term, 'http://www.itunes.com/', nil)
|
|
978
|
+
end
|
|
979
|
+
end
|
|
980
|
+
|
|
981
|
+
def _start_itunes_category(attrsD)
|
|
982
|
+
addTag(attrsD['text'], 'http://www.itunes.com/', nil)
|
|
983
|
+
push('category', true)
|
|
984
|
+
end
|
|
985
|
+
|
|
986
|
+
def _end_category
|
|
987
|
+
value = pop('category')
|
|
988
|
+
return if value.nil? || value.empty?
|
|
989
|
+
context = getContext()
|
|
990
|
+
tags = context['tags']
|
|
991
|
+
if value && ! value.empty? && ! tags.empty? && ! tags[-1]['term']:
|
|
992
|
+
tags[-1]['term'] = value
|
|
993
|
+
else
|
|
994
|
+
addTag(value, nil, nil)
|
|
995
|
+
end
|
|
996
|
+
end
|
|
997
|
+
alias :_end_dc_subject :_end_category
|
|
998
|
+
alias :_end_keywords :_end_category
|
|
999
|
+
alias :_end_itunes_category :_end_category
|
|
1000
|
+
|
|
1001
|
+
def _start_cloud(attrsD)
|
|
1002
|
+
getContext()['cloud'] = FeedParserDict.new(attrsD)
|
|
1003
|
+
end
|
|
1004
|
+
|
|
1005
|
+
def _start_link(attrsD)
|
|
1006
|
+
attrsD['rel'] ||= 'alternate'
|
|
1007
|
+
attrsD['type'] ||= 'text/html'
|
|
1008
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
|
1009
|
+
if attrsD.has_key? 'href'
|
|
1010
|
+
attrsD['href'] = resolveURI(attrsD['href'])
|
|
1011
|
+
end
|
|
1012
|
+
expectingText = @infeed || @inentry || @insource
|
|
1013
|
+
context = getContext()
|
|
1014
|
+
context['links'] ||= []
|
|
1015
|
+
context['links'] << FeedParserDict.new(attrsD)
|
|
1016
|
+
if attrsD['rel'] == 'enclosure'
|
|
1017
|
+
_start_enclosure(attrsD)
|
|
1018
|
+
end
|
|
1019
|
+
if attrsD.has_key? 'href'
|
|
1020
|
+
expectingText = false
|
|
1021
|
+
if (attrsD['rel'] == 'alternate') && @html_types.include?(mapContentType(attrsD['type']))
|
|
1022
|
+
context['link'] = attrsD['href']
|
|
1023
|
+
end
|
|
1024
|
+
else
|
|
1025
|
+
push('link', expectingText)
|
|
1026
|
+
end
|
|
1027
|
+
end
|
|
1028
|
+
alias :_start_producturl :_start_link
|
|
1029
|
+
|
|
1030
|
+
def _end_link
|
|
1031
|
+
value = pop('link')
|
|
1032
|
+
context = getContext()
|
|
1033
|
+
if @intextinput
|
|
1034
|
+
context['textinput']['link'] = value
|
|
1035
|
+
end
|
|
1036
|
+
if @inimage
|
|
1037
|
+
context['image']['link'] = value
|
|
1038
|
+
end
|
|
1039
|
+
end
|
|
1040
|
+
alias :_end_producturl :_end_link
|
|
1041
|
+
|
|
1042
|
+
def _start_guid(attrsD)
|
|
1043
|
+
@guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
|
|
1044
|
+
push('id', true)
|
|
1045
|
+
end
|
|
1046
|
+
|
|
1047
|
+
def _end_guid
|
|
1048
|
+
value = pop('id')
|
|
1049
|
+
_save('guidislink', (@guidislink && ! getContext().has_key?('link')))
|
|
1050
|
+
if @guidislink:
|
|
1051
|
+
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
|
|
1052
|
+
# and only if the item doesn't already have a link element
|
|
1053
|
+
_save('link', value)
|
|
1054
|
+
end
|
|
1055
|
+
end
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def _start_title(attrsD)
|
|
1059
|
+
pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
|
1060
|
+
end
|
|
1061
|
+
alias :_start_dc_title :_start_title
|
|
1062
|
+
alias :_start_media_title :_start_title
|
|
1063
|
+
|
|
1064
|
+
def _end_title
|
|
1065
|
+
value = popContent('title')
|
|
1066
|
+
context = getContext
|
|
1067
|
+
if @intextinput
|
|
1068
|
+
context['textinput']['title'] = value
|
|
1069
|
+
elsif @inimage
|
|
1070
|
+
context['image']['title'] = value
|
|
1071
|
+
end
|
|
1072
|
+
@has_title = true
|
|
1073
|
+
end
|
|
1074
|
+
alias :_end_dc_title :_end_title
|
|
1075
|
+
|
|
1076
|
+
def _end_media_title
|
|
1077
|
+
orig_has_title = @has_title
|
|
1078
|
+
_end_title
|
|
1079
|
+
@has_title = orig_has_title
|
|
1080
|
+
end
|
|
1081
|
+
|
|
1082
|
+
def _start_description(attrsD)
|
|
1083
|
+
context = getContext()
|
|
1084
|
+
if context.has_key?('summary')
|
|
1085
|
+
@summaryKey = 'content'
|
|
1086
|
+
_start_content(attrsD)
|
|
1087
|
+
else
|
|
1088
|
+
pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
|
|
1089
|
+
end
|
|
1090
|
+
end
|
|
1091
|
+
|
|
1092
|
+
def _start_abstract(attrsD)
|
|
1093
|
+
pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
|
1094
|
+
end
|
|
1095
|
+
|
|
1096
|
+
def _end_description
|
|
1097
|
+
if @summaryKey == 'content'
|
|
1098
|
+
_end_content()
|
|
1099
|
+
else
|
|
1100
|
+
value = popContent('description')
|
|
1101
|
+
context = getContext()
|
|
1102
|
+
if @intextinput
|
|
1103
|
+
context['textinput']['description'] = value
|
|
1104
|
+
elsif @inimage:
|
|
1105
|
+
context['image']['description'] = value
|
|
1106
|
+
end
|
|
1107
|
+
end
|
|
1108
|
+
@summaryKey = nil
|
|
1109
|
+
end
|
|
1110
|
+
alias :_end_abstract :_end_description
|
|
1111
|
+
|
|
1112
|
+
def _start_info(attrsD)
|
|
1113
|
+
pushContent('info', attrsD, 'text/plain', true)
|
|
1114
|
+
end
|
|
1115
|
+
alias :_start_feedburner_browserfriendly :_start_info
|
|
1116
|
+
|
|
1117
|
+
def _end_info
|
|
1118
|
+
popContent('info')
|
|
1119
|
+
end
|
|
1120
|
+
alias :_end_feedburner_browserfriendly :_end_info
|
|
1121
|
+
|
|
1122
|
+
def _start_generator(attrsD)
|
|
1123
|
+
if attrsD && ! attrsD.empty?
|
|
1124
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
|
1125
|
+
if attrsD.has_key?('href')
|
|
1126
|
+
attrsD['href'] = resolveURI(attrsD['href'])
|
|
1127
|
+
end
|
|
1128
|
+
end
|
|
1129
|
+
getContext()['generator_detail'] = FeedParserDict.new(attrsD)
|
|
1130
|
+
push('generator', true)
|
|
1131
|
+
end
|
|
1132
|
+
|
|
1133
|
+
def _end_generator
|
|
1134
|
+
value = pop('generator')
|
|
1135
|
+
context = getContext()
|
|
1136
|
+
if context.has_key?('generator_detail')
|
|
1137
|
+
context['generator_detail']['name'] = value
|
|
1138
|
+
end
|
|
1139
|
+
end
|
|
1140
|
+
|
|
1141
|
+
def _start_admin_generatoragent(attrsD)
|
|
1142
|
+
push('generator', true)
|
|
1143
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
|
1144
|
+
if value && ! value.empty?
|
|
1145
|
+
@elementstack[-1][2] << value
|
|
1146
|
+
end
|
|
1147
|
+
pop('generator')
|
|
1148
|
+
getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
|
|
1149
|
+
end
|
|
1150
|
+
|
|
1151
|
+
def _start_admin_errorreportsto(attrsD)
|
|
1152
|
+
push('errorreportsto', true)
|
|
1153
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
|
1154
|
+
if value && ! value.empty?
|
|
1155
|
+
@elementstack[-1][2] << value
|
|
1156
|
+
end
|
|
1157
|
+
pop('errorreportsto')
|
|
1158
|
+
end
|
|
1159
|
+
|
|
1160
|
+
def _start_summary(attrsD)
|
|
1161
|
+
context = getContext()
|
|
1162
|
+
if context.has_key?('summary')
|
|
1163
|
+
@summaryKey = 'content'
|
|
1164
|
+
_start_content(attrsD)
|
|
1165
|
+
else
|
|
1166
|
+
@summaryKey = 'summary'
|
|
1167
|
+
pushContent(@summaryKey, attrsD, 'text/plain', true)
|
|
1168
|
+
end
|
|
1169
|
+
end
|
|
1170
|
+
alias :_start_itunes_summary :_start_summary
|
|
1171
|
+
|
|
1172
|
+
def _end_summary
|
|
1173
|
+
if @summaryKey == 'content':
|
|
1174
|
+
_end_content()
|
|
1175
|
+
else
|
|
1176
|
+
popContent(@summaryKey || 'summary')
|
|
1177
|
+
end
|
|
1178
|
+
@summaryKey = nil
|
|
1179
|
+
end
|
|
1180
|
+
alias :_end_itunes_summary :_end_summary
|
|
1181
|
+
|
|
1182
|
+
def _start_enclosure(attrsD)
|
|
1183
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
|
1184
|
+
getContext()['enclosures'] ||= []
|
|
1185
|
+
getContext()['enclosures'] << FeedParserDict.new(attrsD)
|
|
1186
|
+
href = attrsD['href']
|
|
1187
|
+
if href && ! href.empty?
|
|
1188
|
+
context = getContext()
|
|
1189
|
+
if ! context['id']
|
|
1190
|
+
context['id'] = href
|
|
1191
|
+
end
|
|
1192
|
+
end
|
|
1193
|
+
end
|
|
1194
|
+
alias :_start_media_content :_start_enclosure
|
|
1195
|
+
alias :_start_media_thumbnail :_start_enclosure
|
|
1196
|
+
|
|
1197
|
+
def _start_source(attrsD)
|
|
1198
|
+
@insource = true
|
|
1199
|
+
@has_title = false
|
|
1200
|
+
end
|
|
1201
|
+
|
|
1202
|
+
def _end_source
|
|
1203
|
+
@insource = false
|
|
1204
|
+
getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
|
|
1205
|
+
@sourcedata.clear()
|
|
1206
|
+
end
|
|
1207
|
+
|
|
1208
|
+
def _start_content(attrsD)
|
|
1209
|
+
pushContent('content', attrsD, 'text/plain', true)
|
|
1210
|
+
src = attrsD['src']
|
|
1211
|
+
if src && ! src.empty?:
|
|
1212
|
+
@contentparams['src'] = src
|
|
1213
|
+
end
|
|
1214
|
+
push('content', true)
|
|
1215
|
+
end
|
|
1216
|
+
|
|
1217
|
+
def _start_prodlink(attrsD)
|
|
1218
|
+
pushContent('content', attrsD, 'text/html', true)
|
|
1219
|
+
end
|
|
1220
|
+
|
|
1221
|
+
def _start_body(attrsD)
|
|
1222
|
+
pushContent('content', attrsD, 'application/xhtml+xml', true)
|
|
1223
|
+
end
|
|
1224
|
+
alias :_start_xhtml_body :_start_body
|
|
1225
|
+
|
|
1226
|
+
def _start_content_encoded(attrsD)
|
|
1227
|
+
pushContent('content', attrsD, 'text/html', true)
|
|
1228
|
+
end
|
|
1229
|
+
alias :_start_fullitem :_start_content_encoded
|
|
1230
|
+
|
|
1231
|
+
def _end_content
|
|
1232
|
+
copyToDescription = (['text/plain'] + @html_types).include?(mapContentType(@contentparams['type']))
|
|
1233
|
+
value = popContent('content')
|
|
1234
|
+
if copyToDescription
|
|
1235
|
+
_save('description', value)
|
|
1236
|
+
end
|
|
1237
|
+
end
|
|
1238
|
+
alias :_end_body :_end_content
|
|
1239
|
+
alias :_end_xhtml_body :_end_content
|
|
1240
|
+
alias :_end_content_encoded :_end_content
|
|
1241
|
+
alias :_end_fullitem :_end_content
|
|
1242
|
+
alias :_end_prodlink :_end_content
|
|
1243
|
+
|
|
1244
|
+
def _start_itunes_image(attrsD)
|
|
1245
|
+
push('itunes_image', false)
|
|
1246
|
+
getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
|
|
1247
|
+
end
|
|
1248
|
+
alias :_start_itunes_link :_start_itunes_image
|
|
1249
|
+
|
|
1250
|
+
def _end_itunes_block
|
|
1251
|
+
value = pop('itunes_block', false)
|
|
1252
|
+
getContext()['itunes_block'] = (value == 'yes') && true || false
|
|
1253
|
+
end
|
|
1254
|
+
|
|
1255
|
+
def _end_itunes_explicit
|
|
1256
|
+
value = pop('itunes_explicit', false)
|
|
1257
|
+
getContext()['itunes_explicit'] = (value.downcase == 'yes') && true || false
|
|
1258
|
+
end
|
|
1259
|
+
|
|
1260
|
+
end # End FeedParserMixin
|
|
1261
|
+
end
|
|
1262
|
+
|
|
1263
|
+
def urljoin(base, uri)
|
|
1264
|
+
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
|
1265
|
+
uri = uri.sub(urifixer, '\1\3')
|
|
1266
|
+
pbase = Addressable::URI.parse(base) rescue nil
|
|
1267
|
+
if pbase && pbase.absolute?
|
|
1268
|
+
puri = Addressable::URI.parse(uri) rescue nil
|
|
1269
|
+
if puri && puri.relative?
|
|
1270
|
+
# ForgivingURI.join does the wrong thing. What the hell.
|
|
1271
|
+
return Addressable::URI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
|
|
1272
|
+
end
|
|
1273
|
+
end
|
|
1274
|
+
return uri
|
|
1275
|
+
end
|