rfeedparser 0.9.8 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/ruby
2
+
3
+
4
+ module FeedParser
5
+ class StrictFeedParser < XML::SAX::HandlerBase # expat
6
+ include FeedParserMixin
7
+
8
+ attr_accessor :bozo, :entries, :feeddata, :exc
9
+ def initialize(baseuri, baselang, encoding)
10
+ $stderr << "trying StrictFeedParser\n" if $debug
11
+ startup(baseuri, baselang, encoding)
12
+ @bozo = false
13
+ @exc = nil
14
+ super()
15
+ end
16
+
17
+ def getPos
18
+ [@locator.getSystemId, @locator.getLineNumber]
19
+ end
20
+
21
+ def getAttrs(attrs)
22
+ ret = []
23
+ for i in 0..attrs.getLength
24
+ ret.push([attrs.getName(i), attrs.getValue(i)])
25
+ end
26
+ ret
27
+ end
28
+
29
+ def setDocumentLocator(loc)
30
+ @locator = loc
31
+ end
32
+
33
+ def startDoctypeDecl(name, pub_sys, long_name, uri)
34
+ #Nothing is done here. What could we do that is neat and useful?
35
+ end
36
+
37
+ def startNamespaceDecl(prefix, uri)
38
+ trackNamespace(prefix, uri)
39
+ end
40
+
41
+ def endNamespaceDecl(prefix)
42
+ end
43
+
44
+ def startElement(name, attrs)
45
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
46
+ namespaceuri = ($2 || '').downcase
47
+ name = $3
48
+ if /backend\.userland\.com\/rss/ =~ namespaceuri
49
+ # match any backend.userland.com namespace
50
+ namespaceuri = 'http://backend.userland.com/rss'
51
+ end
52
+ prefix = @matchnamespaces[namespaceuri]
53
+ # No need to raise UndeclaredNamespace, Expat does that for us with
54
+ "unbound prefix (XMLParserError)"
55
+ if prefix and not prefix.empty?
56
+ name = prefix + ':' + name
57
+ end
58
+ name.downcase!
59
+ unknown_starttag(name, attrs)
60
+ end
61
+
62
+ def character(text, start, length)
63
+ #handle_data(CGI.unescapeHTML(text))
64
+ handle_data(text)
65
+ end
66
+ # expat provides "character" not "characters"!
67
+ alias :characters :character # Just in case.
68
+
69
+ def startCdata(content)
70
+ handle_data(content)
71
+ end
72
+
73
+ def endElement(name)
74
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
75
+ namespaceuri = ($2 || '').downcase
76
+ prefix = @matchnamespaces[namespaceuri]
77
+ if prefix and not prefix.empty?
78
+ localname = prefix + ':' + name
79
+ end
80
+ name.downcase!
81
+ unknown_endtag(name)
82
+ end
83
+
84
+ def comment(comment)
85
+ handle_comment(comment)
86
+ end
87
+
88
+ def entityDecl(*foo)
89
+ end
90
+
91
+ def unparsedEntityDecl(*foo)
92
+ end
93
+ def error(exc)
94
+ @bozo = true
95
+ @exc = exc
96
+ end
97
+
98
+ def fatalError(exc)
99
+ error(exc)
100
+ raise exc
101
+ end
102
+ end
103
+
104
+ class LooseFeedParser < BetterSGMLParser
105
+ include FeedParserMixin
106
+ # We write the methods that were in BaseHTMLProcessor in the python code
107
+ # in here directly. We do this because if we inherited from
108
+ # BaseHTMLProcessor but then included from FeedParserMixin, the methods
109
+ # of Mixin would overwrite the methods we inherited from
110
+ # BaseHTMLProcessor. This is exactly the opposite of what we want to
111
+ # happen!
112
+
113
+ attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
114
+
115
+ Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
116
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
117
+ New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
118
+ alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
119
+ def feed
120
+ @feeddata
121
+ end
122
+ def feed=(data)
123
+ @feeddata = data
124
+ end
125
+
126
+ def initialize(baseuri, baselang, encoding)
127
+ startup(baseuri, baselang, encoding)
128
+ super() # Keep the parentheses! No touchy.
129
+ end
130
+
131
+ def reset
132
+ @pieces = []
133
+ super
134
+ end
135
+
136
+ def parse(data)
137
+ data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
138
+ data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
139
+ clean = tag[1..-3].strip
140
+ if Elements_No_End_Tag.include?clean
141
+ tag
142
+ else
143
+ '<'+clean+'></'+clean+'>'
144
+ end
145
+ end
146
+
147
+ data.gsub!(/&#39;/, "'")
148
+ data.gsub!(/&#34;/, "'")
149
+ if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
150
+ data = uconvert(data,'utf-8',@encoding)
151
+ end
152
+ sgml_feed(data) # see the alias above
153
+ end
154
+
155
+
156
+ def decodeEntities(element, data)
157
+ data.gsub!('&#60;', '&lt;')
158
+ data.gsub!('&#x3c;', '&lt;')
159
+ data.gsub!('&#62;', '&gt;')
160
+ data.gsub!('&#x3e;', '&gt;')
161
+ data.gsub!('&#38;', '&amp;')
162
+ data.gsub!('&#x26;', '&amp;')
163
+ data.gsub!('&#34;', '&quot;')
164
+ data.gsub!('&#x22;', '&quot;')
165
+ data.gsub!('&#39;', '&apos;')
166
+ data.gsub!('&#x27;', '&apos;')
167
+ if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
168
+ data.gsub!('&lt;', '<')
169
+ data.gsub!('&gt;', '>')
170
+ data.gsub!('&amp;', '&')
171
+ data.gsub!('&quot;', '"')
172
+ data.gsub!('&apos;', "'")
173
+ end
174
+ return data
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,207 @@
1
+ #!/usr/bin/ruby
2
+ gem 'hpricot', ">=0.5"
3
+ require 'hpricot'
4
+ # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
5
+ # have only been part of its evolution. Hpricot#scrub is cool code, though.
6
+ # http://underpantsgnome.com/2007/01/20/hpricot-scrub
7
+ module Hpricot
8
+ Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
9
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
10
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
11
+ 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
12
+ 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
13
+ 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
14
+ 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
15
+ 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
16
+ 'ul', 'var'
17
+ ]
18
+
19
+ Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
20
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
21
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
22
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
23
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
24
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
25
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
26
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
27
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
28
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
29
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
30
+ ]
31
+
32
+ Unacceptable_Elements_With_End_Tag = ['script', 'applet']
33
+
34
+ Acceptable_Css_Properties = ['azimuth', 'background-color',
35
+ 'border-bottom-color', 'border-collapse', 'border-color',
36
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
37
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
38
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
39
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
40
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
41
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
42
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
43
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
44
+ 'white-space', 'width'
45
+ ]
46
+
47
+ # survey of common keywords found in feeds
48
+ Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
49
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
50
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
51
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
52
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
53
+ 'transparent', 'underline', 'white', 'yellow'
54
+ ]
55
+
56
+ Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
57
+ 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
58
+ 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
59
+ 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
60
+ 'munderover', 'none'
61
+ ]
62
+
63
+ Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
64
+ 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
65
+ 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
66
+ 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
67
+ 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
68
+ 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
69
+ 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
70
+ 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
71
+ 'xlink:type', 'xmlns', 'xmlns:xlink'
72
+ ]
73
+
74
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
75
+ Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
76
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
77
+ 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
78
+ 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
79
+ 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
80
+ 'switch', 'text', 'title', 'use'
81
+ ]
82
+
83
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
84
+ Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
85
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
86
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
87
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
88
+ 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
89
+ 'font-size', 'font-stretch', 'font-style', 'font-variant',
90
+ 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
91
+ 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
92
+ 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
93
+ 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
94
+ 'origin', 'overline-position', 'overline-thickness', 'panose-1',
95
+ 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
96
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
97
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
98
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
99
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
100
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
101
+ 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
102
+ 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
103
+ 'underline-position', 'underline-thickness', 'unicode',
104
+ 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
105
+ 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
106
+ 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
107
+ 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
108
+ 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
109
+ ]
110
+
111
+ Svg_Attr_Map = nil
112
+ Svg_Elem_Map = nil
113
+
114
+ Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
115
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
116
+ 'stroke-opacity'
117
+ ]
118
+
119
+ unless $compatible
120
+ @@acceptable_tag_specific_attributes = {}
121
+ @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
122
+ @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
123
+ end
124
+
125
+ class Elements
126
+ def strip_attributes(safe=[])
127
+ each { |x| x.strip_attributes(safe) }
128
+ end
129
+
130
+ def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
131
+ each { |x| x.strip_style(ok_props, ok_keywords) }
132
+ end
133
+ end
134
+
135
+ class Text
136
+ def strip_attributes(foo)
137
+ end
138
+ end
139
+ class Comment
140
+ def strip_attributes(foo)
141
+ end
142
+ end
143
+ class BogusETag
144
+ def strip_attributes(foo)
145
+ end
146
+ end
147
+
148
+ class Elem
149
+ def strip_attributes
150
+ unless attributes.nil?
151
+ attributes.each do |atr|
152
+ unless Acceptable_Attributes.include?atr[0]
153
+ remove_attribute(atr[0])
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+
161
+ module FeedParserUtilities
162
+ class SanitizerDoc < Hpricot::Doc
163
+
164
+ def scrub
165
+ traverse_all_element do |e|
166
+ if e.elem?
167
+ if Acceptable_Elements.include?e.name
168
+ e.strip_attributes
169
+ else
170
+ if Unacceptable_Elements_With_End_Tag.include?e.name
171
+ e.inner_html = ''
172
+ end
173
+ e.swap(SanitizerDoc.new(e.children).scrub.to_html) # The important part
174
+ end
175
+ elsif e.doctype?
176
+ e.parent.children.delete(e)
177
+ elsif e.text?
178
+ ets = e.to_s
179
+ ets.gsub!(/&#39;/, "'")
180
+ ets.gsub!(/&#34;/, '"')
181
+ ets.gsub!(/\r/,'')
182
+ e.swap(ets)
183
+ else
184
+ end
185
+ end
186
+
187
+ unless $compatible # FIXME nonworking
188
+ # yes, that '/' should be there. It's a search method. See the Hpricot docs.
189
+ (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
190
+ end
191
+ return self
192
+ end
193
+ end
194
+
195
+ def SanitizerDoc(html)
196
+ SanitizerDoc.new(Hpricot.make(html))
197
+ end
198
+ module_function(:SanitizerDoc)
199
+
200
+ def sanitizeHTML(html,encoding)
201
+ # FIXME Tidy not yet supported
202
+ html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
203
+ h = SanitizerDoc(html)
204
+ h = h.scrub
205
+ return h.to_html.strip
206
+ end
207
+ end
@@ -0,0 +1,408 @@
1
+ #!/usr/bin/ruby
2
+ require 'time'
3
+
4
+ # This sucks, but I haven't figured out a better way of getting the namespaces right.
5
+ module FeedParserMixin
6
+ # ISO-8601 date parsing routines written by Fazal Majid.
7
+ # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
8
+ # parser is beyond the scope of feedparser and the current Time.iso8601
9
+ # method does not work.
10
+ # A single regular expression cannot parse ISO 8601 date formats into groups
11
+ # as the standard is highly irregular (for instance is 030104 2003-01-04 or
12
+ # 0301-04-01), so we use templates instead.
13
+ # Please note the order in templates is significant because we need a
14
+ # greedy match.
15
+ def _parse_date_iso8601(dateString)
16
+ # Parse a variety of ISO-8601-compatible formats like 20040105
17
+
18
+ # What I'm about to show you may be the ugliest code in all of
19
+ # rfeedparser.
20
+ # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
21
+ # end of line" but we then attach more of a regexp.
22
+ iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
23
+ '^(\d{4})-([01]\d)',
24
+ '^(\d{4})-?([0123]\d\d)',
25
+ '^(\d\d)-?([01]\d)-?([0123]\d)',
26
+ '^(\d\d)-?([0123]\d\d)',
27
+ '^(\d{4})',
28
+ '-(\d\d)-?([01]\d)',
29
+ '-([0123]\d\d)',
30
+ '-(\d\d)',
31
+ '--([01]\d)-?([0123]\d)',
32
+ '--([01]\d)',
33
+ '---([0123]\d)',
34
+ '(\d\d$)',
35
+ ''
36
+ ]
37
+ iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
38
+ '^(\d{4})-([01]\d)' => ['year','month'],
39
+ '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
40
+ '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
41
+ '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
42
+ '^(\d{4})' => ['year'],
43
+ '-(\d\d)-?([01]\d)' => ['year','month'],
44
+ '-([0123]\d\d)' => ['ordinal'],
45
+ '-(\d\d)' => ['year'],
46
+ '--([01]\d)-?([0123]\d)' => ['month','day'],
47
+ '--([01]\d)' => ['month'],
48
+ '---([0123]\d)' => ['day'],
49
+ '(\d\d$)' => ['century'],
50
+ '' => []
51
+ }
52
+ add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
53
+ add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
54
+ # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
55
+ # by '?'). The second ':' *are* matched.
56
+ m = nil
57
+ param_keys = []
58
+ iso8601_regexps.each do |s|
59
+ $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
60
+ param_keys = iso8601_values[s] + add_to_all_fields
61
+ m = dateString.match(Regexp.new(s+add_to_all))
62
+ break if m
63
+ end
64
+ return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
65
+
66
+ param_values = m.to_a
67
+ param_values = param_values[1..-1]
68
+ params = {}
69
+ param_keys.each_with_index do |key,i|
70
+ params[key] = param_values[i]
71
+ end
72
+
73
+ ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
74
+ year = params['year'] || '--'
75
+ if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
76
+ year = Time.now.utc.year
77
+ elsif year.length == 2
78
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
79
+ year = 100 * (Time.now.utc.year / 100) + year.to_i
80
+ else
81
+ year = year.to_i
82
+ end
83
+
84
+ month = params['month'] || '-'
85
+ if month.nil? or month.empty? or month == '-'
86
+ # ordinals are NOT normalized by mktime, we simulate them
87
+ # by setting month=1, day=ordinal
88
+ if ordinal
89
+ month = DateTime.ordinal(year,ordinal).month
90
+ else
91
+ month = Time.now.utc.month
92
+ end
93
+ end
94
+ month = month.to_i unless month.nil?
95
+ day = params['day']
96
+ if day.nil? or day.empty?
97
+ # see above
98
+ if ordinal
99
+ day = DateTime.ordinal(year,ordinal).day
100
+ elsif params['century'] or params['year'] or params['month']
101
+ day = 1
102
+ else
103
+ day = Time.now.utc.day
104
+ end
105
+ else
106
+ day = day.to_i
107
+ end
108
+ # special case of the century - is the first year of the 21st century
109
+ # 2000 or 2001 ? The debate goes on...
110
+ if params.has_key? 'century'
111
+ year = (params['century'].to_i - 1) * 100 + 1
112
+ end
113
+ # in ISO 8601 most fields are optional
114
+ hour = params['hour'].to_i
115
+ minute = params['minute'].to_i
116
+ second = params['second'].to_i
117
+ weekday = nil
118
+ # daylight savings is complex, but not needed for feedparser's purposes
119
+ # as time zones, if specified, include mention of whether it is active
120
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
121
+ # and most implementations have DST bugs
122
+ tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
123
+ tz = params['tz']
124
+ if tz and not tz.empty? and tz != 'Z'
125
+ # FIXME does this cross over days?
126
+ if tz[0] == '-'
127
+ tm[3] += params['tzhour'].to_i
128
+ tm[4] += params['tzmin'].to_i
129
+ elsif tz[0] == '+'
130
+ tm[3] -= params['tzhour'].to_i
131
+ tm[4] -= params['tzmin'].to_i
132
+ else
133
+ return nil
134
+ end
135
+ end
136
+ return Time.utc(*tm) # Magic!
137
+
138
+ end
139
+
140
+ def _parse_date_onblog(dateString)
141
+ # Parse a string according to the OnBlog 8-bit date format
142
+ # 8-bit date handling routes written by ytrewq1
143
+ korean_year = u("년") # b3e2 in euc-kr
144
+ korean_month = u("월") # bff9 in euc-kr
145
+ korean_day = u("일") # c0cf in euc-kr
146
+
147
+
148
+ korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
149
+
150
+
151
+ m = korean_onblog_date_re.match(dateString)
152
+ return unless m
153
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
154
+
155
+ $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
156
+ return _parse_date_w3dtf(w3dtfdate)
157
+ end
158
+
159
+ def _parse_date_nate(dateString)
160
+ # Parse a string according to the Nate 8-bit date format
161
+ # 8-bit date handling routes written by ytrewq1
162
+ korean_am = u("오전") # bfc0 c0fc in euc-kr
163
+ korean_pm = u("오후") # bfc0 c8c4 in euc-kr
164
+
165
+ korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
166
+ m = korean_nate_date_re.match(dateString)
167
+ return unless m
168
+ hour = m[5].to_i
169
+ ampm = m[4]
170
+ if ampm == korean_pm
171
+ hour += 12
172
+ end
173
+ hour = hour.to_s.rjust(2,'0')
174
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
175
+ $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
176
+ return _parse_date_w3dtf(w3dtfdate)
177
+ end
178
+
179
+ def _parse_date_mssql(dateString)
180
+ mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
181
+
182
+ m = mssql_date_re.match(dateString)
183
+ return unless m
184
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
185
+ $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
186
+ return _parse_date_w3dtf(w3dtfdate)
187
+ end
188
+
189
+ def _parse_date_greek(dateString)
190
+ # Parse a string according to a Greek 8-bit date format
191
+ # Unicode strings for Greek date strings
192
+ greek_months = {
193
+ u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
194
+ u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
195
+ u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
196
+ u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
197
+ u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
198
+ u("Μάι") => u("May"), # ccdce9 in iso-8859-7
199
+ u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
200
+ u("Μαι") => u("May"), # cce1e9 in iso-8859-7
201
+ u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
202
+ u("Ιον") => u("Jun"), # c9efed in iso-8859-7
203
+ u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
204
+ u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
205
+ u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
206
+ u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
207
+ u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
208
+ u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
209
+ u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
210
+ u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
211
+ u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
212
+ }
213
+
214
+ greek_wdays = {
215
+ u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
216
+ u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
217
+ u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
218
+ u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
219
+ u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
220
+ u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
221
+ u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
222
+ }
223
+
224
+ greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
225
+
226
+ m = greek_date_format.match(dateString)
227
+ return unless m
228
+ begin
229
+ wday = greek_wdays[m[1]]
230
+ month = greek_months[m[3]]
231
+ rescue
232
+ return nil
233
+ end
234
+ rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
235
+ $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
236
+ return _parse_date_rfc822(rfc822date)
237
+ end
238
+
239
+ def _parse_date_hungarian(dateString)
240
+ # Parse a string according to a Hungarian 8-bit date format.
241
+ hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
242
+ m = hungarian_date_format_re.match(dateString)
243
+ return unless m
244
+
245
+ # Unicode strings for Hungarian date strings
246
+ hungarian_months = {
247
+ u("január") => u("01"), # e1 in iso-8859-2
248
+ u("februári") => u("02"), # e1 in iso-8859-2
249
+ u("március") => u("03"), # e1 in iso-8859-2
250
+ u("április") => u("04"), # e1 in iso-8859-2
251
+ u("máujus") => u("05"), # e1 in iso-8859-2
252
+ u("június") => u("06"), # fa in iso-8859-2
253
+ u("július") => u("07"), # fa in iso-8859-2
254
+ u("augusztus") => u("08"),
255
+ u("szeptember") => u("09"),
256
+ u("október") => u("10"), # f3 in iso-8859-2
257
+ u("november") => u("11"),
258
+ u("december") => u("12"),
259
+ }
260
+ begin
261
+ month = hungarian_months[m[2]]
262
+ day = m[3].rjust(2,'0')
263
+ hour = m[4].rjust(2,'0')
264
+ rescue
265
+ return
266
+ end
267
+
268
+ w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
269
+ $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
270
+ return _parse_date_w3dtf(w3dtfdate)
271
+ end
272
+
273
+ def rollover(num, modulus)
274
+ return num % modulus, num / modulus
275
+ end
276
+
277
+ def set_self(num, modulus)
278
+ r = num / modulus
279
+ if r == 0
280
+ return num
281
+ end
282
+ return r
283
+ end
284
+ # W3DTF-style date parsing
285
+ # FIXME shouldn't it be "W3CDTF"?
286
+ def _parse_date_w3dtf(dateString)
287
+ # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
288
+ # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
289
+ # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
290
+
291
+ m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
292
+
293
+ w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
294
+ w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
295
+ w3 << m[-1] # Leave the timezone as a String
296
+
297
+ # FIXME this next bit needs some serious refactoring
298
+ # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
299
+ w3[5],r = rollover(w3[5], 60) # rollover seconds
300
+ w3[4] += r
301
+ w3[4],r = rollover(w3[4], 60) # rollover minutes
302
+ w3[3] += r
303
+ w3[3],r = rollover(w3[3], 24) # rollover hours
304
+
305
+ w3[2] = w3[2] + r
306
+ if w3[1] > 12
307
+ w3[1],r = rollover(w3[1],12)
308
+ w3[1] = 12 if w3[1] == 0
309
+ w3[0] += r
310
+ end
311
+
312
+ num_days = Time.days_in_month(w3[1], w3[0])
313
+ while w3[2] > num_days
314
+ w3[2] -= num_days
315
+ w3[1] += 1
316
+ if w3[1] > 12
317
+ w3[0] += 1
318
+ w3[1] = set_self(w3[1], 12)
319
+ end
320
+ num_days = Time.days_in_month(w3[1], w3[0])
321
+ end
322
+
323
+
324
+ unless w3[6].class != String
325
+ if /^-/ =~ w3[6] # Zone offset goes backwards
326
+ w3[6][0] = '+'
327
+ elsif /^\+/ =~ w3[6]
328
+ w3[6][0] = '-'
329
+ end
330
+ end
331
+ return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
332
+ end
333
+
334
+ def _parse_date_rfc822(dateString)
335
+ # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
336
+ # These first few lines are to fix up the stupid proprietary format from Disney
337
+ unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
338
+ 'CT' => 'CST', 'MT' => 'MST',
339
+ 'PT' => 'PST'
340
+ }
341
+
342
+ mon = dateString.split[2]
343
+ if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
344
+ dateString.sub!(mon,mon[0..2])
345
+ end
346
+ if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
347
+ dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
348
+ end
349
+ # Okay, the Disney date format should be fixed up now.
350
+ rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
351
+ if rfc.to_a.length > 1 and rfc.to_a.include? nil
352
+ dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
353
+ hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
354
+ tz ||= "GMT"
355
+ end
356
+ asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
357
+ if asctime_match.to_a.length > 1
358
+ # Month-abbr dayofmonth hour:minute:second year
359
+ dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
360
+ day.to_s.rjust(2,'0')
361
+ end
362
+ if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
363
+ ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
364
+ else
365
+ ds = dateString
366
+ end
367
+ t = Time.rfc2822(ds).utc
368
+ return t
369
+ end
370
+
371
+ def _parse_date_perforce(aDateString) # FIXME not in 4.1?
372
+ # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
373
+ # Note that there is a day of the week at the beginning
374
+ # Ex. Fri, 2006/09/15 08:19:53 EDT
375
+ return Time.parse(aDateString).utc
376
+ end
377
+
378
+ def extract_tuple(atime)
379
+ # NOTE leave the error handling to parse_date
380
+ t = [atime.year, atime.month, atime.mday, atime.hour,
381
+ atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
382
+ atime.isdst
383
+ ]
384
+ # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
385
+ t[0..-2].map!{|s| s.to_i}
386
+ t[-1] = t[-1] ? 1 : 0
387
+ return t
388
+ end
389
+
390
+ def parse_date(dateString)
391
+ @date_handlers.each do |handler|
392
+ begin
393
+ $stderr << "Trying date_handler #{handler}\n" if $debug
394
+ datething = extract_tuple(send(handler,dateString))
395
+ return datething
396
+ rescue Exception => e
397
+ $stderr << "#{handler} raised #{e}\n" if $debug
398
+ end
399
+ end
400
+ return nil
401
+ end
402
+ end
403
+
404
+ module FeedParserUtilities
405
+ def py2rtime(pytuple)
406
+ Time.utc(pytuple[0..5])
407
+ end
408
+ end