jgre-rfeedparser 0.9.961

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env ruby
2
+ # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
3
+ # have only been part of its evolution. Hpricot#scrub is cool code, though.
4
+ # http://underpantsgnome.com/2007/01/20/hpricot-scrub
5
+ module Hpricot
6
+ Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
7
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
8
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
9
+ 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
10
+ 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
11
+ 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
12
+ 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
13
+ 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
14
+ 'ul', 'var'
15
+ ]
16
+
17
+ Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
18
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
19
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
20
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
21
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
22
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
23
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
24
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
25
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
26
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
27
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
28
+ ]
29
+
30
+ Unacceptable_Elements_With_End_Tag = ['script', 'applet']
31
+
32
+ Acceptable_Css_Properties = ['azimuth', 'background-color',
33
+ 'border-bottom-color', 'border-collapse', 'border-color',
34
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
35
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
36
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
37
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
38
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
39
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
40
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
41
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
42
+ 'white-space', 'width'
43
+ ]
44
+
45
+ # survey of common keywords found in feeds
46
+ Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
47
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
48
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
49
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
50
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
51
+ 'transparent', 'underline', 'white', 'yellow'
52
+ ]
53
+
54
+ Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
55
+ 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
56
+ 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
57
+ 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
58
+ 'munderover', 'none'
59
+ ]
60
+
61
+ Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
62
+ 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
63
+ 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
64
+ 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
65
+ 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
66
+ 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
67
+ 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
68
+ 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
69
+ 'xlink:type', 'xmlns', 'xmlns:xlink'
70
+ ]
71
+
72
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
73
+ Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
74
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
75
+ 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
76
+ 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
77
+ 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
78
+ 'switch', 'text', 'title', 'use'
79
+ ]
80
+
81
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
82
+ Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
83
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
84
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
85
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
86
+ 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
87
+ 'font-size', 'font-stretch', 'font-style', 'font-variant',
88
+ 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
89
+ 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
90
+ 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
91
+ 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
92
+ 'origin', 'overline-position', 'overline-thickness', 'panose-1',
93
+ 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
94
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
95
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
96
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
97
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
98
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
99
+ 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
100
+ 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
101
+ 'underline-position', 'underline-thickness', 'unicode',
102
+ 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
103
+ 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
104
+ 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
105
+ 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
106
+ 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
107
+ ]
108
+
109
+ Svg_Attr_Map = nil
110
+ Svg_Elem_Map = nil
111
+
112
+ Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
113
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
114
+ 'stroke-opacity'
115
+ ]
116
+
117
+ unless $compatible
118
+ @@acceptable_tag_specific_attributes = {}
119
+ @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
120
+ @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
121
+ end
122
+
123
+ class Elements
124
+ def strip_attributes(safe=[])
125
+ each { |x| x.strip_attributes(safe) }
126
+ end
127
+
128
+ def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
129
+ each { |x| x.strip_style(ok_props, ok_keywords) }
130
+ end
131
+ end
132
+
133
+ class Text
134
+ def strip_attributes(foo)
135
+ end
136
+ end
137
+ class Comment
138
+ def strip_attributes(foo)
139
+ end
140
+ end
141
+ class BogusETag
142
+ def strip_attributes(foo)
143
+ end
144
+ end
145
+
146
+ class Elem
147
+ def strip_attributes
148
+ unless attributes.nil?
149
+ ra = {}
150
+ raw_attributes.keys.each{|atr| ra[atr] = raw_attributes[atr] if Acceptable_Attributes.include?(atr) }
151
+ self.raw_attributes = ra
152
+ end
153
+ end
154
+ end
155
+ end
156
+
157
+ module FeedParserUtilities
158
+ class SanitizerDoc < Hpricot::Doc
159
+
160
+ def scrub
161
+ others = children.map do |e|
162
+ if e.elem?
163
+ if Acceptable_Elements.include?e.name
164
+ e.strip_attributes
165
+ e.inner_html = SanitizerDoc.new(e.children).scrub
166
+ result = e
167
+ else
168
+ result = e
169
+
170
+ if Unacceptable_Elements_With_End_Tag.include?e.name
171
+ result = nil
172
+ end
173
+
174
+ if result
175
+ result = SanitizerDoc.new(result.children).scrub # The important part
176
+ end
177
+ end
178
+
179
+ elsif e.doctype?
180
+ result = nil
181
+
182
+ elsif e.text?
183
+ ets = e.to_html
184
+ ets.gsub!(/&#39;/, "'")
185
+ ets.gsub!(/&#34;/, '"')
186
+ ets.gsub!(/\r/,'')
187
+ result = ets
188
+ end
189
+ result
190
+ end
191
+
192
+ unless $compatible # FIXME nonworking
193
+ # yes, that '/' should be there. It's a search method. See the Hpricot docs.
194
+ (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
195
+ end
196
+ return others.compact.join
197
+ end
198
+ end
199
+
200
+ def SanitizerDoc(html)
201
+ SanitizerDoc.new(Hpricot.make(html))
202
+ end
203
+ module_function(:SanitizerDoc)
204
+
205
+ def sanitizeHTML(html,encoding)
206
+ # FIXME Tidy not yet supported
207
+ html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
208
+ #h = SanitizerDoc(html)
209
+ #h = h.scrub
210
+ #return h.strip
211
+ end
212
+ end
@@ -0,0 +1,408 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'time'
5
+
6
+ module FeedParser
7
+
8
+ class FeedTimeParser
9
+ @@date_handlers = [:parse_date_rfc822,
10
+ :parse_date_hungarian, :parse_date_greek,:parse_date_mssql,
11
+ :parse_date_nate,:parse_date_onblog,:parse_date_w3dtf,:parse_date_iso8601
12
+ ]
13
+ class << self
14
+ # ISO-8601 date parsing routines written by Fazal Majid.
15
+ # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
16
+ # parser is beyond the scope of feedparser and the current Time.iso8601
17
+ # method does not work.
18
+ # A single regular expression cannot parse ISO 8601 date formats into groups
19
+ # as the standard is highly irregular (for instance is 030104 2003-01-04 or
20
+ # 0301-04-01), so we use templates instead.
21
+ # Please note the order in templates is significant because we need a
22
+ # greedy match.
23
+ def parse_date_iso8601(dateString)
24
+ # Parse a variety of ISO-8601-compatible formats like 20040105
25
+
26
+ # What I'm about to show you may be the ugliest code in all of
27
+ # rfeedparser.
28
+ # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
29
+ # end of line" but we then attach more of a regexp.
30
+ iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
31
+ '^(\d{4})-([01]\d)',
32
+ '^(\d{4})-?([0123]\d\d)',
33
+ '^(\d\d)-?([01]\d)-?([0123]\d)',
34
+ '^(\d\d)-?([0123]\d\d)',
35
+ '^(\d{4})',
36
+ '-(\d\d)-?([01]\d)',
37
+ '-([0123]\d\d)',
38
+ '-(\d\d)',
39
+ '--([01]\d)-?([0123]\d)',
40
+ '--([01]\d)',
41
+ '---([0123]\d)',
42
+ '(\d\d$)',
43
+ '' ]
44
+ iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
45
+ '^(\d{4})-([01]\d)' => ['year','month'],
46
+ '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
47
+ '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
48
+ '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
49
+ '^(\d{4})' => ['year'],
50
+ '-(\d\d)-?([01]\d)' => ['year','month'],
51
+ '-([0123]\d\d)' => ['ordinal'],
52
+ '-(\d\d)' => ['year'],
53
+ '--([01]\d)-?([0123]\d)' => ['month','day'],
54
+ '--([01]\d)' => ['month'],
55
+ '---([0123]\d)' => ['day'],
56
+ '(\d\d$)' => ['century'],
57
+ '' => [] }
58
+ add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
59
+ add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
60
+ # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
61
+ # by '?'). The second ':' *are* matched.
62
+ m = nil
63
+ param_keys = []
64
+ iso8601_regexps.each do |s|
65
+ $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
66
+ param_keys = iso8601_values[s] + add_to_all_fields
67
+ m = dateString.match(Regexp.new(s+add_to_all))
68
+ break if m
69
+ end
70
+ return if m.nil? || (m.begin(0).zero? && m.end(0).zero?)
71
+
72
+ param_values = m.to_a
73
+ param_values = param_values[1..-1]
74
+ params = {}
75
+ param_keys.each_with_index do |key,i|
76
+ params[key] = param_values[i]
77
+ end
78
+
79
+ ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
80
+ year = params['year'] || '--'
81
+ if year.nil? || year.empty? || year == '--' # FIXME When could the regexp ever return a year equal to '--'?
82
+ year = Time.now.utc.year
83
+ elsif year.length == 2
84
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
85
+ year = 100 * (Time.now.utc.year / 100) + year.to_i
86
+ else
87
+ year = year.to_i
88
+ end
89
+
90
+ month = params['month'] || '-'
91
+ if month.nil? || month.empty? || month == '-'
92
+ # ordinals are NOT normalized by mktime, we simulate them
93
+ # by setting month=1, day=ordinal
94
+ if ordinal
95
+ month = DateTime.ordinal(year,ordinal).month
96
+ else
97
+ month = Time.now.utc.month
98
+ end
99
+ end
100
+ month = month.to_i unless month.nil?
101
+ day = params['day']
102
+ if day.nil? || day.empty?
103
+ # see above
104
+ if ordinal
105
+ day = DateTime.ordinal(year,ordinal).day
106
+ elsif params['century'] || params['year'] || params['month']
107
+ day = 1
108
+ else
109
+ day = Time.now.utc.day
110
+ end
111
+ else
112
+ day = day.to_i
113
+ end
114
+ # special case of the century - is the first year of the 21st century
115
+ # 2000 or 2001 ? The debate goes on...
116
+ if params.has_key? 'century'
117
+ year = (params['century'].to_i - 1) * 100 + 1
118
+ end
119
+ # in ISO 8601 most fields are optional
120
+ hour = params['hour'].to_i
121
+ minute = params['minute'].to_i
122
+ second = params['second'].to_i
123
+ weekday = nil
124
+ # daylight savings is complex, but not needed for feedparser's purposes
125
+ # as time zones, if specified, include mention of whether it is active
126
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
127
+ # and most implementations have DST bugs
128
+ tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
129
+ tz = params['tz']
130
+ if tz && ! tz.empty? && tz != 'Z'
131
+ # FIXME does this cross over days?
132
+ if tz[0] == '-'
133
+ tm[3] += params['tzhour'].to_i
134
+ tm[4] += params['tzmin'].to_i
135
+ elsif tz[0] == '+'
136
+ tm[3] -= params['tzhour'].to_i
137
+ tm[4] -= params['tzmin'].to_i
138
+ else
139
+ return nil
140
+ end
141
+ end
142
+ return Time.utc(*tm) # Magic!
143
+
144
+ end
145
+
146
+ def parse_date_onblog(dateString)
147
+ # Parse a string according to the OnBlog 8-bit date format
148
+ # 8-bit date handling routes written by ytrewq1
149
+ korean_year = u("년") # b3e2 in euc-kr
150
+ korean_month = u("월") # bff9 in euc-kr
151
+ korean_day = u("일") # c0cf in euc-kr
152
+
153
+
154
+ korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
155
+
156
+
157
+ m = korean_onblog_date_re.match(dateString)
158
+
159
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
160
+
161
+ $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
162
+ return parse_date_w3dtf(w3dtfdate)
163
+ end
164
+
165
+ def parse_date_nate(dateString)
166
+ # Parse a string according to the Nate 8-bit date format
167
+ # 8-bit date handling routes written by ytrewq1
168
+ korean_am = u("오전") # bfc0 c0fc in euc-kr
169
+ korean_pm = u("오후") # bfc0 c8c4 in euc-kr
170
+
171
+ korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
172
+ m = korean_nate_date_re.match(dateString)
173
+
174
+ hour = m[5].to_i
175
+ ampm = m[4]
176
+ if ampm == korean_pm
177
+ hour += 12
178
+ end
179
+ hour = hour.to_s.rjust(2,'0')
180
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
181
+ $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
182
+ return parse_date_w3dtf(w3dtfdate)
183
+ end
184
+
185
+ def parse_date_mssql(dateString)
186
+ mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
187
+
188
+ m = mssql_date_re.match(dateString)
189
+
190
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
191
+ $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
192
+ return parse_date_w3dtf(w3dtfdate)
193
+ end
194
+
195
+ def parse_date_greek(dateString)
196
+ # Parse a string according to a Greek 8-bit date format
197
+ # Unicode strings for Greek date strings
198
+ greek_months = {
199
+ u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
200
+ u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
201
+ u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
202
+ u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
203
+ u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
204
+ u("Μάι") => u("May"), # ccdce9 in iso-8859-7
205
+ u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
206
+ u("Μαι") => u("May"), # cce1e9 in iso-8859-7
207
+ u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
208
+ u("Ιον") => u("Jun"), # c9efed in iso-8859-7
209
+ u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
210
+ u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
211
+ u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
212
+ u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
213
+ u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
214
+ u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
215
+ u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
216
+ u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
217
+ u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
218
+ }
219
+
220
+ greek_wdays = {
221
+ u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
222
+ u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
223
+ u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
224
+ u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
225
+ u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
226
+ u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
227
+ u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
228
+ }
229
+
230
+ greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
231
+
232
+ m = greek_date_format.match(dateString)
233
+
234
+ wday = greek_wdays[m[1]]
235
+ month = greek_months[m[3]]
236
+
237
+ rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
238
+ $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
239
+ return parse_date_rfc822(rfc822date)
240
+ end
241
+
242
+ def parse_date_hungarian(dateString)
243
+ # Parse a string according to a Hungarian 8-bit date format.
244
+ hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
245
+ m = hungarian_date_format_re.match(dateString)
246
+
247
+ # Unicode strings for Hungarian date strings
248
+ hungarian_months = {
249
+ u("január") => u("01"), # e1 in iso-8859-2
250
+ u("februári") => u("02"), # e1 in iso-8859-2
251
+ u("március") => u("03"), # e1 in iso-8859-2
252
+ u("április") => u("04"), # e1 in iso-8859-2
253
+ u("máujus") => u("05"), # e1 in iso-8859-2
254
+ u("június") => u("06"), # fa in iso-8859-2
255
+ u("július") => u("07"), # fa in iso-8859-2
256
+ u("augusztus") => u("08"),
257
+ u("szeptember") => u("09"),
258
+ u("október") => u("10"), # f3 in iso-8859-2
259
+ u("november") => u("11"),
260
+ u("december") => u("12"),
261
+ }
262
+ month = hungarian_months[m[2]]
263
+ day = m[3].rjust(2,'0')
264
+ hour = m[4].rjust(2,'0')
265
+
266
+ w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
267
+ $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
268
+ return parse_date_w3dtf(w3dtfdate)
269
+ end
270
+
271
+ def rollover(num, modulus)
272
+ return num % modulus, num / modulus
273
+ end
274
+
275
+ def set_self(num, modulus)
276
+ r = num / modulus
277
+ if r == 0
278
+ return num
279
+ end
280
+ return r
281
+ end
282
+
283
+ # W3DTF-style date parsing
284
+ def parse_date_w3dtf(dateString)
285
+ # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
286
+ # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
287
+ # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
288
+
289
+ m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)(?:\.\d+)?([+-]\d\d:\d\d|Z))?)?)?)?/)
290
+
291
+ w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
292
+ w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
293
+ w3 << m[-1] # Leave the timezone as a String
294
+
295
+ # FIXME this next bit needs some serious refactoring
296
+ # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
297
+ w3[5],r = rollover(w3[5], 60) # rollover seconds
298
+ w3[4] += r
299
+ w3[4],r = rollover(w3[4], 60) # rollover minutes
300
+ w3[3] += r
301
+ w3[3],r = rollover(w3[3], 24) # rollover hours
302
+
303
+ w3[2] = w3[2] + r
304
+ if w3[1] > 12
305
+ w3[1],r = rollover(w3[1],12)
306
+ w3[1] = 12 if w3[1] == 0
307
+ w3[0] += r
308
+ end
309
+
310
+ num_days = Time.days_in_month(w3[1], w3[0])
311
+ while w3[2] > num_days
312
+ w3[2] -= num_days
313
+ w3[1] += 1
314
+ if w3[1] > 12
315
+ w3[0] += 1
316
+ w3[1] = set_self(w3[1], 12)
317
+ end
318
+ num_days = Time.days_in_month(w3[1], w3[0])
319
+ end
320
+
321
+
322
+ unless w3[6].class != String
323
+ if /^-/ =~ w3[6] # Zone offset goes backwards
324
+ w3[6][0] = '+'
325
+ elsif /^\+/ =~ w3[6]
326
+ w3[6][0] = '-'
327
+ end
328
+ end
329
+ return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
330
+ end
331
+
332
+ def parse_date_rfc822(dateString)
333
+ # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
334
+ # These first few lines are to fix up the stupid proprietary format from Disney
335
+ unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
336
+ 'CT' => 'CST', 'MT' => 'MST',
337
+ 'PT' => 'PST'
338
+ }
339
+
340
+ mon = dateString.split[2]
341
+ if mon.length > 3 && Time::RFC2822_MONTH_NAME.include?(mon[0..2])
342
+ dateString.sub!(mon,mon[0..2])
343
+ end
344
+ if dateString[-3..-1] != "GMT" && unknown_timezones[dateString[-2..-1]]
345
+ dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
346
+ end
347
+
348
+ # Okay, the Disney date format should be fixed up now.
349
+ rfc_tz = '([A-Za-z]{3}|[\+\-]?\d\d\d\d)'
350
+ rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? #{rfc_tz})?/)
351
+
352
+ if rfc.to_a.length > 1 && rfc.to_a.include?(nil)
353
+ dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
354
+ hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
355
+ tz ||= "GMT"
356
+ end
357
+
358
+ asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
359
+ if asctime_match.to_a.length > 1
360
+ # Month-abbr dayofmonth hour:minute:second year
361
+ dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
362
+ day.to_s.rjust(2,'0')
363
+ end
364
+
365
+ if (rfc.to_a.length > 1 && rfc.to_a.include?(nil)) || asctime_match.to_a.length > 1
366
+ ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
367
+ else
368
+ ds = dateString
369
+ end
370
+ t = Time.rfc2822(ds).utc
371
+ return t
372
+ end
373
+
374
+ def parse_date_perforce(aDateString) # FIXME not in 4.1?
375
+ # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
376
+ # Note that there is a day of the week at the beginning
377
+ # Ex. Fri, 2006/09/15 08:19:53 EDT
378
+ return Time.parse(aDateString).utc
379
+ end
380
+
381
+ def extract_tuple(atime)
382
+ return unless atime
383
+ # NOTE leave the error handling to parse_date
384
+ t = [atime.year, atime.month, atime.mday, atime.hour,
385
+ atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
386
+ atime.isdst
387
+ ]
388
+ # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
389
+ t[0..-2].map!{|s| s.to_i}
390
+ t[-1] = t[-1] ? 1 : 0
391
+ return t
392
+ end
393
+
394
+ def parse_date(dateString)
395
+ @@date_handlers.each do |handler|
396
+ begin
397
+ $stderr << "Trying date_handler #{handler}\n" if $debug
398
+ datething = send(handler,dateString)
399
+ return datething
400
+ rescue => e
401
+ $stderr << "#{handler} raised #{e}\n" if $debug
402
+ end
403
+ end
404
+ return nil
405
+ end
406
+ end
407
+ end
408
+ end