rfeedparser 0.9.931 → 0.9.940

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/lib/rfeedparser.rb +143 -58
  2. data/lib/rfeedparser/aliases.rb +1 -1
  3. data/lib/rfeedparser/better_attributelist.rb +11 -11
  4. data/lib/rfeedparser/better_sgmlparser.rb +1 -1
  5. data/lib/rfeedparser/encoding_helpers.rb +120 -127
  6. data/lib/rfeedparser/feedparserdict.rb +30 -20
  7. data/lib/rfeedparser/forgiving_uri.rb +9 -7
  8. data/lib/rfeedparser/markup_helpers.rb +11 -14
  9. data/lib/rfeedparser/parser_mixin.rb +16 -11
  10. data/lib/rfeedparser/parsers.rb +1 -2
  11. data/lib/rfeedparser/scrub.rb +95 -90
  12. data/lib/rfeedparser/time_helpers.rb +379 -379
  13. data/lib/rfeedparser/utilities.rb +23 -0
  14. data/tests/rfeedparser_test_helper.rb +262 -0
  15. data/tests/rfeedparserserver.rb +3 -109
  16. data/tests/rfeedparsertest.rb +6 -165
  17. data/tests/rfponly/http/200.xml +30 -0
  18. data/tests/rfponly/http/220.xml +28 -0
  19. data/tests/rfponly/http/300.xml +8 -0
  20. data/tests/rfponly/http/300.xml_redirect +25 -0
  21. data/tests/rfponly/http/301.xml +8 -0
  22. data/tests/rfponly/http/301.xml_redirect +25 -0
  23. data/tests/rfponly/http/302.xml +8 -0
  24. data/tests/rfponly/http/302.xml_redirect +25 -0
  25. data/tests/rfponly/http/307.xml +8 -0
  26. data/tests/rfponly/http/307.xml_redirect +25 -0
  27. data/tests/rfponly/http/320.xml +8 -0
  28. data/tests/rfponly/http/320.xml_redirect +25 -0
  29. data/tests/rfponly/http/400.xml +7 -0
  30. data/tests/rfponly/http/404.xml +7 -0
  31. data/tests/rfponly/http/410.xml +7 -0
  32. data/tests/rfponly/http/420.xml +7 -0
  33. data/tests/rfponly/http/500.xml +7 -0
  34. data/tests/rfponly/http/520.xml +7 -0
  35. data/tests/rfponly/http/etag.xml +28 -0
  36. data/tests/rfponly/http/lastmodified.xml +29 -0
  37. data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
  38. data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
  39. data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
  40. metadata +31 -3
@@ -1,406 +1,406 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
  require 'time'
3
3
 
4
- # This sucks, but I haven't figured out a better way of getting the namespaces right.
5
- module FeedParserMixin
6
- # ISO-8601 date parsing routines written by Fazal Majid.
7
- # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
8
- # parser is beyond the scope of feedparser and the current Time.iso8601
9
- # method does not work.
10
- # A single regular expression cannot parse ISO 8601 date formats into groups
11
- # as the standard is highly irregular (for instance is 030104 2003-01-04 or
12
- # 0301-04-01), so we use templates instead.
13
- # Please note the order in templates is significant because we need a
14
- # greedy match.
15
- def _parse_date_iso8601(dateString)
16
- # Parse a variety of ISO-8601-compatible formats like 20040105
17
-
18
- # What I'm about to show you may be the ugliest code in all of
19
- # rfeedparser.
20
- # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
21
- # end of line" but we then attach more of a regexp.
22
- iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
23
- '^(\d{4})-([01]\d)',
24
- '^(\d{4})-?([0123]\d\d)',
25
- '^(\d\d)-?([01]\d)-?([0123]\d)',
26
- '^(\d\d)-?([0123]\d\d)',
27
- '^(\d{4})',
28
- '-(\d\d)-?([01]\d)',
29
- '-([0123]\d\d)',
30
- '-(\d\d)',
31
- '--([01]\d)-?([0123]\d)',
32
- '--([01]\d)',
33
- '---([0123]\d)',
34
- '(\d\d$)',
35
- '' ]
36
- iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
37
- '^(\d{4})-([01]\d)' => ['year','month'],
38
- '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
39
- '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
40
- '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
41
- '^(\d{4})' => ['year'],
42
- '-(\d\d)-?([01]\d)' => ['year','month'],
43
- '-([0123]\d\d)' => ['ordinal'],
44
- '-(\d\d)' => ['year'],
45
- '--([01]\d)-?([0123]\d)' => ['month','day'],
46
- '--([01]\d)' => ['month'],
47
- '---([0123]\d)' => ['day'],
48
- '(\d\d$)' => ['century'],
49
- '' => [] }
50
- add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
51
- add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
52
- # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
53
- # by '?'). The second ':' *are* matched.
54
- m = nil
55
- param_keys = []
56
- iso8601_regexps.each do |s|
57
- $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
58
- param_keys = iso8601_values[s] + add_to_all_fields
59
- m = dateString.match(Regexp.new(s+add_to_all))
60
- break if m
61
- end
62
- return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
63
-
64
- param_values = m.to_a
65
- param_values = param_values[1..-1]
66
- params = {}
67
- param_keys.each_with_index do |key,i|
68
- params[key] = param_values[i]
69
- end
4
+ module FeedParser
70
5
 
71
- ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
72
- year = params['year'] || '--'
73
- if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
74
- year = Time.now.utc.year
75
- elsif year.length == 2
76
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
77
- year = 100 * (Time.now.utc.year / 100) + year.to_i
78
- else
79
- year = year.to_i
80
- end
6
+ class FeedTimeParser
7
+ @@date_handlers = [:parse_date_rfc822,
8
+ :parse_date_hungarian, :parse_date_greek,:parse_date_mssql,
9
+ :parse_date_nate,:parse_date_onblog,:parse_date_w3dtf,:parse_date_iso8601
10
+ ]
11
+ class << self
12
+ # ISO-8601 date parsing routines written by Fazal Majid.
13
+ # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
14
+ # parser is beyond the scope of feedparser and the current Time.iso8601
15
+ # method does not work.
16
+ # A single regular expression cannot parse ISO 8601 date formats into groups
17
+ # as the standard is highly irregular (for instance is 030104 2003-01-04 or
18
+ # 0301-04-01), so we use templates instead.
19
+ # Please note the order in templates is significant because we need a
20
+ # greedy match.
21
+ def parse_date_iso8601(dateString)
22
+ # Parse a variety of ISO-8601-compatible formats like 20040105
23
+
24
+ # What I'm about to show you may be the ugliest code in all of
25
+ # rfeedparser.
26
+ # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
27
+ # end of line" but we then attach more of a regexp.
28
+ iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
29
+ '^(\d{4})-([01]\d)',
30
+ '^(\d{4})-?([0123]\d\d)',
31
+ '^(\d\d)-?([01]\d)-?([0123]\d)',
32
+ '^(\d\d)-?([0123]\d\d)',
33
+ '^(\d{4})',
34
+ '-(\d\d)-?([01]\d)',
35
+ '-([0123]\d\d)',
36
+ '-(\d\d)',
37
+ '--([01]\d)-?([0123]\d)',
38
+ '--([01]\d)',
39
+ '---([0123]\d)',
40
+ '(\d\d$)',
41
+ '' ]
42
+ iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
43
+ '^(\d{4})-([01]\d)' => ['year','month'],
44
+ '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
45
+ '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
46
+ '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
47
+ '^(\d{4})' => ['year'],
48
+ '-(\d\d)-?([01]\d)' => ['year','month'],
49
+ '-([0123]\d\d)' => ['ordinal'],
50
+ '-(\d\d)' => ['year'],
51
+ '--([01]\d)-?([0123]\d)' => ['month','day'],
52
+ '--([01]\d)' => ['month'],
53
+ '---([0123]\d)' => ['day'],
54
+ '(\d\d$)' => ['century'],
55
+ '' => [] }
56
+ add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
57
+ add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
58
+ # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
59
+ # by '?'). The second ':' *are* matched.
60
+ m = nil
61
+ param_keys = []
62
+ iso8601_regexps.each do |s|
63
+ $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
64
+ param_keys = iso8601_values[s] + add_to_all_fields
65
+ m = dateString.match(Regexp.new(s+add_to_all))
66
+ break if m
67
+ end
68
+ return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
69
+
70
+ param_values = m.to_a
71
+ param_values = param_values[1..-1]
72
+ params = {}
73
+ param_keys.each_with_index do |key,i|
74
+ params[key] = param_values[i]
75
+ end
76
+
77
+ ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
78
+ year = params['year'] || '--'
79
+ if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
80
+ year = Time.now.utc.year
81
+ elsif year.length == 2
82
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
83
+ year = 100 * (Time.now.utc.year / 100) + year.to_i
84
+ else
85
+ year = year.to_i
86
+ end
87
+
88
+ month = params['month'] || '-'
89
+ if month.nil? or month.empty? or month == '-'
90
+ # ordinals are NOT normalized by mktime, we simulate them
91
+ # by setting month=1, day=ordinal
92
+ if ordinal
93
+ month = DateTime.ordinal(year,ordinal).month
94
+ else
95
+ month = Time.now.utc.month
96
+ end
97
+ end
98
+ month = month.to_i unless month.nil?
99
+ day = params['day']
100
+ if day.nil? or day.empty?
101
+ # see above
102
+ if ordinal
103
+ day = DateTime.ordinal(year,ordinal).day
104
+ elsif params['century'] or params['year'] or params['month']
105
+ day = 1
106
+ else
107
+ day = Time.now.utc.day
108
+ end
109
+ else
110
+ day = day.to_i
111
+ end
112
+ # special case of the century - is the first year of the 21st century
113
+ # 2000 or 2001 ? The debate goes on...
114
+ if params.has_key? 'century'
115
+ year = (params['century'].to_i - 1) * 100 + 1
116
+ end
117
+ # in ISO 8601 most fields are optional
118
+ hour = params['hour'].to_i
119
+ minute = params['minute'].to_i
120
+ second = params['second'].to_i
121
+ weekday = nil
122
+ # daylight savings is complex, but not needed for feedparser's purposes
123
+ # as time zones, if specified, include mention of whether it is active
124
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
125
+ # and most implementations have DST bugs
126
+ tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
127
+ tz = params['tz']
128
+ if tz and not tz.empty? and tz != 'Z'
129
+ # FIXME does this cross over days?
130
+ if tz[0] == '-'
131
+ tm[3] += params['tzhour'].to_i
132
+ tm[4] += params['tzmin'].to_i
133
+ elsif tz[0] == '+'
134
+ tm[3] -= params['tzhour'].to_i
135
+ tm[4] -= params['tzmin'].to_i
136
+ else
137
+ return nil
138
+ end
139
+ end
140
+ return Time.utc(*tm) # Magic!
81
141
 
82
- month = params['month'] || '-'
83
- if month.nil? or month.empty? or month == '-'
84
- # ordinals are NOT normalized by mktime, we simulate them
85
- # by setting month=1, day=ordinal
86
- if ordinal
87
- month = DateTime.ordinal(year,ordinal).month
88
- else
89
- month = Time.now.utc.month
90
- end
91
- end
92
- month = month.to_i unless month.nil?
93
- day = params['day']
94
- if day.nil? or day.empty?
95
- # see above
96
- if ordinal
97
- day = DateTime.ordinal(year,ordinal).day
98
- elsif params['century'] or params['year'] or params['month']
99
- day = 1
100
- else
101
- day = Time.now.utc.day
102
142
  end
103
- else
104
- day = day.to_i
105
- end
106
- # special case of the century - is the first year of the 21st century
107
- # 2000 or 2001 ? The debate goes on...
108
- if params.has_key? 'century'
109
- year = (params['century'].to_i - 1) * 100 + 1
110
- end
111
- # in ISO 8601 most fields are optional
112
- hour = params['hour'].to_i
113
- minute = params['minute'].to_i
114
- second = params['second'].to_i
115
- weekday = nil
116
- # daylight savings is complex, but not needed for feedparser's purposes
117
- # as time zones, if specified, include mention of whether it is active
118
- # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
119
- # and most implementations have DST bugs
120
- tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
121
- tz = params['tz']
122
- if tz and not tz.empty? and tz != 'Z'
123
- # FIXME does this cross over days?
124
- if tz[0] == '-'
125
- tm[3] += params['tzhour'].to_i
126
- tm[4] += params['tzmin'].to_i
127
- elsif tz[0] == '+'
128
- tm[3] -= params['tzhour'].to_i
129
- tm[4] -= params['tzmin'].to_i
130
- else
131
- return nil
132
- end
133
- end
134
- return Time.utc(*tm) # Magic!
135
-
136
- end
137
143
 
138
- def _parse_date_onblog(dateString)
139
- # Parse a string according to the OnBlog 8-bit date format
140
- # 8-bit date handling routes written by ytrewq1
141
- korean_year = u("년") # b3e2 in euc-kr
142
- korean_month = u("월") # bff9 in euc-kr
143
- korean_day = u("일") # c0cf in euc-kr
144
+ def parse_date_onblog(dateString)
145
+ # Parse a string according to the OnBlog 8-bit date format
146
+ # 8-bit date handling routes written by ytrewq1
147
+ korean_year = u("년") # b3e2 in euc-kr
148
+ korean_month = u("월") # bff9 in euc-kr
149
+ korean_day = u("일") # c0cf in euc-kr
144
150
 
145
151
 
146
- korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
152
+ korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
147
153
 
148
154
 
149
- m = korean_onblog_date_re.match(dateString)
150
- return unless m
151
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
155
+ m = korean_onblog_date_re.match(dateString)
156
+
157
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
152
158
 
153
- $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
154
- return _parse_date_w3dtf(w3dtfdate)
155
- end
156
-
157
- def _parse_date_nate(dateString)
158
- # Parse a string according to the Nate 8-bit date format
159
- # 8-bit date handling routes written by ytrewq1
160
- korean_am = u("오전") # bfc0 c0fc in euc-kr
161
- korean_pm = u("오후") # bfc0 c8c4 in euc-kr
162
-
163
- korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
164
- m = korean_nate_date_re.match(dateString)
165
- return unless m
166
- hour = m[5].to_i
167
- ampm = m[4]
168
- if ampm == korean_pm
169
- hour += 12
170
- end
171
- hour = hour.to_s.rjust(2,'0')
172
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
173
- $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
174
- return _parse_date_w3dtf(w3dtfdate)
175
- end
176
-
177
- def _parse_date_mssql(dateString)
178
- mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
179
-
180
- m = mssql_date_re.match(dateString)
181
- return unless m
182
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
183
- $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
184
- return _parse_date_w3dtf(w3dtfdate)
185
- end
186
-
187
- def _parse_date_greek(dateString)
188
- # Parse a string according to a Greek 8-bit date format
189
- # Unicode strings for Greek date strings
190
- greek_months = {
191
- u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
192
- u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
193
- u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
194
- u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
195
- u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
196
- u("Μάι") => u("May"), # ccdce9 in iso-8859-7
197
- u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
198
- u("Μαι") => u("May"), # cce1e9 in iso-8859-7
199
- u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
200
- u("Ιον") => u("Jun"), # c9efed in iso-8859-7
201
- u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
202
- u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
203
- u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
204
- u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
205
- u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
206
- u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
207
- u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
208
- u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
209
- u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
210
- }
211
-
212
- greek_wdays = {
213
- u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
214
- u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
215
- u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
216
- u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
217
- u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
218
- u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
219
- u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
220
- }
221
-
222
- greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
223
-
224
- m = greek_date_format.match(dateString)
225
- return unless m
226
- begin
227
- wday = greek_wdays[m[1]]
228
- month = greek_months[m[3]]
229
- rescue
230
- return nil
231
- end
232
- rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
233
- $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
234
- return _parse_date_rfc822(rfc822date)
235
- end
159
+ $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
160
+ return parse_date_w3dtf(w3dtfdate)
161
+ end
236
162
 
237
- def _parse_date_hungarian(dateString)
238
- # Parse a string according to a Hungarian 8-bit date format.
239
- hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
240
- m = hungarian_date_format_re.match(dateString)
241
- return unless m
242
-
243
- # Unicode strings for Hungarian date strings
244
- hungarian_months = {
245
- u("január") => u("01"), # e1 in iso-8859-2
246
- u("februári") => u("02"), # e1 in iso-8859-2
247
- u("március") => u("03"), # e1 in iso-8859-2
248
- u("április") => u("04"), # e1 in iso-8859-2
249
- u("máujus") => u("05"), # e1 in iso-8859-2
250
- u("június") => u("06"), # fa in iso-8859-2
251
- u("július") => u("07"), # fa in iso-8859-2
252
- u("augusztus") => u("08"),
253
- u("szeptember") => u("09"),
254
- u("október") => u("10"), # f3 in iso-8859-2
255
- u("november") => u("11"),
256
- u("december") => u("12"),
257
- }
258
- begin
259
- month = hungarian_months[m[2]]
260
- day = m[3].rjust(2,'0')
261
- hour = m[4].rjust(2,'0')
262
- rescue
263
- return
264
- end
163
+ def parse_date_nate(dateString)
164
+ # Parse a string according to the Nate 8-bit date format
165
+ # 8-bit date handling routes written by ytrewq1
166
+ korean_am = u("오전") # bfc0 c0fc in euc-kr
167
+ korean_pm = u("오후") # bfc0 c8c4 in euc-kr
168
+
169
+ korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
170
+ m = korean_nate_date_re.match(dateString)
171
+
172
+ hour = m[5].to_i
173
+ ampm = m[4]
174
+ if ampm == korean_pm
175
+ hour += 12
176
+ end
177
+ hour = hour.to_s.rjust(2,'0')
178
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
179
+ $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
180
+ return parse_date_w3dtf(w3dtfdate)
181
+ end
265
182
 
266
- w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
267
- $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
268
- return _parse_date_w3dtf(w3dtfdate)
269
- end
183
+ def parse_date_mssql(dateString)
184
+ mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
270
185
 
271
- def rollover(num, modulus)
272
- return num % modulus, num / modulus
273
- end
186
+ m = mssql_date_re.match(dateString)
187
+
188
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
189
+ $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
190
+ return parse_date_w3dtf(w3dtfdate)
191
+ end
274
192
 
275
- def set_self(num, modulus)
276
- r = num / modulus
277
- if r == 0
278
- return num
279
- end
280
- return r
281
- end
282
- # W3DTF-style date parsing
283
- # FIXME shouldn't it be "W3CDTF"?
284
- def _parse_date_w3dtf(dateString)
285
- # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
286
- # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
287
- # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
288
-
289
- m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
290
-
291
- w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
292
- w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
293
- w3 << m[-1] # Leave the timezone as a String
294
-
295
- # FIXME this next bit needs some serious refactoring
296
- # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
297
- w3[5],r = rollover(w3[5], 60) # rollover seconds
298
- w3[4] += r
299
- w3[4],r = rollover(w3[4], 60) # rollover minutes
300
- w3[3] += r
301
- w3[3],r = rollover(w3[3], 24) # rollover hours
302
-
303
- w3[2] = w3[2] + r
304
- if w3[1] > 12
305
- w3[1],r = rollover(w3[1],12)
306
- w3[1] = 12 if w3[1] == 0
307
- w3[0] += r
308
- end
193
+ def parse_date_greek(dateString)
194
+ # Parse a string according to a Greek 8-bit date format
195
+ # Unicode strings for Greek date strings
196
+ greek_months = {
197
+ u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
198
+ u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
199
+ u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
200
+ u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
201
+ u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
202
+ u("Μάι") => u("May"), # ccdce9 in iso-8859-7
203
+ u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
204
+ u("Μαι") => u("May"), # cce1e9 in iso-8859-7
205
+ u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
206
+ u("Ιον") => u("Jun"), # c9efed in iso-8859-7
207
+ u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
208
+ u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
209
+ u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
210
+ u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
211
+ u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
212
+ u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
213
+ u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
214
+ u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
215
+ u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
216
+ }
217
+
218
+ greek_wdays = {
219
+ u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
220
+ u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
221
+ u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
222
+ u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
223
+ u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
224
+ u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
225
+ u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
226
+ }
227
+
228
+ greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
229
+
230
+ m = greek_date_format.match(dateString)
231
+
232
+ wday = greek_wdays[m[1]]
233
+ month = greek_months[m[3]]
234
+
235
+ rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
236
+ $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
237
+ return parse_date_rfc822(rfc822date)
238
+ end
309
239
 
310
- num_days = Time.days_in_month(w3[1], w3[0])
311
- while w3[2] > num_days
312
- w3[2] -= num_days
313
- w3[1] += 1
314
- if w3[1] > 12
315
- w3[0] += 1
316
- w3[1] = set_self(w3[1], 12)
240
+ def parse_date_hungarian(dateString)
241
+ # Parse a string according to a Hungarian 8-bit date format.
242
+ hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
243
+ m = hungarian_date_format_re.match(dateString)
244
+
245
+ # Unicode strings for Hungarian date strings
246
+ hungarian_months = {
247
+ u("január") => u("01"), # e1 in iso-8859-2
248
+ u("februári") => u("02"), # e1 in iso-8859-2
249
+ u("március") => u("03"), # e1 in iso-8859-2
250
+ u("április") => u("04"), # e1 in iso-8859-2
251
+ u("máujus") => u("05"), # e1 in iso-8859-2
252
+ u("június") => u("06"), # fa in iso-8859-2
253
+ u("július") => u("07"), # fa in iso-8859-2
254
+ u("augusztus") => u("08"),
255
+ u("szeptember") => u("09"),
256
+ u("október") => u("10"), # f3 in iso-8859-2
257
+ u("november") => u("11"),
258
+ u("december") => u("12"),
259
+ }
260
+ month = hungarian_months[m[2]]
261
+ day = m[3].rjust(2,'0')
262
+ hour = m[4].rjust(2,'0')
263
+
264
+ w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
265
+ $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
266
+ return parse_date_w3dtf(w3dtfdate)
317
267
  end
318
- num_days = Time.days_in_month(w3[1], w3[0])
319
- end
320
268
 
269
+ def rollover(num, modulus)
270
+ return num % modulus, num / modulus
271
+ end
321
272
 
322
- unless w3[6].class != String
323
- if /^-/ =~ w3[6] # Zone offset goes backwards
324
- w3[6][0] = '+'
325
- elsif /^\+/ =~ w3[6]
326
- w3[6][0] = '-'
273
+ def set_self(num, modulus)
274
+ r = num / modulus
275
+ if r == 0
276
+ return num
277
+ end
278
+ return r
279
+ end
280
+ # W3DTF-style date parsing
281
+ # FIXME shouldn't it be "W3CDTF"?
282
+ def parse_date_w3dtf(dateString)
283
+ # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
284
+ # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
285
+ # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
286
+
287
+ m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
288
+
289
+ w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
290
+ w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
291
+ w3 << m[-1] # Leave the timezone as a String
292
+
293
+ # FIXME this next bit needs some serious refactoring
294
+ # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
295
+ w3[5],r = rollover(w3[5], 60) # rollover seconds
296
+ w3[4] += r
297
+ w3[4],r = rollover(w3[4], 60) # rollover minutes
298
+ w3[3] += r
299
+ w3[3],r = rollover(w3[3], 24) # rollover hours
300
+
301
+ w3[2] = w3[2] + r
302
+ if w3[1] > 12
303
+ w3[1],r = rollover(w3[1],12)
304
+ w3[1] = 12 if w3[1] == 0
305
+ w3[0] += r
306
+ end
307
+
308
+ num_days = Time.days_in_month(w3[1], w3[0])
309
+ while w3[2] > num_days
310
+ w3[2] -= num_days
311
+ w3[1] += 1
312
+ if w3[1] > 12
313
+ w3[0] += 1
314
+ w3[1] = set_self(w3[1], 12)
315
+ end
316
+ num_days = Time.days_in_month(w3[1], w3[0])
317
+ end
318
+
319
+
320
+ unless w3[6].class != String
321
+ if /^-/ =~ w3[6] # Zone offset goes backwards
322
+ w3[6][0] = '+'
323
+ elsif /^\+/ =~ w3[6]
324
+ w3[6][0] = '-'
325
+ end
326
+ end
327
+ return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
327
328
  end
328
- end
329
- return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
330
- end
331
329
 
332
- def _parse_date_rfc822(dateString)
333
- # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
334
- # These first few lines are to fix up the stupid proprietary format from Disney
335
- unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
336
- 'CT' => 'CST', 'MT' => 'MST',
337
- 'PT' => 'PST'
338
- }
339
-
340
- mon = dateString.split[2]
341
- if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
342
- dateString.sub!(mon,mon[0..2])
343
- end
344
- if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
345
- dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
346
- end
347
- # Okay, the Disney date format should be fixed up now.
348
- rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
349
- if rfc.to_a.length > 1 and rfc.to_a.include? nil
350
- dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
351
- hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
352
- tz ||= "GMT"
353
- end
354
- asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
355
- if asctime_match.to_a.length > 1
356
- # Month-abbr dayofmonth hour:minute:second year
357
- dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
358
- day.to_s.rjust(2,'0')
359
- end
360
- if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
361
- ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
362
- else
363
- ds = dateString
364
- end
365
- t = Time.rfc2822(ds).utc
366
- return t
367
- end
330
+ def parse_date_rfc822(dateString)
331
+ # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
332
+ # These first few lines are to fix up the stupid proprietary format from Disney
333
+ unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
334
+ 'CT' => 'CST', 'MT' => 'MST',
335
+ 'PT' => 'PST'
336
+ }
337
+
338
+ mon = dateString.split[2]
339
+ if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
340
+ dateString.sub!(mon,mon[0..2])
341
+ end
342
+ if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
343
+ dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
344
+ end
345
+
346
+ # Okay, the Disney date format should be fixed up now.
347
+ rfc_tz = '([A-Za-z]{3}|[\+\-]?\d\d\d\d)'
348
+ rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? #{rfc_tz})?/)
349
+
350
+ if rfc.to_a.length > 1 and rfc.to_a.include? nil
351
+ dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
352
+ hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
353
+ tz ||= "GMT"
354
+ end
355
+
356
+ asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
357
+ if asctime_match.to_a.length > 1
358
+ # Month-abbr dayofmonth hour:minute:second year
359
+ dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
360
+ day.to_s.rjust(2,'0')
361
+ end
362
+
363
+ if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
364
+ ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
365
+ else
366
+ ds = dateString
367
+ end
368
+ t = Time.rfc2822(ds).utc
369
+ return t
370
+ end
368
371
 
369
- def _parse_date_perforce(aDateString) # FIXME not in 4.1?
370
- # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
371
- # Note that there is a day of the week at the beginning
372
- # Ex. Fri, 2006/09/15 08:19:53 EDT
373
- return Time.parse(aDateString).utc
374
- end
372
+ def parse_date_perforce(aDateString) # FIXME not in 4.1?
373
+ # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
374
+ # Note that there is a day of the week at the beginning
375
+ # Ex. Fri, 2006/09/15 08:19:53 EDT
376
+ return Time.parse(aDateString).utc
377
+ end
375
378
 
376
- def extract_tuple(atime)
377
- # NOTE leave the error handling to parse_date
378
- t = [atime.year, atime.month, atime.mday, atime.hour,
379
- atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
380
- atime.isdst
381
- ]
382
- # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
383
- t[0..-2].map!{|s| s.to_i}
384
- t[-1] = t[-1] ? 1 : 0
385
- return t
386
- end
379
+ def extract_tuple(atime)
380
+ return unless atime
381
+ # NOTE leave the error handling to parse_date
382
+ t = [atime.year, atime.month, atime.mday, atime.hour,
383
+ atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
384
+ atime.isdst
385
+ ]
386
+ # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
387
+ t[0..-2].map!{|s| s.to_i}
388
+ t[-1] = t[-1] ? 1 : 0
389
+ return t
390
+ end
387
391
 
388
- def parse_date(dateString)
389
- @date_handlers.each do |handler|
390
- begin
391
- $stderr << "Trying date_handler #{handler}\n" if $debug
392
- datething = extract_tuple(send(handler,dateString))
393
- return datething
394
- rescue Exception => e
395
- $stderr << "#{handler} raised #{e}\n" if $debug
392
+ def parse_date(dateString)
393
+ @@date_handlers.each do |handler|
394
+ begin
395
+ $stderr << "Trying date_handler #{handler}\n" if $debug
396
+ datething = send(handler,dateString)
397
+ return datething
398
+ rescue Exception => e
399
+ $stderr << "#{handler} raised #{e}\n" if $debug
400
+ end
401
+ end
402
+ return nil
396
403
  end
397
404
  end
398
- return nil
399
- end
400
- end
401
-
402
- module FeedParserUtilities
403
- def py2rtime(pytuple)
404
- return Time.utc(*pytuple[0..5]) unless pytuple.blank?
405
405
  end
406
- end
406
+ end