rfeedparser 0.9.8 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/ruby
2
+
3
+
4
+ class BetterSGMLParserError < Exception; end;
5
+ class BetterSGMLParser < HTML::SGMLParser
6
+ # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
7
+ # This makes things work.
8
+ Interesting = /[&<]/u
9
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
10
+ # 64 is the unicode flag
11
+
12
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
13
+ Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
14
+
15
+ Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
16
+ Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
17
+ Endtagopen = /<\//u # Matching the Python SGMLParser
18
+ Endbracket = /[<>]/u
19
+ Declopen = /<!/u
20
+ Piopenbegin = /^<\?/u
21
+ Piclose = />/u
22
+
23
+ Commentopen = /<!--/u
24
+ Commentclose = /--\s*>/u
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
26
+ Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
27
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
28
+ 64)
29
+ Endtagfind = /\s*\/\s*>/u
30
+ def initialize(verbose=false)
31
+ super(verbose)
32
+ end
33
+ def feed(*args)
34
+ super(*args)
35
+ end
36
+
37
+ def goahead(_end)
38
+ rawdata = @rawdata # woo, utf-8 magic
39
+ i = 0
40
+ n = rawdata.length
41
+ while i < n
42
+ if @nomoretags
43
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
+ i = n
46
+ break
47
+ end
48
+ j = rawdata.index(Interesting, i)
49
+ j = n unless j
50
+ handle_data(rawdata[i...j]) if i < j
51
+ i = j
52
+ break if (i == n)
53
+ if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
54
+ if rawdata.index(Starttagopen,i) == i
55
+ if @literal
56
+ handle_data(rawdata[i..i])
57
+ i = i+1
58
+ next
59
+ end
60
+ k = parse_starttag(i)
61
+ break unless k
62
+ i = k
63
+ next
64
+ end
65
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
+ k = parse_endtag(i)
67
+ break unless k
68
+ i = k
69
+ @literal = false
70
+ next
71
+ end
72
+ if @literal
73
+ if n > (i+1)
74
+ handle_data("<")
75
+ i = i+1
76
+ else
77
+ #incomplete
78
+ break
79
+ end
80
+ next
81
+ end
82
+ if rawdata.index(Commentopen,i) == i
83
+ k = parse_comment(i)
84
+ break unless k
85
+ i = k
86
+ next
87
+ end
88
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
+ k = parse_pi(i)
90
+ break unless k
91
+ i += k
92
+ next
93
+ end
94
+ if rawdata.index(Declopen,i) == i
95
+ # This is some sort of declaration; in "HTML as
96
+ # deployed," this should only be the document type
97
+ # declaration ("<!DOCTYPE html...>").
98
+ k = parse_declaration(i)
99
+ break unless k
100
+ i = k
101
+ next
102
+ end
103
+ elsif rawdata[i..i] == '&'
104
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
+ handle_data(rawdata[i..i])
106
+ i += 1
107
+ next
108
+ end
109
+
110
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
+ ni,match = index_match(rawdata, Charref, i)
112
+ if ni and ni == i # See? Ugly
113
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
+ i -= 1 unless rawdata[i-1..i-1] == ";"
116
+ next
117
+ end
118
+ ni,match = index_match(rawdata, Entityref, i)
119
+ if ni and ni == i
120
+ handle_entityref(match[1])
121
+ i += match[0].length
122
+ i -= 1 unless rawdata[i-1..i-1] == ";"
123
+ next
124
+ end
125
+ else
126
+ error('neither < nor & ??')
127
+ end
128
+ # We get here only if incomplete matches but
129
+ # nothing else
130
+ ni,match = index_match(rawdata,Incomplete,i)
131
+ unless ni and ni == 0
132
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
+ i += 1
134
+ next
135
+ end
136
+ j = ni + match[0].length
137
+ break if j == n # Really incomplete
138
+ handle_data(rawdata[i...j])
139
+ i = j
140
+ end # end while
141
+
142
+ if _end and i < n
143
+ handle_data(rawdata[i...n])
144
+ i = n
145
+ end
146
+
147
+ @rawdata = rawdata[i..-1]
148
+ # @offset += i # FIXME BUGME another unused variable in SGMLParser?
149
+ end
150
+
151
+
152
+ # Internal -- parse processing instr, return length or -1 if not terminated
153
+ def parse_pi(i)
154
+ rawdata = @rawdata
155
+ if rawdata[i...i+2] != '<?'
156
+ error("unexpected call to parse_pi()")
157
+ end
158
+ ni,match = index_match(rawdata,Piclose,i+2)
159
+ return nil unless match
160
+ j = ni
161
+ handle_pi(rawdata[i+2...j])
162
+ j = (j + match[0].length)
163
+ return j-i
164
+ end
165
+
166
+ def parse_comment(i)
167
+ rawdata = @rawdata
168
+ if rawdata[i...i+4] != "<!--"
169
+ error("unexpected call to parse_comment()")
170
+ end
171
+ ni,match = index_match(rawdata, Commentclose,i)
172
+ return nil unless match
173
+ handle_comment(rawdata[i+4..(ni-1)])
174
+ return ni+match[0].length # Length from i to just past the closing comment tag
175
+ end
176
+
177
+
178
+ def parse_starttag(i)
179
+ @_starttag_text = nil
180
+ start_pos = i
181
+ rawdata = @rawdata
182
+ ni,match = index_match(rawdata,Shorttagopen,i)
183
+ if ni == i
184
+ # SGML shorthand: <tag/data/ == <tag>data</tag>
185
+ # XXX Can data contain &... (entity or char refs)?
186
+ # XXX Can data contain < or > (tag characters)?
187
+ # XXX Can there be whitespace before the first /?
188
+ k,match = index_match(rawdata,Shorttag,i)
189
+ return nil unless match
190
+ tag, data = match[1], match[2]
191
+ @_starttag_text = "<#{tag}/"
192
+ tag.downcase!
193
+ second_end = rawdata.index(Shorttagopen,k)
194
+ finish_shorttag(tag, data)
195
+ @_starttag_text = rawdata[start_pos...second_end+1]
196
+ return k
197
+ end
198
+
199
+ j = rawdata.index(Endbracket, i+1)
200
+ return nil unless j
201
+ attrsd = []
202
+ if rawdata[i...i+2] == '<>'
203
+ # SGML shorthand: <> == <last open tag seen>
204
+ k = j
205
+ tag = @lasttag
206
+ else
207
+ ni,match = index_match(rawdata,Tagfind,i+1)
208
+ unless match
209
+ error('unexpected call to parse_starttag')
210
+ end
211
+ k = ni+match[0].length+1
212
+ tag = match[0].downcase
213
+ @lasttag = tag
214
+ end
215
+
216
+ while k < j
217
+ break if rawdata.index(Endtagfind, k) == k
218
+ ni,match = index_match(rawdata,Attrfind,k)
219
+ break unless ni
220
+ matched_length = match[0].length
221
+ attrname, rest, attrvalue = match[1],match[2],match[3]
222
+ if rest.nil? or rest.empty?
223
+ attrvalue = '' # was: = attrname # Why the change?
224
+ elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
225
+ attrvalue = attrvalue[1...-1]
226
+ end
227
+ attrsd << [attrname.downcase, attrvalue]
228
+ k += matched_length
229
+ end
230
+ if rawdata[j..j] == ">"
231
+ j += 1
232
+ end
233
+ @_starttag_text = rawdata[start_pos...j]
234
+ finish_starttag(tag, attrsd)
235
+ return j
236
+ end
237
+
238
+ def parse_endtag(i)
239
+ rawdata = @rawdata
240
+ j, match = index_match(rawdata, /[<>]/,i+1)
241
+ return nil unless j
242
+ tag = rawdata[i+2...j].strip.downcase
243
+ if rawdata[j..j] == ">"
244
+ j += 1
245
+ end
246
+ finish_endtag(tag)
247
+ return j
248
+ end
249
+
250
+ def output
251
+ # Return processed HTML as a single string
252
+ return @pieces.map{|p| p.to_s}.join
253
+ end
254
+
255
+ def error(message)
256
+ raise BetterSGMLParserError.new(message)
257
+ end
258
+ def handle_pi(text)
259
+ end
260
+ def handle_decl(text)
261
+ end
262
+ end
263
+
264
+
@@ -0,0 +1,257 @@
1
+ #!/usr/bin/ruby
2
+
3
+ module FeedParserUtilities
4
+
5
+ def unicode(data, from_encoding)
6
+ # Takes a single string and converts it from the encoding in
7
+ # from_encoding to unicode.
8
+ uconvert(data, from_encoding, 'unicode')
9
+ end
10
+
11
+ def uconvert(data, from_encoding, to_encoding = 'utf-8')
12
+ from_encoding = Encoding_Aliases[from_encoding] || from_encoding
13
+ to_encoding = Encoding_Aliases[to_encoding] || to_encoding
14
+ Iconv.iconv(to_encoding, from_encoding, data)[0]
15
+ end
16
+
17
+ def index_match(stri,regexp, offset)
18
+ i = stri.index(regexp, offset)
19
+
20
+ return nil, nil unless i
21
+
22
+ full = stri[i..-1].match(regexp)
23
+ return i, full
24
+ end
25
+
26
+ def _ebcdic_to_ascii(s)
27
+ return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
28
+ end
29
+
30
+ def getCharacterEncoding(feed, xml_data)
31
+ # Get the character encoding of the XML document
32
+ $stderr << "In getCharacterEncoding\n" if $debug
33
+ sniffed_xml_encoding = nil
34
+ xml_encoding = nil
35
+ true_encoding = nil
36
+ begin
37
+ http_headers = feed.meta
38
+ http_content_type = feed.meta['content-type'].split(';')[0]
39
+ encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
40
+ http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
41
+ http_encoding = nil if http_encoding.empty?
42
+ # FIXME Open-Uri returns iso8859-1 if there is no charset header,
43
+ # but that doesn't pass the tests. Open-Uri claims its following
44
+ # the right RFC. Are they wrong or do we need to change the tests?
45
+ rescue NoMethodError
46
+ http_headers = {}
47
+ http_content_type = nil
48
+ http_encoding = nil
49
+ end
50
+ # Must sniff for non-ASCII-compatible character encodings before
51
+ # searching for XML declaration. This heuristic is defined in
52
+ # section F of the XML specification:
53
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
54
+ begin
55
+ if xml_data[0..3] == "\x4c\x6f\xa7\x94"
56
+ # EBCDIC
57
+ xml_data = _ebcdic_to_ascii(xml_data)
58
+ elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
59
+ # UTF-16BE
60
+ sniffed_xml_encoding = 'utf-16be'
61
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
62
+ elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
63
+ # UTF-16BE with BOM
64
+ sniffed_xml_encoding = 'utf-16be'
65
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
66
+ elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
67
+ # UTF-16LE
68
+ sniffed_xml_encoding = 'utf-16le'
69
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
70
+ elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
71
+ # UTF-16LE with BOM
72
+ sniffed_xml_encoding = 'utf-16le'
73
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
74
+ elsif xml_data[0..3] == "\x00\x00\x00\x3c"
75
+ # UTF-32BE
76
+ sniffed_xml_encoding = 'utf-32be'
77
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
78
+ elsif xml_data[0..3] == "\x3c\x00\x00\x00"
79
+ # UTF-32LE
80
+ sniffed_xml_encoding = 'utf-32le'
81
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
82
+ elsif xml_data[0..3] == "\x00\x00\xfe\xff"
83
+ # UTF-32BE with BOM
84
+ sniffed_xml_encoding = 'utf-32be'
85
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
86
+ elsif xml_data[0..3] == "\xff\xfe\x00\x00"
87
+ # UTF-32LE with BOM
88
+ sniffed_xml_encoding = 'utf-32le'
89
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
90
+ elsif xml_data[0..2] == "\xef\xbb\xbf"
91
+ # UTF-8 with BOM
92
+ sniffed_xml_encoding = 'utf-8'
93
+ xml_data = xml_data[3..-1]
94
+ else
95
+ # ASCII-compatible
96
+ end
97
+ xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
98
+ rescue
99
+ xml_encoding_match = nil
100
+ end
101
+ if xml_encoding_match
102
+ xml_encoding = xml_encoding_match[1].downcase
103
+ xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
104
+ if sniffed_xml_encoding and xencodings.include?xml_encoding
105
+ xml_encoding = sniffed_xml_encoding
106
+ end
107
+ end
108
+
109
+ acceptable_content_type = false
110
+ application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
111
+ text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
112
+
113
+ if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
114
+ acceptable_content_type = true
115
+ true_encoding = http_encoding || xml_encoding || 'utf-8'
116
+ elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
117
+ acceptable_content_type = true
118
+ true_encoding = http_encoding || 'us-ascii'
119
+ elsif /^text\// =~ http_content_type
120
+ true_encoding = http_encoding || 'us-ascii'
121
+ elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
122
+ true_encoding = xml_encoding || 'iso-8859-1'
123
+ else
124
+ true_encoding = xml_encoding || 'utf-8'
125
+ end
126
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
127
+ end
128
+
129
+ def toUTF8(data, encoding)
130
+ =begin
131
+ Changes an XML data stream on the fly to specify a new encoding
132
+
133
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
134
+ encoding is a string recognized by encodings.aliases
135
+ =end
136
+ $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
137
+ # NOTE we must use double quotes when dealing with \x encodings!
138
+ if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
139
+ if $debug
140
+ $stderr << "stripping BOM\n"
141
+ if encoding != 'utf-16be'
142
+ $stderr << "string utf-16be instead\n"
143
+ end
144
+ end
145
+ encoding = 'utf-16be'
146
+ data = data[2..-1]
147
+ elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
148
+ if $debug
149
+ $stderr << "stripping BOM\n"
150
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
151
+ end
152
+ encoding = 'utf-16le'
153
+ data = data[2..-1]
154
+ elsif (data[0..2] == "\xef\xbb\xbf")
155
+ if $debug
156
+ $stderr << "stripping BOM\n"
157
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
158
+ end
159
+ encoding = 'utf-8'
160
+ data = data[3..-1]
161
+ elsif (data[0..3] == "\x00\x00\xfe\xff")
162
+ if $debug
163
+ $stderr << "stripping BOM\n"
164
+ if encoding != 'utf-32be'
165
+ $stderr << "trying utf-32be instead\n"
166
+ end
167
+ end
168
+ encoding = 'utf-32be'
169
+ data = data[4..-1]
170
+ elsif (data[0..3] == "\xff\xfe\x00\x00")
171
+ if $debug
172
+ $stderr << "stripping BOM\n"
173
+ if encoding != 'utf-32le'
174
+ $stderr << "trying utf-32le instead\n"
175
+ end
176
+ end
177
+ encoding = 'utf-32le'
178
+ data = data[4..-1]
179
+ end
180
+ begin
181
+ newdata = uconvert(data, encoding, 'utf-8')
182
+ rescue => details
183
+ end
184
+ $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
185
+ declmatch = /^<\?xml[^>]*?>/
186
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
187
+ if declmatch =~ newdata
188
+ newdata.sub!(declmatch, newdecl)
189
+ else
190
+ newdata = newdecl + "\n" + newdata
191
+ end
192
+ return newdata
193
+ end
194
+
195
+ end
196
+
197
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
198
+ module XChar
199
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
200
+ CP1252 = {
201
+ 128 => 8364, # euro sign
202
+ 130 => 8218, # single low-9 quotation mark
203
+ 131 => 402, # latin small letter f with hook
204
+ 132 => 8222, # double low-9 quotation mark
205
+ 133 => 8230, # horizontal ellipsis
206
+ 134 => 8224, # dagger
207
+ 135 => 8225, # double dagger
208
+ 136 => 710, # modifier letter circumflex accent
209
+ 137 => 8240, # per mille sign
210
+ 138 => 352, # latin capital letter s with caron
211
+ 139 => 8249, # single left-pointing angle quotation mark
212
+ 140 => 338, # latin capital ligature oe
213
+ 142 => 381, # latin capital letter z with caron
214
+ 145 => 8216, # left single quotation mark
215
+ 146 => 8217, # right single quotation mark
216
+ 147 => 8220, # left double quotation mark
217
+ 148 => 8221, # right double quotation mark
218
+ 149 => 8226, # bullet
219
+ 150 => 8211, # en dash
220
+ 151 => 8212, # em dash
221
+ 152 => 732, # small tilde
222
+ 153 => 8482, # trade mark sign
223
+ 154 => 353, # latin small letter s with caron
224
+ 155 => 8250, # single right-pointing angle quotation mark
225
+ 156 => 339, # latin small ligature oe
226
+ 158 => 382, # latin small letter z with caron
227
+ 159 => 376} # latin capital letter y with diaeresis
228
+
229
+ # http://www.w3.org/TR/REC-xml/#dt-chardata
230
+ PREDEFINED = {
231
+ 38 => '&amp;', # ampersand
232
+ 60 => '&lt;', # left angle bracket
233
+ 62 => '&gt;'} # right angle bracket
234
+
235
+ # http://www.w3.org/TR/REC-xml/#charsets
236
+ VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
237
+ (0xE000..0xFFFD), (0x10000..0x10FFFF)]
238
+ end
239
+
240
+ class Fixnum
241
+ # xml escaped version of chr
242
+ def xchr
243
+ n = XChar::CP1252[self] || self
244
+ n = 42 unless XChar::VALID.find {|range| range.include? n}
245
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
246
+ end
247
+ end
248
+
249
+ class String
250
+ alias :old_index :index
251
+ def to_xs
252
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
253
+ rescue
254
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
255
+ end
256
+ end
257
+