UnderpantsGnome-rfeedparser 0.9.960

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ class BetterSGMLParserError < StandardError; end;
5
+ class BetterSGMLParser < HTML::SGMLParser
6
+ # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
7
+ # This makes things work.
8
+ Interesting = /[&<]/u
9
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
10
+ # 64 is the unicode flag
11
+
12
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
13
+ Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
14
+
15
+ Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
16
+ Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
17
+ Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
18
+ Endbracket = /[<>]/u
19
+ Declopen = /<!/u
20
+ Piopenbegin = /^<\?/u
21
+ Piclose = />/u
22
+
23
+ Commentopen = /<!--/u
24
+ Commentclose = /--\s*>/u
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
26
+ Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
27
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?', # '
28
+ 64)
29
+ Endtagfind = /\s*\/\s*>/u
30
+ def initialize(verbose=false)
31
+ super(verbose)
32
+ end
33
+ def feed(*args)
34
+ super(*args)
35
+ end
36
+
37
+ def goahead(_end)
38
+ rawdata = @rawdata # woo, utf-8 magic
39
+ i = 0
40
+ n = rawdata.length
41
+ while i < n
42
+ if @nomoretags
43
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
+ i = n
46
+ break
47
+ end
48
+ j = rawdata.index(Interesting, i)
49
+ j = n unless j
50
+ handle_data(rawdata[i...j]) if i < j
51
+ i = j
52
+ break if (i == n)
53
+ if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
54
+ if rawdata.index(Starttagopen,i) == i
55
+ if @literal
56
+ handle_data(rawdata[i..i])
57
+ i = i+1
58
+ next
59
+ end
60
+ k = parse_starttag(i)
61
+ break unless k
62
+ i = k
63
+ next
64
+ end
65
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
+ k = parse_endtag(i)
67
+ break unless k
68
+ i = k
69
+ @literal = false
70
+ next
71
+ end
72
+ if @literal
73
+ if n > (i+1)
74
+ handle_data("<")
75
+ i = i+1
76
+ else
77
+ #incomplete
78
+ break
79
+ end
80
+ next
81
+ end
82
+ if rawdata.index(Commentopen,i) == i
83
+ k = parse_comment(i)
84
+ break unless k
85
+ i = k
86
+ next
87
+ end
88
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
+ k = parse_pi(i)
90
+ break unless k
91
+ i += k
92
+ next
93
+ end
94
+ if rawdata.index(Declopen,i) == i
95
+ # This is some sort of declaration; in "HTML as
96
+ # deployed," this should only be the document type
97
+ # declaration ("<!DOCTYPE html...>").
98
+ k = parse_declaration(i)
99
+ break unless k
100
+ i = k
101
+ next
102
+ end
103
+ elsif rawdata[i..i] == '&'
104
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
+ handle_data(rawdata[i..i])
106
+ i += 1
107
+ next
108
+ end
109
+
110
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
+ ni,match = index_match(rawdata, Charref, i)
112
+ if ni && ni == i # See? Ugly
113
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
+ i -= 1 unless rawdata[i-1..i-1] == ";"
116
+ next
117
+ end
118
+ ni,match = index_match(rawdata, Entityref, i)
119
+ if ni && ni == i
120
+ handle_entityref(match[1])
121
+ i += match[0].length
122
+ i -= 1 unless rawdata[i-1..i-1] == ";"
123
+ next
124
+ end
125
+ else
126
+ error('neither < nor & ??')
127
+ end
128
+ # We get here only if incomplete matches but
129
+ # nothing else
130
+ ni,match = index_match(rawdata,Incomplete,i)
131
+ unless ni && ni == 0
132
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
+ i += 1
134
+ next
135
+ end
136
+ j = ni + match[0].length
137
+ break if j == n # Really incomplete
138
+ handle_data(rawdata[i...j])
139
+ i = j
140
+ end # end while
141
+
142
+ if _end && i < n
143
+ handle_data(rawdata[i...n])
144
+ i = n
145
+ end
146
+
147
+ @rawdata = rawdata[i..-1]
148
+ # @offset += i # FIXME BUGME another unused variable in SGMLParser?
149
+ end
150
+
151
+
152
+ # Internal -- parse processing instr, return length or -1 if not terminated
153
+ def parse_pi(i)
154
+ rawdata = @rawdata
155
+ if rawdata[i...i+2] != '<?'
156
+ error("unexpected call to parse_pi()")
157
+ end
158
+ ni,match = index_match(rawdata,Piclose,i+2)
159
+ return nil unless match
160
+ j = ni
161
+ handle_pi(rawdata[i+2...j])
162
+ j = (j + match[0].length)
163
+ return j-i
164
+ end
165
+
166
+ def parse_comment(i)
167
+ rawdata = @rawdata
168
+ if rawdata[i...i+4] != "<!--"
169
+ error("unexpected call to parse_comment()")
170
+ end
171
+ ni,match = index_match(rawdata, Commentclose,i)
172
+ return nil unless match
173
+ handle_comment(rawdata[i+4..(ni-1)])
174
+ return ni+match[0].length # Length from i to just past the closing comment tag
175
+ end
176
+
177
+
178
+ def parse_starttag(i)
179
+ @_starttag_text = nil
180
+ start_pos = i
181
+ rawdata = @rawdata
182
+ ni,match = index_match(rawdata,Shorttagopen,i)
183
+ if ni == i
184
+ # SGML shorthand: <tag/data/ == <tag>data</tag>
185
+ # XXX Can data contain &... (entity or char refs)?
186
+ # XXX Can data contain < or > (tag characters)?
187
+ # XXX Can there be whitespace before the first /?
188
+ k,match = index_match(rawdata,Shorttag,i)
189
+ return nil unless match
190
+ tag, data = match[1], match[2]
191
+ @_starttag_text = "<#{tag}/"
192
+ tag.downcase!
193
+ second_end = rawdata.index(Shorttagopen,k)
194
+ finish_shorttag(tag, data)
195
+ @_starttag_text = rawdata[start_pos...second_end+1]
196
+ return k
197
+ end
198
+
199
+ j = rawdata.index(Endbracket, i+1)
200
+ return nil unless j
201
+ attrsd = []
202
+ if rawdata[i...i+2] == '<>'
203
+ # SGML shorthand: <> == <last open tag seen>
204
+ k = j
205
+ tag = @lasttag
206
+ else
207
+ ni,match = index_match(rawdata,Tagfind,i+1)
208
+ unless match
209
+ error('unexpected call to parse_starttag')
210
+ end
211
+ k = ni+match[0].length+1
212
+ tag = match[0].downcase
213
+ @lasttag = tag
214
+ end
215
+
216
+ while k < j
217
+ break if rawdata.index(Endtagfind, k) == k
218
+ ni,match = index_match(rawdata,Attrfind,k)
219
+ break unless ni
220
+ matched_length = match[0].length
221
+ attrname, rest, attrvalue = match[1],match[2],match[3]
222
+ if rest.nil? || rest.empty?
223
+ attrvalue = '' # was: = attrname # Why the change?
224
+ elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] || [?",?"] == [attrvalue[0],attrvalue[-1]]
225
+ attrvalue = attrvalue[1...-1]
226
+ end
227
+ attrsd << [attrname.downcase, attrvalue]
228
+ k += matched_length
229
+ end
230
+ if rawdata[j..j] == ">"
231
+ j += 1
232
+ end
233
+ @_starttag_text = rawdata[start_pos...j]
234
+ finish_starttag(tag, attrsd)
235
+ return j
236
+ end
237
+
238
+ def parse_endtag(i)
239
+ rawdata = @rawdata
240
+ j, match = index_match(rawdata, /[<>]/,i+1)
241
+ return nil unless j
242
+ tag = rawdata[i+2...j].strip.downcase
243
+ if rawdata[j..j] == ">"
244
+ j += 1
245
+ end
246
+ finish_endtag(tag)
247
+ return j
248
+ end
249
+
250
+ def output
251
+ # Return processed HTML as a single string
252
+ return @pieces.map{|p| p.to_s}.join
253
+ end
254
+
255
+ def error(message)
256
+ raise BetterSGMLParserError.new(message)
257
+ end
258
+ def handle_pi(text)
259
+ end
260
+ def handle_decl(text)
261
+ end
262
+ end
263
+
264
+
@@ -0,0 +1,260 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module FeedParserUtilities
4
+
5
+ def unicode(data, from_encoding)
6
+ # Takes a single string and converts it from the encoding in
7
+ # from_encoding to unicode.
8
+ uconvert(data, from_encoding, 'unicode')
9
+ end
10
+
11
+ def uconvert(data, from_encoding, to_encoding = 'utf-8')
12
+ from_encoding = Encoding_Aliases[from_encoding] || from_encoding
13
+ to_encoding = Encoding_Aliases[to_encoding] || to_encoding
14
+ Iconv.iconv(to_encoding, from_encoding, data)[0]
15
+ end
16
+
17
+ def index_match(stri ,regexp, offset)
18
+ i = stri.index(regexp, offset)
19
+
20
+ return nil, nil unless i
21
+
22
+ full = stri[i..-1].match(regexp)
23
+ return i, full
24
+ end
25
+
26
+ def _ebcdic_to_ascii(s)
27
+ Iconv.iconv("iso-8859-1", "cp500", s)[0]
28
+ end
29
+
30
+ def getCharacterEncoding(http_headers, xml_data)
31
+ # Get the character encoding of the XML document
32
+ $stderr << "In getCharacterEncoding\n" if $debug
33
+ sniffed_xml_encoding = nil
34
+ xml_encoding = nil
35
+ true_encoding = nil
36
+
37
+ http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
38
+
39
+ encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
40
+ http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
41
+
42
+ http_encoding = nil if http_encoding && http_encoding.empty?
43
+ # FIXME Open-Uri returns iso8859-1 if there is no charset header,
44
+ # but that doesn't pass the tests. Open-Uri claims its following
45
+ # the right RFC. Are they wrong or do we need to change the tests?
46
+
47
+ # Must sniff for non-ASCII-compatible character encodings before
48
+ # searching for XML declaration. This heuristic is defined in
49
+ # section F of the XML specification:
50
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
51
+ begin
52
+ if xml_data[0..3] == "\x4c\x6f\xa7\x94"
53
+ # EBCDIC
54
+ xml_data = _ebcdic_to_ascii(xml_data)
55
+ elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
56
+ # UTF-16BE
57
+ sniffed_xml_encoding = 'utf-16be'
58
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
59
+ elsif xml_data.size >= 4 && xml_data[0..1] == "\xfe\xff" && xml_data[2..3] != "\x00\x00"
60
+ # UTF-16BE with BOM
61
+ sniffed_xml_encoding = 'utf-16be'
62
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
63
+ elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
64
+ # UTF-16LE
65
+ sniffed_xml_encoding = 'utf-16le'
66
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
67
+ elsif xml_data.size >=4 && xml_data[0..1] == "\xff\xfe" && xml_data[2..3] != "\x00\x00"
68
+ # UTF-16LE with BOM
69
+ sniffed_xml_encoding = 'utf-16le'
70
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
71
+ elsif xml_data[0..3] == "\x00\x00\x00\x3c"
72
+ # UTF-32BE
73
+ sniffed_xml_encoding = 'utf-32be'
74
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
75
+ elsif xml_data[0..3] == "\x3c\x00\x00\x00"
76
+ # UTF-32LE
77
+ sniffed_xml_encoding = 'utf-32le'
78
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
79
+ elsif xml_data[0..3] == "\x00\x00\xfe\xff"
80
+ # UTF-32BE with BOM
81
+ sniffed_xml_encoding = 'utf-32be'
82
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
83
+ elsif xml_data[0..3] == "\xff\xfe\x00\x00"
84
+ # UTF-32LE with BOM
85
+ sniffed_xml_encoding = 'utf-32le'
86
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
87
+ elsif xml_data[0..2] == "\xef\xbb\xbf"
88
+ # UTF-8 with BOM
89
+ sniffed_xml_encoding = 'utf-8'
90
+ xml_data = xml_data[3..-1]
91
+ else
92
+ # ASCII-compatible
93
+ end
94
+ xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
95
+ rescue
96
+ xml_encoding_match = nil
97
+ end
98
+ if xml_encoding_match
99
+ xml_encoding = xml_encoding_match[1].downcase
100
+ xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
101
+ if sniffed_xml_encoding && xencodings.include?(xml_encoding)
102
+ xml_encoding = sniffed_xml_encoding
103
+ end
104
+ end
105
+
106
+ acceptable_content_type = false
107
+ application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
108
+ text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
109
+
110
+ if application_content_types.include?(http_content_type) || (/^application\// =~ http_content_type && /\+xml$/ =~ http_content_type)
111
+ acceptable_content_type = true
112
+ true_encoding = http_encoding || xml_encoding || 'utf-8'
113
+ elsif text_content_types.include?(http_content_type) || (/^text\// =~ http_content_type && /\+xml$/ =~ http_content_type)
114
+ acceptable_content_type = true
115
+ true_encoding = http_encoding || 'us-ascii'
116
+ elsif /^text\// =~ http_content_type
117
+ true_encoding = http_encoding || 'us-ascii'
118
+ elsif http_headers && !http_headers.empty? && !http_headers['content-type']
119
+ true_encoding = xml_encoding || 'iso-8859-1'
120
+ else
121
+ true_encoding = xml_encoding || 'utf-8'
122
+ end
123
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
124
+ end
125
+
126
+ def toUTF8(data, encoding)
127
+ $stderr << "entering self.toUTF8, trying encoding #{encoding}\n" if $debug
128
+ # NOTE we must use double quotes when dealing with \x encodings!
129
+ if (data.size >= 4 && data[0..1] == "\xfe\xff" && data[2..3] != "\x00\x00")
130
+ if $debug
131
+ $stderr << "stripping BOM\n"
132
+ if encoding != 'utf-16be'
133
+ $stderr << "string utf-16be instead\n"
134
+ end
135
+ end
136
+ encoding = 'utf-16be'
137
+ data = data[2..-1]
138
+ elsif (data.size >= 4 && data[0..1] == "\xff\xfe" && data[2..3] != "\x00\x00")
139
+ if $debug
140
+ $stderr << "stripping BOM\n"
141
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
142
+ end
143
+ encoding = 'utf-16le'
144
+ data = data[2..-1]
145
+ elsif (data[0..2] == "\xef\xbb\xbf")
146
+ if $debug
147
+ $stderr << "stripping BOM\n"
148
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
149
+ end
150
+ encoding = 'utf-8'
151
+ data = data[3..-1]
152
+ elsif (data[0..3] == "\x00\x00\xfe\xff")
153
+ if $debug
154
+ $stderr << "stripping BOM\n"
155
+ if encoding != 'utf-32be'
156
+ $stderr << "trying utf-32be instead\n"
157
+ end
158
+ end
159
+ encoding = 'utf-32be'
160
+ data = data[4..-1]
161
+ elsif (data[0..3] == "\xff\xfe\x00\x00")
162
+ if $debug
163
+ $stderr << "stripping BOM\n"
164
+ if encoding != 'utf-32le'
165
+ $stderr << "trying utf-32le instead\n"
166
+ end
167
+ end
168
+ encoding = 'utf-32le'
169
+ data = data[4..-1]
170
+ end
171
+ begin
172
+ newdata = uconvert(data, encoding, 'utf-8')
173
+ rescue => details
174
+ raise details
175
+ end
176
+ $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
177
+
178
+ declmatch = /^<\?xml[^>]*?>/
179
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
180
+
181
+ if declmatch =~ newdata
182
+ newdata.sub!(declmatch, newdecl)
183
+ else
184
+ newdata = newdecl + "\n" + newdata
185
+ end
186
+
187
+ newdata
188
+ end
189
+
190
+ end
191
+
192
+ unless defined?(Builder::XChar)
193
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
194
+ module XChar
195
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
196
+ CP1252 = {
197
+ 128 => 8364, # euro sign
198
+ 130 => 8218, # single low-9 quotation mark
199
+ 131 => 402, # latin small letter f with hook
200
+ 132 => 8222, # double low-9 quotation mark
201
+ 133 => 8230, # horizontal ellipsis
202
+ 134 => 8224, # dagger
203
+ 135 => 8225, # double dagger
204
+ 136 => 710, # modifier letter circumflex accent
205
+ 137 => 8240, # per mille sign
206
+ 138 => 352, # latin capital letter s with caron
207
+ 139 => 8249, # single left-pointing angle quotation mark
208
+ 140 => 338, # latin capital ligature oe
209
+ 142 => 381, # latin capital letter z with caron
210
+ 145 => 8216, # left single quotation mark
211
+ 146 => 8217, # right single quotation mark
212
+ 147 => 8220, # left double quotation mark
213
+ 148 => 8221, # right double quotation mark
214
+ 149 => 8226, # bullet
215
+ 150 => 8211, # en dash
216
+ 151 => 8212, # em dash
217
+ 152 => 732, # small tilde
218
+ 153 => 8482, # trade mark sign
219
+ 154 => 353, # latin small letter s with caron
220
+ 155 => 8250, # single right-pointing angle quotation mark
221
+ 156 => 339, # latin small ligature oe
222
+ 158 => 382, # latin small letter z with caron
223
+ 159 => 376 # latin capital letter y with diaeresis
224
+ }
225
+ # http://www.w3.org/TR/REC-xml/#dt-chardata
226
+ PREDEFINED = {
227
+ 38 => '&amp;', # ampersand
228
+ 60 => '&lt;', # left angle bracket
229
+ 62 => '&gt;' # right angle bracket
230
+ }
231
+ # http://www.w3.org/TR/REC-xml/#charsets
232
+ VALID = [
233
+ 0x9, 0xA, 0xD,
234
+ (0x20..0xD7FF),
235
+ (0xE000..0xFFFD),
236
+ (0x10000..0x10FFFF)
237
+ ]
238
+ end
239
+
240
+ class Fixnum
241
+ # xml escaped version of chr
242
+ def xchr
243
+ n = XChar::CP1252[self] || self
244
+
245
+ case n when *XChar::VALID
246
+ XChar::PREDEFINED[n] || (n<128 ? n.chr : "&##{n};")
247
+ else
248
+ '*'
249
+ end
250
+ end
251
+ end
252
+
253
+ class String
254
+ def to_xs
255
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
256
+ rescue
257
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
258
+ end
259
+ end
260
+ end