jgre-rfeedparser 0.9.961

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ class BetterSGMLParserError < StandardError; end;
5
+ class BetterSGMLParser < HTML::SGMLParser
6
+ # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
7
+ # This makes things work.
8
+ Interesting = /[&<]/u
9
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
10
+ # 64 is the unicode flag
11
+
12
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
13
+ Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
14
+
15
+ Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
16
+ Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
17
+ Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
18
+ Endbracket = /[<>]/u
19
+ Declopen = /<!/u
20
+ Piopenbegin = /^<\?/u
21
+ Piclose = />/u
22
+
23
+ Commentopen = /<!--/u
24
+ Commentclose = /--\s*>/u
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
26
+ Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
27
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?', # '
28
+ 64)
29
+ Endtagfind = /\s*\/\s*>/u
30
+ def initialize(verbose=false)
31
+ super(verbose)
32
+ end
33
+ def feed(*args)
34
+ super(*args)
35
+ end
36
+
37
+ def goahead(_end)
38
+ rawdata = @rawdata # woo, utf-8 magic
39
+ i = 0
40
+ n = rawdata.length
41
+ while i < n
42
+ if @nomoretags
43
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
+ i = n
46
+ break
47
+ end
48
+ j = rawdata.index(Interesting, i)
49
+ j = n unless j
50
+ handle_data(rawdata[i...j]) if i < j
51
+ i = j
52
+ break if (i == n)
53
+ if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
54
+ if rawdata.index(Starttagopen,i) == i
55
+ if @literal
56
+ handle_data(rawdata[i..i])
57
+ i = i+1
58
+ next
59
+ end
60
+ k = parse_starttag(i)
61
+ break unless k
62
+ i = k
63
+ next
64
+ end
65
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
+ k = parse_endtag(i)
67
+ break unless k
68
+ i = k
69
+ @literal = false
70
+ next
71
+ end
72
+ if @literal
73
+ if n > (i+1)
74
+ handle_data("<")
75
+ i = i+1
76
+ else
77
+ #incomplete
78
+ break
79
+ end
80
+ next
81
+ end
82
+ if rawdata.index(Commentopen,i) == i
83
+ k = parse_comment(i)
84
+ break unless k
85
+ i = k
86
+ next
87
+ end
88
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
+ k = parse_pi(i)
90
+ break unless k
91
+ i += k
92
+ next
93
+ end
94
+ if rawdata.index(Declopen,i) == i
95
+ # This is some sort of declaration; in "HTML as
96
+ # deployed," this should only be the document type
97
+ # declaration ("<!DOCTYPE html...>").
98
+ k = parse_declaration(i)
99
+ break unless k
100
+ i = k
101
+ next
102
+ end
103
+ elsif rawdata[i..i] == '&'
104
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
+ handle_data(rawdata[i..i])
106
+ i += 1
107
+ next
108
+ end
109
+
110
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
+ ni,match = index_match(rawdata, Charref, i)
112
+ if ni && ni == i # See? Ugly
113
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
+ i -= 1 unless rawdata[i-1..i-1] == ";"
116
+ next
117
+ end
118
+ ni,match = index_match(rawdata, Entityref, i)
119
+ if ni && ni == i
120
+ handle_entityref(match[1])
121
+ i += match[0].length
122
+ i -= 1 unless rawdata[i-1..i-1] == ";"
123
+ next
124
+ end
125
+ else
126
+ error('neither < nor & ??')
127
+ end
128
+ # We get here only if incomplete matches but
129
+ # nothing else
130
+ ni,match = index_match(rawdata,Incomplete,i)
131
+ unless ni && ni == 0
132
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
+ i += 1
134
+ next
135
+ end
136
+ j = ni + match[0].length
137
+ break if j == n # Really incomplete
138
+ handle_data(rawdata[i...j])
139
+ i = j
140
+ end # end while
141
+
142
+ if _end && i < n
143
+ handle_data(rawdata[i...n])
144
+ i = n
145
+ end
146
+
147
+ @rawdata = rawdata[i..-1]
148
+ # @offset += i # FIXME BUGME another unused variable in SGMLParser?
149
+ end
150
+
151
+
152
+ # Internal -- parse processing instr, return length or -1 if not terminated
153
+ def parse_pi(i)
154
+ rawdata = @rawdata
155
+ if rawdata[i...i+2] != '<?'
156
+ error("unexpected call to parse_pi()")
157
+ end
158
+ ni,match = index_match(rawdata,Piclose,i+2)
159
+ return nil unless match
160
+ j = ni
161
+ handle_pi(rawdata[i+2...j])
162
+ j = (j + match[0].length)
163
+ return j-i
164
+ end
165
+
166
+ def parse_comment(i)
167
+ rawdata = @rawdata
168
+ if rawdata[i...i+4] != "<!--"
169
+ error("unexpected call to parse_comment()")
170
+ end
171
+ ni,match = index_match(rawdata, Commentclose,i)
172
+ return nil unless match
173
+ handle_comment(rawdata[i+4..(ni-1)])
174
+ return ni+match[0].length # Length from i to just past the closing comment tag
175
+ end
176
+
177
+
178
+ def parse_starttag(i)
179
+ @_starttag_text = nil
180
+ start_pos = i
181
+ rawdata = @rawdata
182
+ ni,match = index_match(rawdata,Shorttagopen,i)
183
+ if ni == i
184
+ # SGML shorthand: <tag/data/ == <tag>data</tag>
185
+ # XXX Can data contain &... (entity or char refs)?
186
+ # XXX Can data contain < or > (tag characters)?
187
+ # XXX Can there be whitespace before the first /?
188
+ k,match = index_match(rawdata,Shorttag,i)
189
+ return nil unless match
190
+ tag, data = match[1], match[2]
191
+ @_starttag_text = "<#{tag}/"
192
+ tag.downcase!
193
+ second_end = rawdata.index(Shorttagopen,k)
194
+ finish_shorttag(tag, data)
195
+ @_starttag_text = rawdata[start_pos...second_end+1]
196
+ return k
197
+ end
198
+
199
+ j = rawdata.index(Endbracket, i+1)
200
+ return nil unless j
201
+ attrsd = []
202
+ if rawdata[i...i+2] == '<>'
203
+ # SGML shorthand: <> == <last open tag seen>
204
+ k = j
205
+ tag = @lasttag
206
+ else
207
+ ni,match = index_match(rawdata,Tagfind,i+1)
208
+ unless match
209
+ error('unexpected call to parse_starttag')
210
+ end
211
+ k = ni+match[0].length+1
212
+ tag = match[0].downcase
213
+ @lasttag = tag
214
+ end
215
+
216
+ while k < j
217
+ break if rawdata.index(Endtagfind, k) == k
218
+ ni,match = index_match(rawdata,Attrfind,k)
219
+ break unless ni
220
+ matched_length = match[0].length
221
+ attrname, rest, attrvalue = match[1],match[2],match[3]
222
+ if rest.nil? || rest.empty?
223
+ attrvalue = '' # was: = attrname # Why the change?
224
+ elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] || [?",?"] == [attrvalue[0],attrvalue[-1]]
225
+ attrvalue = attrvalue[1...-1]
226
+ end
227
+ attrsd << [attrname.downcase, attrvalue]
228
+ k += matched_length
229
+ end
230
+ if rawdata[j..j] == ">"
231
+ j += 1
232
+ end
233
+ @_starttag_text = rawdata[start_pos...j]
234
+ finish_starttag(tag, attrsd)
235
+ return j
236
+ end
237
+
238
+ def parse_endtag(i)
239
+ rawdata = @rawdata
240
+ j, match = index_match(rawdata, /[<>]/,i+1)
241
+ return nil unless j
242
+ tag = rawdata[i+2...j].strip.downcase
243
+ if rawdata[j..j] == ">"
244
+ j += 1
245
+ end
246
+ finish_endtag(tag)
247
+ return j
248
+ end
249
+
250
+ def output
251
+ # Return processed HTML as a single string
252
+ return @pieces.map{|p| p.to_s}.join
253
+ end
254
+
255
+ def error(message)
256
+ raise BetterSGMLParserError.new(message)
257
+ end
258
+ def handle_pi(text)
259
+ end
260
+ def handle_decl(text)
261
+ end
262
+ end
263
+
264
+
@@ -0,0 +1,260 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module FeedParserUtilities
4
+
5
+ def unicode(data, from_encoding)
6
+ # Takes a single string and converts it from the encoding in
7
+ # from_encoding to unicode.
8
+ uconvert(data, from_encoding, 'unicode')
9
+ end
10
+
11
+ def uconvert(data, from_encoding, to_encoding = 'utf-8')
12
+ from_encoding = Encoding_Aliases[from_encoding] || from_encoding
13
+ to_encoding = Encoding_Aliases[to_encoding] || to_encoding
14
+ Iconv.iconv(to_encoding, from_encoding, data)[0]
15
+ end
16
+
17
+ def index_match(stri ,regexp, offset)
18
+ i = stri.index(regexp, offset)
19
+
20
+ return nil, nil unless i
21
+
22
+ full = stri[i..-1].match(regexp)
23
+ return i, full
24
+ end
25
+
26
+ def _ebcdic_to_ascii(s)
27
+ Iconv.iconv("iso-8859-1", "cp500", s)[0]
28
+ end
29
+
30
+ def getCharacterEncoding(http_headers, xml_data)
31
+ # Get the character encoding of the XML document
32
+ $stderr << "In getCharacterEncoding\n" if $debug
33
+ sniffed_xml_encoding = nil
34
+ xml_encoding = nil
35
+ true_encoding = nil
36
+
37
+ http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
38
+
39
+ encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
40
+ http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
41
+
42
+ http_encoding = nil if http_encoding && http_encoding.empty?
43
+ # FIXME Open-Uri returns iso8859-1 if there is no charset header,
44
+ # but that doesn't pass the tests. Open-Uri claims its following
45
+ # the right RFC. Are they wrong or do we need to change the tests?
46
+
47
+ # Must sniff for non-ASCII-compatible character encodings before
48
+ # searching for XML declaration. This heuristic is defined in
49
+ # section F of the XML specification:
50
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
51
+ begin
52
+ if xml_data[0..3] == "\x4c\x6f\xa7\x94"
53
+ # EBCDIC
54
+ xml_data = _ebcdic_to_ascii(xml_data)
55
+ elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
56
+ # UTF-16BE
57
+ sniffed_xml_encoding = 'utf-16be'
58
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
59
+ elsif xml_data.size >= 4 && xml_data[0..1] == "\xfe\xff" && xml_data[2..3] != "\x00\x00"
60
+ # UTF-16BE with BOM
61
+ sniffed_xml_encoding = 'utf-16be'
62
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
63
+ elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
64
+ # UTF-16LE
65
+ sniffed_xml_encoding = 'utf-16le'
66
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
67
+ elsif xml_data.size >=4 && xml_data[0..1] == "\xff\xfe" && xml_data[2..3] != "\x00\x00"
68
+ # UTF-16LE with BOM
69
+ sniffed_xml_encoding = 'utf-16le'
70
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
71
+ elsif xml_data[0..3] == "\x00\x00\x00\x3c"
72
+ # UTF-32BE
73
+ sniffed_xml_encoding = 'utf-32be'
74
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
75
+ elsif xml_data[0..3] == "\x3c\x00\x00\x00"
76
+ # UTF-32LE
77
+ sniffed_xml_encoding = 'utf-32le'
78
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
79
+ elsif xml_data[0..3] == "\x00\x00\xfe\xff"
80
+ # UTF-32BE with BOM
81
+ sniffed_xml_encoding = 'utf-32be'
82
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
83
+ elsif xml_data[0..3] == "\xff\xfe\x00\x00"
84
+ # UTF-32LE with BOM
85
+ sniffed_xml_encoding = 'utf-32le'
86
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
87
+ elsif xml_data[0..2] == "\xef\xbb\xbf"
88
+ # UTF-8 with BOM
89
+ sniffed_xml_encoding = 'utf-8'
90
+ xml_data = xml_data[3..-1]
91
+ else
92
+ # ASCII-compatible
93
+ end
94
+ xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
95
+ rescue
96
+ xml_encoding_match = nil
97
+ end
98
+ if xml_encoding_match
99
+ xml_encoding = xml_encoding_match[1].downcase
100
+ xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
101
+ if sniffed_xml_encoding && xencodings.include?(xml_encoding)
102
+ xml_encoding = sniffed_xml_encoding
103
+ end
104
+ end
105
+
106
+ acceptable_content_type = false
107
+ application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
108
+ text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
109
+
110
+ if application_content_types.include?(http_content_type) || (/^application\// =~ http_content_type && /\+xml$/ =~ http_content_type)
111
+ acceptable_content_type = true
112
+ true_encoding = http_encoding || xml_encoding || 'utf-8'
113
+ elsif text_content_types.include?(http_content_type) || (/^text\// =~ http_content_type && /\+xml$/ =~ http_content_type)
114
+ acceptable_content_type = true
115
+ true_encoding = http_encoding || 'us-ascii'
116
+ elsif /^text\// =~ http_content_type
117
+ true_encoding = http_encoding || 'us-ascii'
118
+ elsif http_headers && !http_headers.empty? && !http_headers['content-type']
119
+ true_encoding = xml_encoding || 'iso-8859-1'
120
+ else
121
+ true_encoding = xml_encoding || 'utf-8'
122
+ end
123
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
124
+ end
125
+
126
+ def toUTF8(data, encoding)
127
+ $stderr << "entering self.toUTF8, trying encoding #{encoding}\n" if $debug
128
+ # NOTE we must use double quotes when dealing with \x encodings!
129
+ if (data.size >= 4 && data[0..1] == "\xfe\xff" && data[2..3] != "\x00\x00")
130
+ if $debug
131
+ $stderr << "stripping BOM\n"
132
+ if encoding != 'utf-16be'
133
+ $stderr << "string utf-16be instead\n"
134
+ end
135
+ end
136
+ encoding = 'utf-16be'
137
+ data = data[2..-1]
138
+ elsif (data.size >= 4 && data[0..1] == "\xff\xfe" && data[2..3] != "\x00\x00")
139
+ if $debug
140
+ $stderr << "stripping BOM\n"
141
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
142
+ end
143
+ encoding = 'utf-16le'
144
+ data = data[2..-1]
145
+ elsif (data[0..2] == "\xef\xbb\xbf")
146
+ if $debug
147
+ $stderr << "stripping BOM\n"
148
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
149
+ end
150
+ encoding = 'utf-8'
151
+ data = data[3..-1]
152
+ elsif (data[0..3] == "\x00\x00\xfe\xff")
153
+ if $debug
154
+ $stderr << "stripping BOM\n"
155
+ if encoding != 'utf-32be'
156
+ $stderr << "trying utf-32be instead\n"
157
+ end
158
+ end
159
+ encoding = 'utf-32be'
160
+ data = data[4..-1]
161
+ elsif (data[0..3] == "\xff\xfe\x00\x00")
162
+ if $debug
163
+ $stderr << "stripping BOM\n"
164
+ if encoding != 'utf-32le'
165
+ $stderr << "trying utf-32le instead\n"
166
+ end
167
+ end
168
+ encoding = 'utf-32le'
169
+ data = data[4..-1]
170
+ end
171
+ begin
172
+ newdata = uconvert(data, encoding, 'utf-8')
173
+ rescue => details
174
+ raise details
175
+ end
176
+ $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
177
+
178
+ declmatch = /^<\?xml[^>]*?>/
179
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
180
+
181
+ if declmatch =~ newdata
182
+ newdata.sub!(declmatch, newdecl)
183
+ else
184
+ newdata = newdecl + "\n" + newdata
185
+ end
186
+
187
+ newdata
188
+ end
189
+
190
+ end
191
+
192
+ unless defined?(Builder::XChar)
193
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
194
+ module XChar
195
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
196
+ CP1252 = {
197
+ 128 => 8364, # euro sign
198
+ 130 => 8218, # single low-9 quotation mark
199
+ 131 => 402, # latin small letter f with hook
200
+ 132 => 8222, # double low-9 quotation mark
201
+ 133 => 8230, # horizontal ellipsis
202
+ 134 => 8224, # dagger
203
+ 135 => 8225, # double dagger
204
+ 136 => 710, # modifier letter circumflex accent
205
+ 137 => 8240, # per mille sign
206
+ 138 => 352, # latin capital letter s with caron
207
+ 139 => 8249, # single left-pointing angle quotation mark
208
+ 140 => 338, # latin capital ligature oe
209
+ 142 => 381, # latin capital letter z with caron
210
+ 145 => 8216, # left single quotation mark
211
+ 146 => 8217, # right single quotation mark
212
+ 147 => 8220, # left double quotation mark
213
+ 148 => 8221, # right double quotation mark
214
+ 149 => 8226, # bullet
215
+ 150 => 8211, # en dash
216
+ 151 => 8212, # em dash
217
+ 152 => 732, # small tilde
218
+ 153 => 8482, # trade mark sign
219
+ 154 => 353, # latin small letter s with caron
220
+ 155 => 8250, # single right-pointing angle quotation mark
221
+ 156 => 339, # latin small ligature oe
222
+ 158 => 382, # latin small letter z with caron
223
+ 159 => 376 # latin capital letter y with diaeresis
224
+ }
225
+ # http://www.w3.org/TR/REC-xml/#dt-chardata
226
+ PREDEFINED = {
227
+ 38 => '&amp;', # ampersand
228
+ 60 => '&lt;', # left angle bracket
229
+ 62 => '&gt;' # right angle bracket
230
+ }
231
+ # http://www.w3.org/TR/REC-xml/#charsets
232
+ VALID = [
233
+ 0x9, 0xA, 0xD,
234
+ (0x20..0xD7FF),
235
+ (0xE000..0xFFFD),
236
+ (0x10000..0x10FFFF)
237
+ ]
238
+ end
239
+
240
+ class Fixnum
241
+ # xml escaped version of chr
242
+ def xchr
243
+ n = XChar::CP1252[self] || self
244
+
245
+ case n when *XChar::VALID
246
+ XChar::PREDEFINED[n] || (n<128 ? n.chr : "&##{n};")
247
+ else
248
+ '*'
249
+ end
250
+ end
251
+ end
252
+
253
+ class String
254
+ def to_xs
255
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
256
+ rescue
257
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
258
+ end
259
+ end
260
+ end