rfeedparser 0.9.9 → 0.9.85

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,264 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
-
4
- class BetterSGMLParserError < Exception; end;
5
- class BetterSGMLParser < HTML::SGMLParser
6
- # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
7
- # This makes things work.
8
- Interesting = /[&<]/u
9
- Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
10
- # 64 is the unicode flag
11
-
12
- Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
13
- Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
14
-
15
- Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
16
- Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
17
- Endtagopen = /<\//u # Matching the Python SGMLParser
18
- Endbracket = /[<>]/u
19
- Declopen = /<!/u
20
- Piopenbegin = /^<\?/u
21
- Piclose = />/u
22
-
23
- Commentopen = /<!--/u
24
- Commentclose = /--\s*>/u
25
- Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
26
- Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
27
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
28
- 64)
29
- Endtagfind = /\s*\/\s*>/u
30
- def initialize(verbose=false)
31
- super(verbose)
32
- end
33
- def feed(*args)
34
- super(*args)
35
- end
36
-
37
- def goahead(_end)
38
- rawdata = @rawdata # woo, utf-8 magic
39
- i = 0
40
- n = rawdata.length
41
- while i < n
42
- if @nomoretags
43
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
- i = n
46
- break
47
- end
48
- j = rawdata.index(Interesting, i)
49
- j = n unless j
50
- handle_data(rawdata[i...j]) if i < j
51
- i = j
52
- break if (i == n)
53
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
54
- if rawdata.index(Starttagopen,i) == i
55
- if @literal
56
- handle_data(rawdata[i..i])
57
- i = i+1
58
- next
59
- end
60
- k = parse_starttag(i)
61
- break unless k
62
- i = k
63
- next
64
- end
65
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
- k = parse_endtag(i)
67
- break unless k
68
- i = k
69
- @literal = false
70
- next
71
- end
72
- if @literal
73
- if n > (i+1)
74
- handle_data("<")
75
- i = i+1
76
- else
77
- #incomplete
78
- break
79
- end
80
- next
81
- end
82
- if rawdata.index(Commentopen,i) == i
83
- k = parse_comment(i)
84
- break unless k
85
- i = k
86
- next
87
- end
88
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
- k = parse_pi(i)
90
- break unless k
91
- i += k
92
- next
93
- end
94
- if rawdata.index(Declopen,i) == i
95
- # This is some sort of declaration; in "HTML as
96
- # deployed," this should only be the document type
97
- # declaration ("<!DOCTYPE html...>").
98
- k = parse_declaration(i)
99
- break unless k
100
- i = k
101
- next
102
- end
103
- elsif rawdata[i..i] == '&'
104
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
- handle_data(rawdata[i..i])
106
- i += 1
107
- next
108
- end
109
-
110
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
- ni,match = index_match(rawdata, Charref, i)
112
- if ni and ni == i # See? Ugly
113
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
- i -= 1 unless rawdata[i-1..i-1] == ";"
116
- next
117
- end
118
- ni,match = index_match(rawdata, Entityref, i)
119
- if ni and ni == i
120
- handle_entityref(match[1])
121
- i += match[0].length
122
- i -= 1 unless rawdata[i-1..i-1] == ";"
123
- next
124
- end
125
- else
126
- error('neither < nor & ??')
127
- end
128
- # We get here only if incomplete matches but
129
- # nothing else
130
- ni,match = index_match(rawdata,Incomplete,i)
131
- unless ni and ni == 0
132
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
- i += 1
134
- next
135
- end
136
- j = ni + match[0].length
137
- break if j == n # Really incomplete
138
- handle_data(rawdata[i...j])
139
- i = j
140
- end # end while
141
-
142
- if _end and i < n
143
- handle_data(rawdata[i...n])
144
- i = n
145
- end
146
-
147
- @rawdata = rawdata[i..-1]
148
- # @offset += i # FIXME BUGME another unused variable in SGMLParser?
149
- end
150
-
151
-
152
- # Internal -- parse processing instr, return length or -1 if not terminated
153
- def parse_pi(i)
154
- rawdata = @rawdata
155
- if rawdata[i...i+2] != '<?'
156
- error("unexpected call to parse_pi()")
157
- end
158
- ni,match = index_match(rawdata,Piclose,i+2)
159
- return nil unless match
160
- j = ni
161
- handle_pi(rawdata[i+2...j])
162
- j = (j + match[0].length)
163
- return j-i
164
- end
165
-
166
- def parse_comment(i)
167
- rawdata = @rawdata
168
- if rawdata[i...i+4] != "<!--"
169
- error("unexpected call to parse_comment()")
170
- end
171
- ni,match = index_match(rawdata, Commentclose,i)
172
- return nil unless match
173
- handle_comment(rawdata[i+4..(ni-1)])
174
- return ni+match[0].length # Length from i to just past the closing comment tag
175
- end
176
-
177
-
178
- def parse_starttag(i)
179
- @_starttag_text = nil
180
- start_pos = i
181
- rawdata = @rawdata
182
- ni,match = index_match(rawdata,Shorttagopen,i)
183
- if ni == i
184
- # SGML shorthand: <tag/data/ == <tag>data</tag>
185
- # XXX Can data contain &... (entity or char refs)?
186
- # XXX Can data contain < or > (tag characters)?
187
- # XXX Can there be whitespace before the first /?
188
- k,match = index_match(rawdata,Shorttag,i)
189
- return nil unless match
190
- tag, data = match[1], match[2]
191
- @_starttag_text = "<#{tag}/"
192
- tag.downcase!
193
- second_end = rawdata.index(Shorttagopen,k)
194
- finish_shorttag(tag, data)
195
- @_starttag_text = rawdata[start_pos...second_end+1]
196
- return k
197
- end
198
-
199
- j = rawdata.index(Endbracket, i+1)
200
- return nil unless j
201
- attrsd = []
202
- if rawdata[i...i+2] == '<>'
203
- # SGML shorthand: <> == <last open tag seen>
204
- k = j
205
- tag = @lasttag
206
- else
207
- ni,match = index_match(rawdata,Tagfind,i+1)
208
- unless match
209
- error('unexpected call to parse_starttag')
210
- end
211
- k = ni+match[0].length+1
212
- tag = match[0].downcase
213
- @lasttag = tag
214
- end
215
-
216
- while k < j
217
- break if rawdata.index(Endtagfind, k) == k
218
- ni,match = index_match(rawdata,Attrfind,k)
219
- break unless ni
220
- matched_length = match[0].length
221
- attrname, rest, attrvalue = match[1],match[2],match[3]
222
- if rest.nil? or rest.empty?
223
- attrvalue = '' # was: = attrname # Why the change?
224
- elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
225
- attrvalue = attrvalue[1...-1]
226
- end
227
- attrsd << [attrname.downcase, attrvalue]
228
- k += matched_length
229
- end
230
- if rawdata[j..j] == ">"
231
- j += 1
232
- end
233
- @_starttag_text = rawdata[start_pos...j]
234
- finish_starttag(tag, attrsd)
235
- return j
236
- end
237
-
238
- def parse_endtag(i)
239
- rawdata = @rawdata
240
- j, match = index_match(rawdata, /[<>]/,i+1)
241
- return nil unless j
242
- tag = rawdata[i+2...j].strip.downcase
243
- if rawdata[j..j] == ">"
244
- j += 1
245
- end
246
- finish_endtag(tag)
247
- return j
248
- end
249
-
250
- def output
251
- # Return processed HTML as a single string
252
- return @pieces.map{|p| p.to_s}.join
253
- end
254
-
255
- def error(message)
256
- raise BetterSGMLParserError.new(message)
257
- end
258
- def handle_pi(text)
259
- end
260
- def handle_decl(text)
261
- end
262
- end
263
-
264
-
@@ -1,257 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- module FeedParserUtilities
4
-
5
- def unicode(data, from_encoding)
6
- # Takes a single string and converts it from the encoding in
7
- # from_encoding to unicode.
8
- uconvert(data, from_encoding, 'unicode')
9
- end
10
-
11
- def uconvert(data, from_encoding, to_encoding = 'utf-8')
12
- from_encoding = Encoding_Aliases[from_encoding] || from_encoding
13
- to_encoding = Encoding_Aliases[to_encoding] || to_encoding
14
- Iconv.iconv(to_encoding, from_encoding, data)[0]
15
- end
16
-
17
- def index_match(stri,regexp, offset)
18
- i = stri.index(regexp, offset)
19
-
20
- return nil, nil unless i
21
-
22
- full = stri[i..-1].match(regexp)
23
- return i, full
24
- end
25
-
26
- def _ebcdic_to_ascii(s)
27
- return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
28
- end
29
-
30
- def getCharacterEncoding(feed, xml_data)
31
- # Get the character encoding of the XML document
32
- $stderr << "In getCharacterEncoding\n" if $debug
33
- sniffed_xml_encoding = nil
34
- xml_encoding = nil
35
- true_encoding = nil
36
- begin
37
- http_headers = feed.meta
38
- http_content_type = feed.meta['content-type'].split(';')[0]
39
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
40
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
41
- http_encoding = nil if http_encoding.empty?
42
- # FIXME Open-Uri returns iso8859-1 if there is no charset header,
43
- # but that doesn't pass the tests. Open-Uri claims its following
44
- # the right RFC. Are they wrong or do we need to change the tests?
45
- rescue NoMethodError
46
- http_headers = {}
47
- http_content_type = nil
48
- http_encoding = nil
49
- end
50
- # Must sniff for non-ASCII-compatible character encodings before
51
- # searching for XML declaration. This heuristic is defined in
52
- # section F of the XML specification:
53
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
54
- begin
55
- if xml_data[0..3] == "\x4c\x6f\xa7\x94"
56
- # EBCDIC
57
- xml_data = _ebcdic_to_ascii(xml_data)
58
- elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
59
- # UTF-16BE
60
- sniffed_xml_encoding = 'utf-16be'
61
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
62
- elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
63
- # UTF-16BE with BOM
64
- sniffed_xml_encoding = 'utf-16be'
65
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
66
- elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
67
- # UTF-16LE
68
- sniffed_xml_encoding = 'utf-16le'
69
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
70
- elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
71
- # UTF-16LE with BOM
72
- sniffed_xml_encoding = 'utf-16le'
73
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
74
- elsif xml_data[0..3] == "\x00\x00\x00\x3c"
75
- # UTF-32BE
76
- sniffed_xml_encoding = 'utf-32be'
77
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
78
- elsif xml_data[0..3] == "\x3c\x00\x00\x00"
79
- # UTF-32LE
80
- sniffed_xml_encoding = 'utf-32le'
81
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
82
- elsif xml_data[0..3] == "\x00\x00\xfe\xff"
83
- # UTF-32BE with BOM
84
- sniffed_xml_encoding = 'utf-32be'
85
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
86
- elsif xml_data[0..3] == "\xff\xfe\x00\x00"
87
- # UTF-32LE with BOM
88
- sniffed_xml_encoding = 'utf-32le'
89
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
90
- elsif xml_data[0..2] == "\xef\xbb\xbf"
91
- # UTF-8 with BOM
92
- sniffed_xml_encoding = 'utf-8'
93
- xml_data = xml_data[3..-1]
94
- else
95
- # ASCII-compatible
96
- end
97
- xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
98
- rescue
99
- xml_encoding_match = nil
100
- end
101
- if xml_encoding_match
102
- xml_encoding = xml_encoding_match[1].downcase
103
- xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
104
- if sniffed_xml_encoding and xencodings.include?xml_encoding
105
- xml_encoding = sniffed_xml_encoding
106
- end
107
- end
108
-
109
- acceptable_content_type = false
110
- application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
111
- text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
112
-
113
- if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
114
- acceptable_content_type = true
115
- true_encoding = http_encoding || xml_encoding || 'utf-8'
116
- elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
117
- acceptable_content_type = true
118
- true_encoding = http_encoding || 'us-ascii'
119
- elsif /^text\// =~ http_content_type
120
- true_encoding = http_encoding || 'us-ascii'
121
- elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
122
- true_encoding = xml_encoding || 'iso-8859-1'
123
- else
124
- true_encoding = xml_encoding || 'utf-8'
125
- end
126
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
127
- end
128
-
129
- def toUTF8(data, encoding)
130
- =begin
131
- Changes an XML data stream on the fly to specify a new encoding
132
-
133
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
134
- encoding is a string recognized by encodings.aliases
135
- =end
136
- $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
137
- # NOTE we must use double quotes when dealing with \x encodings!
138
- if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
139
- if $debug
140
- $stderr << "stripping BOM\n"
141
- if encoding != 'utf-16be'
142
- $stderr << "string utf-16be instead\n"
143
- end
144
- end
145
- encoding = 'utf-16be'
146
- data = data[2..-1]
147
- elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
148
- if $debug
149
- $stderr << "stripping BOM\n"
150
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
151
- end
152
- encoding = 'utf-16le'
153
- data = data[2..-1]
154
- elsif (data[0..2] == "\xef\xbb\xbf")
155
- if $debug
156
- $stderr << "stripping BOM\n"
157
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
158
- end
159
- encoding = 'utf-8'
160
- data = data[3..-1]
161
- elsif (data[0..3] == "\x00\x00\xfe\xff")
162
- if $debug
163
- $stderr << "stripping BOM\n"
164
- if encoding != 'utf-32be'
165
- $stderr << "trying utf-32be instead\n"
166
- end
167
- end
168
- encoding = 'utf-32be'
169
- data = data[4..-1]
170
- elsif (data[0..3] == "\xff\xfe\x00\x00")
171
- if $debug
172
- $stderr << "stripping BOM\n"
173
- if encoding != 'utf-32le'
174
- $stderr << "trying utf-32le instead\n"
175
- end
176
- end
177
- encoding = 'utf-32le'
178
- data = data[4..-1]
179
- end
180
- begin
181
- newdata = uconvert(data, encoding, 'utf-8')
182
- rescue => details
183
- end
184
- $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
185
- declmatch = /^<\?xml[^>]*?>/
186
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
187
- if declmatch =~ newdata
188
- newdata.sub!(declmatch, newdecl)
189
- else
190
- newdata = newdecl + "\n" + newdata
191
- end
192
- return newdata
193
- end
194
-
195
- end
196
-
197
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
198
- module XChar
199
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
200
- CP1252 = {
201
- 128 => 8364, # euro sign
202
- 130 => 8218, # single low-9 quotation mark
203
- 131 => 402, # latin small letter f with hook
204
- 132 => 8222, # double low-9 quotation mark
205
- 133 => 8230, # horizontal ellipsis
206
- 134 => 8224, # dagger
207
- 135 => 8225, # double dagger
208
- 136 => 710, # modifier letter circumflex accent
209
- 137 => 8240, # per mille sign
210
- 138 => 352, # latin capital letter s with caron
211
- 139 => 8249, # single left-pointing angle quotation mark
212
- 140 => 338, # latin capital ligature oe
213
- 142 => 381, # latin capital letter z with caron
214
- 145 => 8216, # left single quotation mark
215
- 146 => 8217, # right single quotation mark
216
- 147 => 8220, # left double quotation mark
217
- 148 => 8221, # right double quotation mark
218
- 149 => 8226, # bullet
219
- 150 => 8211, # en dash
220
- 151 => 8212, # em dash
221
- 152 => 732, # small tilde
222
- 153 => 8482, # trade mark sign
223
- 154 => 353, # latin small letter s with caron
224
- 155 => 8250, # single right-pointing angle quotation mark
225
- 156 => 339, # latin small ligature oe
226
- 158 => 382, # latin small letter z with caron
227
- 159 => 376} # latin capital letter y with diaeresis
228
-
229
- # http://www.w3.org/TR/REC-xml/#dt-chardata
230
- PREDEFINED = {
231
- 38 => '&amp;', # ampersand
232
- 60 => '&lt;', # left angle bracket
233
- 62 => '&gt;'} # right angle bracket
234
-
235
- # http://www.w3.org/TR/REC-xml/#charsets
236
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
237
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
238
- end
239
-
240
- class Fixnum
241
- # xml escaped version of chr
242
- def xchr
243
- n = XChar::CP1252[self] || self
244
- n = 42 unless XChar::VALID.find {|range| range.include? n}
245
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
246
- end
247
- end
248
-
249
- class String
250
- alias :old_index :index
251
- def to_xs
252
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
253
- rescue
254
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
255
- end
256
- end
257
-