rfeedparser 0.9.92 → 0.9.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,38 +56,38 @@ include FeedParserUtilities
56
56
 
57
57
 
58
58
  module FeedParser
59
- Version = "0.9.92"
59
+ Version = "0.9.93"
60
60
 
61
61
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
62
62
 
63
- Redistribution and use in source and binary forms, with or without modification,
64
- are permitted provided that the following conditions are met:
63
+ Redistribution and use in source and binary forms, with or without modification,
64
+ are permitted provided that the following conditions are met:
65
65
 
66
- * Redistributions of source code must retain the above copyright notice,
66
+ * Redistributions of source code must retain the above copyright notice,
67
67
  this list of conditions and the following disclaimer.
68
- * Redistributions in binary form must reproduce the above copyright notice,
68
+ * Redistributions in binary form must reproduce the above copyright notice,
69
69
  this list of conditions and the following disclaimer in the documentation
70
70
  and/or other materials provided with the distribution.
71
71
 
72
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
- POSSIBILITY OF SUCH DAMAGE."""
72
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
+ POSSIBILITY OF SUCH DAMAGE."""
83
83
 
84
84
  Author = "Jeff Hodges <http://somethingsimilar.com>"
85
85
  Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
86
86
  Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
- "John Beimler <http://john.beimler.org/>",
88
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
- "Aaron Swartz <http://aaronsw.com/>",
90
- "Kevin Marks <http://epeus.blogspot.com/>"
87
+ "John Beimler <http://john.beimler.org/>",
88
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
+ "Aaron Swartz <http://aaronsw.com/>",
90
+ "Kevin Marks <http://epeus.blogspot.com/>"
91
91
  ]
92
92
  # HTTP "User-Agent" header to send to servers when downloading feeds.
93
93
  # If you are embedding feedparser in a larger application, you should
@@ -123,25 +123,26 @@ POSSIBILITY OF SUCH DAMAGE."""
123
123
 
124
124
 
125
125
  SUPPORTED_VERSIONS = {'' => 'unknown',
126
- 'rss090' => 'RSS 0.90',
127
- 'rss091n' => 'RSS 0.91 (Netscape)',
128
- 'rss091u' => 'RSS 0.91 (Userland)',
129
- 'rss092' => 'RSS 0.92',
130
- 'rss093' => 'RSS 0.93',
131
- 'rss094' => 'RSS 0.94',
132
- 'rss20' => 'RSS 2.0',
133
- 'rss10' => 'RSS 1.0',
134
- 'rss' => 'RSS (unknown version)',
135
- 'atom01' => 'Atom 0.1',
136
- 'atom02' => 'Atom 0.2',
137
- 'atom03' => 'Atom 0.3',
138
- 'atom10' => 'Atom 1.0',
139
- 'atom' => 'Atom (unknown version)',
140
- 'cdf' => 'CDF',
141
- 'hotrss' => 'Hot RSS'
126
+ 'rss090' => 'RSS 0.90',
127
+ 'rss091n' => 'RSS 0.91 (Netscape)',
128
+ 'rss091u' => 'RSS 0.91 (Userland)',
129
+ 'rss092' => 'RSS 0.92',
130
+ 'rss093' => 'RSS 0.93',
131
+ 'rss094' => 'RSS 0.94',
132
+ 'rss20' => 'RSS 2.0',
133
+ 'rss10' => 'RSS 1.0',
134
+ 'rss' => 'RSS (unknown version)',
135
+ 'atom01' => 'Atom 0.1',
136
+ 'atom02' => 'Atom 0.2',
137
+ 'atom03' => 'Atom 0.3',
138
+ 'atom10' => 'Atom 1.0',
139
+ 'atom' => 'Atom (unknown version)',
140
+ 'cdf' => 'CDF',
141
+ 'hotrss' => 'Hot RSS'
142
142
  }
143
-
143
+
144
144
  def parse(furi, options = {})
145
+ furi.strip!
145
146
  # Parse a feed from a URL, file, stream or string
146
147
  $compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
147
148
  strictklass = options[:strict] || StrictFeedParser
@@ -189,27 +190,27 @@ POSSIBILITY OF SUCH DAMAGE."""
189
190
  end
190
191
  begin
191
192
  if f.meta
192
- result['etag'] = options[:etag] || f.meta['etag']
193
- result['modified'] = options[:modified] || f.last_modified
194
- result['url'] = f.base_uri.to_s
195
- result['status'] = f.status[0] || 200
196
- result['headers'] = f.meta
197
- result['headers']['content-location'] ||= options[:content_location] unless options[:content_location].nil?
198
- result['headers']['content-language'] ||= options[:content_language] unless options[:content_language].nil?
199
- result['headers']['content-type'] ||= options[:content_type] unless options[:content_type].nil?
193
+ result['etag'] = options[:etag] || f.meta['etag']
194
+ result['modified'] = options[:modified] || f.last_modified
195
+ result['url'] = f.base_uri.to_s
196
+ result['status'] = f.status[0] || 200
197
+ result['headers'] = f.meta
198
+ result['headers']['content-location'] ||= options[:content_location] unless options[:content_location].nil?
199
+ result['headers']['content-language'] ||= options[:content_language] unless options[:content_language].nil?
200
+ result['headers']['content-type'] ||= options[:content_type] unless options[:content_type].nil?
200
201
  end
201
202
  rescue NoMethodError
202
203
  result['headers'] = {}
203
204
  result['etag'] = result['headers']['etag'] = options[:etag] unless options[:etag].nil?
204
205
  result['modified'] = result['headers']['last-modified'] = options[:modified] unless options[:modified].nil?
205
206
  unless options[:content_location].nil?
206
- result['headers']['content-location'] = options[:content_location]
207
+ result['headers']['content-location'] = options[:content_location]
207
208
  end
208
209
  unless options[:content_language].nil?
209
- result['headers']['content-language'] = options[:content_language]
210
+ result['headers']['content-language'] = options[:content_language]
210
211
  end
211
212
  unless options[:content_type].nil?
212
- result['headers']['content-type'] = options[:content_type]
213
+ result['headers']['content-type'] = options[:content_type]
213
214
  end
214
215
  end
215
216
 
@@ -221,13 +222,13 @@ POSSIBILITY OF SUCH DAMAGE."""
221
222
  # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
222
223
  http_headers = result['headers']
223
224
  result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
224
- self.getCharacterEncoding(f,data)
225
+ self.getCharacterEncoding(f,data)
225
226
 
226
227
  if not http_headers.empty? and not acceptable_content_type
227
228
  if http_headers.has_key?('content-type')
228
- bozo_message = "#{http_headers['content-type']} is not an XML media type"
229
+ bozo_message = "#{http_headers['content-type']} is not an XML media type"
229
230
  else
230
- bozo_message = 'no Content-type specified'
231
+ bozo_message = 'no Content-type specified'
231
232
  end
232
233
  result['bozo'] = true
233
234
  result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
@@ -260,21 +261,21 @@ POSSIBILITY OF SUCH DAMAGE."""
260
261
  next if tried_encodings.include? proposed_encoding
261
262
  tried_encodings << proposed_encoding
262
263
  begin
263
- data = self.toUTF8(data, proposed_encoding)
264
- known_encoding = use_strict_parser = true
265
- break
264
+ data = self.toUTF8(data, proposed_encoding)
265
+ known_encoding = use_strict_parser = true
266
+ break
266
267
  rescue
267
268
  end
268
269
  end
269
270
  # if no luck and we have auto-detection library, try that
270
271
  if not known_encoding and $chardet
271
272
  begin
272
- proposed_encoding = CharDet.detect(data)['encoding']
273
- if proposed_encoding and not tried_encodings.include?proposed_encoding
274
- tried_encodings << proposed_encoding
275
- data = self.toUTF8(data, proposed_encoding)
276
- known_encoding = use_strict_parser = true
277
- end
273
+ proposed_encoding = CharDet.detect(data)['encoding']
274
+ if proposed_encoding and not tried_encodings.include?proposed_encoding
275
+ tried_encodings << proposed_encoding
276
+ data = self.toUTF8(data, proposed_encoding)
277
+ known_encoding = use_strict_parser = true
278
+ end
278
279
  rescue
279
280
  end
280
281
  end
@@ -284,24 +285,24 @@ POSSIBILITY OF SUCH DAMAGE."""
284
285
  # if still no luck and we haven't tried utf-8 yet, try that
285
286
  if not known_encoding and not tried_encodings.include?'utf-8'
286
287
  begin
287
- proposed_encoding = 'utf-8'
288
- tried_encodings << proposed_encoding
289
- data = self.toUTF8(data, proposed_encoding)
290
- known_encoding = use_strict_parser = true
288
+ proposed_encoding = 'utf-8'
289
+ tried_encodings << proposed_encoding
290
+ data = self.toUTF8(data, proposed_encoding)
291
+ known_encoding = use_strict_parser = true
291
292
  rescue
292
293
  end
293
294
  end
294
295
  # if still no luck and we haven't tried windows-1252 yet, try that
295
296
  if not known_encoding and not tried_encodings.include?'windows-1252'
296
297
  begin
297
- proposed_encdoing = 'windows-1252'
298
- tried_encodings << proposed_encoding
299
- data = self.toUTF8(data, proposed_encoding)
300
- known_encoding = use_strict_parser = true
298
+ proposed_encoding = 'windows-1252'
299
+ tried_encodings << proposed_encoding
300
+ data = self.toUTF8(data, proposed_encoding)
301
+ known_encoding = use_strict_parser = true
301
302
  rescue
302
303
  end
303
304
  end
304
-
305
+
305
306
  # NOTE this isn't in FeedParser.py 4.1
306
307
  # if still no luck and we haven't tried iso-8859-2 yet, try that.
307
308
  #if not known_encoding and not tried_encodings.include?'iso-8859-2'
@@ -338,15 +339,15 @@ POSSIBILITY OF SUCH DAMAGE."""
338
339
  inputdata = XML::SAX::InputSource.new('parsedfeed')
339
340
  inputdata.setByteStream(StringIO.new(data))
340
341
  begin
341
- saxparser.parse(inputdata)
342
+ saxparser.parse(inputdata)
342
343
  rescue Exception => parseerr # resparse
343
- if $debug
344
- $stderr << "xml parsing failed\n"
345
- $stderr << parseerr.to_s+"\n" # Hrmph.
346
- end
347
- result['bozo'] = true
348
- result['bozo_exception'] = feedparser.exc || e
349
- use_strict_parser = false
344
+ if $debug
345
+ $stderr << "xml parsing failed\n"
346
+ $stderr << parseerr.to_s+"\n" # Hrmph.
347
+ end
348
+ result['bozo'] = true
349
+ result['bozo_exception'] = feedparser.exc || e
350
+ use_strict_parser = false
350
351
  end
351
352
  end
352
353
  if not use_strict_parser
@@ -378,22 +379,22 @@ class TextSerializer < Serializer
378
379
  return if (node.nil? or node.empty?)
379
380
  if node.methods.include?'keys'
380
381
  node.keys.sort.each do |key|
381
- next if ['description','link'].include? key
382
- next if node.has_key? k+'_detail'
383
- next if node.has_key? k+'_parsed'
384
- writer(stream,node[k], prefix+k+'.')
382
+ next if ['description','link'].include? key
383
+ next if node.has_key? k+'_detail'
384
+ next if node.has_key? k+'_parsed'
385
+ writer(stream,node[k], prefix+k+'.')
385
386
  end
386
387
  elsif node.class == Array
387
388
  node.each_with_index do |thing, index|
388
- writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
389
+ writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
389
390
  end
390
391
  else
391
392
  begin
392
- s = u(node.to_s)
393
- stream << prefix[0..-2]
394
- stream << '='
395
- stream << s
396
- stream << "\n"
393
+ s = u(node.to_s)
394
+ stream << prefix[0..-2]
395
+ stream << '='
396
+ stream << s
397
+ stream << "\n"
397
398
  rescue
398
399
  end
399
400
  end
@@ -422,49 +423,49 @@ if $0 == __FILE__
422
423
  opts.banner
423
424
  opts.separator ""
424
425
  opts.on("-A", "--user-agent [AGENT]",
425
- "User-Agent for HTTP URLs") {|agent|
426
+ "User-Agent for HTTP URLs") {|agent|
426
427
  options.agent = agent
427
428
  }
428
429
 
429
430
  opts.on("-e", "--referrer [URL]",
430
- "Referrer for HTTP URLs") {|referrer|
431
+ "Referrer for HTTP URLs") {|referrer|
431
432
  options.referrer = referrer
432
433
  }
433
434
 
434
435
  opts.on("-t", "--etag [TAG]",
435
- "ETag/If-None-Match for HTTP URLs") {|etag|
436
+ "ETag/If-None-Match for HTTP URLs") {|etag|
436
437
  options.etag = etag
437
438
  }
438
439
 
439
440
  opts.on("-m", "--last-modified [DATE]",
440
- "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
441
+ "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
441
442
  options.modified = modified
442
443
  }
443
444
 
444
445
  opts.on("-f", "--format [FORMAT]", [:text, :pprint],
445
- "output resutls in FORMAT (text, pprint)") {|format|
446
+ "output resutls in FORMAT (text, pprint)") {|format|
446
447
  options.format = format
447
448
  }
448
449
 
449
450
  opts.on("-v", "--[no-]verbose",
450
- "write debugging information to stderr") {|v|
451
+ "write debugging information to stderr") {|v|
451
452
  options.verbose = v
452
453
  }
453
454
 
454
455
  opts.on("-c", "--[no-]compatible",
455
- "strip element attributes like feedparser.py 4.1 (default)") {|comp|
456
+ "strip element attributes like feedparser.py 4.1 (default)") {|comp|
456
457
  options.compatible = comp
457
458
  }
458
459
  opts.on("-l", "--content-location [LOCATION]",
459
- "default Content-Location HTTP header") {|loc|
460
+ "default Content-Location HTTP header") {|loc|
460
461
  options.content_location = loc
461
462
  }
462
463
  opts.on("-a", "--content-language [LANG]",
463
- "default Content-Language HTTP header") {|lang|
464
+ "default Content-Language HTTP header") {|lang|
464
465
  options.content_language = lang
465
466
  }
466
467
  opts.on("-t", "--content-type [TYPE]",
467
- "default Content-type HTTP header") {|ctype|
468
+ "default Content-type HTTP header") {|ctype|
468
469
  options.ctype = ctype
469
470
  }
470
471
  end
@@ -482,14 +483,14 @@ if $0 == __FILE__
482
483
  unless args.nil?
483
484
  args.each do |url| # opts.parse! removes everything but the urls from the command line
484
485
  results = FeedParser.parse(url, :etag => options.etag,
485
- :modified => options.modified,
486
- :agent => options.agent,
487
- :referrer => options.referrer,
488
- :content_location => options.content_location,
489
- :content_language => options.content_language,
490
- :content_type => options.ctype
491
- )
492
- serializer.new(results).write($stdout)
486
+ :modified => options.modified,
487
+ :agent => options.agent,
488
+ :referrer => options.referrer,
489
+ :content_location => options.content_location,
490
+ :content_language => options.content_language,
491
+ :content_type => options.ctype
492
+ )
493
+ serializer.new(results).write($stdout)
493
494
  end
494
495
  end
495
496
  end
@@ -14,7 +14,7 @@ class BetterSGMLParser < HTML::SGMLParser
14
14
 
15
15
  Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
16
16
  Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
17
- Endtagopen = /<\//u # Matching the Python SGMLParser
17
+ Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
18
18
  Endbracket = /[<>]/u
19
19
  Declopen = /<!/u
20
20
  Piopenbegin = /^<\?/u
@@ -24,8 +24,8 @@ class BetterSGMLParser < HTML::SGMLParser
24
24
  Commentclose = /--\s*>/u
25
25
  Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
26
26
  Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
27
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
28
- 64)
27
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
28
+ 64)
29
29
  Endtagfind = /\s*\/\s*>/u
30
30
  def initialize(verbose=false)
31
31
  super(verbose)
@@ -40,98 +40,98 @@ class BetterSGMLParser < HTML::SGMLParser
40
40
  n = rawdata.length
41
41
  while i < n
42
42
  if @nomoretags
43
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
- i = n
46
- break
43
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
+ i = n
46
+ break
47
47
  end
48
48
  j = rawdata.index(Interesting, i)
49
49
  j = n unless j
50
50
  handle_data(rawdata[i...j]) if i < j
51
51
  i = j
52
52
  break if (i == n)
53
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
54
- if rawdata.index(Starttagopen,i) == i
55
- if @literal
56
- handle_data(rawdata[i..i])
57
- i = i+1
58
- next
59
- end
60
- k = parse_starttag(i)
61
- break unless k
62
- i = k
63
- next
64
- end
65
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
- k = parse_endtag(i)
67
- break unless k
68
- i = k
69
- @literal = false
70
- next
71
- end
72
- if @literal
73
- if n > (i+1)
74
- handle_data("<")
75
- i = i+1
76
- else
77
- #incomplete
78
- break
79
- end
80
- next
81
- end
82
- if rawdata.index(Commentopen,i) == i
83
- k = parse_comment(i)
84
- break unless k
85
- i = k
86
- next
87
- end
88
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
- k = parse_pi(i)
90
- break unless k
91
- i += k
92
- next
93
- end
94
- if rawdata.index(Declopen,i) == i
95
- # This is some sort of declaration; in "HTML as
96
- # deployed," this should only be the document type
97
- # declaration ("<!DOCTYPE html...>").
98
- k = parse_declaration(i)
99
- break unless k
100
- i = k
101
- next
102
- end
53
+ if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
54
+ if rawdata.index(Starttagopen,i) == i
55
+ if @literal
56
+ handle_data(rawdata[i..i])
57
+ i = i+1
58
+ next
59
+ end
60
+ k = parse_starttag(i)
61
+ break unless k
62
+ i = k
63
+ next
64
+ end
65
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
+ k = parse_endtag(i)
67
+ break unless k
68
+ i = k
69
+ @literal = false
70
+ next
71
+ end
72
+ if @literal
73
+ if n > (i+1)
74
+ handle_data("<")
75
+ i = i+1
76
+ else
77
+ #incomplete
78
+ break
79
+ end
80
+ next
81
+ end
82
+ if rawdata.index(Commentopen,i) == i
83
+ k = parse_comment(i)
84
+ break unless k
85
+ i = k
86
+ next
87
+ end
88
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
+ k = parse_pi(i)
90
+ break unless k
91
+ i += k
92
+ next
93
+ end
94
+ if rawdata.index(Declopen,i) == i
95
+ # This is some sort of declaration; in "HTML as
96
+ # deployed," this should only be the document type
97
+ # declaration ("<!DOCTYPE html...>").
98
+ k = parse_declaration(i)
99
+ break unless k
100
+ i = k
101
+ next
102
+ end
103
103
  elsif rawdata[i..i] == '&'
104
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
- handle_data(rawdata[i..i])
106
- i += 1
107
- next
108
- end
104
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
+ handle_data(rawdata[i..i])
106
+ i += 1
107
+ next
108
+ end
109
109
 
110
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
- ni,match = index_match(rawdata, Charref, i)
112
- if ni and ni == i # See? Ugly
113
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
- i -= 1 unless rawdata[i-1..i-1] == ";"
116
- next
117
- end
118
- ni,match = index_match(rawdata, Entityref, i)
119
- if ni and ni == i
120
- handle_entityref(match[1])
121
- i += match[0].length
122
- i -= 1 unless rawdata[i-1..i-1] == ";"
123
- next
124
- end
110
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
+ ni,match = index_match(rawdata, Charref, i)
112
+ if ni and ni == i # See? Ugly
113
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
+ i -= 1 unless rawdata[i-1..i-1] == ";"
116
+ next
117
+ end
118
+ ni,match = index_match(rawdata, Entityref, i)
119
+ if ni and ni == i
120
+ handle_entityref(match[1])
121
+ i += match[0].length
122
+ i -= 1 unless rawdata[i-1..i-1] == ";"
123
+ next
124
+ end
125
125
  else
126
- error('neither < nor & ??')
126
+ error('neither < nor & ??')
127
127
  end
128
128
  # We get here only if incomplete matches but
129
129
  # nothing else
130
130
  ni,match = index_match(rawdata,Incomplete,i)
131
131
  unless ni and ni == 0
132
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
- i += 1
134
- next
132
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
+ i += 1
134
+ next
135
135
  end
136
136
  j = ni + match[0].length
137
137
  break if j == n # Really incomplete
@@ -206,7 +206,7 @@ class BetterSGMLParser < HTML::SGMLParser
206
206
  else
207
207
  ni,match = index_match(rawdata,Tagfind,i+1)
208
208
  unless match
209
- error('unexpected call to parse_starttag')
209
+ error('unexpected call to parse_starttag')
210
210
  end
211
211
  k = ni+match[0].length+1
212
212
  tag = match[0].downcase
@@ -220,9 +220,9 @@ class BetterSGMLParser < HTML::SGMLParser
220
220
  matched_length = match[0].length
221
221
  attrname, rest, attrvalue = match[1],match[2],match[3]
222
222
  if rest.nil? or rest.empty?
223
- attrvalue = '' # was: = attrname # Why the change?
223
+ attrvalue = '' # was: = attrname # Why the change?
224
224
  elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
225
- attrvalue = attrvalue[1...-1]
225
+ attrvalue = attrvalue[1...-1]
226
226
  end
227
227
  attrsd << [attrname.downcase, attrvalue]
228
228
  k += matched_length