rfeedparser 0.9.92 → 0.9.93

Sign up to get free protection for your applications and to get access to all the features.
@@ -56,38 +56,38 @@ include FeedParserUtilities
56
56
 
57
57
 
58
58
  module FeedParser
59
- Version = "0.9.92"
59
+ Version = "0.9.93"
60
60
 
61
61
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
62
62
 
63
- Redistribution and use in source and binary forms, with or without modification,
64
- are permitted provided that the following conditions are met:
63
+ Redistribution and use in source and binary forms, with or without modification,
64
+ are permitted provided that the following conditions are met:
65
65
 
66
- * Redistributions of source code must retain the above copyright notice,
66
+ * Redistributions of source code must retain the above copyright notice,
67
67
  this list of conditions and the following disclaimer.
68
- * Redistributions in binary form must reproduce the above copyright notice,
68
+ * Redistributions in binary form must reproduce the above copyright notice,
69
69
  this list of conditions and the following disclaimer in the documentation
70
70
  and/or other materials provided with the distribution.
71
71
 
72
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
- POSSIBILITY OF SUCH DAMAGE."""
72
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
+ POSSIBILITY OF SUCH DAMAGE."""
83
83
 
84
84
  Author = "Jeff Hodges <http://somethingsimilar.com>"
85
85
  Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
86
86
  Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
- "John Beimler <http://john.beimler.org/>",
88
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
- "Aaron Swartz <http://aaronsw.com/>",
90
- "Kevin Marks <http://epeus.blogspot.com/>"
87
+ "John Beimler <http://john.beimler.org/>",
88
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
+ "Aaron Swartz <http://aaronsw.com/>",
90
+ "Kevin Marks <http://epeus.blogspot.com/>"
91
91
  ]
92
92
  # HTTP "User-Agent" header to send to servers when downloading feeds.
93
93
  # If you are embedding feedparser in a larger application, you should
@@ -123,25 +123,26 @@ POSSIBILITY OF SUCH DAMAGE."""
123
123
 
124
124
 
125
125
  SUPPORTED_VERSIONS = {'' => 'unknown',
126
- 'rss090' => 'RSS 0.90',
127
- 'rss091n' => 'RSS 0.91 (Netscape)',
128
- 'rss091u' => 'RSS 0.91 (Userland)',
129
- 'rss092' => 'RSS 0.92',
130
- 'rss093' => 'RSS 0.93',
131
- 'rss094' => 'RSS 0.94',
132
- 'rss20' => 'RSS 2.0',
133
- 'rss10' => 'RSS 1.0',
134
- 'rss' => 'RSS (unknown version)',
135
- 'atom01' => 'Atom 0.1',
136
- 'atom02' => 'Atom 0.2',
137
- 'atom03' => 'Atom 0.3',
138
- 'atom10' => 'Atom 1.0',
139
- 'atom' => 'Atom (unknown version)',
140
- 'cdf' => 'CDF',
141
- 'hotrss' => 'Hot RSS'
126
+ 'rss090' => 'RSS 0.90',
127
+ 'rss091n' => 'RSS 0.91 (Netscape)',
128
+ 'rss091u' => 'RSS 0.91 (Userland)',
129
+ 'rss092' => 'RSS 0.92',
130
+ 'rss093' => 'RSS 0.93',
131
+ 'rss094' => 'RSS 0.94',
132
+ 'rss20' => 'RSS 2.0',
133
+ 'rss10' => 'RSS 1.0',
134
+ 'rss' => 'RSS (unknown version)',
135
+ 'atom01' => 'Atom 0.1',
136
+ 'atom02' => 'Atom 0.2',
137
+ 'atom03' => 'Atom 0.3',
138
+ 'atom10' => 'Atom 1.0',
139
+ 'atom' => 'Atom (unknown version)',
140
+ 'cdf' => 'CDF',
141
+ 'hotrss' => 'Hot RSS'
142
142
  }
143
-
143
+
144
144
  def parse(furi, options = {})
145
+ furi.strip!
145
146
  # Parse a feed from a URL, file, stream or string
146
147
  $compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
147
148
  strictklass = options[:strict] || StrictFeedParser
@@ -189,27 +190,27 @@ POSSIBILITY OF SUCH DAMAGE."""
189
190
  end
190
191
  begin
191
192
  if f.meta
192
- result['etag'] = options[:etag] || f.meta['etag']
193
- result['modified'] = options[:modified] || f.last_modified
194
- result['url'] = f.base_uri.to_s
195
- result['status'] = f.status[0] || 200
196
- result['headers'] = f.meta
197
- result['headers']['content-location'] ||= options[:content_location] unless options[:content_location].nil?
198
- result['headers']['content-language'] ||= options[:content_language] unless options[:content_language].nil?
199
- result['headers']['content-type'] ||= options[:content_type] unless options[:content_type].nil?
193
+ result['etag'] = options[:etag] || f.meta['etag']
194
+ result['modified'] = options[:modified] || f.last_modified
195
+ result['url'] = f.base_uri.to_s
196
+ result['status'] = f.status[0] || 200
197
+ result['headers'] = f.meta
198
+ result['headers']['content-location'] ||= options[:content_location] unless options[:content_location].nil?
199
+ result['headers']['content-language'] ||= options[:content_language] unless options[:content_language].nil?
200
+ result['headers']['content-type'] ||= options[:content_type] unless options[:content_type].nil?
200
201
  end
201
202
  rescue NoMethodError
202
203
  result['headers'] = {}
203
204
  result['etag'] = result['headers']['etag'] = options[:etag] unless options[:etag].nil?
204
205
  result['modified'] = result['headers']['last-modified'] = options[:modified] unless options[:modified].nil?
205
206
  unless options[:content_location].nil?
206
- result['headers']['content-location'] = options[:content_location]
207
+ result['headers']['content-location'] = options[:content_location]
207
208
  end
208
209
  unless options[:content_language].nil?
209
- result['headers']['content-language'] = options[:content_language]
210
+ result['headers']['content-language'] = options[:content_language]
210
211
  end
211
212
  unless options[:content_type].nil?
212
- result['headers']['content-type'] = options[:content_type]
213
+ result['headers']['content-type'] = options[:content_type]
213
214
  end
214
215
  end
215
216
 
@@ -221,13 +222,13 @@ POSSIBILITY OF SUCH DAMAGE."""
221
222
  # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
222
223
  http_headers = result['headers']
223
224
  result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
224
- self.getCharacterEncoding(f,data)
225
+ self.getCharacterEncoding(f,data)
225
226
 
226
227
  if not http_headers.empty? and not acceptable_content_type
227
228
  if http_headers.has_key?('content-type')
228
- bozo_message = "#{http_headers['content-type']} is not an XML media type"
229
+ bozo_message = "#{http_headers['content-type']} is not an XML media type"
229
230
  else
230
- bozo_message = 'no Content-type specified'
231
+ bozo_message = 'no Content-type specified'
231
232
  end
232
233
  result['bozo'] = true
233
234
  result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
@@ -260,21 +261,21 @@ POSSIBILITY OF SUCH DAMAGE."""
260
261
  next if tried_encodings.include? proposed_encoding
261
262
  tried_encodings << proposed_encoding
262
263
  begin
263
- data = self.toUTF8(data, proposed_encoding)
264
- known_encoding = use_strict_parser = true
265
- break
264
+ data = self.toUTF8(data, proposed_encoding)
265
+ known_encoding = use_strict_parser = true
266
+ break
266
267
  rescue
267
268
  end
268
269
  end
269
270
  # if no luck and we have auto-detection library, try that
270
271
  if not known_encoding and $chardet
271
272
  begin
272
- proposed_encoding = CharDet.detect(data)['encoding']
273
- if proposed_encoding and not tried_encodings.include?proposed_encoding
274
- tried_encodings << proposed_encoding
275
- data = self.toUTF8(data, proposed_encoding)
276
- known_encoding = use_strict_parser = true
277
- end
273
+ proposed_encoding = CharDet.detect(data)['encoding']
274
+ if proposed_encoding and not tried_encodings.include?proposed_encoding
275
+ tried_encodings << proposed_encoding
276
+ data = self.toUTF8(data, proposed_encoding)
277
+ known_encoding = use_strict_parser = true
278
+ end
278
279
  rescue
279
280
  end
280
281
  end
@@ -284,24 +285,24 @@ POSSIBILITY OF SUCH DAMAGE."""
284
285
  # if still no luck and we haven't tried utf-8 yet, try that
285
286
  if not known_encoding and not tried_encodings.include?'utf-8'
286
287
  begin
287
- proposed_encoding = 'utf-8'
288
- tried_encodings << proposed_encoding
289
- data = self.toUTF8(data, proposed_encoding)
290
- known_encoding = use_strict_parser = true
288
+ proposed_encoding = 'utf-8'
289
+ tried_encodings << proposed_encoding
290
+ data = self.toUTF8(data, proposed_encoding)
291
+ known_encoding = use_strict_parser = true
291
292
  rescue
292
293
  end
293
294
  end
294
295
  # if still no luck and we haven't tried windows-1252 yet, try that
295
296
  if not known_encoding and not tried_encodings.include?'windows-1252'
296
297
  begin
297
- proposed_encdoing = 'windows-1252'
298
- tried_encodings << proposed_encoding
299
- data = self.toUTF8(data, proposed_encoding)
300
- known_encoding = use_strict_parser = true
298
+ proposed_encoding = 'windows-1252'
299
+ tried_encodings << proposed_encoding
300
+ data = self.toUTF8(data, proposed_encoding)
301
+ known_encoding = use_strict_parser = true
301
302
  rescue
302
303
  end
303
304
  end
304
-
305
+
305
306
  # NOTE this isn't in FeedParser.py 4.1
306
307
  # if still no luck and we haven't tried iso-8859-2 yet, try that.
307
308
  #if not known_encoding and not tried_encodings.include?'iso-8859-2'
@@ -338,15 +339,15 @@ POSSIBILITY OF SUCH DAMAGE."""
338
339
  inputdata = XML::SAX::InputSource.new('parsedfeed')
339
340
  inputdata.setByteStream(StringIO.new(data))
340
341
  begin
341
- saxparser.parse(inputdata)
342
+ saxparser.parse(inputdata)
342
343
  rescue Exception => parseerr # resparse
343
- if $debug
344
- $stderr << "xml parsing failed\n"
345
- $stderr << parseerr.to_s+"\n" # Hrmph.
346
- end
347
- result['bozo'] = true
348
- result['bozo_exception'] = feedparser.exc || e
349
- use_strict_parser = false
344
+ if $debug
345
+ $stderr << "xml parsing failed\n"
346
+ $stderr << parseerr.to_s+"\n" # Hrmph.
347
+ end
348
+ result['bozo'] = true
349
+ result['bozo_exception'] = feedparser.exc || e
350
+ use_strict_parser = false
350
351
  end
351
352
  end
352
353
  if not use_strict_parser
@@ -378,22 +379,22 @@ class TextSerializer < Serializer
378
379
  return if (node.nil? or node.empty?)
379
380
  if node.methods.include?'keys'
380
381
  node.keys.sort.each do |key|
381
- next if ['description','link'].include? key
382
- next if node.has_key? k+'_detail'
383
- next if node.has_key? k+'_parsed'
384
- writer(stream,node[k], prefix+k+'.')
382
+ next if ['description','link'].include? key
383
+ next if node.has_key? k+'_detail'
384
+ next if node.has_key? k+'_parsed'
385
+ writer(stream,node[k], prefix+k+'.')
385
386
  end
386
387
  elsif node.class == Array
387
388
  node.each_with_index do |thing, index|
388
- writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
389
+ writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
389
390
  end
390
391
  else
391
392
  begin
392
- s = u(node.to_s)
393
- stream << prefix[0..-2]
394
- stream << '='
395
- stream << s
396
- stream << "\n"
393
+ s = u(node.to_s)
394
+ stream << prefix[0..-2]
395
+ stream << '='
396
+ stream << s
397
+ stream << "\n"
397
398
  rescue
398
399
  end
399
400
  end
@@ -422,49 +423,49 @@ if $0 == __FILE__
422
423
  opts.banner
423
424
  opts.separator ""
424
425
  opts.on("-A", "--user-agent [AGENT]",
425
- "User-Agent for HTTP URLs") {|agent|
426
+ "User-Agent for HTTP URLs") {|agent|
426
427
  options.agent = agent
427
428
  }
428
429
 
429
430
  opts.on("-e", "--referrer [URL]",
430
- "Referrer for HTTP URLs") {|referrer|
431
+ "Referrer for HTTP URLs") {|referrer|
431
432
  options.referrer = referrer
432
433
  }
433
434
 
434
435
  opts.on("-t", "--etag [TAG]",
435
- "ETag/If-None-Match for HTTP URLs") {|etag|
436
+ "ETag/If-None-Match for HTTP URLs") {|etag|
436
437
  options.etag = etag
437
438
  }
438
439
 
439
440
  opts.on("-m", "--last-modified [DATE]",
440
- "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
441
+ "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
441
442
  options.modified = modified
442
443
  }
443
444
 
444
445
  opts.on("-f", "--format [FORMAT]", [:text, :pprint],
445
- "output resutls in FORMAT (text, pprint)") {|format|
446
+ "output resutls in FORMAT (text, pprint)") {|format|
446
447
  options.format = format
447
448
  }
448
449
 
449
450
  opts.on("-v", "--[no-]verbose",
450
- "write debugging information to stderr") {|v|
451
+ "write debugging information to stderr") {|v|
451
452
  options.verbose = v
452
453
  }
453
454
 
454
455
  opts.on("-c", "--[no-]compatible",
455
- "strip element attributes like feedparser.py 4.1 (default)") {|comp|
456
+ "strip element attributes like feedparser.py 4.1 (default)") {|comp|
456
457
  options.compatible = comp
457
458
  }
458
459
  opts.on("-l", "--content-location [LOCATION]",
459
- "default Content-Location HTTP header") {|loc|
460
+ "default Content-Location HTTP header") {|loc|
460
461
  options.content_location = loc
461
462
  }
462
463
  opts.on("-a", "--content-language [LANG]",
463
- "default Content-Language HTTP header") {|lang|
464
+ "default Content-Language HTTP header") {|lang|
464
465
  options.content_language = lang
465
466
  }
466
467
  opts.on("-t", "--content-type [TYPE]",
467
- "default Content-type HTTP header") {|ctype|
468
+ "default Content-type HTTP header") {|ctype|
468
469
  options.ctype = ctype
469
470
  }
470
471
  end
@@ -482,14 +483,14 @@ if $0 == __FILE__
482
483
  unless args.nil?
483
484
  args.each do |url| # opts.parse! removes everything but the urls from the command line
484
485
  results = FeedParser.parse(url, :etag => options.etag,
485
- :modified => options.modified,
486
- :agent => options.agent,
487
- :referrer => options.referrer,
488
- :content_location => options.content_location,
489
- :content_language => options.content_language,
490
- :content_type => options.ctype
491
- )
492
- serializer.new(results).write($stdout)
486
+ :modified => options.modified,
487
+ :agent => options.agent,
488
+ :referrer => options.referrer,
489
+ :content_location => options.content_location,
490
+ :content_language => options.content_language,
491
+ :content_type => options.ctype
492
+ )
493
+ serializer.new(results).write($stdout)
493
494
  end
494
495
  end
495
496
  end
@@ -14,7 +14,7 @@ class BetterSGMLParser < HTML::SGMLParser
14
14
 
15
15
  Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
16
16
  Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
17
- Endtagopen = /<\//u # Matching the Python SGMLParser
17
+ Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
18
18
  Endbracket = /[<>]/u
19
19
  Declopen = /<!/u
20
20
  Piopenbegin = /^<\?/u
@@ -24,8 +24,8 @@ class BetterSGMLParser < HTML::SGMLParser
24
24
  Commentclose = /--\s*>/u
25
25
  Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
26
26
  Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
27
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
28
- 64)
27
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
28
+ 64)
29
29
  Endtagfind = /\s*\/\s*>/u
30
30
  def initialize(verbose=false)
31
31
  super(verbose)
@@ -40,98 +40,98 @@ class BetterSGMLParser < HTML::SGMLParser
40
40
  n = rawdata.length
41
41
  while i < n
42
42
  if @nomoretags
43
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
- i = n
46
- break
43
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
44
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
45
+ i = n
46
+ break
47
47
  end
48
48
  j = rawdata.index(Interesting, i)
49
49
  j = n unless j
50
50
  handle_data(rawdata[i...j]) if i < j
51
51
  i = j
52
52
  break if (i == n)
53
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
54
- if rawdata.index(Starttagopen,i) == i
55
- if @literal
56
- handle_data(rawdata[i..i])
57
- i = i+1
58
- next
59
- end
60
- k = parse_starttag(i)
61
- break unless k
62
- i = k
63
- next
64
- end
65
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
- k = parse_endtag(i)
67
- break unless k
68
- i = k
69
- @literal = false
70
- next
71
- end
72
- if @literal
73
- if n > (i+1)
74
- handle_data("<")
75
- i = i+1
76
- else
77
- #incomplete
78
- break
79
- end
80
- next
81
- end
82
- if rawdata.index(Commentopen,i) == i
83
- k = parse_comment(i)
84
- break unless k
85
- i = k
86
- next
87
- end
88
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
- k = parse_pi(i)
90
- break unless k
91
- i += k
92
- next
93
- end
94
- if rawdata.index(Declopen,i) == i
95
- # This is some sort of declaration; in "HTML as
96
- # deployed," this should only be the document type
97
- # declaration ("<!DOCTYPE html...>").
98
- k = parse_declaration(i)
99
- break unless k
100
- i = k
101
- next
102
- end
53
+ if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
54
+ if rawdata.index(Starttagopen,i) == i
55
+ if @literal
56
+ handle_data(rawdata[i..i])
57
+ i = i+1
58
+ next
59
+ end
60
+ k = parse_starttag(i)
61
+ break unless k
62
+ i = k
63
+ next
64
+ end
65
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
66
+ k = parse_endtag(i)
67
+ break unless k
68
+ i = k
69
+ @literal = false
70
+ next
71
+ end
72
+ if @literal
73
+ if n > (i+1)
74
+ handle_data("<")
75
+ i = i+1
76
+ else
77
+ #incomplete
78
+ break
79
+ end
80
+ next
81
+ end
82
+ if rawdata.index(Commentopen,i) == i
83
+ k = parse_comment(i)
84
+ break unless k
85
+ i = k
86
+ next
87
+ end
88
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
89
+ k = parse_pi(i)
90
+ break unless k
91
+ i += k
92
+ next
93
+ end
94
+ if rawdata.index(Declopen,i) == i
95
+ # This is some sort of declaration; in "HTML as
96
+ # deployed," this should only be the document type
97
+ # declaration ("<!DOCTYPE html...>").
98
+ k = parse_declaration(i)
99
+ break unless k
100
+ i = k
101
+ next
102
+ end
103
103
  elsif rawdata[i..i] == '&'
104
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
- handle_data(rawdata[i..i])
106
- i += 1
107
- next
108
- end
104
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
105
+ handle_data(rawdata[i..i])
106
+ i += 1
107
+ next
108
+ end
109
109
 
110
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
- ni,match = index_match(rawdata, Charref, i)
112
- if ni and ni == i # See? Ugly
113
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
- i -= 1 unless rawdata[i-1..i-1] == ";"
116
- next
117
- end
118
- ni,match = index_match(rawdata, Entityref, i)
119
- if ni and ni == i
120
- handle_entityref(match[1])
121
- i += match[0].length
122
- i -= 1 unless rawdata[i-1..i-1] == ";"
123
- next
124
- end
110
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
111
+ ni,match = index_match(rawdata, Charref, i)
112
+ if ni and ni == i # See? Ugly
113
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
114
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
115
+ i -= 1 unless rawdata[i-1..i-1] == ";"
116
+ next
117
+ end
118
+ ni,match = index_match(rawdata, Entityref, i)
119
+ if ni and ni == i
120
+ handle_entityref(match[1])
121
+ i += match[0].length
122
+ i -= 1 unless rawdata[i-1..i-1] == ";"
123
+ next
124
+ end
125
125
  else
126
- error('neither < nor & ??')
126
+ error('neither < nor & ??')
127
127
  end
128
128
  # We get here only if incomplete matches but
129
129
  # nothing else
130
130
  ni,match = index_match(rawdata,Incomplete,i)
131
131
  unless ni and ni == 0
132
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
- i += 1
134
- next
132
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
133
+ i += 1
134
+ next
135
135
  end
136
136
  j = ni + match[0].length
137
137
  break if j == n # Really incomplete
@@ -206,7 +206,7 @@ class BetterSGMLParser < HTML::SGMLParser
206
206
  else
207
207
  ni,match = index_match(rawdata,Tagfind,i+1)
208
208
  unless match
209
- error('unexpected call to parse_starttag')
209
+ error('unexpected call to parse_starttag')
210
210
  end
211
211
  k = ni+match[0].length+1
212
212
  tag = match[0].downcase
@@ -220,9 +220,9 @@ class BetterSGMLParser < HTML::SGMLParser
220
220
  matched_length = match[0].length
221
221
  attrname, rest, attrvalue = match[1],match[2],match[3]
222
222
  if rest.nil? or rest.empty?
223
- attrvalue = '' # was: = attrname # Why the change?
223
+ attrvalue = '' # was: = attrname # Why the change?
224
224
  elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
225
- attrvalue = attrvalue[1...-1]
225
+ attrvalue = attrvalue[1...-1]
226
226
  end
227
227
  attrsd << [attrname.downcase, attrvalue]
228
228
  k += matched_length