rfeedparser 0.9.92 → 0.9.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +106 -105
- data/lib/rfeedparser/better_sgmlparser.rb +84 -84
- data/lib/rfeedparser/encoding_helpers.rb +4 -3
- data/lib/rfeedparser/parser_mixin.rb +121 -118
- data/lib/rfeedparser/parsers.rb +31 -30
- data/lib/rfeedparser/scrub.rb +1 -1
- data/lib/rfeedparser/time_helpers.rb +52 -54
- data/tests/rfponly/wellformed/mrss/mrss_media_content.xml +20 -0
- data/tests/rfponly/wellformed/mrss/mrss_thumbnail.xml +21 -0
- metadata +10 -5
data/lib/rfeedparser.rb
CHANGED
@@ -56,38 +56,38 @@ include FeedParserUtilities
|
|
56
56
|
|
57
57
|
|
58
58
|
module FeedParser
|
59
|
-
Version = "0.9.
|
59
|
+
Version = "0.9.93"
|
60
60
|
|
61
61
|
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
62
62
|
|
63
|
-
Redistribution and use in source and binary forms, with or without modification,
|
64
|
-
are permitted provided that the following conditions are met:
|
63
|
+
Redistribution and use in source and binary forms, with or without modification,
|
64
|
+
are permitted provided that the following conditions are met:
|
65
65
|
|
66
|
-
* Redistributions of source code must retain the above copyright notice,
|
66
|
+
* Redistributions of source code must retain the above copyright notice,
|
67
67
|
this list of conditions and the following disclaimer.
|
68
|
-
* Redistributions in binary form must reproduce the above copyright notice,
|
68
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
69
69
|
this list of conditions and the following disclaimer in the documentation
|
70
70
|
and/or other materials provided with the distribution.
|
71
71
|
|
72
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
73
|
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
74
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
75
|
-
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
76
|
-
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
77
|
-
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
78
|
-
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
79
|
-
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
80
|
-
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
81
|
-
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
|
-
POSSIBILITY OF SUCH DAMAGE."""
|
72
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
73
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
74
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
75
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
76
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
77
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
78
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
79
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
80
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
81
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
|
+
POSSIBILITY OF SUCH DAMAGE."""
|
83
83
|
|
84
84
|
Author = "Jeff Hodges <http://somethingsimilar.com>"
|
85
85
|
Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
|
86
86
|
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
87
|
+
"John Beimler <http://john.beimler.org/>",
|
88
|
+
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
89
|
+
"Aaron Swartz <http://aaronsw.com/>",
|
90
|
+
"Kevin Marks <http://epeus.blogspot.com/>"
|
91
91
|
]
|
92
92
|
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
93
93
|
# If you are embedding feedparser in a larger application, you should
|
@@ -123,25 +123,26 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
123
123
|
|
124
124
|
|
125
125
|
SUPPORTED_VERSIONS = {'' => 'unknown',
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
126
|
+
'rss090' => 'RSS 0.90',
|
127
|
+
'rss091n' => 'RSS 0.91 (Netscape)',
|
128
|
+
'rss091u' => 'RSS 0.91 (Userland)',
|
129
|
+
'rss092' => 'RSS 0.92',
|
130
|
+
'rss093' => 'RSS 0.93',
|
131
|
+
'rss094' => 'RSS 0.94',
|
132
|
+
'rss20' => 'RSS 2.0',
|
133
|
+
'rss10' => 'RSS 1.0',
|
134
|
+
'rss' => 'RSS (unknown version)',
|
135
|
+
'atom01' => 'Atom 0.1',
|
136
|
+
'atom02' => 'Atom 0.2',
|
137
|
+
'atom03' => 'Atom 0.3',
|
138
|
+
'atom10' => 'Atom 1.0',
|
139
|
+
'atom' => 'Atom (unknown version)',
|
140
|
+
'cdf' => 'CDF',
|
141
|
+
'hotrss' => 'Hot RSS'
|
142
142
|
}
|
143
|
-
|
143
|
+
|
144
144
|
def parse(furi, options = {})
|
145
|
+
furi.strip!
|
145
146
|
# Parse a feed from a URL, file, stream or string
|
146
147
|
$compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
|
147
148
|
strictklass = options[:strict] || StrictFeedParser
|
@@ -189,27 +190,27 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
189
190
|
end
|
190
191
|
begin
|
191
192
|
if f.meta
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
193
|
+
result['etag'] = options[:etag] || f.meta['etag']
|
194
|
+
result['modified'] = options[:modified] || f.last_modified
|
195
|
+
result['url'] = f.base_uri.to_s
|
196
|
+
result['status'] = f.status[0] || 200
|
197
|
+
result['headers'] = f.meta
|
198
|
+
result['headers']['content-location'] ||= options[:content_location] unless options[:content_location].nil?
|
199
|
+
result['headers']['content-language'] ||= options[:content_language] unless options[:content_language].nil?
|
200
|
+
result['headers']['content-type'] ||= options[:content_type] unless options[:content_type].nil?
|
200
201
|
end
|
201
202
|
rescue NoMethodError
|
202
203
|
result['headers'] = {}
|
203
204
|
result['etag'] = result['headers']['etag'] = options[:etag] unless options[:etag].nil?
|
204
205
|
result['modified'] = result['headers']['last-modified'] = options[:modified] unless options[:modified].nil?
|
205
206
|
unless options[:content_location].nil?
|
206
|
-
|
207
|
+
result['headers']['content-location'] = options[:content_location]
|
207
208
|
end
|
208
209
|
unless options[:content_language].nil?
|
209
|
-
|
210
|
+
result['headers']['content-language'] = options[:content_language]
|
210
211
|
end
|
211
212
|
unless options[:content_type].nil?
|
212
|
-
|
213
|
+
result['headers']['content-type'] = options[:content_type]
|
213
214
|
end
|
214
215
|
end
|
215
216
|
|
@@ -221,13 +222,13 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
221
222
|
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
222
223
|
http_headers = result['headers']
|
223
224
|
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
|
224
|
-
|
225
|
+
self.getCharacterEncoding(f,data)
|
225
226
|
|
226
227
|
if not http_headers.empty? and not acceptable_content_type
|
227
228
|
if http_headers.has_key?('content-type')
|
228
|
-
|
229
|
+
bozo_message = "#{http_headers['content-type']} is not an XML media type"
|
229
230
|
else
|
230
|
-
|
231
|
+
bozo_message = 'no Content-type specified'
|
231
232
|
end
|
232
233
|
result['bozo'] = true
|
233
234
|
result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
|
@@ -260,21 +261,21 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
260
261
|
next if tried_encodings.include? proposed_encoding
|
261
262
|
tried_encodings << proposed_encoding
|
262
263
|
begin
|
263
|
-
|
264
|
-
|
265
|
-
|
264
|
+
data = self.toUTF8(data, proposed_encoding)
|
265
|
+
known_encoding = use_strict_parser = true
|
266
|
+
break
|
266
267
|
rescue
|
267
268
|
end
|
268
269
|
end
|
269
270
|
# if no luck and we have auto-detection library, try that
|
270
271
|
if not known_encoding and $chardet
|
271
272
|
begin
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
273
|
+
proposed_encoding = CharDet.detect(data)['encoding']
|
274
|
+
if proposed_encoding and not tried_encodings.include?proposed_encoding
|
275
|
+
tried_encodings << proposed_encoding
|
276
|
+
data = self.toUTF8(data, proposed_encoding)
|
277
|
+
known_encoding = use_strict_parser = true
|
278
|
+
end
|
278
279
|
rescue
|
279
280
|
end
|
280
281
|
end
|
@@ -284,24 +285,24 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
284
285
|
# if still no luck and we haven't tried utf-8 yet, try that
|
285
286
|
if not known_encoding and not tried_encodings.include?'utf-8'
|
286
287
|
begin
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
288
|
+
proposed_encoding = 'utf-8'
|
289
|
+
tried_encodings << proposed_encoding
|
290
|
+
data = self.toUTF8(data, proposed_encoding)
|
291
|
+
known_encoding = use_strict_parser = true
|
291
292
|
rescue
|
292
293
|
end
|
293
294
|
end
|
294
295
|
# if still no luck and we haven't tried windows-1252 yet, try that
|
295
296
|
if not known_encoding and not tried_encodings.include?'windows-1252'
|
296
297
|
begin
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
298
|
+
proposed_encoding = 'windows-1252'
|
299
|
+
tried_encodings << proposed_encoding
|
300
|
+
data = self.toUTF8(data, proposed_encoding)
|
301
|
+
known_encoding = use_strict_parser = true
|
301
302
|
rescue
|
302
303
|
end
|
303
304
|
end
|
304
|
-
|
305
|
+
|
305
306
|
# NOTE this isn't in FeedParser.py 4.1
|
306
307
|
# if still no luck and we haven't tried iso-8859-2 yet, try that.
|
307
308
|
#if not known_encoding and not tried_encodings.include?'iso-8859-2'
|
@@ -338,15 +339,15 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
338
339
|
inputdata = XML::SAX::InputSource.new('parsedfeed')
|
339
340
|
inputdata.setByteStream(StringIO.new(data))
|
340
341
|
begin
|
341
|
-
|
342
|
+
saxparser.parse(inputdata)
|
342
343
|
rescue Exception => parseerr # resparse
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
344
|
+
if $debug
|
345
|
+
$stderr << "xml parsing failed\n"
|
346
|
+
$stderr << parseerr.to_s+"\n" # Hrmph.
|
347
|
+
end
|
348
|
+
result['bozo'] = true
|
349
|
+
result['bozo_exception'] = feedparser.exc || e
|
350
|
+
use_strict_parser = false
|
350
351
|
end
|
351
352
|
end
|
352
353
|
if not use_strict_parser
|
@@ -378,22 +379,22 @@ class TextSerializer < Serializer
|
|
378
379
|
return if (node.nil? or node.empty?)
|
379
380
|
if node.methods.include?'keys'
|
380
381
|
node.keys.sort.each do |key|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
382
|
+
next if ['description','link'].include? key
|
383
|
+
next if node.has_key? k+'_detail'
|
384
|
+
next if node.has_key? k+'_parsed'
|
385
|
+
writer(stream,node[k], prefix+k+'.')
|
385
386
|
end
|
386
387
|
elsif node.class == Array
|
387
388
|
node.each_with_index do |thing, index|
|
388
|
-
|
389
|
+
writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
|
389
390
|
end
|
390
391
|
else
|
391
392
|
begin
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
393
|
+
s = u(node.to_s)
|
394
|
+
stream << prefix[0..-2]
|
395
|
+
stream << '='
|
396
|
+
stream << s
|
397
|
+
stream << "\n"
|
397
398
|
rescue
|
398
399
|
end
|
399
400
|
end
|
@@ -422,49 +423,49 @@ if $0 == __FILE__
|
|
422
423
|
opts.banner
|
423
424
|
opts.separator ""
|
424
425
|
opts.on("-A", "--user-agent [AGENT]",
|
425
|
-
|
426
|
+
"User-Agent for HTTP URLs") {|agent|
|
426
427
|
options.agent = agent
|
427
428
|
}
|
428
429
|
|
429
430
|
opts.on("-e", "--referrer [URL]",
|
430
|
-
|
431
|
+
"Referrer for HTTP URLs") {|referrer|
|
431
432
|
options.referrer = referrer
|
432
433
|
}
|
433
434
|
|
434
435
|
opts.on("-t", "--etag [TAG]",
|
435
|
-
|
436
|
+
"ETag/If-None-Match for HTTP URLs") {|etag|
|
436
437
|
options.etag = etag
|
437
438
|
}
|
438
439
|
|
439
440
|
opts.on("-m", "--last-modified [DATE]",
|
440
|
-
|
441
|
+
"Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
|
441
442
|
options.modified = modified
|
442
443
|
}
|
443
444
|
|
444
445
|
opts.on("-f", "--format [FORMAT]", [:text, :pprint],
|
445
|
-
|
446
|
+
"output resutls in FORMAT (text, pprint)") {|format|
|
446
447
|
options.format = format
|
447
448
|
}
|
448
449
|
|
449
450
|
opts.on("-v", "--[no-]verbose",
|
450
|
-
|
451
|
+
"write debugging information to stderr") {|v|
|
451
452
|
options.verbose = v
|
452
453
|
}
|
453
454
|
|
454
455
|
opts.on("-c", "--[no-]compatible",
|
455
|
-
|
456
|
+
"strip element attributes like feedparser.py 4.1 (default)") {|comp|
|
456
457
|
options.compatible = comp
|
457
458
|
}
|
458
459
|
opts.on("-l", "--content-location [LOCATION]",
|
459
|
-
|
460
|
+
"default Content-Location HTTP header") {|loc|
|
460
461
|
options.content_location = loc
|
461
462
|
}
|
462
463
|
opts.on("-a", "--content-language [LANG]",
|
463
|
-
|
464
|
+
"default Content-Language HTTP header") {|lang|
|
464
465
|
options.content_language = lang
|
465
466
|
}
|
466
467
|
opts.on("-t", "--content-type [TYPE]",
|
467
|
-
|
468
|
+
"default Content-type HTTP header") {|ctype|
|
468
469
|
options.ctype = ctype
|
469
470
|
}
|
470
471
|
end
|
@@ -482,14 +483,14 @@ if $0 == __FILE__
|
|
482
483
|
unless args.nil?
|
483
484
|
args.each do |url| # opts.parse! removes everything but the urls from the command line
|
484
485
|
results = FeedParser.parse(url, :etag => options.etag,
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
486
|
+
:modified => options.modified,
|
487
|
+
:agent => options.agent,
|
488
|
+
:referrer => options.referrer,
|
489
|
+
:content_location => options.content_location,
|
490
|
+
:content_language => options.content_language,
|
491
|
+
:content_type => options.ctype
|
492
|
+
)
|
493
|
+
serializer.new(results).write($stdout)
|
493
494
|
end
|
494
495
|
end
|
495
496
|
end
|
@@ -14,7 +14,7 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
14
14
|
|
15
15
|
Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
|
16
16
|
Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
|
17
|
-
Endtagopen = /<\//u #
|
17
|
+
Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
|
18
18
|
Endbracket = /[<>]/u
|
19
19
|
Declopen = /<!/u
|
20
20
|
Piopenbegin = /^<\?/u
|
@@ -24,8 +24,8 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
24
24
|
Commentclose = /--\s*>/u
|
25
25
|
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
|
26
26
|
Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
|
27
|
-
|
28
|
-
|
27
|
+
'(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
|
28
|
+
64)
|
29
29
|
Endtagfind = /\s*\/\s*>/u
|
30
30
|
def initialize(verbose=false)
|
31
31
|
super(verbose)
|
@@ -40,98 +40,98 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
40
40
|
n = rawdata.length
|
41
41
|
while i < n
|
42
42
|
if @nomoretags
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
# handle_data_range does nothing more than set a "Range" that is never used. wtf?
|
44
|
+
handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
|
45
|
+
i = n
|
46
|
+
break
|
47
47
|
end
|
48
48
|
j = rawdata.index(Interesting, i)
|
49
49
|
j = n unless j
|
50
50
|
handle_data(rawdata[i...j]) if i < j
|
51
51
|
i = j
|
52
52
|
break if (i == n)
|
53
|
-
if rawdata[i..i] == '<' #
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
53
|
+
if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
|
54
|
+
if rawdata.index(Starttagopen,i) == i
|
55
|
+
if @literal
|
56
|
+
handle_data(rawdata[i..i])
|
57
|
+
i = i+1
|
58
|
+
next
|
59
|
+
end
|
60
|
+
k = parse_starttag(i)
|
61
|
+
break unless k
|
62
|
+
i = k
|
63
|
+
next
|
64
|
+
end
|
65
|
+
if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
|
66
|
+
k = parse_endtag(i)
|
67
|
+
break unless k
|
68
|
+
i = k
|
69
|
+
@literal = false
|
70
|
+
next
|
71
|
+
end
|
72
|
+
if @literal
|
73
|
+
if n > (i+1)
|
74
|
+
handle_data("<")
|
75
|
+
i = i+1
|
76
|
+
else
|
77
|
+
#incomplete
|
78
|
+
break
|
79
|
+
end
|
80
|
+
next
|
81
|
+
end
|
82
|
+
if rawdata.index(Commentopen,i) == i
|
83
|
+
k = parse_comment(i)
|
84
|
+
break unless k
|
85
|
+
i = k
|
86
|
+
next
|
87
|
+
end
|
88
|
+
if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
|
89
|
+
k = parse_pi(i)
|
90
|
+
break unless k
|
91
|
+
i += k
|
92
|
+
next
|
93
|
+
end
|
94
|
+
if rawdata.index(Declopen,i) == i
|
95
|
+
# This is some sort of declaration; in "HTML as
|
96
|
+
# deployed," this should only be the document type
|
97
|
+
# declaration ("<!DOCTYPE html...>").
|
98
|
+
k = parse_declaration(i)
|
99
|
+
break unless k
|
100
|
+
i = k
|
101
|
+
next
|
102
|
+
end
|
103
103
|
elsif rawdata[i..i] == '&'
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
104
|
+
if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
|
105
|
+
handle_data(rawdata[i..i])
|
106
|
+
i += 1
|
107
|
+
next
|
108
|
+
end
|
109
109
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
110
|
+
# the Char must come first as its #=~ method is the only one that is UTF-8 safe
|
111
|
+
ni,match = index_match(rawdata, Charref, i)
|
112
|
+
if ni and ni == i # See? Ugly
|
113
|
+
handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
|
114
|
+
i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
|
115
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
ni,match = index_match(rawdata, Entityref, i)
|
119
|
+
if ni and ni == i
|
120
|
+
handle_entityref(match[1])
|
121
|
+
i += match[0].length
|
122
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
123
|
+
next
|
124
|
+
end
|
125
125
|
else
|
126
|
-
|
126
|
+
error('neither < nor & ??')
|
127
127
|
end
|
128
128
|
# We get here only if incomplete matches but
|
129
129
|
# nothing else
|
130
130
|
ni,match = index_match(rawdata,Incomplete,i)
|
131
131
|
unless ni and ni == 0
|
132
|
-
|
133
|
-
|
134
|
-
|
132
|
+
handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
|
133
|
+
i += 1
|
134
|
+
next
|
135
135
|
end
|
136
136
|
j = ni + match[0].length
|
137
137
|
break if j == n # Really incomplete
|
@@ -206,7 +206,7 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
206
206
|
else
|
207
207
|
ni,match = index_match(rawdata,Tagfind,i+1)
|
208
208
|
unless match
|
209
|
-
|
209
|
+
error('unexpected call to parse_starttag')
|
210
210
|
end
|
211
211
|
k = ni+match[0].length+1
|
212
212
|
tag = match[0].downcase
|
@@ -220,9 +220,9 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
220
220
|
matched_length = match[0].length
|
221
221
|
attrname, rest, attrvalue = match[1],match[2],match[3]
|
222
222
|
if rest.nil? or rest.empty?
|
223
|
-
|
223
|
+
attrvalue = '' # was: = attrname # Why the change?
|
224
224
|
elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
|
225
|
-
|
225
|
+
attrvalue = attrvalue[1...-1]
|
226
226
|
end
|
227
227
|
attrsd << [attrname.downcase, attrvalue]
|
228
228
|
k += matched_length
|