rfeedparser 0.9.92 → 0.9.93
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rfeedparser.rb +106 -105
- data/lib/rfeedparser/better_sgmlparser.rb +84 -84
- data/lib/rfeedparser/encoding_helpers.rb +4 -3
- data/lib/rfeedparser/parser_mixin.rb +121 -118
- data/lib/rfeedparser/parsers.rb +31 -30
- data/lib/rfeedparser/scrub.rb +1 -1
- data/lib/rfeedparser/time_helpers.rb +52 -54
- data/tests/rfponly/wellformed/mrss/mrss_media_content.xml +20 -0
- data/tests/rfponly/wellformed/mrss/mrss_thumbnail.xml +21 -0
- metadata +10 -5
data/lib/rfeedparser.rb
CHANGED
@@ -56,38 +56,38 @@ include FeedParserUtilities
|
|
56
56
|
|
57
57
|
|
58
58
|
module FeedParser
|
59
|
-
Version = "0.9.
|
59
|
+
Version = "0.9.93"
|
60
60
|
|
61
61
|
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
62
62
|
|
63
|
-
Redistribution and use in source and binary forms, with or without modification,
|
64
|
-
are permitted provided that the following conditions are met:
|
63
|
+
Redistribution and use in source and binary forms, with or without modification,
|
64
|
+
are permitted provided that the following conditions are met:
|
65
65
|
|
66
|
-
* Redistributions of source code must retain the above copyright notice,
|
66
|
+
* Redistributions of source code must retain the above copyright notice,
|
67
67
|
this list of conditions and the following disclaimer.
|
68
|
-
* Redistributions in binary form must reproduce the above copyright notice,
|
68
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
69
69
|
this list of conditions and the following disclaimer in the documentation
|
70
70
|
and/or other materials provided with the distribution.
|
71
71
|
|
72
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
73
|
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
74
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
75
|
-
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
76
|
-
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
77
|
-
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
78
|
-
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
79
|
-
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
80
|
-
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
81
|
-
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
|
-
POSSIBILITY OF SUCH DAMAGE."""
|
72
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
73
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
74
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
75
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
76
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
77
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
78
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
79
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
80
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
81
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
|
+
POSSIBILITY OF SUCH DAMAGE."""
|
83
83
|
|
84
84
|
Author = "Jeff Hodges <http://somethingsimilar.com>"
|
85
85
|
Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
|
86
86
|
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
87
|
+
"John Beimler <http://john.beimler.org/>",
|
88
|
+
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
89
|
+
"Aaron Swartz <http://aaronsw.com/>",
|
90
|
+
"Kevin Marks <http://epeus.blogspot.com/>"
|
91
91
|
]
|
92
92
|
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
93
93
|
# If you are embedding feedparser in a larger application, you should
|
@@ -123,25 +123,26 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
123
123
|
|
124
124
|
|
125
125
|
SUPPORTED_VERSIONS = {'' => 'unknown',
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
126
|
+
'rss090' => 'RSS 0.90',
|
127
|
+
'rss091n' => 'RSS 0.91 (Netscape)',
|
128
|
+
'rss091u' => 'RSS 0.91 (Userland)',
|
129
|
+
'rss092' => 'RSS 0.92',
|
130
|
+
'rss093' => 'RSS 0.93',
|
131
|
+
'rss094' => 'RSS 0.94',
|
132
|
+
'rss20' => 'RSS 2.0',
|
133
|
+
'rss10' => 'RSS 1.0',
|
134
|
+
'rss' => 'RSS (unknown version)',
|
135
|
+
'atom01' => 'Atom 0.1',
|
136
|
+
'atom02' => 'Atom 0.2',
|
137
|
+
'atom03' => 'Atom 0.3',
|
138
|
+
'atom10' => 'Atom 1.0',
|
139
|
+
'atom' => 'Atom (unknown version)',
|
140
|
+
'cdf' => 'CDF',
|
141
|
+
'hotrss' => 'Hot RSS'
|
142
142
|
}
|
143
|
-
|
143
|
+
|
144
144
|
def parse(furi, options = {})
|
145
|
+
furi.strip!
|
145
146
|
# Parse a feed from a URL, file, stream or string
|
146
147
|
$compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
|
147
148
|
strictklass = options[:strict] || StrictFeedParser
|
@@ -189,27 +190,27 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
189
190
|
end
|
190
191
|
begin
|
191
192
|
if f.meta
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
193
|
+
result['etag'] = options[:etag] || f.meta['etag']
|
194
|
+
result['modified'] = options[:modified] || f.last_modified
|
195
|
+
result['url'] = f.base_uri.to_s
|
196
|
+
result['status'] = f.status[0] || 200
|
197
|
+
result['headers'] = f.meta
|
198
|
+
result['headers']['content-location'] ||= options[:content_location] unless options[:content_location].nil?
|
199
|
+
result['headers']['content-language'] ||= options[:content_language] unless options[:content_language].nil?
|
200
|
+
result['headers']['content-type'] ||= options[:content_type] unless options[:content_type].nil?
|
200
201
|
end
|
201
202
|
rescue NoMethodError
|
202
203
|
result['headers'] = {}
|
203
204
|
result['etag'] = result['headers']['etag'] = options[:etag] unless options[:etag].nil?
|
204
205
|
result['modified'] = result['headers']['last-modified'] = options[:modified] unless options[:modified].nil?
|
205
206
|
unless options[:content_location].nil?
|
206
|
-
|
207
|
+
result['headers']['content-location'] = options[:content_location]
|
207
208
|
end
|
208
209
|
unless options[:content_language].nil?
|
209
|
-
|
210
|
+
result['headers']['content-language'] = options[:content_language]
|
210
211
|
end
|
211
212
|
unless options[:content_type].nil?
|
212
|
-
|
213
|
+
result['headers']['content-type'] = options[:content_type]
|
213
214
|
end
|
214
215
|
end
|
215
216
|
|
@@ -221,13 +222,13 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
221
222
|
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
222
223
|
http_headers = result['headers']
|
223
224
|
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
|
224
|
-
|
225
|
+
self.getCharacterEncoding(f,data)
|
225
226
|
|
226
227
|
if not http_headers.empty? and not acceptable_content_type
|
227
228
|
if http_headers.has_key?('content-type')
|
228
|
-
|
229
|
+
bozo_message = "#{http_headers['content-type']} is not an XML media type"
|
229
230
|
else
|
230
|
-
|
231
|
+
bozo_message = 'no Content-type specified'
|
231
232
|
end
|
232
233
|
result['bozo'] = true
|
233
234
|
result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
|
@@ -260,21 +261,21 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
260
261
|
next if tried_encodings.include? proposed_encoding
|
261
262
|
tried_encodings << proposed_encoding
|
262
263
|
begin
|
263
|
-
|
264
|
-
|
265
|
-
|
264
|
+
data = self.toUTF8(data, proposed_encoding)
|
265
|
+
known_encoding = use_strict_parser = true
|
266
|
+
break
|
266
267
|
rescue
|
267
268
|
end
|
268
269
|
end
|
269
270
|
# if no luck and we have auto-detection library, try that
|
270
271
|
if not known_encoding and $chardet
|
271
272
|
begin
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
273
|
+
proposed_encoding = CharDet.detect(data)['encoding']
|
274
|
+
if proposed_encoding and not tried_encodings.include?proposed_encoding
|
275
|
+
tried_encodings << proposed_encoding
|
276
|
+
data = self.toUTF8(data, proposed_encoding)
|
277
|
+
known_encoding = use_strict_parser = true
|
278
|
+
end
|
278
279
|
rescue
|
279
280
|
end
|
280
281
|
end
|
@@ -284,24 +285,24 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
284
285
|
# if still no luck and we haven't tried utf-8 yet, try that
|
285
286
|
if not known_encoding and not tried_encodings.include?'utf-8'
|
286
287
|
begin
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
288
|
+
proposed_encoding = 'utf-8'
|
289
|
+
tried_encodings << proposed_encoding
|
290
|
+
data = self.toUTF8(data, proposed_encoding)
|
291
|
+
known_encoding = use_strict_parser = true
|
291
292
|
rescue
|
292
293
|
end
|
293
294
|
end
|
294
295
|
# if still no luck and we haven't tried windows-1252 yet, try that
|
295
296
|
if not known_encoding and not tried_encodings.include?'windows-1252'
|
296
297
|
begin
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
298
|
+
proposed_encoding = 'windows-1252'
|
299
|
+
tried_encodings << proposed_encoding
|
300
|
+
data = self.toUTF8(data, proposed_encoding)
|
301
|
+
known_encoding = use_strict_parser = true
|
301
302
|
rescue
|
302
303
|
end
|
303
304
|
end
|
304
|
-
|
305
|
+
|
305
306
|
# NOTE this isn't in FeedParser.py 4.1
|
306
307
|
# if still no luck and we haven't tried iso-8859-2 yet, try that.
|
307
308
|
#if not known_encoding and not tried_encodings.include?'iso-8859-2'
|
@@ -338,15 +339,15 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
338
339
|
inputdata = XML::SAX::InputSource.new('parsedfeed')
|
339
340
|
inputdata.setByteStream(StringIO.new(data))
|
340
341
|
begin
|
341
|
-
|
342
|
+
saxparser.parse(inputdata)
|
342
343
|
rescue Exception => parseerr # resparse
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
344
|
+
if $debug
|
345
|
+
$stderr << "xml parsing failed\n"
|
346
|
+
$stderr << parseerr.to_s+"\n" # Hrmph.
|
347
|
+
end
|
348
|
+
result['bozo'] = true
|
349
|
+
result['bozo_exception'] = feedparser.exc || e
|
350
|
+
use_strict_parser = false
|
350
351
|
end
|
351
352
|
end
|
352
353
|
if not use_strict_parser
|
@@ -378,22 +379,22 @@ class TextSerializer < Serializer
|
|
378
379
|
return if (node.nil? or node.empty?)
|
379
380
|
if node.methods.include?'keys'
|
380
381
|
node.keys.sort.each do |key|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
382
|
+
next if ['description','link'].include? key
|
383
|
+
next if node.has_key? k+'_detail'
|
384
|
+
next if node.has_key? k+'_parsed'
|
385
|
+
writer(stream,node[k], prefix+k+'.')
|
385
386
|
end
|
386
387
|
elsif node.class == Array
|
387
388
|
node.each_with_index do |thing, index|
|
388
|
-
|
389
|
+
writer(stream, thing, prefix[0..-2] + '[' + index.to_s + '].')
|
389
390
|
end
|
390
391
|
else
|
391
392
|
begin
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
393
|
+
s = u(node.to_s)
|
394
|
+
stream << prefix[0..-2]
|
395
|
+
stream << '='
|
396
|
+
stream << s
|
397
|
+
stream << "\n"
|
397
398
|
rescue
|
398
399
|
end
|
399
400
|
end
|
@@ -422,49 +423,49 @@ if $0 == __FILE__
|
|
422
423
|
opts.banner
|
423
424
|
opts.separator ""
|
424
425
|
opts.on("-A", "--user-agent [AGENT]",
|
425
|
-
|
426
|
+
"User-Agent for HTTP URLs") {|agent|
|
426
427
|
options.agent = agent
|
427
428
|
}
|
428
429
|
|
429
430
|
opts.on("-e", "--referrer [URL]",
|
430
|
-
|
431
|
+
"Referrer for HTTP URLs") {|referrer|
|
431
432
|
options.referrer = referrer
|
432
433
|
}
|
433
434
|
|
434
435
|
opts.on("-t", "--etag [TAG]",
|
435
|
-
|
436
|
+
"ETag/If-None-Match for HTTP URLs") {|etag|
|
436
437
|
options.etag = etag
|
437
438
|
}
|
438
439
|
|
439
440
|
opts.on("-m", "--last-modified [DATE]",
|
440
|
-
|
441
|
+
"Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
|
441
442
|
options.modified = modified
|
442
443
|
}
|
443
444
|
|
444
445
|
opts.on("-f", "--format [FORMAT]", [:text, :pprint],
|
445
|
-
|
446
|
+
"output resutls in FORMAT (text, pprint)") {|format|
|
446
447
|
options.format = format
|
447
448
|
}
|
448
449
|
|
449
450
|
opts.on("-v", "--[no-]verbose",
|
450
|
-
|
451
|
+
"write debugging information to stderr") {|v|
|
451
452
|
options.verbose = v
|
452
453
|
}
|
453
454
|
|
454
455
|
opts.on("-c", "--[no-]compatible",
|
455
|
-
|
456
|
+
"strip element attributes like feedparser.py 4.1 (default)") {|comp|
|
456
457
|
options.compatible = comp
|
457
458
|
}
|
458
459
|
opts.on("-l", "--content-location [LOCATION]",
|
459
|
-
|
460
|
+
"default Content-Location HTTP header") {|loc|
|
460
461
|
options.content_location = loc
|
461
462
|
}
|
462
463
|
opts.on("-a", "--content-language [LANG]",
|
463
|
-
|
464
|
+
"default Content-Language HTTP header") {|lang|
|
464
465
|
options.content_language = lang
|
465
466
|
}
|
466
467
|
opts.on("-t", "--content-type [TYPE]",
|
467
|
-
|
468
|
+
"default Content-type HTTP header") {|ctype|
|
468
469
|
options.ctype = ctype
|
469
470
|
}
|
470
471
|
end
|
@@ -482,14 +483,14 @@ if $0 == __FILE__
|
|
482
483
|
unless args.nil?
|
483
484
|
args.each do |url| # opts.parse! removes everything but the urls from the command line
|
484
485
|
results = FeedParser.parse(url, :etag => options.etag,
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
486
|
+
:modified => options.modified,
|
487
|
+
:agent => options.agent,
|
488
|
+
:referrer => options.referrer,
|
489
|
+
:content_location => options.content_location,
|
490
|
+
:content_language => options.content_language,
|
491
|
+
:content_type => options.ctype
|
492
|
+
)
|
493
|
+
serializer.new(results).write($stdout)
|
493
494
|
end
|
494
495
|
end
|
495
496
|
end
|
@@ -14,7 +14,7 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
14
14
|
|
15
15
|
Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
|
16
16
|
Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
|
17
|
-
Endtagopen = /<\//u #
|
17
|
+
Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
|
18
18
|
Endbracket = /[<>]/u
|
19
19
|
Declopen = /<!/u
|
20
20
|
Piopenbegin = /^<\?/u
|
@@ -24,8 +24,8 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
24
24
|
Commentclose = /--\s*>/u
|
25
25
|
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
|
26
26
|
Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
|
27
|
-
|
28
|
-
|
27
|
+
'(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
|
28
|
+
64)
|
29
29
|
Endtagfind = /\s*\/\s*>/u
|
30
30
|
def initialize(verbose=false)
|
31
31
|
super(verbose)
|
@@ -40,98 +40,98 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
40
40
|
n = rawdata.length
|
41
41
|
while i < n
|
42
42
|
if @nomoretags
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
# handle_data_range does nothing more than set a "Range" that is never used. wtf?
|
44
|
+
handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
|
45
|
+
i = n
|
46
|
+
break
|
47
47
|
end
|
48
48
|
j = rawdata.index(Interesting, i)
|
49
49
|
j = n unless j
|
50
50
|
handle_data(rawdata[i...j]) if i < j
|
51
51
|
i = j
|
52
52
|
break if (i == n)
|
53
|
-
if rawdata[i..i] == '<' #
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
53
|
+
if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
|
54
|
+
if rawdata.index(Starttagopen,i) == i
|
55
|
+
if @literal
|
56
|
+
handle_data(rawdata[i..i])
|
57
|
+
i = i+1
|
58
|
+
next
|
59
|
+
end
|
60
|
+
k = parse_starttag(i)
|
61
|
+
break unless k
|
62
|
+
i = k
|
63
|
+
next
|
64
|
+
end
|
65
|
+
if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
|
66
|
+
k = parse_endtag(i)
|
67
|
+
break unless k
|
68
|
+
i = k
|
69
|
+
@literal = false
|
70
|
+
next
|
71
|
+
end
|
72
|
+
if @literal
|
73
|
+
if n > (i+1)
|
74
|
+
handle_data("<")
|
75
|
+
i = i+1
|
76
|
+
else
|
77
|
+
#incomplete
|
78
|
+
break
|
79
|
+
end
|
80
|
+
next
|
81
|
+
end
|
82
|
+
if rawdata.index(Commentopen,i) == i
|
83
|
+
k = parse_comment(i)
|
84
|
+
break unless k
|
85
|
+
i = k
|
86
|
+
next
|
87
|
+
end
|
88
|
+
if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
|
89
|
+
k = parse_pi(i)
|
90
|
+
break unless k
|
91
|
+
i += k
|
92
|
+
next
|
93
|
+
end
|
94
|
+
if rawdata.index(Declopen,i) == i
|
95
|
+
# This is some sort of declaration; in "HTML as
|
96
|
+
# deployed," this should only be the document type
|
97
|
+
# declaration ("<!DOCTYPE html...>").
|
98
|
+
k = parse_declaration(i)
|
99
|
+
break unless k
|
100
|
+
i = k
|
101
|
+
next
|
102
|
+
end
|
103
103
|
elsif rawdata[i..i] == '&'
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
104
|
+
if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
|
105
|
+
handle_data(rawdata[i..i])
|
106
|
+
i += 1
|
107
|
+
next
|
108
|
+
end
|
109
109
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
110
|
+
# the Char must come first as its #=~ method is the only one that is UTF-8 safe
|
111
|
+
ni,match = index_match(rawdata, Charref, i)
|
112
|
+
if ni and ni == i # See? Ugly
|
113
|
+
handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
|
114
|
+
i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
|
115
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
ni,match = index_match(rawdata, Entityref, i)
|
119
|
+
if ni and ni == i
|
120
|
+
handle_entityref(match[1])
|
121
|
+
i += match[0].length
|
122
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
123
|
+
next
|
124
|
+
end
|
125
125
|
else
|
126
|
-
|
126
|
+
error('neither < nor & ??')
|
127
127
|
end
|
128
128
|
# We get here only if incomplete matches but
|
129
129
|
# nothing else
|
130
130
|
ni,match = index_match(rawdata,Incomplete,i)
|
131
131
|
unless ni and ni == 0
|
132
|
-
|
133
|
-
|
134
|
-
|
132
|
+
handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
|
133
|
+
i += 1
|
134
|
+
next
|
135
135
|
end
|
136
136
|
j = ni + match[0].length
|
137
137
|
break if j == n # Really incomplete
|
@@ -206,7 +206,7 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
206
206
|
else
|
207
207
|
ni,match = index_match(rawdata,Tagfind,i+1)
|
208
208
|
unless match
|
209
|
-
|
209
|
+
error('unexpected call to parse_starttag')
|
210
210
|
end
|
211
211
|
k = ni+match[0].length+1
|
212
212
|
tag = match[0].downcase
|
@@ -220,9 +220,9 @@ class BetterSGMLParser < HTML::SGMLParser
|
|
220
220
|
matched_length = match[0].length
|
221
221
|
attrname, rest, attrvalue = match[1],match[2],match[3]
|
222
222
|
if rest.nil? or rest.empty?
|
223
|
-
|
223
|
+
attrvalue = '' # was: = attrname # Why the change?
|
224
224
|
elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
|
225
|
-
|
225
|
+
attrvalue = attrvalue[1...-1]
|
226
226
|
end
|
227
227
|
attrsd << [attrname.downcase, attrvalue]
|
228
228
|
k += matched_length
|