htmltokenizer 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +63 -0
- data/lib/html/htmltokenizer.rb +355 -0
- data/test/htmltokenizer_test.rb +92 -0
- metadata +41 -0
data/README
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
htmltokenizer README
|
2
|
+
============
|
3
|
+
|
4
|
+
htmltokenizer is a port of the idea behind Perl's HTML::TokeParser::Simple.
|
5
|
+
The basic concept is that it treats a web page as a series of tokens, which
|
6
|
+
are either text, html tags, or html comments. This class provides a way
|
7
|
+
of getting these tokens in sequence, either one at a time regardless of
|
8
|
+
type, or by choosing a list of interesting tags.
|
9
|
+
|
10
|
+
Requirements
|
11
|
+
------------
|
12
|
+
|
13
|
+
* ruby
|
14
|
+
|
15
|
+
Install
|
16
|
+
-------
|
17
|
+
|
18
|
+
De-Compress archive and enter its top directory.
|
19
|
+
Then type:
|
20
|
+
|
21
|
+
$ ruby install.rb config
|
22
|
+
$ ruby install.rb setup
|
23
|
+
$ su -c "ruby install.rb install"
|
24
|
+
|
25
|
+
or
|
26
|
+
|
27
|
+
$ ruby install.rb config
|
28
|
+
$ ruby install.rb setup
|
29
|
+
$ sudo ruby install.rb install
|
30
|
+
|
31
|
+
You can also install files into your favorite directory
|
32
|
+
by supplying install.rb some options. Try "ruby install.rb --help".
|
33
|
+
|
34
|
+
Usage
|
35
|
+
-----
|
36
|
+
|
37
|
+
require 'html/htmltokenizer'
|
38
|
+
|
39
|
+
page = getSomePageFromTheInternetAsAString()
|
40
|
+
|
41
|
+
tokenizer = HTMLTokenizer.new(page)
|
42
|
+
|
43
|
+
while token = tokenizer.getTag('a', 'font', '/tr', 'div')
|
44
|
+
if 'div' == token.tag_name
|
45
|
+
if 'headlinesheader' == token.attr_hash['class']
|
46
|
+
puts "Header is: " + tokenizer.getTrimmedText('/div')
|
47
|
+
else
|
48
|
+
tokenizer.getTag('/div')
|
49
|
+
token = tokenizer.getTag('a')
|
50
|
+
if token.attr_hash['href']
|
51
|
+
puts "Found a link after a div going to #{token.attr_hash['href']}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
License
|
58
|
+
-------
|
59
|
+
|
60
|
+
Ruby's license, see http://www.ruby-lang.org/en/LICENSE.txt
|
61
|
+
|
62
|
+
|
63
|
+
Ben Giddings <bg-rubyraa@infofiend.com>
|
@@ -0,0 +1,355 @@
|
|
1
|
+
# = HTMLTokenizer
|
2
|
+
#
|
3
|
+
# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
|
4
|
+
# Copyright:: Copyright (c) 2004 Ben Giddings
|
5
|
+
# License:: Distributes under the same terms as Ruby
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# This is a partial port of the functionality behind Perl's TokeParser
|
9
|
+
# Provided a page it progressively returns tokens from that page
|
10
|
+
#
|
11
|
+
# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
|
12
|
+
|
13
|
+
#
|
14
|
+
# A class to tokenize HTML.
|
15
|
+
#
|
16
|
+
# Example:
|
17
|
+
#
|
18
|
+
# page = "<HTML>
|
19
|
+
# <HEAD>
|
20
|
+
# <TITLE>This is the title</TITLE>
|
21
|
+
# </HEAD>
|
22
|
+
# <!-- Here comes the <a href=\"missing.link\">blah</a>
|
23
|
+
# comment body
|
24
|
+
# -->
|
25
|
+
# <BODY>
|
26
|
+
# <H1>This is the header</H1>
|
27
|
+
# <P>
|
28
|
+
# This is the paragraph, it contains
|
29
|
+
# <a href=\"link.html\">links</a>,
|
30
|
+
# <img src=\"blah.gif\" optional alt='images
|
31
|
+
# are
|
32
|
+
# really cool'>. Ok, here is some more text and
|
33
|
+
# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
|
34
|
+
# </P>
|
35
|
+
# </body>
|
36
|
+
# </HTML>
|
37
|
+
# "
|
38
|
+
# toke = HTMLTokenizer.new(page)
|
39
|
+
#
|
40
|
+
# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
|
41
|
+
# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
|
42
|
+
# assert("links" == toke.getTrimmedText)
|
43
|
+
# assert(toke.getTag("IMG", "A").attr_hash['optional'])
|
44
|
+
# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
|
45
|
+
#
|
46
|
+
class HTMLTokenizer
|
47
|
+
@@version = 1.0
|
48
|
+
|
49
|
+
# Get version of HTMLTokenizer lib
|
50
|
+
def self.version
|
51
|
+
@@version
|
52
|
+
end
|
53
|
+
|
54
|
+
attr_reader :page
|
55
|
+
|
56
|
+
# Create a new tokenizer, based on the content, used as a string.
|
57
|
+
def initialize(content)
|
58
|
+
@page = content.to_s
|
59
|
+
@cur_pos = 0
|
60
|
+
end
|
61
|
+
|
62
|
+
# Reset the parser, setting the current position back at the stop
|
63
|
+
def reset
|
64
|
+
@cur_pos = 0
|
65
|
+
end
|
66
|
+
|
67
|
+
# Look at the next token, but don't actually grab it
|
68
|
+
def peekNextToken
|
69
|
+
if @cur_pos == @page.length then return nil end
|
70
|
+
|
71
|
+
if ?< == @page[@cur_pos]
|
72
|
+
# Next token is a tag of some kind
|
73
|
+
if '!--' == @page[(@cur_pos + 1), 3]
|
74
|
+
# Token is a comment
|
75
|
+
tag_end = @page.index('-->', (@cur_pos + 1))
|
76
|
+
if tag_end.nil?
|
77
|
+
raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
|
78
|
+
end
|
79
|
+
# p @page[@cur_pos .. (tag_end+2)]
|
80
|
+
HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
|
81
|
+
else
|
82
|
+
# Token is a html tag
|
83
|
+
tag_end = @page.index('>', (@cur_pos + 1))
|
84
|
+
if tag_end.nil?
|
85
|
+
raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
|
86
|
+
end
|
87
|
+
# p @page[@cur_pos .. tag_end]
|
88
|
+
HTMLTag.new(@page[@cur_pos .. tag_end])
|
89
|
+
end
|
90
|
+
else
|
91
|
+
# Next token is text
|
92
|
+
text_end = @page.index('<', @cur_pos)
|
93
|
+
text_end = text_end.nil? ? -1 : (text_end - 1)
|
94
|
+
# p @page[@cur_pos .. text_end]
|
95
|
+
HTMLText.new(@page[@cur_pos .. text_end])
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Get the next token, returns an instance of
|
100
|
+
# * HTMLText
|
101
|
+
# * HTMLToken
|
102
|
+
# * HTMLTag
|
103
|
+
def getNextToken
|
104
|
+
token = peekNextToken
|
105
|
+
if token
|
106
|
+
# @page = @page[token.raw.length .. -1]
|
107
|
+
# @page.slice!(0, token.raw.length)
|
108
|
+
@cur_pos += token.raw.length
|
109
|
+
end
|
110
|
+
#p token
|
111
|
+
#print token.raw
|
112
|
+
return token
|
113
|
+
end
|
114
|
+
|
115
|
+
# Get a tag from the specified set of desired tags.
|
116
|
+
# For example:
|
117
|
+
# <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
|
118
|
+
# Will return the next header tag encountered.
|
119
|
+
def getTag(*sought_tags)
|
120
|
+
sought_tags.collect! {|elm| elm.downcase}
|
121
|
+
|
122
|
+
while (tag = getNextToken)
|
123
|
+
if tag.kind_of?(HTMLTag) and
|
124
|
+
(0 == sought_tags.length or sought_tags.include?(tag.tag_name))
|
125
|
+
break
|
126
|
+
end
|
127
|
+
end
|
128
|
+
tag
|
129
|
+
end
|
130
|
+
|
131
|
+
# Get all the text between the current position and the next tag
|
132
|
+
# (if specified) or a specific later tag
|
133
|
+
def getText(until_tag = nil)
|
134
|
+
if until_tag.nil?
|
135
|
+
if ?< == @page[@cur_pos]
|
136
|
+
# Next token is a tag, not text
|
137
|
+
""
|
138
|
+
else
|
139
|
+
# Next token is text
|
140
|
+
getNextToken.text
|
141
|
+
end
|
142
|
+
else
|
143
|
+
ret_str = ""
|
144
|
+
|
145
|
+
while (tag = peekNextToken)
|
146
|
+
if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
|
147
|
+
break
|
148
|
+
end
|
149
|
+
|
150
|
+
if ("" != tag.text)
|
151
|
+
ret_str << (tag.text + " ")
|
152
|
+
end
|
153
|
+
getNextToken
|
154
|
+
end
|
155
|
+
|
156
|
+
ret_str
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Like getText, but squeeze all whitespace, getting rid of
|
161
|
+
# leading and trailing whitespace, and squeezing multiple
|
162
|
+
# spaces into a single space.
|
163
|
+
def getTrimmedText(until_tag = nil)
|
164
|
+
getText(until_tag).strip.gsub(/\s+/m, " ")
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
# The parent class for all three types of HTML tokens
|
170
|
+
class HTMLToken
|
171
|
+
attr_accessor :raw
|
172
|
+
|
173
|
+
# Initialize the token based on the raw text
|
174
|
+
def initialize(text)
|
175
|
+
@raw = text
|
176
|
+
end
|
177
|
+
|
178
|
+
# By default, return exactly the string used to create the text
|
179
|
+
def to_s
|
180
|
+
raw
|
181
|
+
end
|
182
|
+
|
183
|
+
# By default tokens have no text representation
|
184
|
+
def text
|
185
|
+
""
|
186
|
+
end
|
187
|
+
|
188
|
+
def trimmed_text
|
189
|
+
text.strip.gsub(/\s+/m, " ")
|
190
|
+
end
|
191
|
+
|
192
|
+
# Compare to another based on the raw source
|
193
|
+
def ==(other)
|
194
|
+
raw == other.to_s
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# Class representing text that isn't inside a tag
|
199
|
+
class HTMLText < HTMLToken
|
200
|
+
def text
|
201
|
+
raw
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# Class representing an HTML comment
|
206
|
+
class HTMLComment < HTMLToken
|
207
|
+
attr_accessor :contents
|
208
|
+
def initialize(text)
|
209
|
+
super(text)
|
210
|
+
temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
|
211
|
+
if temp_arr[0].nil?
|
212
|
+
raise "Text passed to HTMLComment.initialize is not a comment"
|
213
|
+
end
|
214
|
+
|
215
|
+
@contents = temp_arr[0][0]
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# Class representing an HTML tag
|
220
|
+
class HTMLTag < HTMLToken
|
221
|
+
attr_reader :end_tag, :tag_name
|
222
|
+
def initialize(text)
|
223
|
+
super(text)
|
224
|
+
if ?< != text[0] or ?> != text[-1]
|
225
|
+
raise "Text passed to HTMLComment.initialize is not a comment"
|
226
|
+
end
|
227
|
+
|
228
|
+
@attr_hash = Hash.new
|
229
|
+
@raw = text
|
230
|
+
|
231
|
+
tag_name = text.scan(/[\w:-]+/)[0]
|
232
|
+
if tag_name.nil?
|
233
|
+
raise "Error, tag is nil: #{tag_name}"
|
234
|
+
end
|
235
|
+
|
236
|
+
if ?/ == text[1]
|
237
|
+
# It's an end tag
|
238
|
+
@end_tag = true
|
239
|
+
@tag_name = '/' + tag_name.downcase
|
240
|
+
else
|
241
|
+
@end_tag = false
|
242
|
+
@tag_name = tag_name.downcase
|
243
|
+
end
|
244
|
+
|
245
|
+
@hashed = false
|
246
|
+
end
|
247
|
+
|
248
|
+
# Retrieve a hash of all the tag's attributes.
|
249
|
+
# Lazily done, so that if you don't look at a tag's attributes
|
250
|
+
# things go quicker
|
251
|
+
def attr_hash
|
252
|
+
# Lazy initialize == don't build the hash until it's needed
|
253
|
+
if !@hashed
|
254
|
+
if !@end_tag
|
255
|
+
# Get the attributes
|
256
|
+
attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
|
257
|
+
if attr_arr.kind_of?(Array)
|
258
|
+
# Attributes found, parse them
|
259
|
+
attrs = attr_arr[0]
|
260
|
+
attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
|
261
|
+
# clean up the array by:
|
262
|
+
# * setting all nil elements to true
|
263
|
+
# * removing enclosing quotes
|
264
|
+
attr_arr.each {
|
265
|
+
|item|
|
266
|
+
val = if item[1].nil?
|
267
|
+
item[0]
|
268
|
+
elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
|
269
|
+
item[1][1 .. -2]
|
270
|
+
else
|
271
|
+
item[1]
|
272
|
+
end
|
273
|
+
@attr_hash[item[0].downcase] = val
|
274
|
+
}
|
275
|
+
end
|
276
|
+
end
|
277
|
+
@hashed = true
|
278
|
+
end
|
279
|
+
|
280
|
+
#p self
|
281
|
+
|
282
|
+
@attr_hash
|
283
|
+
end
|
284
|
+
|
285
|
+
# Get the 'alt' text for a tag, if it exists, or an empty string otherwise
|
286
|
+
def text
|
287
|
+
if !end_tag
|
288
|
+
case tag_name
|
289
|
+
when 'img'
|
290
|
+
if !attr_hash['alt'].nil?
|
291
|
+
return attr_hash['alt']
|
292
|
+
end
|
293
|
+
when 'applet'
|
294
|
+
if !attr_hash['alt'].nil?
|
295
|
+
return attr_hash['alt']
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
return ''
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
if $0 == __FILE__
|
304
|
+
require 'test/unit'
|
305
|
+
|
306
|
+
class TC_TestHTMLTokenizer < Test::Unit::TestCase
|
307
|
+
def test_bad_link
|
308
|
+
toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
|
309
|
+
assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
|
310
|
+
end
|
311
|
+
|
312
|
+
def test_namespace
|
313
|
+
toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
|
314
|
+
assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
|
315
|
+
end
|
316
|
+
|
317
|
+
def test_comment
|
318
|
+
toke = HTMLTokenizer.new("<!-- comment on me -->")
|
319
|
+
t = toke.getNextToken
|
320
|
+
assert(HTMLComment == t.class)
|
321
|
+
assert("comment on me" == t.contents)
|
322
|
+
end
|
323
|
+
|
324
|
+
|
325
|
+
def test_full
|
326
|
+
page = "<HTML>
|
327
|
+
<HEAD>
|
328
|
+
<TITLE>This is the title</TITLE>
|
329
|
+
</HEAD>
|
330
|
+
<!-- Here comes the <a href=\"missing.link\">blah</a>
|
331
|
+
comment body
|
332
|
+
-->
|
333
|
+
<BODY>
|
334
|
+
<H1>This is the header</H1>
|
335
|
+
<P>
|
336
|
+
This is the paragraph, it contains
|
337
|
+
<a href=\"link.html\">links</a>,
|
338
|
+
<img src=\"blah.gif\" optional alt='images
|
339
|
+
are
|
340
|
+
really cool'>. Ok, here is some more text and
|
341
|
+
<A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
|
342
|
+
</P>
|
343
|
+
</body>
|
344
|
+
</HTML>
|
345
|
+
"
|
346
|
+
toke = HTMLTokenizer.new(page)
|
347
|
+
|
348
|
+
assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
|
349
|
+
assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
|
350
|
+
assert("links" == toke.getTrimmedText)
|
351
|
+
assert(toke.getTag("IMG", "A").attr_hash['optional'])
|
352
|
+
assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'html/htmltokenizer'
|
2
|
+
|
3
|
+
class HtmlTokenizerTest < Test::Unit::TestCase
|
4
|
+
def test_right_version
|
5
|
+
assert_equal 1.0, HTMLTokenizer.version
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_parses_attributes_with_dash
|
9
|
+
html = '<meta http-equiv="content-type" value="text/html">'
|
10
|
+
token = HTMLTokenizer.new(html).getNextToken()
|
11
|
+
|
12
|
+
assert_equal HTMLTag, token.class
|
13
|
+
assert_equal 2, token.attr_hash.size
|
14
|
+
assert_equal true, token.attr_hash.has_key?('value')
|
15
|
+
assert_equal true, token.attr_hash.has_key?('http-equiv')
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_parses_tags_with_dash
|
19
|
+
html = '<a-value>abc</a-value>'
|
20
|
+
tokenizer = HTMLTokenizer.new(html)
|
21
|
+
|
22
|
+
assert_equal 'a-value', tokenizer.getNextToken().tag_name
|
23
|
+
assert_equal 'abc', tokenizer.getNextToken().text
|
24
|
+
assert_equal '/a-value', tokenizer.getNextToken().tag_name
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_gets_attributes_from_tags_with_dash_with_space
|
28
|
+
html = '<a-value n="2" >abc</a-value>'
|
29
|
+
tokenizer = HTMLTokenizer.new(html)
|
30
|
+
|
31
|
+
token = tokenizer.getNextToken()
|
32
|
+
assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
|
33
|
+
assert_equal '2', token.attr_hash['n']
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_gets_attributes_from_tags_with_dash_sans_space
|
37
|
+
html = '<a-value k=\'3\'>abc</a-value>'
|
38
|
+
tokenizer = HTMLTokenizer.new(html)
|
39
|
+
|
40
|
+
token = tokenizer.getNextToken()
|
41
|
+
assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
|
42
|
+
assert_equal '3', token.attr_hash['k']
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_gets_dashed_attributes_from_tags_with_dash
|
46
|
+
html = '<S-Value p:n-d="2">abc</a-value>'
|
47
|
+
tokenizer = HTMLTokenizer.new(html)
|
48
|
+
|
49
|
+
token = tokenizer.getNextToken()
|
50
|
+
assert_equal 's-value', token.tag_name
|
51
|
+
assert_equal 1, token.attr_hash.size
|
52
|
+
assert_equal '2', token.attr_hash['p:n-d']
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_reads_attributes_without_quotes
|
56
|
+
html = '<a href=http://www.test.com/blank.html>value</a>'
|
57
|
+
tokenizer = HTMLTokenizer.new(html)
|
58
|
+
|
59
|
+
token = tokenizer.getNextToken()
|
60
|
+
assert_equal 'a', token.tag_name
|
61
|
+
assert_equal 'http://www.test.com/blank.html', token.attr_hash['href']
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_reads_short_attributes_without_quotes
|
65
|
+
html = '<a name=a>value</a>'
|
66
|
+
tokenizer = HTMLTokenizer.new(html)
|
67
|
+
|
68
|
+
token = tokenizer.getNextToken()
|
69
|
+
assert_equal 'a', token.tag_name
|
70
|
+
assert_equal 'a', token.attr_hash['name']
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_reads_multiple_short_attributes_without_quotes
|
74
|
+
html = '<a name=n target=m href=k>value</a>'
|
75
|
+
tokenizer = HTMLTokenizer.new(html)
|
76
|
+
|
77
|
+
token = tokenizer.getNextToken()
|
78
|
+
assert_equal 'a', token.tag_name
|
79
|
+
assert_equal 'n', token.attr_hash['name']
|
80
|
+
assert_equal 'm', token.attr_hash['target']
|
81
|
+
assert_equal 'k', token.attr_hash['href']
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_makes_boolean_attribute_values_themselves
|
85
|
+
html = '<input type=checked checked>'
|
86
|
+
tokenizer = HTMLTokenizer.new(html)
|
87
|
+
|
88
|
+
token = tokenizer.getNextToken()
|
89
|
+
assert_equal 'input', token.tag_name
|
90
|
+
assert_equal 'checked', token.attr_hash['checked']
|
91
|
+
end
|
92
|
+
end
|
metadata
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.10
|
3
|
+
specification_version: 1
|
4
|
+
name: htmltokenizer
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: "1.0"
|
7
|
+
date: 2005-07-17
|
8
|
+
summary: A class to tokenize HTML.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: bg-rubyforge@infofiend.com
|
12
|
+
homepage: http://htmltokenizer.rubyforge.org/
|
13
|
+
rubyforge_project: htmltokenizer
|
14
|
+
description: "This is a partial port of the functionality behind Perl's TokeParser Provided a
|
15
|
+
page it progressively returns tokens from that page"
|
16
|
+
autorequire:
|
17
|
+
default_executable:
|
18
|
+
bindir: bin
|
19
|
+
has_rdoc: true
|
20
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
|
+
requirements:
|
22
|
+
-
|
23
|
+
- ">"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 0.0.0
|
26
|
+
version:
|
27
|
+
platform: ruby
|
28
|
+
authors:
|
29
|
+
- Ben Giddings
|
30
|
+
files:
|
31
|
+
- lib/html/htmltokenizer.rb
|
32
|
+
- test/htmltokenizer_test.rb
|
33
|
+
- README
|
34
|
+
test_files:
|
35
|
+
- test/htmltokenizer_test.rb
|
36
|
+
rdoc_options: []
|
37
|
+
extra_rdoc_files: []
|
38
|
+
executables: []
|
39
|
+
extensions: []
|
40
|
+
requirements: []
|
41
|
+
dependencies: []
|