htmltokenizer 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,63 @@
1
+ htmltokenizer README
2
+ ============
3
+
4
+ htmltokenizer is a port of the idea behind Perl's HTML::TokeParser::Simple.
5
+ The basic concept is that it treats a web page as a series of tokens, which
6
+ are either text, html tags, or html comments. This class provides a way
7
+ of getting these tokens in sequence, either one at a time regardless of
8
+ type, or by choosing a list of interesting tags.
9
+
10
+ Requirements
11
+ ------------
12
+
13
+ * ruby
14
+
15
+ Install
16
+ -------
17
+
18
+ De-Compress archive and enter its top directory.
19
+ Then type:
20
+
21
+ $ ruby install.rb config
22
+ $ ruby install.rb setup
23
+ $ su -c "ruby install.rb install"
24
+
25
+ or
26
+
27
+ $ ruby install.rb config
28
+ $ ruby install.rb setup
29
+ $ sudo ruby install.rb install
30
+
31
+ You can also install files into your favorite directory
32
+ by supplying install.rb some options. Try "ruby install.rb --help".
33
+
34
+ Usage
35
+ -----
36
+
37
+ require 'html/htmltokenizer'
38
+
39
+ page = getSomePageFromTheInternetAsAString()
40
+
41
+ tokenizer = HTMLTokenizer.new(page)
42
+
43
+ while token = tokenizer.getTag('a', 'font', '/tr', 'div')
44
+ if 'div' == token.tag_name
45
+ if 'headlinesheader' == token.attr_hash['class']
46
+ puts "Header is: " + tokenizer.getTrimmedText('/div')
47
+ else
48
+ tokenizer.getTag('/div')
49
+ token = tokenizer.getTag('a')
50
+ if token.attr_hash['href']
51
+ puts "Found a link after a div going to #{token.attr_hash['href']}"
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ License
58
+ -------
59
+
60
+ Ruby's license, see http://www.ruby-lang.org/en/LICENSE.txt
61
+
62
+
63
+ Ben Giddings <bg-rubyraa@infofiend.com>
@@ -0,0 +1,355 @@
1
+ # = HTMLTokenizer
2
+ #
3
+ # Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
4
+ # Copyright:: Copyright (c) 2004 Ben Giddings
5
+ # License:: Distributes under the same terms as Ruby
6
+ #
7
+ #
8
+ # This is a partial port of the functionality behind Perl's TokeParser
9
+ # Provided a page it progressively returns tokens from that page
10
+ #
11
+ # $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
12
+
13
+ #
14
+ # A class to tokenize HTML.
15
+ #
16
+ # Example:
17
+ #
18
+ # page = "<HTML>
19
+ # <HEAD>
20
+ # <TITLE>This is the title</TITLE>
21
+ # </HEAD>
22
+ # <!-- Here comes the <a href=\"missing.link\">blah</a>
23
+ # comment body
24
+ # -->
25
+ # <BODY>
26
+ # <H1>This is the header</H1>
27
+ # <P>
28
+ # This is the paragraph, it contains
29
+ # <a href=\"link.html\">links</a>,
30
+ # <img src=\"blah.gif\" optional alt='images
31
+ # are
32
+ # really cool'>. Ok, here is some more text and
33
+ # <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
34
+ # </P>
35
+ # </body>
36
+ # </HTML>
37
+ # "
38
+ # toke = HTMLTokenizer.new(page)
39
+ #
40
+ # assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
41
+ # assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
42
+ # assert("links" == toke.getTrimmedText)
43
+ # assert(toke.getTag("IMG", "A").attr_hash['optional'])
44
+ # assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
45
+ #
46
+ class HTMLTokenizer
47
+ @@version = 1.0
48
+
49
+ # Get version of HTMLTokenizer lib
50
+ def self.version
51
+ @@version
52
+ end
53
+
54
+ attr_reader :page
55
+
56
+ # Create a new tokenizer, based on the content, used as a string.
57
+ def initialize(content)
58
+ @page = content.to_s
59
+ @cur_pos = 0
60
+ end
61
+
62
+ # Reset the parser, setting the current position back at the stop
63
+ def reset
64
+ @cur_pos = 0
65
+ end
66
+
67
+ # Look at the next token, but don't actually grab it
68
+ def peekNextToken
69
+ if @cur_pos == @page.length then return nil end
70
+
71
+ if ?< == @page[@cur_pos]
72
+ # Next token is a tag of some kind
73
+ if '!--' == @page[(@cur_pos + 1), 3]
74
+ # Token is a comment
75
+ tag_end = @page.index('-->', (@cur_pos + 1))
76
+ if tag_end.nil?
77
+ raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
78
+ end
79
+ # p @page[@cur_pos .. (tag_end+2)]
80
+ HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
81
+ else
82
+ # Token is a html tag
83
+ tag_end = @page.index('>', (@cur_pos + 1))
84
+ if tag_end.nil?
85
+ raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
86
+ end
87
+ # p @page[@cur_pos .. tag_end]
88
+ HTMLTag.new(@page[@cur_pos .. tag_end])
89
+ end
90
+ else
91
+ # Next token is text
92
+ text_end = @page.index('<', @cur_pos)
93
+ text_end = text_end.nil? ? -1 : (text_end - 1)
94
+ # p @page[@cur_pos .. text_end]
95
+ HTMLText.new(@page[@cur_pos .. text_end])
96
+ end
97
+ end
98
+
99
+ # Get the next token, returns an instance of
100
+ # * HTMLText
101
+ # * HTMLToken
102
+ # * HTMLTag
103
+ def getNextToken
104
+ token = peekNextToken
105
+ if token
106
+ # @page = @page[token.raw.length .. -1]
107
+ # @page.slice!(0, token.raw.length)
108
+ @cur_pos += token.raw.length
109
+ end
110
+ #p token
111
+ #print token.raw
112
+ return token
113
+ end
114
+
115
+ # Get a tag from the specified set of desired tags.
116
+ # For example:
117
+ # <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
118
+ # Will return the next header tag encountered.
119
+ def getTag(*sought_tags)
120
+ sought_tags.collect! {|elm| elm.downcase}
121
+
122
+ while (tag = getNextToken)
123
+ if tag.kind_of?(HTMLTag) and
124
+ (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
125
+ break
126
+ end
127
+ end
128
+ tag
129
+ end
130
+
131
+ # Get all the text between the current position and the next tag
132
+ # (if specified) or a specific later tag
133
+ def getText(until_tag = nil)
134
+ if until_tag.nil?
135
+ if ?< == @page[@cur_pos]
136
+ # Next token is a tag, not text
137
+ ""
138
+ else
139
+ # Next token is text
140
+ getNextToken.text
141
+ end
142
+ else
143
+ ret_str = ""
144
+
145
+ while (tag = peekNextToken)
146
+ if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
147
+ break
148
+ end
149
+
150
+ if ("" != tag.text)
151
+ ret_str << (tag.text + " ")
152
+ end
153
+ getNextToken
154
+ end
155
+
156
+ ret_str
157
+ end
158
+ end
159
+
160
+ # Like getText, but squeeze all whitespace, getting rid of
161
+ # leading and trailing whitespace, and squeezing multiple
162
+ # spaces into a single space.
163
+ def getTrimmedText(until_tag = nil)
164
+ getText(until_tag).strip.gsub(/\s+/m, " ")
165
+ end
166
+
167
+ end
168
+
169
+ # The parent class for all three types of HTML tokens
170
+ class HTMLToken
171
+ attr_accessor :raw
172
+
173
+ # Initialize the token based on the raw text
174
+ def initialize(text)
175
+ @raw = text
176
+ end
177
+
178
+ # By default, return exactly the string used to create the text
179
+ def to_s
180
+ raw
181
+ end
182
+
183
+ # By default tokens have no text representation
184
+ def text
185
+ ""
186
+ end
187
+
188
+ def trimmed_text
189
+ text.strip.gsub(/\s+/m, " ")
190
+ end
191
+
192
+ # Compare to another based on the raw source
193
+ def ==(other)
194
+ raw == other.to_s
195
+ end
196
+ end
197
+
198
+ # Class representing text that isn't inside a tag
199
+ class HTMLText < HTMLToken
200
+ def text
201
+ raw
202
+ end
203
+ end
204
+
205
+ # Class representing an HTML comment
206
+ class HTMLComment < HTMLToken
207
+ attr_accessor :contents
208
+ def initialize(text)
209
+ super(text)
210
+ temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
211
+ if temp_arr[0].nil?
212
+ raise "Text passed to HTMLComment.initialize is not a comment"
213
+ end
214
+
215
+ @contents = temp_arr[0][0]
216
+ end
217
+ end
218
+
219
+ # Class representing an HTML tag
220
+ class HTMLTag < HTMLToken
221
+ attr_reader :end_tag, :tag_name
222
+ def initialize(text)
223
+ super(text)
224
+ if ?< != text[0] or ?> != text[-1]
225
+ raise "Text passed to HTMLComment.initialize is not a comment"
226
+ end
227
+
228
+ @attr_hash = Hash.new
229
+ @raw = text
230
+
231
+ tag_name = text.scan(/[\w:-]+/)[0]
232
+ if tag_name.nil?
233
+ raise "Error, tag is nil: #{tag_name}"
234
+ end
235
+
236
+ if ?/ == text[1]
237
+ # It's an end tag
238
+ @end_tag = true
239
+ @tag_name = '/' + tag_name.downcase
240
+ else
241
+ @end_tag = false
242
+ @tag_name = tag_name.downcase
243
+ end
244
+
245
+ @hashed = false
246
+ end
247
+
248
+ # Retrieve a hash of all the tag's attributes.
249
+ # Lazily done, so that if you don't look at a tag's attributes
250
+ # things go quicker
251
+ def attr_hash
252
+ # Lazy initialize == don't build the hash until it's needed
253
+ if !@hashed
254
+ if !@end_tag
255
+ # Get the attributes
256
+ attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
257
+ if attr_arr.kind_of?(Array)
258
+ # Attributes found, parse them
259
+ attrs = attr_arr[0]
260
+ attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
261
+ # clean up the array by:
262
+ # * setting all nil elements to true
263
+ # * removing enclosing quotes
264
+ attr_arr.each {
265
+ |item|
266
+ val = if item[1].nil?
267
+ item[0]
268
+ elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
269
+ item[1][1 .. -2]
270
+ else
271
+ item[1]
272
+ end
273
+ @attr_hash[item[0].downcase] = val
274
+ }
275
+ end
276
+ end
277
+ @hashed = true
278
+ end
279
+
280
+ #p self
281
+
282
+ @attr_hash
283
+ end
284
+
285
+ # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
286
+ def text
287
+ if !end_tag
288
+ case tag_name
289
+ when 'img'
290
+ if !attr_hash['alt'].nil?
291
+ return attr_hash['alt']
292
+ end
293
+ when 'applet'
294
+ if !attr_hash['alt'].nil?
295
+ return attr_hash['alt']
296
+ end
297
+ end
298
+ end
299
+ return ''
300
+ end
301
+ end
302
+
303
+ if $0 == __FILE__
304
+ require 'test/unit'
305
+
306
+ class TC_TestHTMLTokenizer < Test::Unit::TestCase
307
+ def test_bad_link
308
+ toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
309
+ assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
310
+ end
311
+
312
+ def test_namespace
313
+ toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
314
+ assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
315
+ end
316
+
317
+ def test_comment
318
+ toke = HTMLTokenizer.new("<!-- comment on me -->")
319
+ t = toke.getNextToken
320
+ assert(HTMLComment == t.class)
321
+ assert("comment on me" == t.contents)
322
+ end
323
+
324
+
325
+ def test_full
326
+ page = "<HTML>
327
+ <HEAD>
328
+ <TITLE>This is the title</TITLE>
329
+ </HEAD>
330
+ <!-- Here comes the <a href=\"missing.link\">blah</a>
331
+ comment body
332
+ -->
333
+ <BODY>
334
+ <H1>This is the header</H1>
335
+ <P>
336
+ This is the paragraph, it contains
337
+ <a href=\"link.html\">links</a>,
338
+ <img src=\"blah.gif\" optional alt='images
339
+ are
340
+ really cool'>. Ok, here is some more text and
341
+ <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
342
+ </P>
343
+ </body>
344
+ </HTML>
345
+ "
346
+ toke = HTMLTokenizer.new(page)
347
+
348
+ assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
349
+ assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
350
+ assert("links" == toke.getTrimmedText)
351
+ assert(toke.getTag("IMG", "A").attr_hash['optional'])
352
+ assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
353
+ end
354
+ end
355
+ end
@@ -0,0 +1,92 @@
1
+ require 'html/htmltokenizer'
2
+
3
+ class HtmlTokenizerTest < Test::Unit::TestCase
4
+ def test_right_version
5
+ assert_equal 1.0, HTMLTokenizer.version
6
+ end
7
+
8
+ def test_parses_attributes_with_dash
9
+ html = '<meta http-equiv="content-type" value="text/html">'
10
+ token = HTMLTokenizer.new(html).getNextToken()
11
+
12
+ assert_equal HTMLTag, token.class
13
+ assert_equal 2, token.attr_hash.size
14
+ assert_equal true, token.attr_hash.has_key?('value')
15
+ assert_equal true, token.attr_hash.has_key?('http-equiv')
16
+ end
17
+
18
+ def test_parses_tags_with_dash
19
+ html = '<a-value>abc</a-value>'
20
+ tokenizer = HTMLTokenizer.new(html)
21
+
22
+ assert_equal 'a-value', tokenizer.getNextToken().tag_name
23
+ assert_equal 'abc', tokenizer.getNextToken().text
24
+ assert_equal '/a-value', tokenizer.getNextToken().tag_name
25
+ end
26
+
27
+ def test_gets_attributes_from_tags_with_dash_with_space
28
+ html = '<a-value n="2" >abc</a-value>'
29
+ tokenizer = HTMLTokenizer.new(html)
30
+
31
+ token = tokenizer.getNextToken()
32
+ assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
33
+ assert_equal '2', token.attr_hash['n']
34
+ end
35
+
36
+ def test_gets_attributes_from_tags_with_dash_sans_space
37
+ html = '<a-value k=\'3\'>abc</a-value>'
38
+ tokenizer = HTMLTokenizer.new(html)
39
+
40
+ token = tokenizer.getNextToken()
41
+ assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
42
+ assert_equal '3', token.attr_hash['k']
43
+ end
44
+
45
+ def test_gets_dashed_attributes_from_tags_with_dash
46
+ html = '<S-Value p:n-d="2">abc</a-value>'
47
+ tokenizer = HTMLTokenizer.new(html)
48
+
49
+ token = tokenizer.getNextToken()
50
+ assert_equal 's-value', token.tag_name
51
+ assert_equal 1, token.attr_hash.size
52
+ assert_equal '2', token.attr_hash['p:n-d']
53
+ end
54
+
55
+ def test_reads_attributes_without_quotes
56
+ html = '<a href=http://www.test.com/blank.html>value</a>'
57
+ tokenizer = HTMLTokenizer.new(html)
58
+
59
+ token = tokenizer.getNextToken()
60
+ assert_equal 'a', token.tag_name
61
+ assert_equal 'http://www.test.com/blank.html', token.attr_hash['href']
62
+ end
63
+
64
+ def test_reads_short_attributes_without_quotes
65
+ html = '<a name=a>value</a>'
66
+ tokenizer = HTMLTokenizer.new(html)
67
+
68
+ token = tokenizer.getNextToken()
69
+ assert_equal 'a', token.tag_name
70
+ assert_equal 'a', token.attr_hash['name']
71
+ end
72
+
73
+ def test_reads_multiple_short_attributes_without_quotes
74
+ html = '<a name=n target=m href=k>value</a>'
75
+ tokenizer = HTMLTokenizer.new(html)
76
+
77
+ token = tokenizer.getNextToken()
78
+ assert_equal 'a', token.tag_name
79
+ assert_equal 'n', token.attr_hash['name']
80
+ assert_equal 'm', token.attr_hash['target']
81
+ assert_equal 'k', token.attr_hash['href']
82
+ end
83
+
84
+ def test_makes_boolean_attribute_values_themselves
85
+ html = '<input type=checked checked>'
86
+ tokenizer = HTMLTokenizer.new(html)
87
+
88
+ token = tokenizer.getNextToken()
89
+ assert_equal 'input', token.tag_name
90
+ assert_equal 'checked', token.attr_hash['checked']
91
+ end
92
+ end
metadata ADDED
@@ -0,0 +1,41 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: htmltokenizer
5
+ version: !ruby/object:Gem::Version
6
+ version: "1.0"
7
+ date: 2005-07-17
8
+ summary: A class to tokenize HTML.
9
+ require_paths:
10
+ - lib
11
+ email: bg-rubyforge@infofiend.com
12
+ homepage: http://htmltokenizer.rubyforge.org/
13
+ rubyforge_project: htmltokenizer
14
+ description: "This is a partial port of the functionality behind Perl's TokeParser Provided a
15
+ page it progressively returns tokens from that page"
16
+ autorequire:
17
+ default_executable:
18
+ bindir: bin
19
+ has_rdoc: true
20
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
21
+ requirements:
22
+ -
23
+ - ">"
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.0
26
+ version:
27
+ platform: ruby
28
+ authors:
29
+ - Ben Giddings
30
+ files:
31
+ - lib/html/htmltokenizer.rb
32
+ - test/htmltokenizer_test.rb
33
+ - README
34
+ test_files:
35
+ - test/htmltokenizer_test.rb
36
+ rdoc_options: []
37
+ extra_rdoc_files: []
38
+ executables: []
39
+ extensions: []
40
+ requirements: []
41
+ dependencies: []