htmltokenizer 1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,63 @@
1
+ htmltokenizer README
2
+ ============
3
+
4
+ htmltokenizer is a port of the idea behind Perl's HTML::TokeParser::Simple.
5
+ The basic concept is that it treats a web page as a series of tokens, which
6
+ are either text, html tags, or html comments. This class provides a way
7
+ of getting these tokens in sequence, either one at a time regardless of
8
+ type, or by choosing a list of interesting tags.
9
+
10
+ Requirements
11
+ ------------
12
+
13
+ * ruby
14
+
15
+ Install
16
+ -------
17
+
18
+ De-Compress archive and enter its top directory.
19
+ Then type:
20
+
21
+ $ ruby install.rb config
22
+ $ ruby install.rb setup
23
+ $ su -c "ruby install.rb install"
24
+
25
+ or
26
+
27
+ $ ruby install.rb config
28
+ $ ruby install.rb setup
29
+ $ sudo ruby install.rb install
30
+
31
+ You can also install files into your favorite directory
32
+ by supplying install.rb some options. Try "ruby install.rb --help".
33
+
34
+ Usage
35
+ -----
36
+
37
+ require 'html/htmltokenizer'
38
+
39
+ page = getSomePageFromTheInternetAsAString()
40
+
41
+ tokenizer = HTMLTokenizer.new(page)
42
+
43
+ while token = tokenizer.getTag('a', 'font', '/tr', 'div')
44
+ if 'div' == token.tag_name
45
+ if 'headlinesheader' == token.attr_hash['class']
46
+ puts "Header is: " + tokenizer.getTrimmedText('/div')
47
+ else
48
+ tokenizer.getTag('/div')
49
+ token = tokenizer.getTag('a')
50
+ if token.attr_hash['href']
51
+ puts "Found a link after a div going to #{token.attr_hash['href']}"
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ License
58
+ -------
59
+
60
+ Ruby's license, see http://www.ruby-lang.org/en/LICENSE.txt
61
+
62
+
63
+ Ben Giddings <bg-rubyraa@infofiend.com>
@@ -0,0 +1,355 @@
1
+ # = HTMLTokenizer
2
+ #
3
+ # Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
4
+ # Copyright:: Copyright (c) 2004 Ben Giddings
5
+ # License:: Distributes under the same terms as Ruby
6
+ #
7
+ #
8
+ # This is a partial port of the functionality behind Perl's TokeParser
9
+ # Provided a page it progressively returns tokens from that page
10
+ #
11
+ # $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
12
+
13
+ #
14
+ # A class to tokenize HTML.
15
+ #
16
+ # Example:
17
+ #
18
+ # page = "<HTML>
19
+ # <HEAD>
20
+ # <TITLE>This is the title</TITLE>
21
+ # </HEAD>
22
+ # <!-- Here comes the <a href=\"missing.link\">blah</a>
23
+ # comment body
24
+ # -->
25
+ # <BODY>
26
+ # <H1>This is the header</H1>
27
+ # <P>
28
+ # This is the paragraph, it contains
29
+ # <a href=\"link.html\">links</a>,
30
+ # <img src=\"blah.gif\" optional alt='images
31
+ # are
32
+ # really cool'>. Ok, here is some more text and
33
+ # <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
34
+ # </P>
35
+ # </body>
36
+ # </HTML>
37
+ # "
38
+ # toke = HTMLTokenizer.new(page)
39
+ #
40
+ # assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
41
+ # assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
42
+ # assert("links" == toke.getTrimmedText)
43
+ # assert(toke.getTag("IMG", "A").attr_hash['optional'])
44
+ # assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
45
+ #
46
+ class HTMLTokenizer
47
+ @@version = 1.0
48
+
49
+ # Get version of HTMLTokenizer lib
50
+ def self.version
51
+ @@version
52
+ end
53
+
54
+ attr_reader :page
55
+
56
+ # Create a new tokenizer, based on the content, used as a string.
57
+ def initialize(content)
58
+ @page = content.to_s
59
+ @cur_pos = 0
60
+ end
61
+
62
+ # Reset the parser, setting the current position back at the stop
63
+ def reset
64
+ @cur_pos = 0
65
+ end
66
+
67
+ # Look at the next token, but don't actually grab it
68
+ def peekNextToken
69
+ if @cur_pos == @page.length then return nil end
70
+
71
+ if ?< == @page[@cur_pos]
72
+ # Next token is a tag of some kind
73
+ if '!--' == @page[(@cur_pos + 1), 3]
74
+ # Token is a comment
75
+ tag_end = @page.index('-->', (@cur_pos + 1))
76
+ if tag_end.nil?
77
+ raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
78
+ end
79
+ # p @page[@cur_pos .. (tag_end+2)]
80
+ HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
81
+ else
82
+ # Token is a html tag
83
+ tag_end = @page.index('>', (@cur_pos + 1))
84
+ if tag_end.nil?
85
+ raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
86
+ end
87
+ # p @page[@cur_pos .. tag_end]
88
+ HTMLTag.new(@page[@cur_pos .. tag_end])
89
+ end
90
+ else
91
+ # Next token is text
92
+ text_end = @page.index('<', @cur_pos)
93
+ text_end = text_end.nil? ? -1 : (text_end - 1)
94
+ # p @page[@cur_pos .. text_end]
95
+ HTMLText.new(@page[@cur_pos .. text_end])
96
+ end
97
+ end
98
+
99
+ # Get the next token, returns an instance of
100
+ # * HTMLText
101
+ # * HTMLToken
102
+ # * HTMLTag
103
+ def getNextToken
104
+ token = peekNextToken
105
+ if token
106
+ # @page = @page[token.raw.length .. -1]
107
+ # @page.slice!(0, token.raw.length)
108
+ @cur_pos += token.raw.length
109
+ end
110
+ #p token
111
+ #print token.raw
112
+ return token
113
+ end
114
+
115
+ # Get a tag from the specified set of desired tags.
116
+ # For example:
117
+ # <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
118
+ # Will return the next header tag encountered.
119
+ def getTag(*sought_tags)
120
+ sought_tags.collect! {|elm| elm.downcase}
121
+
122
+ while (tag = getNextToken)
123
+ if tag.kind_of?(HTMLTag) and
124
+ (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
125
+ break
126
+ end
127
+ end
128
+ tag
129
+ end
130
+
131
+ # Get all the text between the current position and the next tag
132
+ # (if specified) or a specific later tag
133
+ def getText(until_tag = nil)
134
+ if until_tag.nil?
135
+ if ?< == @page[@cur_pos]
136
+ # Next token is a tag, not text
137
+ ""
138
+ else
139
+ # Next token is text
140
+ getNextToken.text
141
+ end
142
+ else
143
+ ret_str = ""
144
+
145
+ while (tag = peekNextToken)
146
+ if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
147
+ break
148
+ end
149
+
150
+ if ("" != tag.text)
151
+ ret_str << (tag.text + " ")
152
+ end
153
+ getNextToken
154
+ end
155
+
156
+ ret_str
157
+ end
158
+ end
159
+
160
+ # Like getText, but squeeze all whitespace, getting rid of
161
+ # leading and trailing whitespace, and squeezing multiple
162
+ # spaces into a single space.
163
+ def getTrimmedText(until_tag = nil)
164
+ getText(until_tag).strip.gsub(/\s+/m, " ")
165
+ end
166
+
167
+ end
168
+
169
+ # The parent class for all three types of HTML tokens
170
+ class HTMLToken
171
+ attr_accessor :raw
172
+
173
+ # Initialize the token based on the raw text
174
+ def initialize(text)
175
+ @raw = text
176
+ end
177
+
178
+ # By default, return exactly the string used to create the text
179
+ def to_s
180
+ raw
181
+ end
182
+
183
+ # By default tokens have no text representation
184
+ def text
185
+ ""
186
+ end
187
+
188
+ def trimmed_text
189
+ text.strip.gsub(/\s+/m, " ")
190
+ end
191
+
192
+ # Compare to another based on the raw source
193
+ def ==(other)
194
+ raw == other.to_s
195
+ end
196
+ end
197
+
198
+ # Class representing text that isn't inside a tag
199
+ class HTMLText < HTMLToken
200
+ def text
201
+ raw
202
+ end
203
+ end
204
+
205
+ # Class representing an HTML comment
206
+ class HTMLComment < HTMLToken
207
+ attr_accessor :contents
208
+ def initialize(text)
209
+ super(text)
210
+ temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
211
+ if temp_arr[0].nil?
212
+ raise "Text passed to HTMLComment.initialize is not a comment"
213
+ end
214
+
215
+ @contents = temp_arr[0][0]
216
+ end
217
+ end
218
+
219
+ # Class representing an HTML tag
220
+ class HTMLTag < HTMLToken
221
+ attr_reader :end_tag, :tag_name
222
+ def initialize(text)
223
+ super(text)
224
+ if ?< != text[0] or ?> != text[-1]
225
+ raise "Text passed to HTMLComment.initialize is not a comment"
226
+ end
227
+
228
+ @attr_hash = Hash.new
229
+ @raw = text
230
+
231
+ tag_name = text.scan(/[\w:-]+/)[0]
232
+ if tag_name.nil?
233
+ raise "Error, tag is nil: #{tag_name}"
234
+ end
235
+
236
+ if ?/ == text[1]
237
+ # It's an end tag
238
+ @end_tag = true
239
+ @tag_name = '/' + tag_name.downcase
240
+ else
241
+ @end_tag = false
242
+ @tag_name = tag_name.downcase
243
+ end
244
+
245
+ @hashed = false
246
+ end
247
+
248
+ # Retrieve a hash of all the tag's attributes.
249
+ # Lazily done, so that if you don't look at a tag's attributes
250
+ # things go quicker
251
+ def attr_hash
252
+ # Lazy initialize == don't build the hash until it's needed
253
+ if !@hashed
254
+ if !@end_tag
255
+ # Get the attributes
256
+ attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
257
+ if attr_arr.kind_of?(Array)
258
+ # Attributes found, parse them
259
+ attrs = attr_arr[0]
260
+ attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
261
+ # clean up the array by:
262
+ # * setting all nil elements to true
263
+ # * removing enclosing quotes
264
+ attr_arr.each {
265
+ |item|
266
+ val = if item[1].nil?
267
+ item[0]
268
+ elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
269
+ item[1][1 .. -2]
270
+ else
271
+ item[1]
272
+ end
273
+ @attr_hash[item[0].downcase] = val
274
+ }
275
+ end
276
+ end
277
+ @hashed = true
278
+ end
279
+
280
+ #p self
281
+
282
+ @attr_hash
283
+ end
284
+
285
+ # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
286
+ def text
287
+ if !end_tag
288
+ case tag_name
289
+ when 'img'
290
+ if !attr_hash['alt'].nil?
291
+ return attr_hash['alt']
292
+ end
293
+ when 'applet'
294
+ if !attr_hash['alt'].nil?
295
+ return attr_hash['alt']
296
+ end
297
+ end
298
+ end
299
+ return ''
300
+ end
301
+ end
302
+
303
+ if $0 == __FILE__
304
+ require 'test/unit'
305
+
306
+ class TC_TestHTMLTokenizer < Test::Unit::TestCase
307
+ def test_bad_link
308
+ toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
309
+ assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
310
+ end
311
+
312
+ def test_namespace
313
+ toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
314
+ assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
315
+ end
316
+
317
+ def test_comment
318
+ toke = HTMLTokenizer.new("<!-- comment on me -->")
319
+ t = toke.getNextToken
320
+ assert(HTMLComment == t.class)
321
+ assert("comment on me" == t.contents)
322
+ end
323
+
324
+
325
+ def test_full
326
+ page = "<HTML>
327
+ <HEAD>
328
+ <TITLE>This is the title</TITLE>
329
+ </HEAD>
330
+ <!-- Here comes the <a href=\"missing.link\">blah</a>
331
+ comment body
332
+ -->
333
+ <BODY>
334
+ <H1>This is the header</H1>
335
+ <P>
336
+ This is the paragraph, it contains
337
+ <a href=\"link.html\">links</a>,
338
+ <img src=\"blah.gif\" optional alt='images
339
+ are
340
+ really cool'>. Ok, here is some more text and
341
+ <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
342
+ </P>
343
+ </body>
344
+ </HTML>
345
+ "
346
+ toke = HTMLTokenizer.new(page)
347
+
348
+ assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
349
+ assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
350
+ assert("links" == toke.getTrimmedText)
351
+ assert(toke.getTag("IMG", "A").attr_hash['optional'])
352
+ assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
353
+ end
354
+ end
355
+ end
@@ -0,0 +1,92 @@
1
+ require 'html/htmltokenizer'
2
+
3
+ class HtmlTokenizerTest < Test::Unit::TestCase
4
+ def test_right_version
5
+ assert_equal 1.0, HTMLTokenizer.version
6
+ end
7
+
8
+ def test_parses_attributes_with_dash
9
+ html = '<meta http-equiv="content-type" value="text/html">'
10
+ token = HTMLTokenizer.new(html).getNextToken()
11
+
12
+ assert_equal HTMLTag, token.class
13
+ assert_equal 2, token.attr_hash.size
14
+ assert_equal true, token.attr_hash.has_key?('value')
15
+ assert_equal true, token.attr_hash.has_key?('http-equiv')
16
+ end
17
+
18
+ def test_parses_tags_with_dash
19
+ html = '<a-value>abc</a-value>'
20
+ tokenizer = HTMLTokenizer.new(html)
21
+
22
+ assert_equal 'a-value', tokenizer.getNextToken().tag_name
23
+ assert_equal 'abc', tokenizer.getNextToken().text
24
+ assert_equal '/a-value', tokenizer.getNextToken().tag_name
25
+ end
26
+
27
+ def test_gets_attributes_from_tags_with_dash_with_space
28
+ html = '<a-value n="2" >abc</a-value>'
29
+ tokenizer = HTMLTokenizer.new(html)
30
+
31
+ token = tokenizer.getNextToken()
32
+ assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
33
+ assert_equal '2', token.attr_hash['n']
34
+ end
35
+
36
+ def test_gets_attributes_from_tags_with_dash_sans_space
37
+ html = '<a-value k=\'3\'>abc</a-value>'
38
+ tokenizer = HTMLTokenizer.new(html)
39
+
40
+ token = tokenizer.getNextToken()
41
+ assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
42
+ assert_equal '3', token.attr_hash['k']
43
+ end
44
+
45
+ def test_gets_dashed_attributes_from_tags_with_dash
46
+ html = '<S-Value p:n-d="2">abc</a-value>'
47
+ tokenizer = HTMLTokenizer.new(html)
48
+
49
+ token = tokenizer.getNextToken()
50
+ assert_equal 's-value', token.tag_name
51
+ assert_equal 1, token.attr_hash.size
52
+ assert_equal '2', token.attr_hash['p:n-d']
53
+ end
54
+
55
+ def test_reads_attributes_without_quotes
56
+ html = '<a href=http://www.test.com/blank.html>value</a>'
57
+ tokenizer = HTMLTokenizer.new(html)
58
+
59
+ token = tokenizer.getNextToken()
60
+ assert_equal 'a', token.tag_name
61
+ assert_equal 'http://www.test.com/blank.html', token.attr_hash['href']
62
+ end
63
+
64
+ def test_reads_short_attributes_without_quotes
65
+ html = '<a name=a>value</a>'
66
+ tokenizer = HTMLTokenizer.new(html)
67
+
68
+ token = tokenizer.getNextToken()
69
+ assert_equal 'a', token.tag_name
70
+ assert_equal 'a', token.attr_hash['name']
71
+ end
72
+
73
+ def test_reads_multiple_short_attributes_without_quotes
74
+ html = '<a name=n target=m href=k>value</a>'
75
+ tokenizer = HTMLTokenizer.new(html)
76
+
77
+ token = tokenizer.getNextToken()
78
+ assert_equal 'a', token.tag_name
79
+ assert_equal 'n', token.attr_hash['name']
80
+ assert_equal 'm', token.attr_hash['target']
81
+ assert_equal 'k', token.attr_hash['href']
82
+ end
83
+
84
+ def test_makes_boolean_attribute_values_themselves
85
+ html = '<input type=checked checked>'
86
+ tokenizer = HTMLTokenizer.new(html)
87
+
88
+ token = tokenizer.getNextToken()
89
+ assert_equal 'input', token.tag_name
90
+ assert_equal 'checked', token.attr_hash['checked']
91
+ end
92
+ end
metadata ADDED
@@ -0,0 +1,41 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: htmltokenizer
5
+ version: !ruby/object:Gem::Version
6
+ version: "1.0"
7
+ date: 2005-07-17
8
+ summary: A class to tokenize HTML.
9
+ require_paths:
10
+ - lib
11
+ email: bg-rubyforge@infofiend.com
12
+ homepage: http://htmltokenizer.rubyforge.org/
13
+ rubyforge_project: htmltokenizer
14
+ description: "This is a partial port of the functionality behind Perl's TokeParser Provided a
15
+ page it progressively returns tokens from that page"
16
+ autorequire:
17
+ default_executable:
18
+ bindir: bin
19
+ has_rdoc: true
20
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
21
+ requirements:
22
+ -
23
+ - ">"
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.0
26
+ version:
27
+ platform: ruby
28
+ authors:
29
+ - Ben Giddings
30
+ files:
31
+ - lib/html/htmltokenizer.rb
32
+ - test/htmltokenizer_test.rb
33
+ - README
34
+ test_files:
35
+ - test/htmltokenizer_test.rb
36
+ rdoc_options: []
37
+ extra_rdoc_files: []
38
+ executables: []
39
+ extensions: []
40
+ requirements: []
41
+ dependencies: []