RubyGems - htmltokenizer - Versions diffs - 1.0 - Mend

htmltokenizer 1.0

Files changed (4) hide show

data/README ADDED Viewed

@@ -0,0 +1,63 @@
+htmltokenizer README
+============
+  htmltokenizer is a port of the idea behind Perl's HTML::TokeParser::Simple.
+  The basic concept is that it treats a web page as a series of tokens, which
+  are either text, html tags, or html comments.  This class provides a way
+  of getting these tokens in sequence, either one at a time regardless of
+  type, or by choosing a list of interesting tags.
+Requirements
+------------
+  * ruby
+Install
+-------
+  De-Compress archive and enter its top directory.
+  Then type:
+    $ ruby install.rb config
+    $ ruby install.rb setup
+    $ su -c "ruby install.rb install"
+  or
+    $ ruby install.rb config
+    $ ruby install.rb setup
+    $ sudo ruby install.rb install
+  You can also install files into your favorite directory
+  by supplying install.rb some options. Try "ruby install.rb --help".
+Usage
+-----
+require 'html/htmltokenizer'
+page = getSomePageFromTheInternetAsAString()
+tokenizer = HTMLTokenizer.new(page)
+while token = tokenizer.getTag('a', 'font', '/tr', 'div')
+  if 'div' == token.tag_name
+    if 'headlinesheader' == token.attr_hash['class']
+      puts "Header is: " + tokenizer.getTrimmedText('/div')
+    else
+      tokenizer.getTag('/div')
+      token = tokenizer.getTag('a')
+      if token.attr_hash['href']
+        puts "Found a link after a div going to #{token.attr_hash['href']}"
+      end
+    end
+  end
+end
+License
+-------
+  Ruby's license, see http://www.ruby-lang.org/en/LICENSE.txt
+Ben Giddings <bg-rubyraa@infofiend.com>

data/lib/html/htmltokenizer.rb ADDED Viewed

@@ -0,0 +1,355 @@
+# = HTMLTokenizer
+#
+# Author::    Ben Giddings  (mailto:bg-rubyforge@infofiend.com)
+# Copyright:: Copyright (c) 2004 Ben Giddings
+# License::   Distributes under the same terms as Ruby
+#
+#
+# This is a partial port of the functionality behind Perl's TokeParser
+# Provided a page it progressively returns tokens from that page
+#
+# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
+#
+# A class to tokenize HTML.
+#
+# Example:
+#
+#   page = "<HTML>
+#   <HEAD>
+#   <TITLE>This is the title</TITLE>
+#   </HEAD>
+#    <!-- Here comes the <a href=\"missing.link\">blah</a>
+#    comment body
+#     -->
+#    <BODY>
+#      <H1>This is the header</H1>
+#      <P>
+#        This is the paragraph, it contains
+#        <a href=\"link.html\">links</a>,
+#        <img src=\"blah.gif\" optional alt='images
+#        are
+#        really cool'>.  Ok, here is some more text and
+#        <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
+#      </P>
+#    </body>
+#    </HTML>
+#    "
+#    toke = HTMLTokenizer.new(page)
+#
+#    assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
+#    assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
+#    assert("links" == toke.getTrimmedText)
+#    assert(toke.getTag("IMG", "A").attr_hash['optional'])
+#    assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
+#
+class HTMLTokenizer
+  @@version = 1.0
+  # Get version of HTMLTokenizer lib
+  def self.version
+    @@version
+  end
+  attr_reader :page
+  # Create a new tokenizer, based on the content, used as a string.
+  def initialize(content)
+    @page = content.to_s
+    @cur_pos = 0
+  end
+  # Reset the parser, setting the current position back at the stop
+  def reset
+    @cur_pos = 0
+  end
+  # Look at the next token, but don't actually grab it
+  def peekNextToken
+    if @cur_pos == @page.length then return nil end
+    if ?< == @page[@cur_pos]
+      # Next token is a tag of some kind
+      if '!--' == @page[(@cur_pos + 1), 3]
+        # Token is a comment
+        tag_end = @page.index('-->', (@cur_pos + 1))
+        if tag_end.nil?
+          raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
+        end
+        # p @page[@cur_pos .. (tag_end+2)]
+        HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
+      else
+        # Token is a html tag
+        tag_end = @page.index('>', (@cur_pos + 1))
+        if tag_end.nil?
+          raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
+        end
+        # p @page[@cur_pos .. tag_end]
+        HTMLTag.new(@page[@cur_pos .. tag_end])
+      end
+    else
+      # Next token is text
+      text_end = @page.index('<', @cur_pos)
+      text_end = text_end.nil? ? -1 : (text_end - 1)
+      # p @page[@cur_pos .. text_end]
+      HTMLText.new(@page[@cur_pos .. text_end])
+    end
+  end
+  # Get the next token, returns an instance of
+  # * HTMLText
+  # * HTMLToken
+  # * HTMLTag
+  def getNextToken
+    token = peekNextToken
+    if token
+      # @page = @page[token.raw.length .. -1]
+      # @page.slice!(0, token.raw.length)
+      @cur_pos += token.raw.length
+    end
+    #p token
+    #print token.raw
+    return token
+  end
+  # Get a tag from the specified set of desired tags.
+  # For example:
+  # <tt>foo =  toke.getTag("h1", "h2", "h3")</tt>
+  # Will return the next header tag encountered.
+  def getTag(*sought_tags)
+    sought_tags.collect! {|elm| elm.downcase}
+    while (tag = getNextToken)
+      if tag.kind_of?(HTMLTag) and
+          (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
+        break
+      end
+    end
+    tag
+  end
+  # Get all the text between the current position and the next tag
+  # (if specified) or a specific later tag
+  def getText(until_tag = nil)
+    if until_tag.nil?
+      if ?< == @page[@cur_pos]
+        # Next token is a tag, not text
+        ""
+      else
+        # Next token is text
+        getNextToken.text
+      end
+    else
+      ret_str = ""
+      while (tag = peekNextToken)
+        if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
+          break
+        end
+        if ("" != tag.text)
+          ret_str << (tag.text + " ")
+        end
+        getNextToken
+      end
+      ret_str
+    end
+  end
+  # Like getText, but squeeze all whitespace, getting rid of
+  # leading and trailing whitespace, and squeezing multiple
+  # spaces into a single space.
+  def getTrimmedText(until_tag = nil)
+    getText(until_tag).strip.gsub(/\s+/m, " ")
+  end
+end
+# The parent class for all three types of HTML tokens
+class HTMLToken
+  attr_accessor :raw
+  # Initialize the token based on the raw text
+  def initialize(text)
+    @raw = text
+  end
+  # By default, return exactly the string used to create the text
+  def to_s
+    raw
+  end
+  # By default tokens have no text representation
+  def text
+    ""
+  end
+  def trimmed_text
+    text.strip.gsub(/\s+/m, " ")
+  end
+  # Compare to another based on the raw source
+  def ==(other)
+    raw == other.to_s
+  end
+end
+# Class representing text that isn't inside a tag
+class HTMLText < HTMLToken
+  def text
+    raw
+  end
+end
+# Class representing an HTML comment
+class HTMLComment < HTMLToken
+  attr_accessor :contents
+  def initialize(text)
+    super(text)
+    temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
+    if temp_arr[0].nil?
+      raise "Text passed to HTMLComment.initialize is not a comment"
+    end
+    @contents = temp_arr[0][0]
+  end
+end
+# Class representing an HTML tag
+class HTMLTag < HTMLToken
+  attr_reader :end_tag, :tag_name
+  def initialize(text)
+    super(text)
+    if ?< != text[0] or ?> != text[-1]
+      raise "Text passed to HTMLComment.initialize is not a comment"
+    end
+    @attr_hash = Hash.new
+    @raw = text
+    tag_name = text.scan(/[\w:-]+/)[0]
+    if tag_name.nil?
+      raise "Error, tag is nil: #{tag_name}"
+    end
+    if ?/ == text[1]
+      # It's an end tag
+      @end_tag = true
+      @tag_name = '/' + tag_name.downcase
+    else
+      @end_tag = false
+      @tag_name = tag_name.downcase
+    end
+    @hashed = false
+  end
+  # Retrieve a hash of all the tag's attributes.
+  # Lazily done, so that if you don't look at a tag's attributes
+  # things go quicker
+  def attr_hash
+    # Lazy initialize == don't build the hash until it's needed
+    if !@hashed
+      if !@end_tag
+        # Get the attributes
+        attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
+        if attr_arr.kind_of?(Array)
+          # Attributes found, parse them
+          attrs = attr_arr[0]
+          attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
+          # clean up the array by:
+          # * setting all nil elements to true
+          # * removing enclosing quotes
+          attr_arr.each {
+            |item|
+            val = if item[1].nil?
+                    item[0]
+                  elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
+                    item[1][1 .. -2]
+                  else
+                    item[1]
+                  end
+            @attr_hash[item[0].downcase] = val
+          }
+        end
+      end
+      @hashed = true
+    end
+    #p self
+    @attr_hash
+  end
+  # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
+  def text
+    if !end_tag
+      case tag_name
+      when 'img'
+        if !attr_hash['alt'].nil?
+          return attr_hash['alt']
+        end
+      when 'applet'
+        if !attr_hash['alt'].nil?
+          return attr_hash['alt']
+        end
+      end
+    end
+    return ''
+  end
+end
+if $0 == __FILE__
+  require 'test/unit'
+  class TC_TestHTMLTokenizer < Test::Unit::TestCase
+    def test_bad_link
+      toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
+      assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
+    end
+    def test_namespace
+      toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
+      assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
+    end
+    def test_comment
+      toke = HTMLTokenizer.new("<!-- comment on me -->")
+      t = toke.getNextToken
+      assert(HTMLComment == t.class)
+      assert("comment on me" == t.contents)
+    end
+    def test_full
+      page = "<HTML>
+<HEAD>
+<TITLE>This is the title</TITLE>
+</HEAD>
+<!-- Here comes the <a href=\"missing.link\">blah</a>
+comment body
+ -->
+<BODY>
+  <H1>This is the header</H1>
+  <P>
+    This is the paragraph, it contains
+    <a href=\"link.html\">links</a>,
+    <img src=\"blah.gif\" optional alt='images
+are
+really cool'>.  Ok, here is some more text and
+    <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
+  </P>
+</body>
+</HTML>
+"
+      toke = HTMLTokenizer.new(page)
+      assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
+      assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
+      assert("links" == toke.getTrimmedText)
+      assert(toke.getTag("IMG", "A").attr_hash['optional'])
+      assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
+    end
+  end
+end

data/test/htmltokenizer_test.rb ADDED Viewed

@@ -0,0 +1,92 @@
+require 'html/htmltokenizer'
+class HtmlTokenizerTest < Test::Unit::TestCase
+  def test_right_version
+    assert_equal 1.0, HTMLTokenizer.version
+  end
+  def test_parses_attributes_with_dash
+    html = '<meta http-equiv="content-type" value="text/html">'
+    token = HTMLTokenizer.new(html).getNextToken()
+    assert_equal HTMLTag, token.class
+    assert_equal 2, token.attr_hash.size
+    assert_equal true, token.attr_hash.has_key?('value')
+    assert_equal true, token.attr_hash.has_key?('http-equiv')
+  end
+  def test_parses_tags_with_dash
+    html = '<a-value>abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    assert_equal 'a-value', tokenizer.getNextToken().tag_name
+    assert_equal 'abc', tokenizer.getNextToken().text
+    assert_equal '/a-value', tokenizer.getNextToken().tag_name
+  end
+  def test_gets_attributes_from_tags_with_dash_with_space
+    html = '<a-value n="2" >abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
+    assert_equal '2', token.attr_hash['n']
+  end
+  def test_gets_attributes_from_tags_with_dash_sans_space
+    html = '<a-value k=\'3\'>abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
+    assert_equal '3', token.attr_hash['k']
+  end
+  def test_gets_dashed_attributes_from_tags_with_dash
+    html = '<S-Value p:n-d="2">abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 's-value', token.tag_name
+    assert_equal 1, token.attr_hash.size
+    assert_equal '2', token.attr_hash['p:n-d']
+  end
+  def test_reads_attributes_without_quotes
+    html = '<a href=http://www.test.com/blank.html>value</a>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'a', token.tag_name
+    assert_equal 'http://www.test.com/blank.html', token.attr_hash['href']
+  end
+  def test_reads_short_attributes_without_quotes
+    html = '<a name=a>value</a>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'a', token.tag_name
+    assert_equal 'a', token.attr_hash['name']
+  end
+  def test_reads_multiple_short_attributes_without_quotes
+    html = '<a name=n target=m href=k>value</a>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'a', token.tag_name
+    assert_equal 'n', token.attr_hash['name']
+    assert_equal 'm', token.attr_hash['target']
+    assert_equal 'k', token.attr_hash['href']
+  end
+  def test_makes_boolean_attribute_values_themselves
+    html = '<input type=checked checked>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'input', token.tag_name
+    assert_equal 'checked', token.attr_hash['checked']
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,41 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.8.10
+specification_version: 1
+name: htmltokenizer
+version: !ruby/object:Gem::Version
+  version: "1.0"
+date: 2005-07-17
+summary: A class to tokenize HTML.
+require_paths:
+  - lib
+email: bg-rubyforge@infofiend.com
+homepage: http://htmltokenizer.rubyforge.org/
+rubyforge_project: htmltokenizer
+description: "This is a partial port of the functionality behind Perl's TokeParser Provided a
+  page it progressively returns tokens from that page"
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+    -
+      - ">"
+      - !ruby/object:Gem::Version
+        version: 0.0.0
+  version:
+platform: ruby
+authors:
+  - Ben Giddings
+files:
+  - lib/html/htmltokenizer.rb
+  - test/htmltokenizer_test.rb
+  - README
+test_files:
+  - test/htmltokenizer_test.rb
+rdoc_options: []
+extra_rdoc_files: []
+executables: []
+extensions: []
+requirements: []
+dependencies: []