RubyGems - htmltokenizer - Versions diffs - 1.0 - Mend

htmltokenizer 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/README ADDED Viewed

@@ -0,0 +1,63 @@
+htmltokenizer README
+============
+  htmltokenizer is a port of the idea behind Perl's HTML::TokeParser::Simple.
+  The basic concept is that it treats a web page as a series of tokens, which
+  are either text, html tags, or html comments.  This class provides a way
+  of getting these tokens in sequence, either one at a time regardless of
+  type, or by choosing a list of interesting tags.
+Requirements
+------------
+  * ruby
+Install
+-------
+  De-Compress archive and enter its top directory.
+  Then type:
+    $ ruby install.rb config
+    $ ruby install.rb setup
+    $ su -c "ruby install.rb install"
+  or
+    $ ruby install.rb config
+    $ ruby install.rb setup
+    $ sudo ruby install.rb install
+  You can also install files into your favorite directory
+  by supplying install.rb some options. Try "ruby install.rb --help".
+Usage
+-----
+require 'html/htmltokenizer'
+page = getSomePageFromTheInternetAsAString()
+tokenizer = HTMLTokenizer.new(page)
+while token = tokenizer.getTag('a', 'font', '/tr', 'div')
+  if 'div' == token.tag_name
+    if 'headlinesheader' == token.attr_hash['class']
+      puts "Header is: " + tokenizer.getTrimmedText('/div')
+    else
+      tokenizer.getTag('/div')
+      token = tokenizer.getTag('a')
+      if token.attr_hash['href']
+        puts "Found a link after a div going to #{token.attr_hash['href']}"
+      end
+    end
+  end
+end
+License
+-------
+  Ruby's license, see http://www.ruby-lang.org/en/LICENSE.txt
+Ben Giddings <bg-rubyraa@infofiend.com>

data/lib/html/htmltokenizer.rb ADDED Viewed

@@ -0,0 +1,355 @@
+# = HTMLTokenizer
+#
+# Author::    Ben Giddings  (mailto:bg-rubyforge@infofiend.com)
+# Copyright:: Copyright (c) 2004 Ben Giddings
+# License::   Distributes under the same terms as Ruby
+#
+#
+# This is a partial port of the functionality behind Perl's TokeParser
+# Provided a page it progressively returns tokens from that page
+#
+# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
+#
+# A class to tokenize HTML.
+#
+# Example:
+#
+#   page = "<HTML>
+#   <HEAD>
+#   <TITLE>This is the title</TITLE>
+#   </HEAD>
+#    <!-- Here comes the <a href=\"missing.link\">blah</a>
+#    comment body
+#     -->
+#    <BODY>
+#      <H1>This is the header</H1>
+#      <P>
+#        This is the paragraph, it contains
+#        <a href=\"link.html\">links</a>,
+#        <img src=\"blah.gif\" optional alt='images
+#        are
+#        really cool'>.  Ok, here is some more text and
+#        <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
+#      </P>
+#    </body>
+#    </HTML>
+#    "
+#    toke = HTMLTokenizer.new(page)
+#
+#    assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
+#    assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
+#    assert("links" == toke.getTrimmedText)
+#    assert(toke.getTag("IMG", "A").attr_hash['optional'])
+#    assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
+#
+class HTMLTokenizer
+  @@version = 1.0
+  # Get version of HTMLTokenizer lib
+  def self.version
+    @@version
+  end
+  attr_reader :page
+  # Create a new tokenizer, based on the content, used as a string.
+  def initialize(content)
+    @page = content.to_s
+    @cur_pos = 0
+  end
+  # Reset the parser, setting the current position back at the stop
+  def reset
+    @cur_pos = 0
+  end
+  # Look at the next token, but don't actually grab it
+  def peekNextToken
+    if @cur_pos == @page.length then return nil end
+    if ?< == @page[@cur_pos]
+      # Next token is a tag of some kind
+      if '!--' == @page[(@cur_pos + 1), 3]
+        # Token is a comment
+        tag_end = @page.index('-->', (@cur_pos + 1))
+        if tag_end.nil?
+          raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
+        end
+        # p @page[@cur_pos .. (tag_end+2)]
+        HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
+      else
+        # Token is a html tag
+        tag_end = @page.index('>', (@cur_pos + 1))
+        if tag_end.nil?
+          raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
+        end
+        # p @page[@cur_pos .. tag_end]
+        HTMLTag.new(@page[@cur_pos .. tag_end])
+      end
+    else
+      # Next token is text
+      text_end = @page.index('<', @cur_pos)
+      text_end = text_end.nil? ? -1 : (text_end - 1)
+      # p @page[@cur_pos .. text_end]
+      HTMLText.new(@page[@cur_pos .. text_end])
+    end
+  end
+  # Get the next token, returns an instance of
+  # * HTMLText
+  # * HTMLToken
+  # * HTMLTag
+  def getNextToken
+    token = peekNextToken
+    if token
+      # @page = @page[token.raw.length .. -1]
+      # @page.slice!(0, token.raw.length)
+      @cur_pos += token.raw.length
+    end
+    #p token
+    #print token.raw
+    return token
+  end
+  # Get a tag from the specified set of desired tags.
+  # For example:
+  # <tt>foo =  toke.getTag("h1", "h2", "h3")</tt>
+  # Will return the next header tag encountered.
+  def getTag(*sought_tags)
+    sought_tags.collect! {|elm| elm.downcase}
+    while (tag = getNextToken)
+      if tag.kind_of?(HTMLTag) and
+          (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
+        break
+      end
+    end
+    tag
+  end
+  # Get all the text between the current position and the next tag
+  # (if specified) or a specific later tag
+  def getText(until_tag = nil)
+    if until_tag.nil?
+      if ?< == @page[@cur_pos]
+        # Next token is a tag, not text
+        ""
+      else
+        # Next token is text
+        getNextToken.text
+      end
+    else
+      ret_str = ""
+      while (tag = peekNextToken)
+        if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
+          break
+        end
+        if ("" != tag.text)
+          ret_str << (tag.text + " ")
+        end
+        getNextToken
+      end
+      ret_str
+    end
+  end
+  # Like getText, but squeeze all whitespace, getting rid of
+  # leading and trailing whitespace, and squeezing multiple
+  # spaces into a single space.
+  def getTrimmedText(until_tag = nil)
+    getText(until_tag).strip.gsub(/\s+/m, " ")
+  end
+end
+# The parent class for all three types of HTML tokens
+class HTMLToken
+  attr_accessor :raw
+  # Initialize the token based on the raw text
+  def initialize(text)
+    @raw = text
+  end
+  # By default, return exactly the string used to create the text
+  def to_s
+    raw
+  end
+  # By default tokens have no text representation
+  def text
+    ""
+  end
+  def trimmed_text
+    text.strip.gsub(/\s+/m, " ")
+  end
+  # Compare to another based on the raw source
+  def ==(other)
+    raw == other.to_s
+  end
+end
+# Class representing text that isn't inside a tag
+class HTMLText < HTMLToken
+  def text
+    raw
+  end
+end
+# Class representing an HTML comment
+class HTMLComment < HTMLToken
+  attr_accessor :contents
+  def initialize(text)
+    super(text)
+    temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
+    if temp_arr[0].nil?
+      raise "Text passed to HTMLComment.initialize is not a comment"
+    end
+    @contents = temp_arr[0][0]
+  end
+end
+# Class representing an HTML tag
+class HTMLTag < HTMLToken
+  attr_reader :end_tag, :tag_name
+  def initialize(text)
+    super(text)
+    if ?< != text[0] or ?> != text[-1]
+      raise "Text passed to HTMLComment.initialize is not a comment"
+    end
+    @attr_hash = Hash.new
+    @raw = text
+    tag_name = text.scan(/[\w:-]+/)[0]
+    if tag_name.nil?
+      raise "Error, tag is nil: #{tag_name}"
+    end
+    if ?/ == text[1]
+      # It's an end tag
+      @end_tag = true
+      @tag_name = '/' + tag_name.downcase
+    else
+      @end_tag = false
+      @tag_name = tag_name.downcase
+    end
+    @hashed = false
+  end
+  # Retrieve a hash of all the tag's attributes.
+  # Lazily done, so that if you don't look at a tag's attributes
+  # things go quicker
+  def attr_hash
+    # Lazy initialize == don't build the hash until it's needed
+    if !@hashed
+      if !@end_tag
+        # Get the attributes
+        attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
+        if attr_arr.kind_of?(Array)
+          # Attributes found, parse them
+          attrs = attr_arr[0]
+          attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
+          # clean up the array by:
+          # * setting all nil elements to true
+          # * removing enclosing quotes
+          attr_arr.each {
+            |item|
+            val = if item[1].nil?
+                    item[0]
+                  elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
+                    item[1][1 .. -2]
+                  else
+                    item[1]
+                  end
+            @attr_hash[item[0].downcase] = val
+          }
+        end
+      end
+      @hashed = true
+    end
+    #p self
+    @attr_hash
+  end
+  # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
+  def text
+    if !end_tag
+      case tag_name
+      when 'img'
+        if !attr_hash['alt'].nil?
+          return attr_hash['alt']
+        end
+      when 'applet'
+        if !attr_hash['alt'].nil?
+          return attr_hash['alt']
+        end
+      end
+    end
+    return ''
+  end
+end
+if $0 == __FILE__
+  require 'test/unit'
+  class TC_TestHTMLTokenizer < Test::Unit::TestCase
+    def test_bad_link
+      toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
+      assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
+    end
+    def test_namespace
+      toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
+      assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
+    end
+    def test_comment
+      toke = HTMLTokenizer.new("<!-- comment on me -->")
+      t = toke.getNextToken
+      assert(HTMLComment == t.class)
+      assert("comment on me" == t.contents)
+    end
+    def test_full
+      page = "<HTML>
+<HEAD>
+<TITLE>This is the title</TITLE>
+</HEAD>
+<!-- Here comes the <a href=\"missing.link\">blah</a>
+comment body
+ -->
+<BODY>
+  <H1>This is the header</H1>
+  <P>
+    This is the paragraph, it contains
+    <a href=\"link.html\">links</a>,
+    <img src=\"blah.gif\" optional alt='images
+are
+really cool'>.  Ok, here is some more text and
+    <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
+  </P>
+</body>
+</HTML>
+"
+      toke = HTMLTokenizer.new(page)
+      assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
+      assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
+      assert("links" == toke.getTrimmedText)
+      assert(toke.getTag("IMG", "A").attr_hash['optional'])
+      assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
+    end
+  end
+end

data/test/htmltokenizer_test.rb ADDED Viewed

@@ -0,0 +1,92 @@
+require 'html/htmltokenizer'
+class HtmlTokenizerTest < Test::Unit::TestCase
+  def test_right_version
+    assert_equal 1.0, HTMLTokenizer.version
+  end
+  def test_parses_attributes_with_dash
+    html = '<meta http-equiv="content-type" value="text/html">'
+    token = HTMLTokenizer.new(html).getNextToken()
+    assert_equal HTMLTag, token.class
+    assert_equal 2, token.attr_hash.size
+    assert_equal true, token.attr_hash.has_key?('value')
+    assert_equal true, token.attr_hash.has_key?('http-equiv')
+  end
+  def test_parses_tags_with_dash
+    html = '<a-value>abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    assert_equal 'a-value', tokenizer.getNextToken().tag_name
+    assert_equal 'abc', tokenizer.getNextToken().text
+    assert_equal '/a-value', tokenizer.getNextToken().tag_name
+  end
+  def test_gets_attributes_from_tags_with_dash_with_space
+    html = '<a-value n="2" >abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
+    assert_equal '2', token.attr_hash['n']
+  end
+  def test_gets_attributes_from_tags_with_dash_sans_space
+    html = '<a-value k=\'3\'>abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 1, token.attr_hash.size, "attributes found: #{token.attr_hash.inspect}"
+    assert_equal '3', token.attr_hash['k']
+  end
+  def test_gets_dashed_attributes_from_tags_with_dash
+    html = '<S-Value p:n-d="2">abc</a-value>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 's-value', token.tag_name
+    assert_equal 1, token.attr_hash.size
+    assert_equal '2', token.attr_hash['p:n-d']
+  end
+  def test_reads_attributes_without_quotes
+    html = '<a href=http://www.test.com/blank.html>value</a>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'a', token.tag_name
+    assert_equal 'http://www.test.com/blank.html', token.attr_hash['href']
+  end
+  def test_reads_short_attributes_without_quotes
+    html = '<a name=a>value</a>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'a', token.tag_name
+    assert_equal 'a', token.attr_hash['name']
+  end
+  def test_reads_multiple_short_attributes_without_quotes
+    html = '<a name=n target=m href=k>value</a>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'a', token.tag_name
+    assert_equal 'n', token.attr_hash['name']
+    assert_equal 'm', token.attr_hash['target']
+    assert_equal 'k', token.attr_hash['href']
+  end
+  def test_makes_boolean_attribute_values_themselves
+    html = '<input type=checked checked>'
+    tokenizer = HTMLTokenizer.new(html)
+    token = tokenizer.getNextToken()
+    assert_equal 'input', token.tag_name
+    assert_equal 'checked', token.attr_hash['checked']
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,41 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.8.10
+specification_version: 1
+name: htmltokenizer
+version: !ruby/object:Gem::Version
+  version: "1.0"
+date: 2005-07-17
+summary: A class to tokenize HTML.
+require_paths:
+  - lib
+email: bg-rubyforge@infofiend.com
+homepage: http://htmltokenizer.rubyforge.org/
+rubyforge_project: htmltokenizer
+description: "This is a partial port of the functionality behind Perl's TokeParser Provided a
+  page it progressively returns tokens from that page"
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+    -
+      - ">"
+      - !ruby/object:Gem::Version
+        version: 0.0.0
+  version:
+platform: ruby
+authors:
+  - Ben Giddings
+files:
+  - lib/html/htmltokenizer.rb
+  - test/htmltokenizer_test.rb
+  - README
+test_files:
+  - test/htmltokenizer_test.rb
+rdoc_options: []
+extra_rdoc_files: []
+executables: []
+extensions: []
+requirements: []
+dependencies: []