RubyGems - utf8_validator - Versions diffs - 0.0.1 → 0.0.2 - Mend

utf8_validator 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/README.rdoc CHANGED Viewed

@@ -16,6 +16,7 @@ That functionality is left as an exercise for the reader.
 The Unicode Consortium:: At http://unicode.org/ for all the information published there.
 Frank Yung-Fong Tang:: For the state machine algorithm.  See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
+Markus Kuhn:: For invalid test data.  http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 == A Word On Ruby Versions

data/Rakefile CHANGED Viewed

@@ -21,11 +21,11 @@ Validation algorithm.}
   gem.email = "allard.guy.m@gmail.com"
   gem.authors = ["Guy Allard"]
-  # Include your dependencies below. Runtime dependencies are required when using your gem,
-  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
+  #  Runtime Dependencies - None at present
   #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
-  gem.add_development_dependency 'bundler', '>= 2.1.2'
+  #
+  # Bundler/Jeweler takes care of this via the Gemfile.lock process
+  # gem.add_development_dependency 'bundler', '>= 2.1.2'
 end
 Jeweler::RubygemsDotOrgTasks.new

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.1
1	+ 0.0.2

data/examples/fullstring.rb ADDED Viewed

@@ -0,0 +1,79 @@
+# encoding: utf-8
+#
+# Show how to parse a full string with multiple UTF8 validation failures.
+# Accumulate error information, and report it.
+#
+require 'rubygems' unless RUBY_VERSION =~ /1\.9/
+require 'utf8_validator'
+#
+# = Purpose
+#
+# A helper class for processing multiple validation errors in a single string.
+#
+class ValidationHelper
+  #
+  attr_reader :error_list
+  #
+  # Get a validator instance.
+  #
+  def initialize
+    @validator = UTF8::Validator.new
+  end
+  #
+  # Validate the whole string.
+  #
+  def scanstring(string)
+    @error_list = ""
+    work_string = string
+    run_pos = 0
+    begin
+      @validator.valid_encoding?(work_string, true)
+    rescue UTF8::ValidationError => e
+      # Extract offset of error, keep running offset up to date
+      last_colon = e.message.rindex(':')
+      last_lparen = e.message.rindex('(')
+      epos = e.message[last_colon+1..last_lparen-1]
+      sub_start = epos.to_i
+      if run_pos == 0
+        run_pos += sub_start
+      else
+        run_pos += sub_start + 1
+      end
+      # Start again at error offset + 1
+      work_string = work_string[sub_start+1..-1]
+      # Build next error message
+      next_emsg = e.message[0..last_colon]    # Part A of current message
+      # Add running offset position
+      run_pos_str = sprintf "%d(0x%x)", run_pos, run_pos
+      next_emsg += run_pos_str
+      #
+      @error_list += next_emsg
+      @error_list += "\n"
+      retry
+    end
+  end
+end
+#
+puts "Started"
+puts
+#
+helper = ValidationHelper.new
+#
+test_data = [
+  "a\xffbc\xfed",
+  "abcdefghijk\xffbcdefghijk\xfecdefg",
+  "anoerrorsz",
+  "errorlast\x80",
+  "a\xffbcd\xfeefgh\xfd123",
+]
+#
+test_data.each do |string|
+  puts "/" * 60
+  puts "#{string}"
+  helper.scanstring(string)
+  puts "#{helper.error_list}"
+end
+#
+puts
+puts "Complete"

data/lib/utf8_validator.rb CHANGED Viewed

@@ -1,4 +1,6 @@
-#
+#--
+# Copyright (c) 2011 Guy Allard
+#--
 require 'validation/validator'
 require 'validation/errors'

data/lib/validation/errors.rb CHANGED Viewed

@@ -1,3 +1,6 @@
+#--
+# Copyright (c) 2011 Guy Allard
+#--
 module UTF8
   #
   # == Purpose
@@ -7,8 +10,5 @@ module UTF8
   #
   class ValidationError < ::RuntimeError
     #
-    def message()
-      "general UTF-8 validation error"
-    end
   end
 end

data/lib/validation/validator.rb CHANGED Viewed

@@ -60,30 +60,33 @@ module UTF8
 # Instances of this class are thread safe, and a single instance may be used
 # safely by multiple concurrent threads, with one caveat:
 #
-# The value of #{Validator::DEBUG} must not be changed by any thread.
+# The value of #{DEBUG} must not be changed by any thread.
 #
 #--
 # Copyright (c) 2011 Guy Allard
-#
+#--
 class Validator
   #
   # For use during development only.
   #
   DEBUG=false
   #
   # Validate the supplied string for proper UTF-8 encoding.
   #
   # Calling Sequence:
   #
+  #    validator = UTF8::Validator.new                           -> validator
   #    validator.valid_encoding?(string)                         -> true or false
-  #    validator.valid_encoding?(string, raise_on_error)         -> true or exception
+  #    validator.valid_encoding?(string, true)                   -> true or exception
   #
   # Parameters:
   #
   # string::         the string to validate
   # raise_on_error:: a flag to indicate failure behavior
-  #
+  #
+  # When raise_on_error is _true_ and a string fails validation, an
+  # error of type #{UTF8::ValidationError} is raised.  The byte in error
+  # and the location of that byte are described in the error message.
   #
   def valid_encoding?(string, raise_on_error = false)
     bytes = string.bytes
@@ -169,7 +172,7 @@ class Validator
             else
               valid = false
               break
-          end # of the inner case
+          end # of the inner case, the 'start' state
         # The last continuation byte of a 2, 3, or 4 byte character
         # State: 'a'
@@ -185,6 +188,7 @@ class Validator
           end
         # The first continuation byte for most 3 byte characters
+        # (those with start bytes in: 0xe1-0xec or 0xee-0xef)
         # State: 'b'
         # o Input = 0x80-0xBF: change state to A
         # o Others: ERROR
@@ -198,6 +202,7 @@ class Validator
           end
         # The first continuation byte for some special 3 byte characters
+        # (those with start byte 0xe0)
         # State: 'c'
         # o Input = 0xA0-0xBF: change state to A
         # o Others: ERROR
@@ -211,6 +216,7 @@ class Validator
           end
         # The first continuation byte for the remaining 3 byte characters
+        # (those with start byte 0xed)
         # State: 'd'
         # o Input = 0x80-0x9F: change state to A
         # o Others: ERROR
@@ -224,6 +230,7 @@ class Validator
           end
         # The first continuation byte for some 4 byte characters
+        # (those with start bytes in: 0xf1-0xf3)
         # State: 'e'
         # o Input = 0x80-0xBF: change state to B
         # o Others: ERROR
@@ -237,6 +244,7 @@ class Validator
           end
         # The first continuation byte for some special 4 byte characters
+        # (those with start byte 0xf0)
         # State: 'f'
         # o Input = 0x90-0xBF: change state to B
         # o Others: ERROR
@@ -250,6 +258,7 @@ class Validator
           end
         # The first continuation byte for the remaining 4 byte characters
+        # (those with start byte 0xf4)
         # State: 'g'
         # o Input = 0x80-0x8F: change state to B
         # o Others: ERROR
@@ -271,10 +280,12 @@ class Validator
     puts "State at end: #{state}" if DEBUG
     # Catch truncation at end of string
     if valid and state != 'start'
+      puts "Resetting valid value" if DEBUG
       valid = false
     end
     #
     if !valid and raise_on_error
+      puts "Raising Error" if DEBUG
       raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
     end
     #

data/test/test_raise_request.rb CHANGED Viewed

@@ -34,5 +34,15 @@ class TestRaiseRequect < Test::Unit::TestCase
     end
   end
+  # Check message from raise
+  def test_0030_check_raise_message
+    #
+    begin
+      @validator.valid_encoding?("a\xffb\xfec", true)
+    rescue UTF8::ValidationError => e
+      assert e.message =~ /^Invalid byte/
+    end
+  end
 end

data/test/test_utf8_validator.rb CHANGED Viewed

@@ -7,6 +7,10 @@ require 'helper'
 #
 # Tests for the #{UTF8::Validator} implementation.
 #
+# Some test data pulled directly from:
+#
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+#
 class TestUtf8Validator < Test::Unit::TestCase
   #
   def setup
@@ -92,6 +96,22 @@ class TestUtf8Validator < Test::Unit::TestCase
     end
   end
+  # Boundary conditions
+  def test_0070_boundary_conditions
+    test_data = [
+      "\xed\x9f\xbf", # = "\ud7ff"
+      "\xee\x80\x80", # = "\ue000"
+      "\xef\xbf\xbd", # = "\ufffd"
+#      "\xf4\x8f\xbf\xbf", # = "\U0010ffff" / maybe _should_ fail ??
+#      "\xf4\x90\x80\x80", # = "\ufffd" / maybe  _should_ fail ?? / research
+    ]
+    test_data.each do |string|
+      assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
+      assert string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}"  if RUBY_VERSION =~ /1\.9/
+    end
+  end
   #--
   # Validation should fail for the following tests
   #--
@@ -108,8 +128,13 @@ class TestUtf8Validator < Test::Unit::TestCase
   # UTF-16 Surrogate Halves
   def test0520_utf16_surrogate_halves
     test_data = [
-      "\xed\xa0\x80", # u-800 (lowest)
-      "\xed\xbf\xbf", # u-fff (highest)
+      "\xed\xa0\x80",
+      "\xed\xad\xbf",
+      "\xed\xae\x80",
+      "\xed\xaf\xbf",
+      "\xed\xb0\x80",
+      "\xed\xbe\x80",
+      "\xed\xbf\xbf",
     ]
     test_data.each do |string|
       assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
@@ -117,6 +142,14 @@ class TestUtf8Validator < Test::Unit::TestCase
     end
   end
+  #--
+  # I do not see a need to test UTF-16 surrogate pairs.  They are guaranteed
+  # to alyays fail if the preceding test succeeds.  This is because the
+  # preceeding test data values are always the first surrogate of the pair.
+  #
+  # UTF-16 surrogates are clearly something I do not understand.
+  #--
   # Invalid single bytes
   def test0530_invalid_single_bytes
     test_data = [
@@ -222,5 +255,21 @@ class TestUtf8Validator < Test::Unit::TestCase
     end
   end
+  # Maximum overlong sequences
+  def test0580_max_overlong_seqs
+    test_data = [
+      "\xc1\xbf",
+      "\xe0\x9f\xbf",
+      "\xf0\x8f\xbf\xbf",
+      "\xf8\x87\xbf\xbf\xbf",
+      "\xfc\x83\xbf\xbf\xbf\xbf",
+    ]
+    test_data.each do |string|
+      assert !@validator.valid_encoding?(string), "max overlong seq: #{string}"
+      assert !string.force_encoding("UTF-8").valid_encoding?, "max overlong seq 19: #{string}"  if RUBY_VERSION =~ /1\.9/
+    end
+  end
 end

data/utf8_validator.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{utf8_validator}
-  s.version = "0.0.1"
+  s.version = "0.0.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Guy Allard"]
-  s.date = %q{2011-01-25}
+  s.date = %q{2011-01-26}
   s.description = %q{A State Machine implementation of a UTF-8 Encoding
 Validation algorithm.}
   s.email = %q{allard.guy.m@gmail.com}
@@ -25,6 +25,7 @@ Validation algorithm.}
     "README.rdoc",
     "Rakefile",
     "VERSION",
+    "examples/fullstring.rb",
     "lib/utf8_validator.rb",
     "lib/validation/errors.rb",
     "lib/validation/validator.rb",
@@ -39,6 +40,7 @@ Validation algorithm.}
   s.rubygems_version = %q{1.3.7}
   s.summary = %q{A UTF-8 Encoding Validator.}
   s.test_files = [
+    "examples/fullstring.rb",
     "test/helper.rb",
     "test/test_raise_request.rb",
     "test/test_utf8_validator.rb"
@@ -52,18 +54,15 @@ Validation algorithm.}
       s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
       s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
       s.add_development_dependency(%q<rcov>, [">= 0"])
-      s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
     else
       s.add_dependency(%q<bundler>, ["~> 1.0.0"])
       s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
       s.add_dependency(%q<rcov>, [">= 0"])
-      s.add_dependency(%q<bundler>, [">= 2.1.2"])
     end
   else
     s.add_dependency(%q<bundler>, ["~> 1.0.0"])
     s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
     s.add_dependency(%q<rcov>, [">= 0"])
-    s.add_dependency(%q<bundler>, [">= 2.1.2"])
   end
 end

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 1
-  version: 0.0.1
+  - 2
+  version: 0.0.2
 platform: ruby
 authors:
 - Guy Allard
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-25 00:00:00 -05:00
+date: 2011-01-26 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -60,21 +60,6 @@ dependencies:
   type: :development
   prerelease: false
   version_requirements: *id003
-- !ruby/object:Gem::Dependency
-  name: bundler
-  requirement: &id004 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        segments:
-        - 2
-        - 1
-        - 2
-        version: 2.1.2
-  type: :development
-  prerelease: false
-  version_requirements: *id004
 description: |-
   A State Machine implementation of a UTF-8 Encoding
   Validation algorithm.
@@ -94,6 +79,7 @@ files:
 - README.rdoc
 - Rakefile
 - VERSION
+- examples/fullstring.rb
 - lib/utf8_validator.rb
 - lib/validation/errors.rb
 - lib/validation/validator.rb
@@ -115,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: -1120544117494340473
+      hash: 2884485592009813991
       segments:
       - 0
       version: "0"
@@ -135,6 +121,7 @@ signing_key:
 specification_version: 3
 summary: A UTF-8 Encoding Validator.
 test_files:
+- examples/fullstring.rb
 - test/helper.rb
 - test/test_raise_request.rb
 - test/test_utf8_validator.rb