RubyGems - rdf-tabular - Versions diffs - 0.4.0 → 3.1.0 - Mend

rdf-tabular 0.4.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

checksums.yaml +5 -5
data/README.md +24 -5
data/VERSION +1 -1
data/etc/csvw.jsonld +135 -50
data/lib/rdf/tabular/csvw.rb +215 -181
data/lib/rdf/tabular/format.rb +8 -6
data/lib/rdf/tabular/literal.rb +1 -1
data/lib/rdf/tabular/metadata.rb +61 -80
data/lib/rdf/tabular/reader.rb +18 -15
data/lib/rdf/tabular/uax35.rb +143 -38
data/spec/data/countries-minimal.json +38 -0
data/spec/data/countries-minimal.ttl +36 -0
data/spec/data/countries-standard.json +86 -0
data/spec/data/countries-standard.ttl +75 -0
data/spec/data/countries.csv +4 -0
data/spec/data/countries.csv-minimal.json +16 -0
data/spec/data/countries.csv-minimal.ttl +19 -0
data/spec/data/countries.csv-standard.json +33 -0
data/spec/data/countries.csv-standard.ttl +44 -0
data/spec/data/countries.html +88 -0
data/spec/data/countries.json +53 -0
data/spec/data/countries_embed-minimal.json +38 -0
data/spec/data/countries_embed-minimal.ttl +36 -0
data/spec/data/countries_embed-standard.json +86 -0
data/spec/data/countries_embed-standard.ttl +75 -0
data/spec/data/countries_embed.html +88 -0
data/spec/data/countries_html-minimal.json +38 -0
data/spec/data/countries_html-minimal.ttl +36 -0
data/spec/data/countries_html-standard.json +86 -0
data/spec/data/countries_html-standard.ttl +75 -0
data/spec/data/country-codes-and-names-minimal.json +19 -0
data/spec/data/country-codes-and-names-minimal.ttl +22 -0
data/spec/data/country-codes-and-names-standard.json +47 -0
data/spec/data/country-codes-and-names-standard.ttl +45 -0
data/spec/data/country-codes-and-names.csv +5 -0
data/spec/data/country_slice.csv +4 -0
data/spec/data/junior-roles.csv +3 -0
data/spec/data/junior-roles.json +54 -0
data/spec/data/roles-minimal.json +32 -0
data/spec/data/roles-minimal.ttl +36 -0
data/spec/data/roles-standard.json +56 -0
data/spec/data/roles-standard.ttl +66 -0
data/spec/data/roles.json +23 -0
data/spec/data/senior-roles.csv +3 -0
data/spec/data/senior-roles.json +52 -0
data/spec/data/test232-metadata.json +10 -0
data/spec/data/test232.csv +3 -0
data/spec/data/tree-ops-atd.json +1 -0
data/spec/data/tree-ops-ext-minimal.json +42 -0
data/spec/data/tree-ops-ext-minimal.ttl +34 -0
data/spec/data/tree-ops-ext-standard.json +93 -0
data/spec/data/tree-ops-ext-standard.ttl +82 -0
data/spec/data/tree-ops-ext.csv +4 -0
data/spec/data/tree-ops-ext.json +81 -0
data/spec/data/tree-ops-minimal.json +18 -0
data/spec/data/tree-ops-minimal.ttl +14 -0
data/spec/data/tree-ops-standard.json +44 -0
data/spec/data/tree-ops-standard.ttl +44 -0
data/spec/data/tree-ops-virtual-minimal.json +32 -0
data/spec/data/tree-ops-virtual-minimal.ttl +25 -0
data/spec/data/tree-ops-virtual-standard.json +49 -0
data/spec/data/tree-ops-virtual-standard.ttl +49 -0
data/spec/data/tree-ops-virtual.json +48 -0
data/spec/data/tree-ops.csv +3 -0
data/spec/data/tree-ops.csv-metadata.json +43 -0
data/spec/data/tree-ops.html +54 -0
data/spec/data/tree-ops.tsv +3 -0
data/spec/format_spec.rb +5 -4
data/spec/metadata_spec.rb +10 -16
data/spec/suite_helper.rb +2 -2
data/spec/suite_spec.rb +5 -6
data/spec/uax35_spec.rb +239 -0
metadata +149 -36
data/lib/rdf/tabular/json.rb +0 -0

data/lib/rdf/tabular/reader.rb CHANGED

@@ -28,16 +28,19 @@ module RDF::Tabular
         RDF::CLI::Option.new(
           symbol: :metadata,
           datatype: RDF::URI,
+          control: :url2,
           on: ["--metadata URI"],
           description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
         RDF::CLI::Option.new(
           symbol: :minimal,
+          control: :checkbox,
           datatype: TrueClass,
           on: ["--minimal"],
           description: "Includes only the information gleaned from the cells of the tabular data.") {true},
         RDF::CLI::Option.new(
           symbol: :noProv,
           datatype: TrueClass,
+          control: :checkbox,
           on: ["--no-prov"],
           description: "do not output optional provenance information.") {true},
       ]
@@ -60,7 +63,7 @@ module RDF::Tabular
     # @yieldparam  [RDF::Reader] reader
     # @yieldreturn [void] ignored
     # @raise [RDF::ReaderError] if the CSV document cannot be loaded
-    def initialize(input = $stdin, options = {}, &block)
+    def initialize(input = $stdin, **options, &block)
       super do
         # Base would be how we are to take this
         @options[:base] ||= base_uri.to_s if base_uri
@@ -86,7 +89,7 @@ module RDF::Tabular
           # If input is JSON, then the input is the metadata
           content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
           if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
-            @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
+            @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
             # If @metadata is for a Table, turn it into a TableGroup
             @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
             @metadata.normalize!
@@ -99,7 +102,7 @@ module RDF::Tabular
               def script.content_type; "application/csvm+json"; end
               log_debug("Reader#initialize") {"Process HTML script block"}
               @input = script
-              @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
+              @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
               # If @metadata is for a Table, turn it into a TableGroup
               @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
               @metadata.normalize!
@@ -116,7 +119,7 @@ module RDF::Tabular
             dialect.separator = "\t" if (input.content_type == "text/tsv" rescue nil)
             embed_options = @options.dup
             embed_options[:lang] = dialect_metadata.lang if dialect_metadata.lang
-            embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], embed_options)
+            embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], **embed_options)
             if (@metadata = @options[:metadata]) && @metadata.tableSchema
               @metadata.verify_compatible!(embedded_metadata)
@@ -133,7 +136,7 @@ module RDF::Tabular
           else
             # It's tabluar data. Find metadata and proceed as if it was specified in the first place
             @options[:original_input] = @input unless @options[:metadata]
-            @input = @metadata = Metadata.for_input(@input, @options).normalize!
+            @input = @metadata = Metadata.for_input(@input, **@options).normalize!
           end
           log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
@@ -183,7 +186,7 @@ module RDF::Tabular
               if options[:original_input] && !input.describes_file?(options[:base_uri])
                 table_resource = RDF::Node.new
                 add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
-                Reader.new(options[:original_input], options.merge(
+                Reader.new(options[:original_input], **options.merge(
                     metadata: input.tables.first,
                     base: input.tables.first.url,
                     no_found_metadata: true,
@@ -202,7 +205,7 @@ module RDF::Tabular
                   end.flatten.compact
                   table_resource = table.id || RDF::Node.new
                   add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
-                  Reader.open(table.url, options.merge(
+                  Reader.open(table.url, **options.merge(
                       metadata: table,
                       base: table.url,
                       no_found_metadata: true,
@@ -418,9 +421,9 @@ module RDF::Tabular
       res = if io
         ::JSON::dump_default_options = json_state
-        ::JSON.dump(self.send(hash_fn, options), io)
+        ::JSON.dump(self.send(hash_fn, **options), io)
       else
-        hash = self.send(hash_fn, options)
+        hash = self.send(hash_fn, **options)
         ::JSON.generate(hash, json_state)
       end
@@ -440,7 +443,7 @@ module RDF::Tabular
     #
     # @param [Hash{Symbol => Object}] options
     # @return [Hash, Array]
-    def to_hash(options = {})
+    def to_hash(**options)
       # Construct metadata from that passed from file open, along with information from the file.
       if input.is_a?(Metadata)
         log_debug("each_statement: metadata") {input.inspect}
@@ -464,13 +467,13 @@ module RDF::Tabular
             table_group['tables'] = tables
             if options[:original_input] && !input.describes_file?(options[:base_uri])
-              Reader.new(options[:original_input], options.merge(
+              Reader.new(options[:original_input], **options.merge(
                   metadata:           input.tables.first,
                   base:               input.tables.first.url,
                   minimal:            minimal?,
                   no_found_metadata:  true,
               )) do |r|
-                case t = r.to_hash(options)
+                case t = r.to_hash(**options)
                 when Array then tables += t unless input.tables.first.suppressOutput
                 when Hash  then tables << t unless input.tables.first.suppressOutput
                 end
@@ -478,13 +481,13 @@ module RDF::Tabular
             else
               input.each_table do |table|
                 next if table.suppressOutput && !validate?
-                Reader.open(table.url, options.merge(
+                Reader.open(table.url, **options.merge(
                   metadata:           table,
                   base:               table.url,
                   minimal:            minimal?,
                   no_found_metadata:  true,
                 )) do |r|
-                  case t = r.to_hash(options)
+                  case t = r.to_hash(**options)
                   when Array then tables += t unless table.suppressOutput
                   when Hash  then tables << t unless table.suppressOutput
                   end
@@ -557,7 +560,7 @@ module RDF::Tabular
             co['@id'] = subject.to_s unless subject == 'null'
             prop = case cell.propertyUrl
             when RDF.type then '@type'
-            when nil then URI.decode(column.name) # Use URI-decoded name
+            when nil then CGI.unescape(column.name) # Use URI-decoded name
             else
               # Compact the property to a term or prefixed name
               metadata.context.compact_iri(cell.propertyUrl, vocab: true)

data/lib/rdf/tabular/uax35.rb CHANGED

@@ -7,50 +7,99 @@ module RDF::Tabular
   module UAX35
     ##
-    # Parse the date format (if provided), and match against the value (if provided)
-    # Otherwise, validate format and raise an error
+    # Parse the date pattern (if provided), and match against the value (if provided)
+    # Otherwise, validate pattern and raise an error.
     #
-    # @param [String] format
+    # Supported patterns are:
+    #
+    # * yyyy-MM-dd
+    # * yyyyMMdd
+    # * dd-MM-yyyy
+    # * d-M-yyyy
+    # * d-M-yy
+    # * d-M-y
+    # * MM-dd-yyyy
+    # * M-d-yyyy
+    # * M-d-yy
+    # * M-d-y
+    # * dd/MM/yyyy
+    # * d/M/yyyy
+    # * d/M/yy
+    # * d/M/y
+    # * MM/dd/yyyy
+    # * M/d/yyyy
+    # * M/d/yy
+    # * M/d/y
+    # * dd.MM.yyyy
+    # * d.M.yyyy
+    # * d.M.yy
+    # * d.M.y
+    # * MM.dd.yyyy
+    # * M.d.yyyy
+    # * M.d.yy
+    # * M.d.y
+    # * yyyy-MM-ddTHH:mm
+    # * yyyy-MM-ddTHH:mm:ss
+    # * yyyy-MM-ddTHH:mm:ss.S+
+    #
+    # Year comonents less than four digits are normalized to 1900 or 2000 based on if the value is <= 99 or >= 70, it is considered to be in the 1900 range, otherwise, based on 2000.
+    #
+    # @param [String] pattern
     # @param [String] value
     # @return [String] XMLSchema version of value
-    # @raise [ArgumentError] if format is not valid, or nil, if value does not match
-    def parse_uax35_date(format, value)
-      date_format, time_format = nil, nil
-      return value unless format
-      value ||= ""
+    # @raise [ArgumentError] if pattern is not valid, or nil
+    # @raise [ParseError] if value does not match
+    def parse_uax35_date(pattern, value)
+      date_pattern, time_pattern = nil, nil
+      return value unless pattern
+      orig_value = value ||= ""
+      orig_pattern = pattern
       # Extract tz info
-      if md = format.match(/^(.*[dyms])+(\s*[xX]+)$/)
-        format, tz_format = md[1], md[2]
+      if md = pattern.match(/^(.*[dyms])+(\s*[xX]+)$/)
+        pattern, tz_pattern = md[1], md[2]
       end
-      date_format, time_format = format.split(' ')
-      date_format, time_format = nil, date_format if self.base.to_sym == :time
+      date_pattern, time_pattern = pattern.split(' ')
+      # Snuff out if this is a Time pattern
+      date_pattern, time_pattern = nil, date_pattern if time_pattern.nil? && !date_pattern.match(/[TyMd]/)
       # Extract date, of specified
-      date_part = case date_format
+      date_part = case date_pattern
       when 'yyyy-MM-dd' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})/)
       when 'yyyyMMdd'   then value.match(/^(?<yr>\d{4})(?<mo>\d{2})(?<da>\d{2})/)
       when 'dd-MM-yyyy' then value.match(/^(?<da>\d{2})-(?<mo>\d{2})-(?<yr>\d{4})/)
       when 'd-M-yyyy'   then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{4})/)
+      when 'd-M-yy'     then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{2})/)
+      when 'd-M-y'      then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{1,4})/)
       when 'MM-dd-yyyy' then value.match(/^(?<mo>\d{2})-(?<da>\d{2})-(?<yr>\d{4})/)
       when 'M-d-yyyy'   then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{4})/)
-      when 'dd/MM/yyyy' then value.match(/^(?<da>\d{2})\/(?<mo>\d{2})\/(?<yr>\d{4})/)
+      when 'M-d-yy'     then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{2})/)
+      when 'M-d-y'      then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{1,4})/)
+      when 'dd/MM/yyyy' then value.match(/^(?<da>\d{2})\/(?<mo>\d{2})\/(?<yr>\d{1,4})/)
       when 'd/M/yyyy'   then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{4})/)
-      when 'MM/dd/yyyy' then value.match(/^(?<mo>\d{2})\/(?<da>\d{2})\/(?<yr>\d{4})/)
+      when 'd/M/yy'     then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{2})/)
+      when 'd/M/y'      then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{1,4})/)
+      when 'MM/dd/yyyy' then value.match(/^(?<mo>\d{2})\/(?<da>\d{2})\/(?<yr>\d{1,4})/)
       when 'M/d/yyyy'   then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{4})/)
+      when 'M/d/yy'     then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{2})/)
+      when 'M/d/y'      then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{1,4})/)
       when 'dd.MM.yyyy' then value.match(/^(?<da>\d{2})\.(?<mo>\d{2})\.(?<yr>\d{4})/)
       when 'd.M.yyyy'   then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{4})/)
+      when 'd.M.yy'     then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{2})/)
+      when 'd.M.y'      then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{1,4})/)
       when 'MM.dd.yyyy' then value.match(/^(?<mo>\d{2})\.(?<da>\d{2})\.(?<yr>\d{4})/)
       when 'M.d.yyyy'   then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{4})/)
+      when 'M.d.yy'     then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{2})/)
+      when 'M.d.y'      then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{1,4})/)
       when 'yyyy-MM-ddTHH:mm' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2})(?<se>(?<ms>))/)
       when 'yyyy-MM-ddTHH:mm:ss' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})(?<ms>)/)
       when /yyyy-MM-ddTHH:mm:ss\.S+/
         md = value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})\.(?<ms>\d+)/)
-        num_ms = date_format.match(/S+/).to_s.length
+        num_ms = date_pattern.match(/S+/).to_s.length
         md if md && md[:ms].length <= num_ms
       else
-        raise ArgumentError, "unrecognized date/time format #{date_format}" if date_format
+        raise ArgumentError, "unrecognized date/time pattern #{date_pattern}" if date_pattern
         nil
       end
@@ -61,25 +110,25 @@ module RDF::Tabular
       end
       # Extract time, of specified
-      time_part = case time_format
+      time_part = case time_pattern
       when 'HH:mm:ss' then value.match(/^(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})(?<ms>)/)
       when 'HHmmss'   then value.match(/^(?<hr>\d{2})(?<mi>\d{2})(?<se>\d{2})(?<ms>)/)
       when 'HH:mm'    then value.match(/^(?<hr>\d{2}):(?<mi>\d{2})(?<se>)(?<ms>)/)
       when 'HHmm'     then value.match(/^(?<hr>\d{2})(?<mi>\d{2})(?<se>)(?<ms>)/)
       when /HH:mm:ss\.S+/
         md = value.match(/^(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})\.(?<ms>\d+)/)
-        num_ms = time_format.match(/S+/).to_s.length
+        num_ms = time_pattern.match(/S+/).to_s.length
         md if md && md[:ms].length <= num_ms
       else
-        raise ArgumentError, "unrecognized date/time format #{time_format}" if time_format
+        raise ArgumentError, "unrecognized date/time pattern #{pattern}" if time_pattern
         nil
       end
-      # If there's a date_format but no date_part, match fails
-      return nil if date_format && date_part.nil?
+      # If there's a date_pattern but no date_part, match fails
+      raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && date_pattern && date_part.nil?
-      # If there's a time_format but no time_part, match fails
-      return nil if time_format && time_part.nil?
+      # If there's a time_pattern but no time_part, match fails
+      raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && time_pattern && time_part.nil?
       # Forward past time part
       value = value[time_part.to_s.length..-1] if time_part
@@ -88,8 +137,8 @@ module RDF::Tabular
       time_part = date_part if date_part && date_part.names.include?("hr")
       # If there's a timezone, it may optionally start with whitespace
-      value = value.lstrip if tz_format.to_s.start_with?(' ')
-      tz_part = case tz_format.to_s.lstrip
+      value = value.lstrip if tz_pattern.to_s.start_with?(' ')
+      tz_part = case tz_pattern.to_s.lstrip
       when 'x'    then value.match(/^(?:(?<hr>[+-]\d{2})(?<mi>\d{2})?)$/)
       when 'X'    then value.match(/^(?:(?:(?<hr>[+-]\d{2})(?<mi>\d{2})?)|(?<z>Z))$/)
       when 'xx'   then value.match(/^(?:(?<hr>[+-]\d{2})(?<mi>\d{2}))|$/)
@@ -97,15 +146,30 @@ module RDF::Tabular
       when 'xxx'  then value.match(/^(?:(?<hr>[+-]\d{2}):(?<mi>\d{2}))$/)
       when 'XXX'  then value.match(/^(?:(?:(?<hr>[+-]\d{2}):(?<mi>\d{2}))|(?<z>Z))$/)
       else
-        raise ArgumentError, "unrecognized timezone format #{tz_format.to_s.lstrip}" if tz_format
+        raise ArgumentError, "unrecognized timezone pattern #{tz_pattern.to_s.lstrip}" if tz_pattern
         nil
       end
-      # If there's a tz_format but no time_part, match fails
-      return nil if tz_format && tz_part.nil?
+      # If there's a tz_pattern but no time_part, match fails
+      raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && tz_pattern && tz_part.nil?
       # Compose normalized value
-      vd = ("%04d-%02d-%02d" % [date_part[:yr].to_i, date_part[:mo].to_i, date_part[:da].to_i]) if date_part
+      vd = if date_part
+        yr, mo, da = [date_part[:yr], date_part[:mo], date_part[:da]].map(&:to_i)
+        if date_part[:yr].length < 4
+          # Make sure that yr makes sense, if given
+          yr = case yr
+          when 0..69    then yr + 2000
+          when 100..999 then yr + 2000
+          when 70..99   then yr + 1900
+          else               yr
+          end
+        end
+        ("%04d-%02d-%02d" % [yr, mo, da])
+      end
       vt = ("%02d:%02d:%02d" % [time_part[:hr].to_i, time_part[:mi].to_i, time_part[:se].to_i]) if time_part
       # Add milliseconds, if matched
@@ -117,37 +181,74 @@ module RDF::Tabular
     end
     ##
-    # Parse the date format (if provided), and match against the value (if provided)
-    # Otherwise, validate format and raise an error
+    # Parse the date pattern (if provided), and match against the value (if provided)
+    # Otherwise, validate pattern and raise an error
     #
     # @param [String] pattern
     # @param [String] value
     # @param [String] groupChar
     # @param [String] decimalChar
     # @return [String] XMLSchema version of value or nil, if value does not match
-    # @raise [ArgumentError] if format is not valid
+    # @raise [ArgumentError] if pattern is not valid
     def parse_uax35_number(pattern, value, groupChar=",", decimalChar=".")
       value ||= ""
       re = build_number_re(pattern, groupChar, decimalChar)
+      raise ParseError, "#{value} has repeating #{groupChar.inspect}" if groupChar.length == 1 && value.include?(groupChar*2)
       # Upcase value and remove internal spaces
       value = value.upcase
       if value =~ re
         # Upcase value and remove internal spaces
         value = value.
-          upcase.
           gsub(/\s+/, '').
           gsub(groupChar, '').
           gsub(decimalChar, '.')
         # result re-assembles parts removed from value
         value
-      else
+      elsif !value.empty?
         # no match
-        nil
+        raise ParseError, "#{value.inspect} does not match #{pattern.inspect}"
+      end
+      # Extract percent or per-mille sign
+      case value
+      when /%/
+        value = value.sub('%', '')
+        lhs, rhs = value.split('.')
+        # Shift decimal
+        value = case lhs.length
+        when 0 then "0.00#{rhs}".sub('E', 'e')
+        when 1 then "0.0#{lhs}#{rhs}".sub('E', 'e')
+        when 2 then "0.#{lhs}#{rhs}".sub('E', 'e')
+        else
+          ll, lr = lhs[0..lhs.length-3], lhs[-2..-1]
+          ll = ll + "0" unless ll =~ /\d+/
+          "#{ll}.#{lr}#{rhs}".sub('E', 'e')
+        end
+      when /‰/
+        value = value.sub('‰', '')
+        lhs, rhs = value.split('.')
+        # Shift decimal
+        value = case lhs.length
+        when 0 then "0.000#{rhs}".sub('E', 'e')
+        when 1 then "0.00#{lhs}#{rhs}".sub('E', 'e')
+        when 2 then "0.0#{lhs}#{rhs}".sub('E', 'e')
+        when 3 then "0.#{lhs}#{rhs}".sub('E', 'e')
+        else
+          ll, lr = lhs[0..lhs.length-4], lhs[-3..-1]
+          ll = ll + "0" unless ll =~ /\d+/
+          "#{ll}.#{lr}#{rhs}".sub('E', 'e')
+        end
+      when /NAN/ then value.sub('NAN', 'NaN')
+      when /E/ then value.sub('E', 'e')
+      else
+        value
       end
     end
@@ -157,9 +258,10 @@ module RDF::Tabular
     # @param [String] groupChar
     # @param [String] decimalChar
     # @return [Regexp] Regular expression matching value
-    # @raise [ArgumentError] if format is not valid
+    # @raise [ArgumentError] if pattern is not valid
     def build_number_re(pattern, groupChar, decimalChar)
       # pattern must be composed of only 0, #, decimalChar, groupChar, E, %, and ‰
       ge = Regexp.escape groupChar
       de = Regexp.escape decimalChar
@@ -320,5 +422,8 @@ module RDF::Tabular
       Regexp.new("^(?<prefix>#{prefix})(?<numeric_part>#{integer_str}#{fractional_str}#{exponent_str})(?<suffix>#{suffix})$")
     end
+    # ParseError is raised when a value does not match the pattern
+    class ParseError < RuntimeError; end
   end
 end

data/spec/data/countries-minimal.json ADDED

@@ -0,0 +1,38 @@
+[
+  {
+    "@id": "http://example.org/countries.csv#AD",
+    "http://www.geonames.org/ontology#countryCode": "AD",
+    "schema:latitude": 42.546245,
+    "schema:longitude": 1.601554,
+    "schema:name": "Andorra"
+  },
+  {
+    "@id": "http://example.org/countries.csv#AE",
+    "http://www.geonames.org/ontology#countryCode": "AE",
+    "schema:latitude": 23.424076,
+    "schema:longitude": 53.847818,
+    "schema:name": "United Arab Emirates"
+  },
+  {
+    "@id": "http://example.org/countries.csv#AF",
+    "http://www.geonames.org/ontology#countryCode": "AF",
+    "schema:latitude": 33.93911,
+    "schema:longitude": 67.709953,
+    "schema:name": "Afghanistan"
+  },
+  {
+    "countryRef": "http://example.org/countries.csv#AF",
+    "year": "1960",
+    "population": 9616353
+  },
+  {
+    "countryRef": "http://example.org/countries.csv#AF",
+    "year": "1961",
+    "population": 9799379
+  },
+  {
+    "countryRef": "http://example.org/countries.csv#AF",
+    "year": "1962",
+    "population": 9989846
+  }
+]