rdf-tabular 0.4.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +24 -5
  3. data/VERSION +1 -1
  4. data/etc/csvw.jsonld +135 -50
  5. data/lib/rdf/tabular/csvw.rb +215 -181
  6. data/lib/rdf/tabular/format.rb +8 -6
  7. data/lib/rdf/tabular/literal.rb +1 -1
  8. data/lib/rdf/tabular/metadata.rb +61 -80
  9. data/lib/rdf/tabular/reader.rb +18 -15
  10. data/lib/rdf/tabular/uax35.rb +143 -38
  11. data/spec/data/countries-minimal.json +38 -0
  12. data/spec/data/countries-minimal.ttl +36 -0
  13. data/spec/data/countries-standard.json +86 -0
  14. data/spec/data/countries-standard.ttl +75 -0
  15. data/spec/data/countries.csv +4 -0
  16. data/spec/data/countries.csv-minimal.json +16 -0
  17. data/spec/data/countries.csv-minimal.ttl +19 -0
  18. data/spec/data/countries.csv-standard.json +33 -0
  19. data/spec/data/countries.csv-standard.ttl +44 -0
  20. data/spec/data/countries.html +88 -0
  21. data/spec/data/countries.json +53 -0
  22. data/spec/data/countries_embed-minimal.json +38 -0
  23. data/spec/data/countries_embed-minimal.ttl +36 -0
  24. data/spec/data/countries_embed-standard.json +86 -0
  25. data/spec/data/countries_embed-standard.ttl +75 -0
  26. data/spec/data/countries_embed.html +88 -0
  27. data/spec/data/countries_html-minimal.json +38 -0
  28. data/spec/data/countries_html-minimal.ttl +36 -0
  29. data/spec/data/countries_html-standard.json +86 -0
  30. data/spec/data/countries_html-standard.ttl +75 -0
  31. data/spec/data/country-codes-and-names-minimal.json +19 -0
  32. data/spec/data/country-codes-and-names-minimal.ttl +22 -0
  33. data/spec/data/country-codes-and-names-standard.json +47 -0
  34. data/spec/data/country-codes-and-names-standard.ttl +45 -0
  35. data/spec/data/country-codes-and-names.csv +5 -0
  36. data/spec/data/country_slice.csv +4 -0
  37. data/spec/data/junior-roles.csv +3 -0
  38. data/spec/data/junior-roles.json +54 -0
  39. data/spec/data/roles-minimal.json +32 -0
  40. data/spec/data/roles-minimal.ttl +36 -0
  41. data/spec/data/roles-standard.json +56 -0
  42. data/spec/data/roles-standard.ttl +66 -0
  43. data/spec/data/roles.json +23 -0
  44. data/spec/data/senior-roles.csv +3 -0
  45. data/spec/data/senior-roles.json +52 -0
  46. data/spec/data/test232-metadata.json +10 -0
  47. data/spec/data/test232.csv +3 -0
  48. data/spec/data/tree-ops-atd.json +1 -0
  49. data/spec/data/tree-ops-ext-minimal.json +42 -0
  50. data/spec/data/tree-ops-ext-minimal.ttl +34 -0
  51. data/spec/data/tree-ops-ext-standard.json +93 -0
  52. data/spec/data/tree-ops-ext-standard.ttl +82 -0
  53. data/spec/data/tree-ops-ext.csv +4 -0
  54. data/spec/data/tree-ops-ext.json +81 -0
  55. data/spec/data/tree-ops-minimal.json +18 -0
  56. data/spec/data/tree-ops-minimal.ttl +14 -0
  57. data/spec/data/tree-ops-standard.json +44 -0
  58. data/spec/data/tree-ops-standard.ttl +44 -0
  59. data/spec/data/tree-ops-virtual-minimal.json +32 -0
  60. data/spec/data/tree-ops-virtual-minimal.ttl +25 -0
  61. data/spec/data/tree-ops-virtual-standard.json +49 -0
  62. data/spec/data/tree-ops-virtual-standard.ttl +49 -0
  63. data/spec/data/tree-ops-virtual.json +48 -0
  64. data/spec/data/tree-ops.csv +3 -0
  65. data/spec/data/tree-ops.csv-metadata.json +43 -0
  66. data/spec/data/tree-ops.html +54 -0
  67. data/spec/data/tree-ops.tsv +3 -0
  68. data/spec/format_spec.rb +5 -4
  69. data/spec/metadata_spec.rb +10 -16
  70. data/spec/suite_helper.rb +2 -2
  71. data/spec/suite_spec.rb +5 -6
  72. data/spec/uax35_spec.rb +239 -0
  73. metadata +149 -36
  74. data/lib/rdf/tabular/json.rb +0 -0
@@ -28,16 +28,19 @@ module RDF::Tabular
28
28
  RDF::CLI::Option.new(
29
29
  symbol: :metadata,
30
30
  datatype: RDF::URI,
31
+ control: :url2,
31
32
  on: ["--metadata URI"],
32
33
  description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
33
34
  RDF::CLI::Option.new(
34
35
  symbol: :minimal,
36
+ control: :checkbox,
35
37
  datatype: TrueClass,
36
38
  on: ["--minimal"],
37
39
  description: "Includes only the information gleaned from the cells of the tabular data.") {true},
38
40
  RDF::CLI::Option.new(
39
41
  symbol: :noProv,
40
42
  datatype: TrueClass,
43
+ control: :checkbox,
41
44
  on: ["--no-prov"],
42
45
  description: "do not output optional provenance information.") {true},
43
46
  ]
@@ -60,7 +63,7 @@ module RDF::Tabular
60
63
  # @yieldparam [RDF::Reader] reader
61
64
  # @yieldreturn [void] ignored
62
65
  # @raise [RDF::ReaderError] if the CSV document cannot be loaded
63
- def initialize(input = $stdin, options = {}, &block)
66
+ def initialize(input = $stdin, **options, &block)
64
67
  super do
65
68
  # Base would be how we are to take this
66
69
  @options[:base] ||= base_uri.to_s if base_uri
@@ -86,7 +89,7 @@ module RDF::Tabular
86
89
  # If input is JSON, then the input is the metadata
87
90
  content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
88
91
  if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
89
- @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
92
+ @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
90
93
  # If @metadata is for a Table, turn it into a TableGroup
91
94
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
92
95
  @metadata.normalize!
@@ -99,7 +102,7 @@ module RDF::Tabular
99
102
  def script.content_type; "application/csvm+json"; end
100
103
  log_debug("Reader#initialize") {"Process HTML script block"}
101
104
  @input = script
102
- @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
105
+ @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
103
106
  # If @metadata is for a Table, turn it into a TableGroup
104
107
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
105
108
  @metadata.normalize!
@@ -116,7 +119,7 @@ module RDF::Tabular
116
119
  dialect.separator = "\t" if (input.content_type == "text/tsv" rescue nil)
117
120
  embed_options = @options.dup
118
121
  embed_options[:lang] = dialect_metadata.lang if dialect_metadata.lang
119
- embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], embed_options)
122
+ embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], **embed_options)
120
123
 
121
124
  if (@metadata = @options[:metadata]) && @metadata.tableSchema
122
125
  @metadata.verify_compatible!(embedded_metadata)
@@ -133,7 +136,7 @@ module RDF::Tabular
133
136
  else
134
137
  # It's tabluar data. Find metadata and proceed as if it was specified in the first place
135
138
  @options[:original_input] = @input unless @options[:metadata]
136
- @input = @metadata = Metadata.for_input(@input, @options).normalize!
139
+ @input = @metadata = Metadata.for_input(@input, **@options).normalize!
137
140
  end
138
141
 
139
142
  log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
@@ -183,7 +186,7 @@ module RDF::Tabular
183
186
  if options[:original_input] && !input.describes_file?(options[:base_uri])
184
187
  table_resource = RDF::Node.new
185
188
  add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
186
- Reader.new(options[:original_input], options.merge(
189
+ Reader.new(options[:original_input], **options.merge(
187
190
  metadata: input.tables.first,
188
191
  base: input.tables.first.url,
189
192
  no_found_metadata: true,
@@ -202,7 +205,7 @@ module RDF::Tabular
202
205
  end.flatten.compact
203
206
  table_resource = table.id || RDF::Node.new
204
207
  add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
205
- Reader.open(table.url, options.merge(
208
+ Reader.open(table.url, **options.merge(
206
209
  metadata: table,
207
210
  base: table.url,
208
211
  no_found_metadata: true,
@@ -418,9 +421,9 @@ module RDF::Tabular
418
421
 
419
422
  res = if io
420
423
  ::JSON::dump_default_options = json_state
421
- ::JSON.dump(self.send(hash_fn, options), io)
424
+ ::JSON.dump(self.send(hash_fn, **options), io)
422
425
  else
423
- hash = self.send(hash_fn, options)
426
+ hash = self.send(hash_fn, **options)
424
427
  ::JSON.generate(hash, json_state)
425
428
  end
426
429
 
@@ -440,7 +443,7 @@ module RDF::Tabular
440
443
  #
441
444
  # @param [Hash{Symbol => Object}] options
442
445
  # @return [Hash, Array]
443
- def to_hash(options = {})
446
+ def to_hash(**options)
444
447
  # Construct metadata from that passed from file open, along with information from the file.
445
448
  if input.is_a?(Metadata)
446
449
  log_debug("each_statement: metadata") {input.inspect}
@@ -464,13 +467,13 @@ module RDF::Tabular
464
467
  table_group['tables'] = tables
465
468
 
466
469
  if options[:original_input] && !input.describes_file?(options[:base_uri])
467
- Reader.new(options[:original_input], options.merge(
470
+ Reader.new(options[:original_input], **options.merge(
468
471
  metadata: input.tables.first,
469
472
  base: input.tables.first.url,
470
473
  minimal: minimal?,
471
474
  no_found_metadata: true,
472
475
  )) do |r|
473
- case t = r.to_hash(options)
476
+ case t = r.to_hash(**options)
474
477
  when Array then tables += t unless input.tables.first.suppressOutput
475
478
  when Hash then tables << t unless input.tables.first.suppressOutput
476
479
  end
@@ -478,13 +481,13 @@ module RDF::Tabular
478
481
  else
479
482
  input.each_table do |table|
480
483
  next if table.suppressOutput && !validate?
481
- Reader.open(table.url, options.merge(
484
+ Reader.open(table.url, **options.merge(
482
485
  metadata: table,
483
486
  base: table.url,
484
487
  minimal: minimal?,
485
488
  no_found_metadata: true,
486
489
  )) do |r|
487
- case t = r.to_hash(options)
490
+ case t = r.to_hash(**options)
488
491
  when Array then tables += t unless table.suppressOutput
489
492
  when Hash then tables << t unless table.suppressOutput
490
493
  end
@@ -557,7 +560,7 @@ module RDF::Tabular
557
560
  co['@id'] = subject.to_s unless subject == 'null'
558
561
  prop = case cell.propertyUrl
559
562
  when RDF.type then '@type'
560
- when nil then URI.decode(column.name) # Use URI-decoded name
563
+ when nil then CGI.unescape(column.name) # Use URI-decoded name
561
564
  else
562
565
  # Compact the property to a term or prefixed name
563
566
  metadata.context.compact_iri(cell.propertyUrl, vocab: true)
@@ -7,50 +7,99 @@ module RDF::Tabular
7
7
  module UAX35
8
8
 
9
9
  ##
10
- # Parse the date format (if provided), and match against the value (if provided)
11
- # Otherwise, validate format and raise an error
10
+ # Parse the date pattern (if provided), and match against the value (if provided)
11
+ # Otherwise, validate pattern and raise an error.
12
12
  #
13
- # @param [String] format
13
+ # Supported patterns are:
14
+ #
15
+ # * yyyy-MM-dd
16
+ # * yyyyMMdd
17
+ # * dd-MM-yyyy
18
+ # * d-M-yyyy
19
+ # * d-M-yy
20
+ # * d-M-y
21
+ # * MM-dd-yyyy
22
+ # * M-d-yyyy
23
+ # * M-d-yy
24
+ # * M-d-y
25
+ # * dd/MM/yyyy
26
+ # * d/M/yyyy
27
+ # * d/M/yy
28
+ # * d/M/y
29
+ # * MM/dd/yyyy
30
+ # * M/d/yyyy
31
+ # * M/d/yy
32
+ # * M/d/y
33
+ # * dd.MM.yyyy
34
+ # * d.M.yyyy
35
+ # * d.M.yy
36
+ # * d.M.y
37
+ # * MM.dd.yyyy
38
+ # * M.d.yyyy
39
+ # * M.d.yy
40
+ # * M.d.y
41
+ # * yyyy-MM-ddTHH:mm
42
+ # * yyyy-MM-ddTHH:mm:ss
43
+ # * yyyy-MM-ddTHH:mm:ss.S+
44
+ #
45
+ # Year comonents less than four digits are normalized to 1900 or 2000 based on if the value is <= 99 or >= 70, it is considered to be in the 1900 range, otherwise, based on 2000.
46
+ #
47
+ # @param [String] pattern
14
48
  # @param [String] value
15
49
  # @return [String] XMLSchema version of value
16
- # @raise [ArgumentError] if format is not valid, or nil, if value does not match
17
- def parse_uax35_date(format, value)
18
- date_format, time_format = nil, nil
19
- return value unless format
20
- value ||= ""
50
+ # @raise [ArgumentError] if pattern is not valid, or nil
51
+ # @raise [ParseError] if value does not match
52
+ def parse_uax35_date(pattern, value)
53
+ date_pattern, time_pattern = nil, nil
54
+ return value unless pattern
55
+ orig_value = value ||= ""
56
+ orig_pattern = pattern
21
57
 
22
58
  # Extract tz info
23
- if md = format.match(/^(.*[dyms])+(\s*[xX]+)$/)
24
- format, tz_format = md[1], md[2]
59
+ if md = pattern.match(/^(.*[dyms])+(\s*[xX]+)$/)
60
+ pattern, tz_pattern = md[1], md[2]
25
61
  end
26
62
 
27
- date_format, time_format = format.split(' ')
28
- date_format, time_format = nil, date_format if self.base.to_sym == :time
63
+ date_pattern, time_pattern = pattern.split(' ')
64
+ # Snuff out if this is a Time pattern
65
+ date_pattern, time_pattern = nil, date_pattern if time_pattern.nil? && !date_pattern.match(/[TyMd]/)
29
66
 
30
67
  # Extract date, of specified
31
- date_part = case date_format
68
+ date_part = case date_pattern
32
69
  when 'yyyy-MM-dd' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})/)
33
70
  when 'yyyyMMdd' then value.match(/^(?<yr>\d{4})(?<mo>\d{2})(?<da>\d{2})/)
34
71
  when 'dd-MM-yyyy' then value.match(/^(?<da>\d{2})-(?<mo>\d{2})-(?<yr>\d{4})/)
35
72
  when 'd-M-yyyy' then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{4})/)
73
+ when 'd-M-yy' then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{2})/)
74
+ when 'd-M-y' then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{1,4})/)
36
75
  when 'MM-dd-yyyy' then value.match(/^(?<mo>\d{2})-(?<da>\d{2})-(?<yr>\d{4})/)
37
76
  when 'M-d-yyyy' then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{4})/)
38
- when 'dd/MM/yyyy' then value.match(/^(?<da>\d{2})\/(?<mo>\d{2})\/(?<yr>\d{4})/)
77
+ when 'M-d-yy' then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{2})/)
78
+ when 'M-d-y' then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{1,4})/)
79
+ when 'dd/MM/yyyy' then value.match(/^(?<da>\d{2})\/(?<mo>\d{2})\/(?<yr>\d{1,4})/)
39
80
  when 'd/M/yyyy' then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{4})/)
40
- when 'MM/dd/yyyy' then value.match(/^(?<mo>\d{2})\/(?<da>\d{2})\/(?<yr>\d{4})/)
81
+ when 'd/M/yy' then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{2})/)
82
+ when 'd/M/y' then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{1,4})/)
83
+ when 'MM/dd/yyyy' then value.match(/^(?<mo>\d{2})\/(?<da>\d{2})\/(?<yr>\d{1,4})/)
41
84
  when 'M/d/yyyy' then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{4})/)
85
+ when 'M/d/yy' then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{2})/)
86
+ when 'M/d/y' then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{1,4})/)
42
87
  when 'dd.MM.yyyy' then value.match(/^(?<da>\d{2})\.(?<mo>\d{2})\.(?<yr>\d{4})/)
43
88
  when 'd.M.yyyy' then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{4})/)
89
+ when 'd.M.yy' then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{2})/)
90
+ when 'd.M.y' then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{1,4})/)
44
91
  when 'MM.dd.yyyy' then value.match(/^(?<mo>\d{2})\.(?<da>\d{2})\.(?<yr>\d{4})/)
45
92
  when 'M.d.yyyy' then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{4})/)
93
+ when 'M.d.yy' then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{2})/)
94
+ when 'M.d.y' then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{1,4})/)
46
95
  when 'yyyy-MM-ddTHH:mm' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2})(?<se>(?<ms>))/)
47
96
  when 'yyyy-MM-ddTHH:mm:ss' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})(?<ms>)/)
48
97
  when /yyyy-MM-ddTHH:mm:ss\.S+/
49
98
  md = value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})\.(?<ms>\d+)/)
50
- num_ms = date_format.match(/S+/).to_s.length
99
+ num_ms = date_pattern.match(/S+/).to_s.length
51
100
  md if md && md[:ms].length <= num_ms
52
101
  else
53
- raise ArgumentError, "unrecognized date/time format #{date_format}" if date_format
102
+ raise ArgumentError, "unrecognized date/time pattern #{date_pattern}" if date_pattern
54
103
  nil
55
104
  end
56
105
 
@@ -61,25 +110,25 @@ module RDF::Tabular
61
110
  end
62
111
 
63
112
  # Extract time, of specified
64
- time_part = case time_format
113
+ time_part = case time_pattern
65
114
  when 'HH:mm:ss' then value.match(/^(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})(?<ms>)/)
66
115
  when 'HHmmss' then value.match(/^(?<hr>\d{2})(?<mi>\d{2})(?<se>\d{2})(?<ms>)/)
67
116
  when 'HH:mm' then value.match(/^(?<hr>\d{2}):(?<mi>\d{2})(?<se>)(?<ms>)/)
68
117
  when 'HHmm' then value.match(/^(?<hr>\d{2})(?<mi>\d{2})(?<se>)(?<ms>)/)
69
118
  when /HH:mm:ss\.S+/
70
119
  md = value.match(/^(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})\.(?<ms>\d+)/)
71
- num_ms = time_format.match(/S+/).to_s.length
120
+ num_ms = time_pattern.match(/S+/).to_s.length
72
121
  md if md && md[:ms].length <= num_ms
73
122
  else
74
- raise ArgumentError, "unrecognized date/time format #{time_format}" if time_format
123
+ raise ArgumentError, "unrecognized date/time pattern #{pattern}" if time_pattern
75
124
  nil
76
125
  end
77
126
 
78
- # If there's a date_format but no date_part, match fails
79
- return nil if date_format && date_part.nil?
127
+ # If there's a date_pattern but no date_part, match fails
128
+ raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && date_pattern && date_part.nil?
80
129
 
81
- # If there's a time_format but no time_part, match fails
82
- return nil if time_format && time_part.nil?
130
+ # If there's a time_pattern but no time_part, match fails
131
+ raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && time_pattern && time_part.nil?
83
132
 
84
133
  # Forward past time part
85
134
  value = value[time_part.to_s.length..-1] if time_part
@@ -88,8 +137,8 @@ module RDF::Tabular
88
137
  time_part = date_part if date_part && date_part.names.include?("hr")
89
138
 
90
139
  # If there's a timezone, it may optionally start with whitespace
91
- value = value.lstrip if tz_format.to_s.start_with?(' ')
92
- tz_part = case tz_format.to_s.lstrip
140
+ value = value.lstrip if tz_pattern.to_s.start_with?(' ')
141
+ tz_part = case tz_pattern.to_s.lstrip
93
142
  when 'x' then value.match(/^(?:(?<hr>[+-]\d{2})(?<mi>\d{2})?)$/)
94
143
  when 'X' then value.match(/^(?:(?:(?<hr>[+-]\d{2})(?<mi>\d{2})?)|(?<z>Z))$/)
95
144
  when 'xx' then value.match(/^(?:(?<hr>[+-]\d{2})(?<mi>\d{2}))|$/)
@@ -97,15 +146,30 @@ module RDF::Tabular
97
146
  when 'xxx' then value.match(/^(?:(?<hr>[+-]\d{2}):(?<mi>\d{2}))$/)
98
147
  when 'XXX' then value.match(/^(?:(?:(?<hr>[+-]\d{2}):(?<mi>\d{2}))|(?<z>Z))$/)
99
148
  else
100
- raise ArgumentError, "unrecognized timezone format #{tz_format.to_s.lstrip}" if tz_format
149
+ raise ArgumentError, "unrecognized timezone pattern #{tz_pattern.to_s.lstrip}" if tz_pattern
101
150
  nil
102
151
  end
103
152
 
104
- # If there's a tz_format but no time_part, match fails
105
- return nil if tz_format && tz_part.nil?
153
+ # If there's a tz_pattern but no time_part, match fails
154
+ raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && tz_pattern && tz_part.nil?
106
155
 
107
156
  # Compose normalized value
108
- vd = ("%04d-%02d-%02d" % [date_part[:yr].to_i, date_part[:mo].to_i, date_part[:da].to_i]) if date_part
157
+ vd = if date_part
158
+ yr, mo, da = [date_part[:yr], date_part[:mo], date_part[:da]].map(&:to_i)
159
+
160
+ if date_part[:yr].length < 4
161
+ # Make sure that yr makes sense, if given
162
+ yr = case yr
163
+ when 0..69 then yr + 2000
164
+ when 100..999 then yr + 2000
165
+ when 70..99 then yr + 1900
166
+ else yr
167
+ end
168
+ end
169
+
170
+ ("%04d-%02d-%02d" % [yr, mo, da])
171
+ end
172
+
109
173
  vt = ("%02d:%02d:%02d" % [time_part[:hr].to_i, time_part[:mi].to_i, time_part[:se].to_i]) if time_part
110
174
 
111
175
  # Add milliseconds, if matched
@@ -117,37 +181,74 @@ module RDF::Tabular
117
181
  end
118
182
 
119
183
  ##
120
- # Parse the date format (if provided), and match against the value (if provided)
121
- # Otherwise, validate format and raise an error
184
+ # Parse the date pattern (if provided), and match against the value (if provided)
185
+ # Otherwise, validate pattern and raise an error
122
186
  #
123
187
  # @param [String] pattern
124
188
  # @param [String] value
125
189
  # @param [String] groupChar
126
190
  # @param [String] decimalChar
127
191
  # @return [String] XMLSchema version of value or nil, if value does not match
128
- # @raise [ArgumentError] if format is not valid
192
+ # @raise [ArgumentError] if pattern is not valid
129
193
  def parse_uax35_number(pattern, value, groupChar=",", decimalChar=".")
130
194
  value ||= ""
131
195
 
132
196
  re = build_number_re(pattern, groupChar, decimalChar)
133
197
 
198
+ raise ParseError, "#{value} has repeating #{groupChar.inspect}" if groupChar.length == 1 && value.include?(groupChar*2)
199
+
134
200
  # Upcase value and remove internal spaces
135
201
  value = value.upcase
136
202
 
137
203
  if value =~ re
138
-
139
204
  # Upcase value and remove internal spaces
140
205
  value = value.
141
- upcase.
142
206
  gsub(/\s+/, '').
143
207
  gsub(groupChar, '').
144
208
  gsub(decimalChar, '.')
145
209
 
146
210
  # result re-assembles parts removed from value
147
211
  value
148
- else
212
+ elsif !value.empty?
149
213
  # no match
150
- nil
214
+ raise ParseError, "#{value.inspect} does not match #{pattern.inspect}"
215
+ end
216
+
217
+ # Extract percent or per-mille sign
218
+ case value
219
+ when /%/
220
+ value = value.sub('%', '')
221
+ lhs, rhs = value.split('.')
222
+
223
+ # Shift decimal
224
+ value = case lhs.length
225
+ when 0 then "0.00#{rhs}".sub('E', 'e')
226
+ when 1 then "0.0#{lhs}#{rhs}".sub('E', 'e')
227
+ when 2 then "0.#{lhs}#{rhs}".sub('E', 'e')
228
+ else
229
+ ll, lr = lhs[0..lhs.length-3], lhs[-2..-1]
230
+ ll = ll + "0" unless ll =~ /\d+/
231
+ "#{ll}.#{lr}#{rhs}".sub('E', 'e')
232
+ end
233
+ when /‰/
234
+ value = value.sub('‰', '')
235
+ lhs, rhs = value.split('.')
236
+
237
+ # Shift decimal
238
+ value = case lhs.length
239
+ when 0 then "0.000#{rhs}".sub('E', 'e')
240
+ when 1 then "0.00#{lhs}#{rhs}".sub('E', 'e')
241
+ when 2 then "0.0#{lhs}#{rhs}".sub('E', 'e')
242
+ when 3 then "0.#{lhs}#{rhs}".sub('E', 'e')
243
+ else
244
+ ll, lr = lhs[0..lhs.length-4], lhs[-3..-1]
245
+ ll = ll + "0" unless ll =~ /\d+/
246
+ "#{ll}.#{lr}#{rhs}".sub('E', 'e')
247
+ end
248
+ when /NAN/ then value.sub('NAN', 'NaN')
249
+ when /E/ then value.sub('E', 'e')
250
+ else
251
+ value
151
252
  end
152
253
  end
153
254
 
@@ -157,9 +258,10 @@ module RDF::Tabular
157
258
  # @param [String] groupChar
158
259
  # @param [String] decimalChar
159
260
  # @return [Regexp] Regular expression matching value
160
- # @raise [ArgumentError] if format is not valid
261
+ # @raise [ArgumentError] if pattern is not valid
161
262
  def build_number_re(pattern, groupChar, decimalChar)
162
263
  # pattern must be composed of only 0, #, decimalChar, groupChar, E, %, and ‰
264
+
163
265
  ge = Regexp.escape groupChar
164
266
  de = Regexp.escape decimalChar
165
267
 
@@ -320,5 +422,8 @@ module RDF::Tabular
320
422
 
321
423
  Regexp.new("^(?<prefix>#{prefix})(?<numeric_part>#{integer_str}#{fractional_str}#{exponent_str})(?<suffix>#{suffix})$")
322
424
  end
425
+
426
+ # ParseError is raised when a value does not match the pattern
427
+ class ParseError < RuntimeError; end
323
428
  end
324
429
  end
@@ -0,0 +1,38 @@
1
+ [
2
+ {
3
+ "@id": "http://example.org/countries.csv#AD",
4
+ "http://www.geonames.org/ontology#countryCode": "AD",
5
+ "schema:latitude": 42.546245,
6
+ "schema:longitude": 1.601554,
7
+ "schema:name": "Andorra"
8
+ },
9
+ {
10
+ "@id": "http://example.org/countries.csv#AE",
11
+ "http://www.geonames.org/ontology#countryCode": "AE",
12
+ "schema:latitude": 23.424076,
13
+ "schema:longitude": 53.847818,
14
+ "schema:name": "United Arab Emirates"
15
+ },
16
+ {
17
+ "@id": "http://example.org/countries.csv#AF",
18
+ "http://www.geonames.org/ontology#countryCode": "AF",
19
+ "schema:latitude": 33.93911,
20
+ "schema:longitude": 67.709953,
21
+ "schema:name": "Afghanistan"
22
+ },
23
+ {
24
+ "countryRef": "http://example.org/countries.csv#AF",
25
+ "year": "1960",
26
+ "population": 9616353
27
+ },
28
+ {
29
+ "countryRef": "http://example.org/countries.csv#AF",
30
+ "year": "1961",
31
+ "population": 9799379
32
+ },
33
+ {
34
+ "countryRef": "http://example.org/countries.csv#AF",
35
+ "year": "1962",
36
+ "population": 9989846
37
+ }
38
+ ]