rdf-tabular 0.4.0 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +24 -5
  3. data/VERSION +1 -1
  4. data/etc/csvw.jsonld +135 -50
  5. data/lib/rdf/tabular/csvw.rb +215 -181
  6. data/lib/rdf/tabular/format.rb +8 -6
  7. data/lib/rdf/tabular/literal.rb +1 -1
  8. data/lib/rdf/tabular/metadata.rb +61 -80
  9. data/lib/rdf/tabular/reader.rb +18 -15
  10. data/lib/rdf/tabular/uax35.rb +143 -38
  11. data/spec/data/countries-minimal.json +38 -0
  12. data/spec/data/countries-minimal.ttl +36 -0
  13. data/spec/data/countries-standard.json +86 -0
  14. data/spec/data/countries-standard.ttl +75 -0
  15. data/spec/data/countries.csv +4 -0
  16. data/spec/data/countries.csv-minimal.json +16 -0
  17. data/spec/data/countries.csv-minimal.ttl +19 -0
  18. data/spec/data/countries.csv-standard.json +33 -0
  19. data/spec/data/countries.csv-standard.ttl +44 -0
  20. data/spec/data/countries.html +88 -0
  21. data/spec/data/countries.json +53 -0
  22. data/spec/data/countries_embed-minimal.json +38 -0
  23. data/spec/data/countries_embed-minimal.ttl +36 -0
  24. data/spec/data/countries_embed-standard.json +86 -0
  25. data/spec/data/countries_embed-standard.ttl +75 -0
  26. data/spec/data/countries_embed.html +88 -0
  27. data/spec/data/countries_html-minimal.json +38 -0
  28. data/spec/data/countries_html-minimal.ttl +36 -0
  29. data/spec/data/countries_html-standard.json +86 -0
  30. data/spec/data/countries_html-standard.ttl +75 -0
  31. data/spec/data/country-codes-and-names-minimal.json +19 -0
  32. data/spec/data/country-codes-and-names-minimal.ttl +22 -0
  33. data/spec/data/country-codes-and-names-standard.json +47 -0
  34. data/spec/data/country-codes-and-names-standard.ttl +45 -0
  35. data/spec/data/country-codes-and-names.csv +5 -0
  36. data/spec/data/country_slice.csv +4 -0
  37. data/spec/data/junior-roles.csv +3 -0
  38. data/spec/data/junior-roles.json +54 -0
  39. data/spec/data/roles-minimal.json +32 -0
  40. data/spec/data/roles-minimal.ttl +36 -0
  41. data/spec/data/roles-standard.json +56 -0
  42. data/spec/data/roles-standard.ttl +66 -0
  43. data/spec/data/roles.json +23 -0
  44. data/spec/data/senior-roles.csv +3 -0
  45. data/spec/data/senior-roles.json +52 -0
  46. data/spec/data/test232-metadata.json +10 -0
  47. data/spec/data/test232.csv +3 -0
  48. data/spec/data/tree-ops-atd.json +1 -0
  49. data/spec/data/tree-ops-ext-minimal.json +42 -0
  50. data/spec/data/tree-ops-ext-minimal.ttl +34 -0
  51. data/spec/data/tree-ops-ext-standard.json +93 -0
  52. data/spec/data/tree-ops-ext-standard.ttl +82 -0
  53. data/spec/data/tree-ops-ext.csv +4 -0
  54. data/spec/data/tree-ops-ext.json +81 -0
  55. data/spec/data/tree-ops-minimal.json +18 -0
  56. data/spec/data/tree-ops-minimal.ttl +14 -0
  57. data/spec/data/tree-ops-standard.json +44 -0
  58. data/spec/data/tree-ops-standard.ttl +44 -0
  59. data/spec/data/tree-ops-virtual-minimal.json +32 -0
  60. data/spec/data/tree-ops-virtual-minimal.ttl +25 -0
  61. data/spec/data/tree-ops-virtual-standard.json +49 -0
  62. data/spec/data/tree-ops-virtual-standard.ttl +49 -0
  63. data/spec/data/tree-ops-virtual.json +48 -0
  64. data/spec/data/tree-ops.csv +3 -0
  65. data/spec/data/tree-ops.csv-metadata.json +43 -0
  66. data/spec/data/tree-ops.html +54 -0
  67. data/spec/data/tree-ops.tsv +3 -0
  68. data/spec/format_spec.rb +5 -4
  69. data/spec/metadata_spec.rb +10 -16
  70. data/spec/suite_helper.rb +2 -2
  71. data/spec/suite_spec.rb +5 -6
  72. data/spec/uax35_spec.rb +239 -0
  73. metadata +149 -36
  74. data/lib/rdf/tabular/json.rb +0 -0
@@ -28,16 +28,19 @@ module RDF::Tabular
28
28
  RDF::CLI::Option.new(
29
29
  symbol: :metadata,
30
30
  datatype: RDF::URI,
31
+ control: :url2,
31
32
  on: ["--metadata URI"],
32
33
  description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
33
34
  RDF::CLI::Option.new(
34
35
  symbol: :minimal,
36
+ control: :checkbox,
35
37
  datatype: TrueClass,
36
38
  on: ["--minimal"],
37
39
  description: "Includes only the information gleaned from the cells of the tabular data.") {true},
38
40
  RDF::CLI::Option.new(
39
41
  symbol: :noProv,
40
42
  datatype: TrueClass,
43
+ control: :checkbox,
41
44
  on: ["--no-prov"],
42
45
  description: "do not output optional provenance information.") {true},
43
46
  ]
@@ -60,7 +63,7 @@ module RDF::Tabular
60
63
  # @yieldparam [RDF::Reader] reader
61
64
  # @yieldreturn [void] ignored
62
65
  # @raise [RDF::ReaderError] if the CSV document cannot be loaded
63
- def initialize(input = $stdin, options = {}, &block)
66
+ def initialize(input = $stdin, **options, &block)
64
67
  super do
65
68
  # Base would be how we are to take this
66
69
  @options[:base] ||= base_uri.to_s if base_uri
@@ -86,7 +89,7 @@ module RDF::Tabular
86
89
  # If input is JSON, then the input is the metadata
87
90
  content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
88
91
  if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
89
- @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
92
+ @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
90
93
  # If @metadata is for a Table, turn it into a TableGroup
91
94
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
92
95
  @metadata.normalize!
@@ -99,7 +102,7 @@ module RDF::Tabular
99
102
  def script.content_type; "application/csvm+json"; end
100
103
  log_debug("Reader#initialize") {"Process HTML script block"}
101
104
  @input = script
102
- @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
105
+ @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
103
106
  # If @metadata is for a Table, turn it into a TableGroup
104
107
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
105
108
  @metadata.normalize!
@@ -116,7 +119,7 @@ module RDF::Tabular
116
119
  dialect.separator = "\t" if (input.content_type == "text/tsv" rescue nil)
117
120
  embed_options = @options.dup
118
121
  embed_options[:lang] = dialect_metadata.lang if dialect_metadata.lang
119
- embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], embed_options)
122
+ embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], **embed_options)
120
123
 
121
124
  if (@metadata = @options[:metadata]) && @metadata.tableSchema
122
125
  @metadata.verify_compatible!(embedded_metadata)
@@ -133,7 +136,7 @@ module RDF::Tabular
133
136
  else
134
137
  # It's tabluar data. Find metadata and proceed as if it was specified in the first place
135
138
  @options[:original_input] = @input unless @options[:metadata]
136
- @input = @metadata = Metadata.for_input(@input, @options).normalize!
139
+ @input = @metadata = Metadata.for_input(@input, **@options).normalize!
137
140
  end
138
141
 
139
142
  log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
@@ -183,7 +186,7 @@ module RDF::Tabular
183
186
  if options[:original_input] && !input.describes_file?(options[:base_uri])
184
187
  table_resource = RDF::Node.new
185
188
  add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
186
- Reader.new(options[:original_input], options.merge(
189
+ Reader.new(options[:original_input], **options.merge(
187
190
  metadata: input.tables.first,
188
191
  base: input.tables.first.url,
189
192
  no_found_metadata: true,
@@ -202,7 +205,7 @@ module RDF::Tabular
202
205
  end.flatten.compact
203
206
  table_resource = table.id || RDF::Node.new
204
207
  add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
205
- Reader.open(table.url, options.merge(
208
+ Reader.open(table.url, **options.merge(
206
209
  metadata: table,
207
210
  base: table.url,
208
211
  no_found_metadata: true,
@@ -418,9 +421,9 @@ module RDF::Tabular
418
421
 
419
422
  res = if io
420
423
  ::JSON::dump_default_options = json_state
421
- ::JSON.dump(self.send(hash_fn, options), io)
424
+ ::JSON.dump(self.send(hash_fn, **options), io)
422
425
  else
423
- hash = self.send(hash_fn, options)
426
+ hash = self.send(hash_fn, **options)
424
427
  ::JSON.generate(hash, json_state)
425
428
  end
426
429
 
@@ -440,7 +443,7 @@ module RDF::Tabular
440
443
  #
441
444
  # @param [Hash{Symbol => Object}] options
442
445
  # @return [Hash, Array]
443
- def to_hash(options = {})
446
+ def to_hash(**options)
444
447
  # Construct metadata from that passed from file open, along with information from the file.
445
448
  if input.is_a?(Metadata)
446
449
  log_debug("each_statement: metadata") {input.inspect}
@@ -464,13 +467,13 @@ module RDF::Tabular
464
467
  table_group['tables'] = tables
465
468
 
466
469
  if options[:original_input] && !input.describes_file?(options[:base_uri])
467
- Reader.new(options[:original_input], options.merge(
470
+ Reader.new(options[:original_input], **options.merge(
468
471
  metadata: input.tables.first,
469
472
  base: input.tables.first.url,
470
473
  minimal: minimal?,
471
474
  no_found_metadata: true,
472
475
  )) do |r|
473
- case t = r.to_hash(options)
476
+ case t = r.to_hash(**options)
474
477
  when Array then tables += t unless input.tables.first.suppressOutput
475
478
  when Hash then tables << t unless input.tables.first.suppressOutput
476
479
  end
@@ -478,13 +481,13 @@ module RDF::Tabular
478
481
  else
479
482
  input.each_table do |table|
480
483
  next if table.suppressOutput && !validate?
481
- Reader.open(table.url, options.merge(
484
+ Reader.open(table.url, **options.merge(
482
485
  metadata: table,
483
486
  base: table.url,
484
487
  minimal: minimal?,
485
488
  no_found_metadata: true,
486
489
  )) do |r|
487
- case t = r.to_hash(options)
490
+ case t = r.to_hash(**options)
488
491
  when Array then tables += t unless table.suppressOutput
489
492
  when Hash then tables << t unless table.suppressOutput
490
493
  end
@@ -557,7 +560,7 @@ module RDF::Tabular
557
560
  co['@id'] = subject.to_s unless subject == 'null'
558
561
  prop = case cell.propertyUrl
559
562
  when RDF.type then '@type'
560
- when nil then URI.decode(column.name) # Use URI-decoded name
563
+ when nil then CGI.unescape(column.name) # Use URI-decoded name
561
564
  else
562
565
  # Compact the property to a term or prefixed name
563
566
  metadata.context.compact_iri(cell.propertyUrl, vocab: true)
@@ -7,50 +7,99 @@ module RDF::Tabular
7
7
  module UAX35
8
8
 
9
9
  ##
10
- # Parse the date format (if provided), and match against the value (if provided)
11
- # Otherwise, validate format and raise an error
10
+ # Parse the date pattern (if provided), and match against the value (if provided)
11
+ # Otherwise, validate pattern and raise an error.
12
12
  #
13
- # @param [String] format
13
+ # Supported patterns are:
14
+ #
15
+ # * yyyy-MM-dd
16
+ # * yyyyMMdd
17
+ # * dd-MM-yyyy
18
+ # * d-M-yyyy
19
+ # * d-M-yy
20
+ # * d-M-y
21
+ # * MM-dd-yyyy
22
+ # * M-d-yyyy
23
+ # * M-d-yy
24
+ # * M-d-y
25
+ # * dd/MM/yyyy
26
+ # * d/M/yyyy
27
+ # * d/M/yy
28
+ # * d/M/y
29
+ # * MM/dd/yyyy
30
+ # * M/d/yyyy
31
+ # * M/d/yy
32
+ # * M/d/y
33
+ # * dd.MM.yyyy
34
+ # * d.M.yyyy
35
+ # * d.M.yy
36
+ # * d.M.y
37
+ # * MM.dd.yyyy
38
+ # * M.d.yyyy
39
+ # * M.d.yy
40
+ # * M.d.y
41
+ # * yyyy-MM-ddTHH:mm
42
+ # * yyyy-MM-ddTHH:mm:ss
43
+ # * yyyy-MM-ddTHH:mm:ss.S+
44
+ #
45
+ # Year comonents less than four digits are normalized to 1900 or 2000 based on if the value is <= 99 or >= 70, it is considered to be in the 1900 range, otherwise, based on 2000.
46
+ #
47
+ # @param [String] pattern
14
48
  # @param [String] value
15
49
  # @return [String] XMLSchema version of value
16
- # @raise [ArgumentError] if format is not valid, or nil, if value does not match
17
- def parse_uax35_date(format, value)
18
- date_format, time_format = nil, nil
19
- return value unless format
20
- value ||= ""
50
+ # @raise [ArgumentError] if pattern is not valid, or nil
51
+ # @raise [ParseError] if value does not match
52
+ def parse_uax35_date(pattern, value)
53
+ date_pattern, time_pattern = nil, nil
54
+ return value unless pattern
55
+ orig_value = value ||= ""
56
+ orig_pattern = pattern
21
57
 
22
58
  # Extract tz info
23
- if md = format.match(/^(.*[dyms])+(\s*[xX]+)$/)
24
- format, tz_format = md[1], md[2]
59
+ if md = pattern.match(/^(.*[dyms])+(\s*[xX]+)$/)
60
+ pattern, tz_pattern = md[1], md[2]
25
61
  end
26
62
 
27
- date_format, time_format = format.split(' ')
28
- date_format, time_format = nil, date_format if self.base.to_sym == :time
63
+ date_pattern, time_pattern = pattern.split(' ')
64
+ # Snuff out if this is a Time pattern
65
+ date_pattern, time_pattern = nil, date_pattern if time_pattern.nil? && !date_pattern.match(/[TyMd]/)
29
66
 
30
67
  # Extract date, of specified
31
- date_part = case date_format
68
+ date_part = case date_pattern
32
69
  when 'yyyy-MM-dd' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})/)
33
70
  when 'yyyyMMdd' then value.match(/^(?<yr>\d{4})(?<mo>\d{2})(?<da>\d{2})/)
34
71
  when 'dd-MM-yyyy' then value.match(/^(?<da>\d{2})-(?<mo>\d{2})-(?<yr>\d{4})/)
35
72
  when 'd-M-yyyy' then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{4})/)
73
+ when 'd-M-yy' then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{2})/)
74
+ when 'd-M-y' then value.match(/^(?<da>\d{1,2})-(?<mo>\d{1,2})-(?<yr>\d{1,4})/)
36
75
  when 'MM-dd-yyyy' then value.match(/^(?<mo>\d{2})-(?<da>\d{2})-(?<yr>\d{4})/)
37
76
  when 'M-d-yyyy' then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{4})/)
38
- when 'dd/MM/yyyy' then value.match(/^(?<da>\d{2})\/(?<mo>\d{2})\/(?<yr>\d{4})/)
77
+ when 'M-d-yy' then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{2})/)
78
+ when 'M-d-y' then value.match(/^(?<mo>\d{1,2})-(?<da>\d{1,2})-(?<yr>\d{1,4})/)
79
+ when 'dd/MM/yyyy' then value.match(/^(?<da>\d{2})\/(?<mo>\d{2})\/(?<yr>\d{1,4})/)
39
80
  when 'd/M/yyyy' then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{4})/)
40
- when 'MM/dd/yyyy' then value.match(/^(?<mo>\d{2})\/(?<da>\d{2})\/(?<yr>\d{4})/)
81
+ when 'd/M/yy' then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{2})/)
82
+ when 'd/M/y' then value.match(/^(?<da>\d{1,2})\/(?<mo>\d{1,2})\/(?<yr>\d{1,4})/)
83
+ when 'MM/dd/yyyy' then value.match(/^(?<mo>\d{2})\/(?<da>\d{2})\/(?<yr>\d{1,4})/)
41
84
  when 'M/d/yyyy' then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{4})/)
85
+ when 'M/d/yy' then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{2})/)
86
+ when 'M/d/y' then value.match(/^(?<mo>\d{1,2})\/(?<da>\d{1,2})\/(?<yr>\d{1,4})/)
42
87
  when 'dd.MM.yyyy' then value.match(/^(?<da>\d{2})\.(?<mo>\d{2})\.(?<yr>\d{4})/)
43
88
  when 'd.M.yyyy' then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{4})/)
89
+ when 'd.M.yy' then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{2})/)
90
+ when 'd.M.y' then value.match(/^(?<da>\d{1,2})\.(?<mo>\d{1,2})\.(?<yr>\d{1,4})/)
44
91
  when 'MM.dd.yyyy' then value.match(/^(?<mo>\d{2})\.(?<da>\d{2})\.(?<yr>\d{4})/)
45
92
  when 'M.d.yyyy' then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{4})/)
93
+ when 'M.d.yy' then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{2})/)
94
+ when 'M.d.y' then value.match(/^(?<mo>\d{1,2})\.(?<da>\d{1,2})\.(?<yr>\d{1,4})/)
46
95
  when 'yyyy-MM-ddTHH:mm' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2})(?<se>(?<ms>))/)
47
96
  when 'yyyy-MM-ddTHH:mm:ss' then value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})(?<ms>)/)
48
97
  when /yyyy-MM-ddTHH:mm:ss\.S+/
49
98
  md = value.match(/^(?<yr>\d{4})-(?<mo>\d{2})-(?<da>\d{2})T(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})\.(?<ms>\d+)/)
50
- num_ms = date_format.match(/S+/).to_s.length
99
+ num_ms = date_pattern.match(/S+/).to_s.length
51
100
  md if md && md[:ms].length <= num_ms
52
101
  else
53
- raise ArgumentError, "unrecognized date/time format #{date_format}" if date_format
102
+ raise ArgumentError, "unrecognized date/time pattern #{date_pattern}" if date_pattern
54
103
  nil
55
104
  end
56
105
 
@@ -61,25 +110,25 @@ module RDF::Tabular
61
110
  end
62
111
 
63
112
  # Extract time, of specified
64
- time_part = case time_format
113
+ time_part = case time_pattern
65
114
  when 'HH:mm:ss' then value.match(/^(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})(?<ms>)/)
66
115
  when 'HHmmss' then value.match(/^(?<hr>\d{2})(?<mi>\d{2})(?<se>\d{2})(?<ms>)/)
67
116
  when 'HH:mm' then value.match(/^(?<hr>\d{2}):(?<mi>\d{2})(?<se>)(?<ms>)/)
68
117
  when 'HHmm' then value.match(/^(?<hr>\d{2})(?<mi>\d{2})(?<se>)(?<ms>)/)
69
118
  when /HH:mm:ss\.S+/
70
119
  md = value.match(/^(?<hr>\d{2}):(?<mi>\d{2}):(?<se>\d{2})\.(?<ms>\d+)/)
71
- num_ms = time_format.match(/S+/).to_s.length
120
+ num_ms = time_pattern.match(/S+/).to_s.length
72
121
  md if md && md[:ms].length <= num_ms
73
122
  else
74
- raise ArgumentError, "unrecognized date/time format #{time_format}" if time_format
123
+ raise ArgumentError, "unrecognized date/time pattern #{pattern}" if time_pattern
75
124
  nil
76
125
  end
77
126
 
78
- # If there's a date_format but no date_part, match fails
79
- return nil if date_format && date_part.nil?
127
+ # If there's a date_pattern but no date_part, match fails
128
+ raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && date_pattern && date_part.nil?
80
129
 
81
- # If there's a time_format but no time_part, match fails
82
- return nil if time_format && time_part.nil?
130
+ # If there's a time_pattern but no time_part, match fails
131
+ raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && time_pattern && time_part.nil?
83
132
 
84
133
  # Forward past time part
85
134
  value = value[time_part.to_s.length..-1] if time_part
@@ -88,8 +137,8 @@ module RDF::Tabular
88
137
  time_part = date_part if date_part && date_part.names.include?("hr")
89
138
 
90
139
  # If there's a timezone, it may optionally start with whitespace
91
- value = value.lstrip if tz_format.to_s.start_with?(' ')
92
- tz_part = case tz_format.to_s.lstrip
140
+ value = value.lstrip if tz_pattern.to_s.start_with?(' ')
141
+ tz_part = case tz_pattern.to_s.lstrip
93
142
  when 'x' then value.match(/^(?:(?<hr>[+-]\d{2})(?<mi>\d{2})?)$/)
94
143
  when 'X' then value.match(/^(?:(?:(?<hr>[+-]\d{2})(?<mi>\d{2})?)|(?<z>Z))$/)
95
144
  when 'xx' then value.match(/^(?:(?<hr>[+-]\d{2})(?<mi>\d{2}))|$/)
@@ -97,15 +146,30 @@ module RDF::Tabular
97
146
  when 'xxx' then value.match(/^(?:(?<hr>[+-]\d{2}):(?<mi>\d{2}))$/)
98
147
  when 'XXX' then value.match(/^(?:(?:(?<hr>[+-]\d{2}):(?<mi>\d{2}))|(?<z>Z))$/)
99
148
  else
100
- raise ArgumentError, "unrecognized timezone format #{tz_format.to_s.lstrip}" if tz_format
149
+ raise ArgumentError, "unrecognized timezone pattern #{tz_pattern.to_s.lstrip}" if tz_pattern
101
150
  nil
102
151
  end
103
152
 
104
- # If there's a tz_format but no time_part, match fails
105
- return nil if tz_format && tz_part.nil?
153
+ # If there's a tz_pattern but no time_part, match fails
154
+ raise ParseError, "#{orig_value} does not match pattern #{orig_pattern}" if !orig_value.empty? && tz_pattern && tz_part.nil?
106
155
 
107
156
  # Compose normalized value
108
- vd = ("%04d-%02d-%02d" % [date_part[:yr].to_i, date_part[:mo].to_i, date_part[:da].to_i]) if date_part
157
+ vd = if date_part
158
+ yr, mo, da = [date_part[:yr], date_part[:mo], date_part[:da]].map(&:to_i)
159
+
160
+ if date_part[:yr].length < 4
161
+ # Make sure that yr makes sense, if given
162
+ yr = case yr
163
+ when 0..69 then yr + 2000
164
+ when 100..999 then yr + 2000
165
+ when 70..99 then yr + 1900
166
+ else yr
167
+ end
168
+ end
169
+
170
+ ("%04d-%02d-%02d" % [yr, mo, da])
171
+ end
172
+
109
173
  vt = ("%02d:%02d:%02d" % [time_part[:hr].to_i, time_part[:mi].to_i, time_part[:se].to_i]) if time_part
110
174
 
111
175
  # Add milliseconds, if matched
@@ -117,37 +181,74 @@ module RDF::Tabular
117
181
  end
118
182
 
119
183
  ##
120
- # Parse the date format (if provided), and match against the value (if provided)
121
- # Otherwise, validate format and raise an error
184
+ # Parse the date pattern (if provided), and match against the value (if provided)
185
+ # Otherwise, validate pattern and raise an error
122
186
  #
123
187
  # @param [String] pattern
124
188
  # @param [String] value
125
189
  # @param [String] groupChar
126
190
  # @param [String] decimalChar
127
191
  # @return [String] XMLSchema version of value or nil, if value does not match
128
- # @raise [ArgumentError] if format is not valid
192
+ # @raise [ArgumentError] if pattern is not valid
129
193
  def parse_uax35_number(pattern, value, groupChar=",", decimalChar=".")
130
194
  value ||= ""
131
195
 
132
196
  re = build_number_re(pattern, groupChar, decimalChar)
133
197
 
198
+ raise ParseError, "#{value} has repeating #{groupChar.inspect}" if groupChar.length == 1 && value.include?(groupChar*2)
199
+
134
200
  # Upcase value and remove internal spaces
135
201
  value = value.upcase
136
202
 
137
203
  if value =~ re
138
-
139
204
  # Upcase value and remove internal spaces
140
205
  value = value.
141
- upcase.
142
206
  gsub(/\s+/, '').
143
207
  gsub(groupChar, '').
144
208
  gsub(decimalChar, '.')
145
209
 
146
210
  # result re-assembles parts removed from value
147
211
  value
148
- else
212
+ elsif !value.empty?
149
213
  # no match
150
- nil
214
+ raise ParseError, "#{value.inspect} does not match #{pattern.inspect}"
215
+ end
216
+
217
+ # Extract percent or per-mille sign
218
+ case value
219
+ when /%/
220
+ value = value.sub('%', '')
221
+ lhs, rhs = value.split('.')
222
+
223
+ # Shift decimal
224
+ value = case lhs.length
225
+ when 0 then "0.00#{rhs}".sub('E', 'e')
226
+ when 1 then "0.0#{lhs}#{rhs}".sub('E', 'e')
227
+ when 2 then "0.#{lhs}#{rhs}".sub('E', 'e')
228
+ else
229
+ ll, lr = lhs[0..lhs.length-3], lhs[-2..-1]
230
+ ll = ll + "0" unless ll =~ /\d+/
231
+ "#{ll}.#{lr}#{rhs}".sub('E', 'e')
232
+ end
233
+ when /‰/
234
+ value = value.sub('‰', '')
235
+ lhs, rhs = value.split('.')
236
+
237
+ # Shift decimal
238
+ value = case lhs.length
239
+ when 0 then "0.000#{rhs}".sub('E', 'e')
240
+ when 1 then "0.00#{lhs}#{rhs}".sub('E', 'e')
241
+ when 2 then "0.0#{lhs}#{rhs}".sub('E', 'e')
242
+ when 3 then "0.#{lhs}#{rhs}".sub('E', 'e')
243
+ else
244
+ ll, lr = lhs[0..lhs.length-4], lhs[-3..-1]
245
+ ll = ll + "0" unless ll =~ /\d+/
246
+ "#{ll}.#{lr}#{rhs}".sub('E', 'e')
247
+ end
248
+ when /NAN/ then value.sub('NAN', 'NaN')
249
+ when /E/ then value.sub('E', 'e')
250
+ else
251
+ value
151
252
  end
152
253
  end
153
254
 
@@ -157,9 +258,10 @@ module RDF::Tabular
157
258
  # @param [String] groupChar
158
259
  # @param [String] decimalChar
159
260
  # @return [Regexp] Regular expression matching value
160
- # @raise [ArgumentError] if format is not valid
261
+ # @raise [ArgumentError] if pattern is not valid
161
262
  def build_number_re(pattern, groupChar, decimalChar)
162
263
  # pattern must be composed of only 0, #, decimalChar, groupChar, E, %, and ‰
264
+
163
265
  ge = Regexp.escape groupChar
164
266
  de = Regexp.escape decimalChar
165
267
 
@@ -320,5 +422,8 @@ module RDF::Tabular
320
422
 
321
423
  Regexp.new("^(?<prefix>#{prefix})(?<numeric_part>#{integer_str}#{fractional_str}#{exponent_str})(?<suffix>#{suffix})$")
322
424
  end
425
+
426
+ # ParseError is raised when a value does not match the pattern
427
+ class ParseError < RuntimeError; end
323
428
  end
324
429
  end
@@ -0,0 +1,38 @@
1
+ [
2
+ {
3
+ "@id": "http://example.org/countries.csv#AD",
4
+ "http://www.geonames.org/ontology#countryCode": "AD",
5
+ "schema:latitude": 42.546245,
6
+ "schema:longitude": 1.601554,
7
+ "schema:name": "Andorra"
8
+ },
9
+ {
10
+ "@id": "http://example.org/countries.csv#AE",
11
+ "http://www.geonames.org/ontology#countryCode": "AE",
12
+ "schema:latitude": 23.424076,
13
+ "schema:longitude": 53.847818,
14
+ "schema:name": "United Arab Emirates"
15
+ },
16
+ {
17
+ "@id": "http://example.org/countries.csv#AF",
18
+ "http://www.geonames.org/ontology#countryCode": "AF",
19
+ "schema:latitude": 33.93911,
20
+ "schema:longitude": 67.709953,
21
+ "schema:name": "Afghanistan"
22
+ },
23
+ {
24
+ "countryRef": "http://example.org/countries.csv#AF",
25
+ "year": "1960",
26
+ "population": 9616353
27
+ },
28
+ {
29
+ "countryRef": "http://example.org/countries.csv#AF",
30
+ "year": "1961",
31
+ "population": 9799379
32
+ },
33
+ {
34
+ "countryRef": "http://example.org/countries.csv#AF",
35
+ "year": "1962",
36
+ "population": 9989846
37
+ }
38
+ ]