rdf-tabular 0.2.1 → 0.4.0.beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  require 'rdf'
2
+ require 'rdf/vocab'
2
3
 
3
4
  module RDF::Tabular
4
5
  ##
@@ -7,7 +8,7 @@ module RDF::Tabular
7
8
  # @author [Gregg Kellogg](http://greggkellogg.net/)
8
9
  class Reader < RDF::Reader
9
10
  format Format
10
- include Utils
11
+ include RDF::Util::Logger
11
12
 
12
13
  # Metadata associated with the CSV
13
14
  #
@@ -20,14 +21,27 @@ module RDF::Tabular
20
21
  attr_reader :input
21
22
 
22
23
  ##
23
- # Warnings found during processing
24
- # @return [Array<String>]
25
- attr_reader :warnings
26
-
27
- ##
28
- # Accumulated errors found during processing
29
- # @return [Array<String>]
30
- attr_reader :errors
24
+ # Writer options
25
+ # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Writer#options-class_method
26
+ def self.options
27
+ super + [
28
+ RDF::CLI::Option.new(
29
+ symbol: :metadata,
30
+ datatype: RDF::URI,
31
+ on: ["--metadata URI"],
32
+ description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
33
+ RDF::CLI::Option.new(
34
+ symbol: :minimal,
35
+ datatype: TrueClass,
36
+ on: ["--minimal"],
37
+ description: "Includes only the information gleaned from the cells of the tabular data.") {true},
38
+ RDF::CLI::Option.new(
39
+ symbol: :noProv,
40
+ datatype: TrueClass,
41
+ on: ["--no-prov"],
42
+ description: "do not output optional provenance information.") {true},
43
+ ]
44
+ end
31
45
 
32
46
  ##
33
47
  # Initializes the RDF::Tabular Reader instance.
@@ -40,10 +54,6 @@ module RDF::Tabular
40
54
  # @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
41
55
  # @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
42
56
  # @option options [Boolean] :noProv do not output optional provenance information
43
- # @option options [Array] :errors
44
- # array for placing errors found when processing metadata. If not set, and validating, errors are output to `$stderr`
45
- # @option options [Array] :warnings
46
- # array for placing warnings found when processing metadata. If not set, and validating, warnings are output to `$stderr`
47
57
  # @option optinons [Array<Hash>] :fks_referencing_table
48
58
  # When called with Table metadata, a list of the foreign keys referencing this table
49
59
  # @yield [reader] `self`
@@ -61,11 +71,7 @@ module RDF::Tabular
61
71
  @options[:base] = "file:/#{File.expand_path(@options[:base])}"
62
72
  end
63
73
 
64
- @options[:depth] ||= 0
65
- @errors = @options.fetch(:errors, [])
66
- @warnings = @options.fetch(:warnings, [])
67
-
68
- debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
74
+ log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
69
75
 
70
76
  # Minimal implies noProv
71
77
  @options[:noProv] ||= @options[:minimal]
@@ -76,7 +82,7 @@ module RDF::Tabular
76
82
  else input
77
83
  end
78
84
 
79
- depth do
85
+ log_depth do
80
86
  # If input is JSON, then the input is the metadata
81
87
  content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
82
88
  if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
@@ -85,6 +91,20 @@ module RDF::Tabular
85
91
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
86
92
  @metadata.normalize!
87
93
  @input = @metadata
94
+ elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
95
+ !RDF::URI(@options[:base].to_s).fragment
96
+ require 'nokogiri' unless defined?(:Nokogiri)
97
+ doc = Nokogiri::HTML.parse(input)
98
+ doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
99
+ def script.content_type; "application/csvm+json"; end
100
+ log_debug("Reader#initialize") {"Process HTML script block"}
101
+ @input = script
102
+ @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
103
+ # If @metadata is for a Table, turn it into a TableGroup
104
+ @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
105
+ @metadata.normalize!
106
+ @input = @metadata
107
+ end
88
108
  elsif @options[:no_found_metadata]
89
109
  # Extract embedded metadata and merge
90
110
  dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
@@ -116,7 +136,7 @@ module RDF::Tabular
116
136
  @input = @metadata = Metadata.for_input(@input, @options).normalize!
117
137
  end
118
138
 
119
- debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
139
+ log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
120
140
 
121
141
  if block_given?
122
142
  case block.arity
@@ -139,9 +159,9 @@ module RDF::Tabular
139
159
 
140
160
  # Construct metadata from that passed from file open, along with information from the file.
141
161
  if input.is_a?(Metadata)
142
- debug("each_statement: metadata") {input.inspect}
162
+ log_debug("each_statement: metadata") {input.inspect}
143
163
 
144
- depth do
164
+ log_depth do
145
165
  begin
146
166
  # Validate metadata
147
167
  input.validate!
@@ -168,8 +188,6 @@ module RDF::Tabular
168
188
  base: input.tables.first.url,
169
189
  no_found_metadata: true,
170
190
  table_resource: table_resource,
171
- warnings: @warnings,
172
- errors: @errors,
173
191
  )) do |r|
174
192
  r.each_statement(&block)
175
193
  end
@@ -190,8 +208,6 @@ module RDF::Tabular
190
208
  no_found_metadata: true,
191
209
  table_resource: table_resource,
192
210
  fks_referencing_table: fks,
193
- warnings: @warnings,
194
- errors: @errors,
195
211
  )) do |r|
196
212
  r.each_statement(&block)
197
213
  end
@@ -204,42 +220,38 @@ module RDF::Tabular
204
220
  # Provenance
205
221
  if prov?
206
222
  activity = RDF::Node.new
207
- add_statement(0, table_group, RDF::PROV.wasGeneratedBy, activity)
208
- add_statement(0, activity, RDF.type, RDF::PROV.Activity)
209
- add_statement(0, activity, RDF::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
210
- add_statement(0, activity, RDF::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
211
- add_statement(0, activity, RDF::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))
223
+ add_statement(0, table_group, RDF::Vocab::PROV.wasGeneratedBy, activity)
224
+ add_statement(0, activity, RDF.type, RDF::Vocab::PROV.Activity)
225
+ add_statement(0, activity, RDF::Vocab::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
226
+ add_statement(0, activity, RDF::Vocab::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
227
+ add_statement(0, activity, RDF::Vocab::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))
212
228
 
213
229
  unless (urls = input.tables.map(&:url)).empty?
214
230
  usage = RDF::Node.new
215
- add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
216
- add_statement(0, usage, RDF.type, RDF::PROV.Usage)
231
+ add_statement(0, activity, RDF::Vocab::PROV.qualifiedUsage, usage)
232
+ add_statement(0, usage, RDF.type, RDF::Vocab::PROV.Usage)
217
233
  urls.each do |url|
218
- add_statement(0, usage, RDF::PROV.entity, RDF::URI(url))
234
+ add_statement(0, usage, RDF::Vocab::PROV.entity, RDF::URI(url))
219
235
  end
220
- add_statement(0, usage, RDF::PROV.hadRole, CSVW.csvEncodedTabularData)
236
+ add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.csvEncodedTabularData)
221
237
  end
222
238
 
223
239
  unless Array(input.filenames).empty?
224
240
  usage = RDF::Node.new
225
- add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
226
- add_statement(0, usage, RDF.type, RDF::PROV.Usage)
241
+ add_statement(0, activity, RDF::Vocab::PROV.qualifiedUsage, usage)
242
+ add_statement(0, usage, RDF.type, RDF::Vocab::PROV.Usage)
227
243
  Array(input.filenames).each do |fn|
228
- add_statement(0, usage, RDF::PROV.entity, RDF::URI(fn))
244
+ add_statement(0, usage, RDF::Vocab::PROV.entity, RDF::URI(fn))
229
245
  end
230
- add_statement(0, usage, RDF::PROV.hadRole, CSVW.tabularMetadata)
246
+ add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
231
247
  end
232
248
  end
233
- ensure
234
- warnings = @warnings.concat(input.warnings)
235
- if validate? && !warnings.empty? && !@options[:warnings]
236
- $stderr.puts "Warnings: #{warnings.join("\n")}"
237
- end
238
- if validate? && !errors.empty? && !@options[:errors]
239
- $stderr.puts "Errors: #{errors.join("\n")}"
240
- end
241
249
  end
242
250
  end
251
+
252
+ if validate? && log_statistics[:error]
253
+ raise RDF::ReaderError, "Errors found during processing"
254
+ end
243
255
  return
244
256
  end
245
257
 
@@ -260,8 +272,9 @@ module RDF::Tabular
260
272
  row.subject = table_resource
261
273
  add_statement(last_row_num + 1, row) unless metadata.suppressOutput
262
274
  next
275
+ else
276
+ last_row_num = row.sourceNumber
263
277
  end
264
- last_row_num = row.sourceNumber
265
278
 
266
279
  # Collect primary and foreign keys if validating
267
280
  if validate?
@@ -285,11 +298,20 @@ module RDF::Tabular
285
298
  end
286
299
  row.values.each_with_index do |cell, index|
287
300
  # Collect cell errors
288
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
289
- cell.errors.join("\n") unless Array(cell.errors).empty?
301
+ unless Array(cell.errors).empty?
302
+ self.send((validate? ? :log_error : :log_warn),
303
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
304
+ cell.errors.join("\n")
305
+ end
306
+ end
290
307
  next if cell.column.suppressOutput # Skip ignored cells
291
308
  cell_subject = cell.aboutUrl || default_cell_subject
292
- propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
309
+ propertyUrl = cell.propertyUrl || begin
310
+ # It's possible that the metadata URL already has a fragment, in which case we need to override it.
311
+ u = metadata.url.dup
312
+ u.fragment = cell.column.name
313
+ u
314
+ end
293
315
  add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
294
316
 
295
317
  if cell.column.valueUrl
@@ -338,16 +360,13 @@ module RDF::Tabular
338
360
  end
339
361
 
340
362
  ##
341
- # Validate and raise an exception if any errors are found while processing either metadata or tables
342
- # @return [self]
343
- # @raise [Error]
363
+ # Do we have valid metadata?
364
+ # @raise [RDF::ReaderError]
344
365
  def validate!
345
- each_statement {} # Read all rows
346
- raise Error, errors.join("\n") unless errors.empty?
347
- self
366
+ @options[:validate] = true
367
+ each_statement {}
348
368
  rescue RDF::ReaderError => e
349
369
  raise Error, e.message
350
- self
351
370
  end
352
371
 
353
372
  ##
@@ -394,16 +413,22 @@ module RDF::Tabular
394
413
  end
395
414
  options = {} unless options.is_a?(Hash)
396
415
 
397
- hash_fn = options[:atd] ? :to_atd : :to_hash
416
+ hash_fn = :to_hash
398
417
  options = options.merge(noProv: @options[:noProv])
399
418
 
400
- if io
419
+ res = if io
401
420
  ::JSON::dump_default_options = json_state
402
421
  ::JSON.dump(self.send(hash_fn, options), io)
403
422
  else
404
423
  hash = self.send(hash_fn, options)
405
424
  ::JSON.generate(hash, json_state)
406
425
  end
426
+
427
+ if validate? && log_statistics[:error]
428
+ raise RDF::Tabular::Error, "Errors found during processing"
429
+ end
430
+
431
+ res
407
432
  rescue IOError => e
408
433
  raise RDF::Tabular::Error, e.message
409
434
  end
@@ -418,8 +443,8 @@ module RDF::Tabular
418
443
  def to_hash(options = {})
419
444
  # Construct metadata from that passed from file open, along with information from the file.
420
445
  if input.is_a?(Metadata)
421
- debug("each_statement: metadata") {input.inspect}
422
- depth do
446
+ log_debug("each_statement: metadata") {input.inspect}
447
+ log_depth do
423
448
  # Get Metadata to invoke and open referenced files
424
449
  begin
425
450
  # Validate metadata
@@ -444,8 +469,6 @@ module RDF::Tabular
444
469
  base: input.tables.first.url,
445
470
  minimal: minimal?,
446
471
  no_found_metadata: true,
447
- warnings: @warnings,
448
- errors: @errors,
449
472
  )) do |r|
450
473
  case t = r.to_hash(options)
451
474
  when Array then tables += t unless input.tables.first.suppressOutput
@@ -460,8 +483,6 @@ module RDF::Tabular
460
483
  base: table.url,
461
484
  minimal: minimal?,
462
485
  no_found_metadata: true,
463
- warnings: @warnings,
464
- errors: @errors,
465
486
  )) do |r|
466
487
  case t = r.to_hash(options)
467
488
  when Array then tables += t unless table.suppressOutput
@@ -476,14 +497,6 @@ module RDF::Tabular
476
497
 
477
498
  # Result is table_group or array
478
499
  minimal? ? tables : table_group
479
- ensure
480
- warnings = @warnings.concat(input.warnings)
481
- if validate? && !warnings.empty? && !@options[:warnings]
482
- $stderr.puts "Warnings: #{warnings.join("\n")}"
483
- end
484
- if validate? && !errors.empty? && !@options[:errors]
485
- $stderr.puts "Errors: #{errors.join("\n")}"
486
- end
487
500
  end
488
501
  end
489
502
  else
@@ -523,8 +536,12 @@ module RDF::Tabular
523
536
  column = metadata.tableSchema.columns[index]
524
537
 
525
538
  # Collect cell errors
526
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
527
- cell.errors.join("\n") unless Array(cell.errors).empty?
539
+ unless Array(cell.errors).empty?
540
+ self.send(validate? ? :log_error : :log_warn,
541
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
542
+ cell.errors.join("\n")
543
+ end
544
+ end
528
545
 
529
546
  # Ignore suppressed columns
530
547
  next if column.suppressOutput
@@ -606,71 +623,6 @@ module RDF::Tabular
606
623
  end
607
624
  end
608
625
 
609
- # Return a hash representation of the annotated tabular data model for JSON serialization
610
- # @param [Hash{Symbol => Object}] options
611
- # @return [Hash]
612
- def to_atd(options = {})
613
- # Construct metadata from that passed from file open, along with information from the file.
614
- if input.is_a?(Metadata)
615
- debug("each_statement: metadata") {input.inspect}
616
- depth do
617
- # Get Metadata to invoke and open referenced files
618
- case input.type
619
- when :TableGroup
620
- table_group = input.to_atd
621
- if input.tables.empty? && options[:original_input]
622
- Reader.new(options[:original_input], options.merge(
623
- base: options[:base],
624
- no_found_metadata: true
625
- )) do |r|
626
- table_group["tables"] << r.to_atd(options)
627
- end
628
- else
629
- input.each_table do |table|
630
- Reader.open(table.url, options.merge(
631
- metadata: table,
632
- base: table.url,
633
- no_found_metadata: true
634
- )) do |r|
635
- table_group["tables"] << r.to_atd(options)
636
- end
637
- end
638
- end
639
-
640
- # Result is table_group
641
- table_group
642
- when :Table
643
- table = nil
644
- Reader.open(input.url, options.merge(
645
- metadata: input,
646
- base: input.url,
647
- no_found_metadata: true
648
- )) do |r|
649
- table = r.to_atd(options)
650
- end
651
-
652
- table
653
- else
654
- raise "Opened inappropriate metadata type: #{input.type}"
655
- end
656
- end
657
- else
658
- rows = []
659
- table = metadata.to_atd
660
- rows, columns = table["rows"], table["columns"]
661
-
662
- # Input is file containing CSV data.
663
- # Output ROW-Level statements
664
- metadata.each_row(input) do |row|
665
- rows << row.to_atd
666
- row.values.each_with_index do |cell, colndx|
667
- columns[colndx]["cells"] << cell.to_atd
668
- end
669
- end
670
- table
671
- end
672
- end
673
-
674
626
  def minimal?; @options[:minimal]; end
675
627
  def prov?; !(@options[:noProv]); end
676
628
 
@@ -690,9 +642,9 @@ module RDF::Tabular
690
642
  # @param [URI, BNode, Literal] object the object of the statement
691
643
  # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
692
644
  def add_statement(node, *args)
693
- statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement.new(*args)
645
+ statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement(*args)
694
646
  raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
695
- debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
647
+ log_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
696
648
  @callback.call(statement)
697
649
  end
698
650
 
@@ -701,7 +653,7 @@ module RDF::Tabular
701
653
  pk_strings = {}
702
654
  primary_keys.reject(&:empty?).each do |row_pks|
703
655
  pk_names = row_pks.map {|cell| cell.value}.join(",")
704
- errors << "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
656
+ log_error "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
705
657
  pk_strings[pk_names] ||= 0
706
658
  pk_strings[pk_names] += 1
707
659
  end
@@ -734,7 +686,7 @@ module RDF::Tabular
734
686
  fk[:reference_to] ||= {}
735
687
  cell_values = cells.map {|cell| cell.stringValue unless cell.stringValue.to_s.empty?}.compact
736
688
  next if cell_values.empty? # Don't record if empty
737
- errors << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
689
+ log_error "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
738
690
  fk[:reference_to][cell_values] ||= row
739
691
  end
740
692
  end
@@ -747,7 +699,7 @@ module RDF::Tabular
747
699
  # Verify that reference_from entry exists in reference_to
748
700
  fk.fetch(:reference_from, {}).each do |cell_values, row|
749
701
  unless fk.fetch(:reference_to, {}).has_key?(cell_values)
750
- errors << "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
702
+ log_error "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
751
703
  "Foreign Key violation, expected to find #{cell_values.map(&:to_s).inspect}"
752
704
  end
753
705
  end
@@ -134,7 +134,6 @@ module RDF::Tabular
134
134
  # Upcase value and remove internal spaces
135
135
  value = value.upcase
136
136
 
137
- #require 'byebug'; byebug unless value.empty?
138
137
  if value =~ re
139
138
 
140
139
  # Upcase value and remove internal spaces
@@ -211,6 +210,7 @@ module RDF::Tabular
211
210
 
212
211
  min_integer_digits = integer_part.gsub(groupChar, '').gsub('#', '').length
213
212
  all_integer_digits = integer_part.gsub(groupChar, '').length
213
+ all_integer_digits += 1 if all_integer_digits == min_integer_digits
214
214
  min_fractional_digits = fractional_part.gsub(groupChar, '').gsub('#', '').length
215
215
  max_fractional_digits = fractional_part.gsub(groupChar, '').length
216
216
  exponent_sign = exponent_part[0] if exponent_part =~ /^[+-]/
@@ -226,7 +226,7 @@ module RDF::Tabular
226
226
 
227
227
  # Construct regular expression for integer part
228
228
  integer_str = if primary_grouping_size == 0
229
- all_integer_digits > min_integer_digits ? "\\d{#{min_integer_digits},}" : "\\d{#{min_integer_digits}}"
229
+ "\\d{#{min_integer_digits},}"
230
230
  else
231
231
  # These number of groupings must be there
232
232
  integer_parts = []
@@ -235,8 +235,8 @@ module RDF::Tabular
235
235
  sz = [primary_grouping_size, min_integer_digits].min
236
236
  integer_rem = primary_grouping_size - sz
237
237
  integer_parts << "\\d{#{sz}}"
238
- min_integer_digits -= primary_grouping_size
239
- all_integer_digits -= primary_grouping_size
238
+ min_integer_digits -= sz
239
+ all_integer_digits -= sz
240
240
  primary_grouping_size = secondary_grouping_size
241
241
  end
242
242
  required_digits = integer_parts.reverse.join(ge)