rdf-tabular 0.2.1 → 0.4.0.beta2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  require 'rdf'
2
+ require 'rdf/vocab'
2
3
 
3
4
  module RDF::Tabular
4
5
  ##
@@ -7,7 +8,7 @@ module RDF::Tabular
7
8
  # @author [Gregg Kellogg](http://greggkellogg.net/)
8
9
  class Reader < RDF::Reader
9
10
  format Format
10
- include Utils
11
+ include RDF::Util::Logger
11
12
 
12
13
  # Metadata associated with the CSV
13
14
  #
@@ -20,14 +21,27 @@ module RDF::Tabular
20
21
  attr_reader :input
21
22
 
22
23
  ##
23
- # Warnings found during processing
24
- # @return [Array<String>]
25
- attr_reader :warnings
26
-
27
- ##
28
- # Accumulated errors found during processing
29
- # @return [Array<String>]
30
- attr_reader :errors
24
+ # Writer options
25
+ # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Writer#options-class_method
26
+ def self.options
27
+ super + [
28
+ RDF::CLI::Option.new(
29
+ symbol: :metadata,
30
+ datatype: RDF::URI,
31
+ on: ["--metadata URI"],
32
+ description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
33
+ RDF::CLI::Option.new(
34
+ symbol: :minimal,
35
+ datatype: TrueClass,
36
+ on: ["--minimal"],
37
+ description: "Includes only the information gleaned from the cells of the tabular data.") {true},
38
+ RDF::CLI::Option.new(
39
+ symbol: :noProv,
40
+ datatype: TrueClass,
41
+ on: ["--no-prov"],
42
+ description: "do not output optional provenance information.") {true},
43
+ ]
44
+ end
31
45
 
32
46
  ##
33
47
  # Initializes the RDF::Tabular Reader instance.
@@ -40,10 +54,6 @@ module RDF::Tabular
40
54
  # @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
41
55
  # @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
42
56
  # @option options [Boolean] :noProv do not output optional provenance information
43
- # @option options [Array] :errors
44
- # array for placing errors found when processing metadata. If not set, and validating, errors are output to `$stderr`
45
- # @option options [Array] :warnings
46
- # array for placing warnings found when processing metadata. If not set, and validating, warnings are output to `$stderr`
47
57
  # @option optinons [Array<Hash>] :fks_referencing_table
48
58
  # When called with Table metadata, a list of the foreign keys referencing this table
49
59
  # @yield [reader] `self`
@@ -61,11 +71,7 @@ module RDF::Tabular
61
71
  @options[:base] = "file:/#{File.expand_path(@options[:base])}"
62
72
  end
63
73
 
64
- @options[:depth] ||= 0
65
- @errors = @options.fetch(:errors, [])
66
- @warnings = @options.fetch(:warnings, [])
67
-
68
- debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
74
+ log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
69
75
 
70
76
  # Minimal implies noProv
71
77
  @options[:noProv] ||= @options[:minimal]
@@ -76,7 +82,7 @@ module RDF::Tabular
76
82
  else input
77
83
  end
78
84
 
79
- depth do
85
+ log_depth do
80
86
  # If input is JSON, then the input is the metadata
81
87
  content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
82
88
  if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
@@ -85,6 +91,20 @@ module RDF::Tabular
85
91
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
86
92
  @metadata.normalize!
87
93
  @input = @metadata
94
+ elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
95
+ !RDF::URI(@options[:base].to_s).fragment
96
+ require 'nokogiri' unless defined?(:Nokogiri)
97
+ doc = Nokogiri::HTML.parse(input)
98
+ doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
99
+ def script.content_type; "application/csvm+json"; end
100
+ log_debug("Reader#initialize") {"Process HTML script block"}
101
+ @input = script
102
+ @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
103
+ # If @metadata is for a Table, turn it into a TableGroup
104
+ @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
105
+ @metadata.normalize!
106
+ @input = @metadata
107
+ end
88
108
  elsif @options[:no_found_metadata]
89
109
  # Extract embedded metadata and merge
90
110
  dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
@@ -116,7 +136,7 @@ module RDF::Tabular
116
136
  @input = @metadata = Metadata.for_input(@input, @options).normalize!
117
137
  end
118
138
 
119
- debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
139
+ log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
120
140
 
121
141
  if block_given?
122
142
  case block.arity
@@ -139,9 +159,9 @@ module RDF::Tabular
139
159
 
140
160
  # Construct metadata from that passed from file open, along with information from the file.
141
161
  if input.is_a?(Metadata)
142
- debug("each_statement: metadata") {input.inspect}
162
+ log_debug("each_statement: metadata") {input.inspect}
143
163
 
144
- depth do
164
+ log_depth do
145
165
  begin
146
166
  # Validate metadata
147
167
  input.validate!
@@ -168,8 +188,6 @@ module RDF::Tabular
168
188
  base: input.tables.first.url,
169
189
  no_found_metadata: true,
170
190
  table_resource: table_resource,
171
- warnings: @warnings,
172
- errors: @errors,
173
191
  )) do |r|
174
192
  r.each_statement(&block)
175
193
  end
@@ -190,8 +208,6 @@ module RDF::Tabular
190
208
  no_found_metadata: true,
191
209
  table_resource: table_resource,
192
210
  fks_referencing_table: fks,
193
- warnings: @warnings,
194
- errors: @errors,
195
211
  )) do |r|
196
212
  r.each_statement(&block)
197
213
  end
@@ -204,42 +220,38 @@ module RDF::Tabular
204
220
  # Provenance
205
221
  if prov?
206
222
  activity = RDF::Node.new
207
- add_statement(0, table_group, RDF::PROV.wasGeneratedBy, activity)
208
- add_statement(0, activity, RDF.type, RDF::PROV.Activity)
209
- add_statement(0, activity, RDF::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
210
- add_statement(0, activity, RDF::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
211
- add_statement(0, activity, RDF::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))
223
+ add_statement(0, table_group, RDF::Vocab::PROV.wasGeneratedBy, activity)
224
+ add_statement(0, activity, RDF.type, RDF::Vocab::PROV.Activity)
225
+ add_statement(0, activity, RDF::Vocab::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
226
+ add_statement(0, activity, RDF::Vocab::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
227
+ add_statement(0, activity, RDF::Vocab::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))
212
228
 
213
229
  unless (urls = input.tables.map(&:url)).empty?
214
230
  usage = RDF::Node.new
215
- add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
216
- add_statement(0, usage, RDF.type, RDF::PROV.Usage)
231
+ add_statement(0, activity, RDF::Vocab::PROV.qualifiedUsage, usage)
232
+ add_statement(0, usage, RDF.type, RDF::Vocab::PROV.Usage)
217
233
  urls.each do |url|
218
- add_statement(0, usage, RDF::PROV.entity, RDF::URI(url))
234
+ add_statement(0, usage, RDF::Vocab::PROV.entity, RDF::URI(url))
219
235
  end
220
- add_statement(0, usage, RDF::PROV.hadRole, CSVW.csvEncodedTabularData)
236
+ add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.csvEncodedTabularData)
221
237
  end
222
238
 
223
239
  unless Array(input.filenames).empty?
224
240
  usage = RDF::Node.new
225
- add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
226
- add_statement(0, usage, RDF.type, RDF::PROV.Usage)
241
+ add_statement(0, activity, RDF::Vocab::PROV.qualifiedUsage, usage)
242
+ add_statement(0, usage, RDF.type, RDF::Vocab::PROV.Usage)
227
243
  Array(input.filenames).each do |fn|
228
- add_statement(0, usage, RDF::PROV.entity, RDF::URI(fn))
244
+ add_statement(0, usage, RDF::Vocab::PROV.entity, RDF::URI(fn))
229
245
  end
230
- add_statement(0, usage, RDF::PROV.hadRole, CSVW.tabularMetadata)
246
+ add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
231
247
  end
232
248
  end
233
- ensure
234
- warnings = @warnings.concat(input.warnings)
235
- if validate? && !warnings.empty? && !@options[:warnings]
236
- $stderr.puts "Warnings: #{warnings.join("\n")}"
237
- end
238
- if validate? && !errors.empty? && !@options[:errors]
239
- $stderr.puts "Errors: #{errors.join("\n")}"
240
- end
241
249
  end
242
250
  end
251
+
252
+ if validate? && log_statistics[:error]
253
+ raise RDF::ReaderError, "Errors found during processing"
254
+ end
243
255
  return
244
256
  end
245
257
 
@@ -260,8 +272,9 @@ module RDF::Tabular
260
272
  row.subject = table_resource
261
273
  add_statement(last_row_num + 1, row) unless metadata.suppressOutput
262
274
  next
275
+ else
276
+ last_row_num = row.sourceNumber
263
277
  end
264
- last_row_num = row.sourceNumber
265
278
 
266
279
  # Collect primary and foreign keys if validating
267
280
  if validate?
@@ -285,11 +298,20 @@ module RDF::Tabular
285
298
  end
286
299
  row.values.each_with_index do |cell, index|
287
300
  # Collect cell errors
288
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
289
- cell.errors.join("\n") unless Array(cell.errors).empty?
301
+ unless Array(cell.errors).empty?
302
+ self.send((validate? ? :log_error : :log_warn),
303
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
304
+ cell.errors.join("\n")
305
+ end
306
+ end
290
307
  next if cell.column.suppressOutput # Skip ignored cells
291
308
  cell_subject = cell.aboutUrl || default_cell_subject
292
- propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
309
+ propertyUrl = cell.propertyUrl || begin
310
+ # It's possible that the metadata URL already has a fragment, in which case we need to override it.
311
+ u = metadata.url.dup
312
+ u.fragment = cell.column.name
313
+ u
314
+ end
293
315
  add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
294
316
 
295
317
  if cell.column.valueUrl
@@ -338,16 +360,13 @@ module RDF::Tabular
338
360
  end
339
361
 
340
362
  ##
341
- # Validate and raise an exception if any errors are found while processing either metadata or tables
342
- # @return [self]
343
- # @raise [Error]
363
+ # Do we have valid metadata?
364
+ # @raise [RDF::ReaderError]
344
365
  def validate!
345
- each_statement {} # Read all rows
346
- raise Error, errors.join("\n") unless errors.empty?
347
- self
366
+ @options[:validate] = true
367
+ each_statement {}
348
368
  rescue RDF::ReaderError => e
349
369
  raise Error, e.message
350
- self
351
370
  end
352
371
 
353
372
  ##
@@ -394,16 +413,22 @@ module RDF::Tabular
394
413
  end
395
414
  options = {} unless options.is_a?(Hash)
396
415
 
397
- hash_fn = options[:atd] ? :to_atd : :to_hash
416
+ hash_fn = :to_hash
398
417
  options = options.merge(noProv: @options[:noProv])
399
418
 
400
- if io
419
+ res = if io
401
420
  ::JSON::dump_default_options = json_state
402
421
  ::JSON.dump(self.send(hash_fn, options), io)
403
422
  else
404
423
  hash = self.send(hash_fn, options)
405
424
  ::JSON.generate(hash, json_state)
406
425
  end
426
+
427
+ if validate? && log_statistics[:error]
428
+ raise RDF::Tabular::Error, "Errors found during processing"
429
+ end
430
+
431
+ res
407
432
  rescue IOError => e
408
433
  raise RDF::Tabular::Error, e.message
409
434
  end
@@ -418,8 +443,8 @@ module RDF::Tabular
418
443
  def to_hash(options = {})
419
444
  # Construct metadata from that passed from file open, along with information from the file.
420
445
  if input.is_a?(Metadata)
421
- debug("each_statement: metadata") {input.inspect}
422
- depth do
446
+ log_debug("each_statement: metadata") {input.inspect}
447
+ log_depth do
423
448
  # Get Metadata to invoke and open referenced files
424
449
  begin
425
450
  # Validate metadata
@@ -444,8 +469,6 @@ module RDF::Tabular
444
469
  base: input.tables.first.url,
445
470
  minimal: minimal?,
446
471
  no_found_metadata: true,
447
- warnings: @warnings,
448
- errors: @errors,
449
472
  )) do |r|
450
473
  case t = r.to_hash(options)
451
474
  when Array then tables += t unless input.tables.first.suppressOutput
@@ -460,8 +483,6 @@ module RDF::Tabular
460
483
  base: table.url,
461
484
  minimal: minimal?,
462
485
  no_found_metadata: true,
463
- warnings: @warnings,
464
- errors: @errors,
465
486
  )) do |r|
466
487
  case t = r.to_hash(options)
467
488
  when Array then tables += t unless table.suppressOutput
@@ -476,14 +497,6 @@ module RDF::Tabular
476
497
 
477
498
  # Result is table_group or array
478
499
  minimal? ? tables : table_group
479
- ensure
480
- warnings = @warnings.concat(input.warnings)
481
- if validate? && !warnings.empty? && !@options[:warnings]
482
- $stderr.puts "Warnings: #{warnings.join("\n")}"
483
- end
484
- if validate? && !errors.empty? && !@options[:errors]
485
- $stderr.puts "Errors: #{errors.join("\n")}"
486
- end
487
500
  end
488
501
  end
489
502
  else
@@ -523,8 +536,12 @@ module RDF::Tabular
523
536
  column = metadata.tableSchema.columns[index]
524
537
 
525
538
  # Collect cell errors
526
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
527
- cell.errors.join("\n") unless Array(cell.errors).empty?
539
+ unless Array(cell.errors).empty?
540
+ self.send(validate? ? :log_error : :log_warn,
541
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
542
+ cell.errors.join("\n")
543
+ end
544
+ end
528
545
 
529
546
  # Ignore suppressed columns
530
547
  next if column.suppressOutput
@@ -606,71 +623,6 @@ module RDF::Tabular
606
623
  end
607
624
  end
608
625
 
609
- # Return a hash representation of the annotated tabular data model for JSON serialization
610
- # @param [Hash{Symbol => Object}] options
611
- # @return [Hash]
612
- def to_atd(options = {})
613
- # Construct metadata from that passed from file open, along with information from the file.
614
- if input.is_a?(Metadata)
615
- debug("each_statement: metadata") {input.inspect}
616
- depth do
617
- # Get Metadata to invoke and open referenced files
618
- case input.type
619
- when :TableGroup
620
- table_group = input.to_atd
621
- if input.tables.empty? && options[:original_input]
622
- Reader.new(options[:original_input], options.merge(
623
- base: options[:base],
624
- no_found_metadata: true
625
- )) do |r|
626
- table_group["tables"] << r.to_atd(options)
627
- end
628
- else
629
- input.each_table do |table|
630
- Reader.open(table.url, options.merge(
631
- metadata: table,
632
- base: table.url,
633
- no_found_metadata: true
634
- )) do |r|
635
- table_group["tables"] << r.to_atd(options)
636
- end
637
- end
638
- end
639
-
640
- # Result is table_group
641
- table_group
642
- when :Table
643
- table = nil
644
- Reader.open(input.url, options.merge(
645
- metadata: input,
646
- base: input.url,
647
- no_found_metadata: true
648
- )) do |r|
649
- table = r.to_atd(options)
650
- end
651
-
652
- table
653
- else
654
- raise "Opened inappropriate metadata type: #{input.type}"
655
- end
656
- end
657
- else
658
- rows = []
659
- table = metadata.to_atd
660
- rows, columns = table["rows"], table["columns"]
661
-
662
- # Input is file containing CSV data.
663
- # Output ROW-Level statements
664
- metadata.each_row(input) do |row|
665
- rows << row.to_atd
666
- row.values.each_with_index do |cell, colndx|
667
- columns[colndx]["cells"] << cell.to_atd
668
- end
669
- end
670
- table
671
- end
672
- end
673
-
674
626
  def minimal?; @options[:minimal]; end
675
627
  def prov?; !(@options[:noProv]); end
676
628
 
@@ -690,9 +642,9 @@ module RDF::Tabular
690
642
  # @param [URI, BNode, Literal] object the object of the statement
691
643
  # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
692
644
  def add_statement(node, *args)
693
- statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement.new(*args)
645
+ statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement(*args)
694
646
  raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
695
- debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
647
+ log_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
696
648
  @callback.call(statement)
697
649
  end
698
650
 
@@ -701,7 +653,7 @@ module RDF::Tabular
701
653
  pk_strings = {}
702
654
  primary_keys.reject(&:empty?).each do |row_pks|
703
655
  pk_names = row_pks.map {|cell| cell.value}.join(",")
704
- errors << "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
656
+ log_error "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
705
657
  pk_strings[pk_names] ||= 0
706
658
  pk_strings[pk_names] += 1
707
659
  end
@@ -734,7 +686,7 @@ module RDF::Tabular
734
686
  fk[:reference_to] ||= {}
735
687
  cell_values = cells.map {|cell| cell.stringValue unless cell.stringValue.to_s.empty?}.compact
736
688
  next if cell_values.empty? # Don't record if empty
737
- errors << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
689
+ log_error "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
738
690
  fk[:reference_to][cell_values] ||= row
739
691
  end
740
692
  end
@@ -747,7 +699,7 @@ module RDF::Tabular
747
699
  # Verify that reference_from entry exists in reference_to
748
700
  fk.fetch(:reference_from, {}).each do |cell_values, row|
749
701
  unless fk.fetch(:reference_to, {}).has_key?(cell_values)
750
- errors << "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
702
+ log_error "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
751
703
  "Foreign Key violation, expected to find #{cell_values.map(&:to_s).inspect}"
752
704
  end
753
705
  end
@@ -134,7 +134,6 @@ module RDF::Tabular
134
134
  # Upcase value and remove internal spaces
135
135
  value = value.upcase
136
136
 
137
- #require 'byebug'; byebug unless value.empty?
138
137
  if value =~ re
139
138
 
140
139
  # Upcase value and remove internal spaces
@@ -211,6 +210,7 @@ module RDF::Tabular
211
210
 
212
211
  min_integer_digits = integer_part.gsub(groupChar, '').gsub('#', '').length
213
212
  all_integer_digits = integer_part.gsub(groupChar, '').length
213
+ all_integer_digits += 1 if all_integer_digits == min_integer_digits
214
214
  min_fractional_digits = fractional_part.gsub(groupChar, '').gsub('#', '').length
215
215
  max_fractional_digits = fractional_part.gsub(groupChar, '').length
216
216
  exponent_sign = exponent_part[0] if exponent_part =~ /^[+-]/
@@ -226,7 +226,7 @@ module RDF::Tabular
226
226
 
227
227
  # Construct regular expression for integer part
228
228
  integer_str = if primary_grouping_size == 0
229
- all_integer_digits > min_integer_digits ? "\\d{#{min_integer_digits},}" : "\\d{#{min_integer_digits}}"
229
+ "\\d{#{min_integer_digits},}"
230
230
  else
231
231
  # These number of groupings must be there
232
232
  integer_parts = []
@@ -235,8 +235,8 @@ module RDF::Tabular
235
235
  sz = [primary_grouping_size, min_integer_digits].min
236
236
  integer_rem = primary_grouping_size - sz
237
237
  integer_parts << "\\d{#{sz}}"
238
- min_integer_digits -= primary_grouping_size
239
- all_integer_digits -= primary_grouping_size
238
+ min_integer_digits -= sz
239
+ all_integer_digits -= sz
240
240
  primary_grouping_size = secondary_grouping_size
241
241
  end
242
242
  required_digits = integer_parts.reverse.join(ge)