rdf-tabular 0.3.0 → 0.4.0.beta2

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,7 +8,7 @@ module RDF::Tabular
8
8
  # @author [Gregg Kellogg](http://greggkellogg.net/)
9
9
  class Reader < RDF::Reader
10
10
  format Format
11
- include Utils
11
+ include RDF::Util::Logger
12
12
 
13
13
  # Metadata associated with the CSV
14
14
  #
@@ -21,14 +21,27 @@ module RDF::Tabular
21
21
  attr_reader :input
22
22
 
23
23
  ##
24
- # Warnings found during processing
25
- # @return [Array<String>]
26
- attr_reader :warnings
27
-
28
- ##
29
- # Accumulated errors found during processing
30
- # @return [Array<String>]
31
- attr_reader :errors
24
+ # Writer options
25
+ # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Writer#options-class_method
26
+ def self.options
27
+ super + [
28
+ RDF::CLI::Option.new(
29
+ symbol: :metadata,
30
+ datatype: RDF::URI,
31
+ on: ["--metadata URI"],
32
+ description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
33
+ RDF::CLI::Option.new(
34
+ symbol: :minimal,
35
+ datatype: TrueClass,
36
+ on: ["--minimal"],
37
+ description: "Includes only the information gleaned from the cells of the tabular data.") {true},
38
+ RDF::CLI::Option.new(
39
+ symbol: :noProv,
40
+ datatype: TrueClass,
41
+ on: ["--no-prov"],
42
+ description: "do not output optional provenance information.") {true},
43
+ ]
44
+ end
32
45
 
33
46
  ##
34
47
  # Initializes the RDF::Tabular Reader instance.
@@ -41,10 +54,6 @@ module RDF::Tabular
41
54
  # @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
42
55
  # @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
43
56
  # @option options [Boolean] :noProv do not output optional provenance information
44
- # @option options [Array] :errors
45
- # array for placing errors found when processing metadata. If not set, and validating, errors are output to `$stderr`
46
- # @option options [Array] :warnings
47
- # array for placing warnings found when processing metadata. If not set, and validating, warnings are output to `$stderr`
48
57
  # @option optinons [Array<Hash>] :fks_referencing_table
49
58
  # When called with Table metadata, a list of the foreign keys referencing this table
50
59
  # @yield [reader] `self`
@@ -62,11 +71,7 @@ module RDF::Tabular
62
71
  @options[:base] = "file:/#{File.expand_path(@options[:base])}"
63
72
  end
64
73
 
65
- @options[:depth] ||= 0
66
- @errors = @options.fetch(:errors, [])
67
- @warnings = @options.fetch(:warnings, [])
68
-
69
- debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
74
+ log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
70
75
 
71
76
  # Minimal implies noProv
72
77
  @options[:noProv] ||= @options[:minimal]
@@ -77,7 +82,7 @@ module RDF::Tabular
77
82
  else input
78
83
  end
79
84
 
80
- depth do
85
+ log_depth do
81
86
  # If input is JSON, then the input is the metadata
82
87
  content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
83
88
  if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
@@ -86,6 +91,20 @@ module RDF::Tabular
86
91
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
87
92
  @metadata.normalize!
88
93
  @input = @metadata
94
+ elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
95
+ !RDF::URI(@options[:base].to_s).fragment
96
+ require 'nokogiri' unless defined?(:Nokogiri)
97
+ doc = Nokogiri::HTML.parse(input)
98
+ doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
99
+ def script.content_type; "application/csvm+json"; end
100
+ log_debug("Reader#initialize") {"Process HTML script block"}
101
+ @input = script
102
+ @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
103
+ # If @metadata is for a Table, turn it into a TableGroup
104
+ @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
105
+ @metadata.normalize!
106
+ @input = @metadata
107
+ end
89
108
  elsif @options[:no_found_metadata]
90
109
  # Extract embedded metadata and merge
91
110
  dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
@@ -117,7 +136,7 @@ module RDF::Tabular
117
136
  @input = @metadata = Metadata.for_input(@input, @options).normalize!
118
137
  end
119
138
 
120
- debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
139
+ log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
121
140
 
122
141
  if block_given?
123
142
  case block.arity
@@ -140,9 +159,9 @@ module RDF::Tabular
140
159
 
141
160
  # Construct metadata from that passed from file open, along with information from the file.
142
161
  if input.is_a?(Metadata)
143
- debug("each_statement: metadata") {input.inspect}
162
+ log_debug("each_statement: metadata") {input.inspect}
144
163
 
145
- depth do
164
+ log_depth do
146
165
  begin
147
166
  # Validate metadata
148
167
  input.validate!
@@ -169,8 +188,6 @@ module RDF::Tabular
169
188
  base: input.tables.first.url,
170
189
  no_found_metadata: true,
171
190
  table_resource: table_resource,
172
- warnings: @warnings,
173
- errors: @errors,
174
191
  )) do |r|
175
192
  r.each_statement(&block)
176
193
  end
@@ -191,8 +208,6 @@ module RDF::Tabular
191
208
  no_found_metadata: true,
192
209
  table_resource: table_resource,
193
210
  fks_referencing_table: fks,
194
- warnings: @warnings,
195
- errors: @errors,
196
211
  )) do |r|
197
212
  r.each_statement(&block)
198
213
  end
@@ -231,16 +246,12 @@ module RDF::Tabular
231
246
  add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
232
247
  end
233
248
  end
234
- ensure
235
- warnings = @warnings.concat(input.warnings)
236
- if validate? && !warnings.empty? && !@options[:warnings]
237
- $stderr.puts "Warnings: #{warnings.join("\n")}"
238
- end
239
- if validate? && !errors.empty? && !@options[:errors]
240
- $stderr.puts "Errors: #{errors.join("\n")}"
241
- end
242
249
  end
243
250
  end
251
+
252
+ if validate? && log_statistics[:error]
253
+ raise RDF::ReaderError, "Errors found during processing"
254
+ end
244
255
  return
245
256
  end
246
257
 
@@ -261,8 +272,9 @@ module RDF::Tabular
261
272
  row.subject = table_resource
262
273
  add_statement(last_row_num + 1, row) unless metadata.suppressOutput
263
274
  next
275
+ else
276
+ last_row_num = row.sourceNumber
264
277
  end
265
- last_row_num = row.sourceNumber
266
278
 
267
279
  # Collect primary and foreign keys if validating
268
280
  if validate?
@@ -286,11 +298,20 @@ module RDF::Tabular
286
298
  end
287
299
  row.values.each_with_index do |cell, index|
288
300
  # Collect cell errors
289
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
290
- cell.errors.join("\n") unless Array(cell.errors).empty?
301
+ unless Array(cell.errors).empty?
302
+ self.send((validate? ? :log_error : :log_warn),
303
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
304
+ cell.errors.join("\n")
305
+ end
306
+ end
291
307
  next if cell.column.suppressOutput # Skip ignored cells
292
308
  cell_subject = cell.aboutUrl || default_cell_subject
293
- propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
309
+ propertyUrl = cell.propertyUrl || begin
310
+ # It's possible that the metadata URL already has a fragment, in which case we need to override it.
311
+ u = metadata.url.dup
312
+ u.fragment = cell.column.name
313
+ u
314
+ end
294
315
  add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
295
316
 
296
317
  if cell.column.valueUrl
@@ -339,16 +360,13 @@ module RDF::Tabular
339
360
  end
340
361
 
341
362
  ##
342
- # Validate and raise an exception if any errors are found while processing either metadata or tables
343
- # @return [self]
344
- # @raise [Error]
363
+ # Do we have valid metadata?
364
+ # @raise [RDF::ReaderError]
345
365
  def validate!
346
- each_statement {} # Read all rows
347
- raise Error, errors.join("\n") unless errors.empty?
348
- self
366
+ @options[:validate] = true
367
+ each_statement {}
349
368
  rescue RDF::ReaderError => e
350
369
  raise Error, e.message
351
- self
352
370
  end
353
371
 
354
372
  ##
@@ -395,16 +413,22 @@ module RDF::Tabular
395
413
  end
396
414
  options = {} unless options.is_a?(Hash)
397
415
 
398
- hash_fn = options[:atd] ? :to_atd : :to_hash
416
+ hash_fn = :to_hash
399
417
  options = options.merge(noProv: @options[:noProv])
400
418
 
401
- if io
419
+ res = if io
402
420
  ::JSON::dump_default_options = json_state
403
421
  ::JSON.dump(self.send(hash_fn, options), io)
404
422
  else
405
423
  hash = self.send(hash_fn, options)
406
424
  ::JSON.generate(hash, json_state)
407
425
  end
426
+
427
+ if validate? && log_statistics[:error]
428
+ raise RDF::Tabular::Error, "Errors found during processing"
429
+ end
430
+
431
+ res
408
432
  rescue IOError => e
409
433
  raise RDF::Tabular::Error, e.message
410
434
  end
@@ -419,8 +443,8 @@ module RDF::Tabular
419
443
  def to_hash(options = {})
420
444
  # Construct metadata from that passed from file open, along with information from the file.
421
445
  if input.is_a?(Metadata)
422
- debug("each_statement: metadata") {input.inspect}
423
- depth do
446
+ log_debug("each_statement: metadata") {input.inspect}
447
+ log_depth do
424
448
  # Get Metadata to invoke and open referenced files
425
449
  begin
426
450
  # Validate metadata
@@ -445,8 +469,6 @@ module RDF::Tabular
445
469
  base: input.tables.first.url,
446
470
  minimal: minimal?,
447
471
  no_found_metadata: true,
448
- warnings: @warnings,
449
- errors: @errors,
450
472
  )) do |r|
451
473
  case t = r.to_hash(options)
452
474
  when Array then tables += t unless input.tables.first.suppressOutput
@@ -461,8 +483,6 @@ module RDF::Tabular
461
483
  base: table.url,
462
484
  minimal: minimal?,
463
485
  no_found_metadata: true,
464
- warnings: @warnings,
465
- errors: @errors,
466
486
  )) do |r|
467
487
  case t = r.to_hash(options)
468
488
  when Array then tables += t unless table.suppressOutput
@@ -477,14 +497,6 @@ module RDF::Tabular
477
497
 
478
498
  # Result is table_group or array
479
499
  minimal? ? tables : table_group
480
- ensure
481
- warnings = @warnings.concat(input.warnings)
482
- if validate? && !warnings.empty? && !@options[:warnings]
483
- $stderr.puts "Warnings: #{warnings.join("\n")}"
484
- end
485
- if validate? && !errors.empty? && !@options[:errors]
486
- $stderr.puts "Errors: #{errors.join("\n")}"
487
- end
488
500
  end
489
501
  end
490
502
  else
@@ -524,8 +536,12 @@ module RDF::Tabular
524
536
  column = metadata.tableSchema.columns[index]
525
537
 
526
538
  # Collect cell errors
527
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
528
- cell.errors.join("\n") unless Array(cell.errors).empty?
539
+ unless Array(cell.errors).empty?
540
+ self.send(validate? ? :log_error : :log_warn,
541
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
542
+ cell.errors.join("\n")
543
+ end
544
+ end
529
545
 
530
546
  # Ignore suppressed columns
531
547
  next if column.suppressOutput
@@ -607,71 +623,6 @@ module RDF::Tabular
607
623
  end
608
624
  end
609
625
 
610
- # Return a hash representation of the annotated tabular data model for JSON serialization
611
- # @param [Hash{Symbol => Object}] options
612
- # @return [Hash]
613
- def to_atd(options = {})
614
- # Construct metadata from that passed from file open, along with information from the file.
615
- if input.is_a?(Metadata)
616
- debug("each_statement: metadata") {input.inspect}
617
- depth do
618
- # Get Metadata to invoke and open referenced files
619
- case input.type
620
- when :TableGroup
621
- table_group = input.to_atd
622
- if input.tables.empty? && options[:original_input]
623
- Reader.new(options[:original_input], options.merge(
624
- base: options[:base],
625
- no_found_metadata: true
626
- )) do |r|
627
- table_group["tables"] << r.to_atd(options)
628
- end
629
- else
630
- input.each_table do |table|
631
- Reader.open(table.url, options.merge(
632
- metadata: table,
633
- base: table.url,
634
- no_found_metadata: true
635
- )) do |r|
636
- table_group["tables"] << r.to_atd(options)
637
- end
638
- end
639
- end
640
-
641
- # Result is table_group
642
- table_group
643
- when :Table
644
- table = nil
645
- Reader.open(input.url, options.merge(
646
- metadata: input,
647
- base: input.url,
648
- no_found_metadata: true
649
- )) do |r|
650
- table = r.to_atd(options)
651
- end
652
-
653
- table
654
- else
655
- raise "Opened inappropriate metadata type: #{input.type}"
656
- end
657
- end
658
- else
659
- rows = []
660
- table = metadata.to_atd
661
- rows, columns = table["rows"], table["columns"]
662
-
663
- # Input is file containing CSV data.
664
- # Output ROW-Level statements
665
- metadata.each_row(input) do |row|
666
- rows << row.to_atd
667
- row.values.each_with_index do |cell, colndx|
668
- columns[colndx]["cells"] << cell.to_atd
669
- end
670
- end
671
- table
672
- end
673
- end
674
-
675
626
  def minimal?; @options[:minimal]; end
676
627
  def prov?; !(@options[:noProv]); end
677
628
 
@@ -691,9 +642,9 @@ module RDF::Tabular
691
642
  # @param [URI, BNode, Literal] object the object of the statement
692
643
  # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
693
644
  def add_statement(node, *args)
694
- statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement.new(*args)
645
+ statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement(*args)
695
646
  raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
696
- debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
647
+ log_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
697
648
  @callback.call(statement)
698
649
  end
699
650
 
@@ -702,7 +653,7 @@ module RDF::Tabular
702
653
  pk_strings = {}
703
654
  primary_keys.reject(&:empty?).each do |row_pks|
704
655
  pk_names = row_pks.map {|cell| cell.value}.join(",")
705
- errors << "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
656
+ log_error "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
706
657
  pk_strings[pk_names] ||= 0
707
658
  pk_strings[pk_names] += 1
708
659
  end
@@ -735,7 +686,7 @@ module RDF::Tabular
735
686
  fk[:reference_to] ||= {}
736
687
  cell_values = cells.map {|cell| cell.stringValue unless cell.stringValue.to_s.empty?}.compact
737
688
  next if cell_values.empty? # Don't record if empty
738
- errors << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
689
+ log_error "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
739
690
  fk[:reference_to][cell_values] ||= row
740
691
  end
741
692
  end
@@ -748,7 +699,7 @@ module RDF::Tabular
748
699
  # Verify that reference_from entry exists in reference_to
749
700
  fk.fetch(:reference_from, {}).each do |cell_values, row|
750
701
  unless fk.fetch(:reference_to, {}).has_key?(cell_values)
751
- errors << "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
702
+ log_error "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
752
703
  "Foreign Key violation, expected to find #{cell_values.map(&:to_s).inspect}"
753
704
  end
754
705
  end
@@ -134,7 +134,6 @@ module RDF::Tabular
134
134
  # Upcase value and remove internal spaces
135
135
  value = value.upcase
136
136
 
137
- #require 'byebug'; byebug unless value.empty?
138
137
  if value =~ re
139
138
 
140
139
  # Upcase value and remove internal spaces
@@ -226,7 +225,6 @@ module RDF::Tabular
226
225
  fractional_grouping_size = fractional_parts[0].to_s.length
227
226
 
228
227
  # Construct regular expression for integer part
229
- #require 'byebug'; byebug
230
228
  integer_str = if primary_grouping_size == 0
231
229
  "\\d{#{min_integer_digits},}"
232
230
  else
data/spec/format_spec.rb CHANGED
@@ -30,4 +30,38 @@ describe RDF::Tabular::Format do
30
30
  describe "#to_sym" do
31
31
  specify {expect(described_class.to_sym).to eq :tabular}
32
32
  end
33
+
34
+ describe ".cli_commands" do
35
+ before(:each) do
36
+ WebMock.stub_request(:any, %r(.*example.org.*)).
37
+ to_return(lambda {|request|
38
+ file = request.uri.to_s.split('/').last
39
+ content_type = case file
40
+ when /\.json/ then 'application/json'
41
+ when /\.csv/ then 'text/csv'
42
+ else 'text/plain'
43
+ end
44
+
45
+ path = File.expand_path("../data/#{file}", __FILE__)
46
+ if File.exist?(path)
47
+ {
48
+ body: File.read(path),
49
+ status: 200,
50
+ headers: {'Content-Type' => content_type}
51
+ }
52
+ else
53
+ {status: 401}
54
+ end
55
+ })
56
+ end
57
+ after(:each) {|example| puts logger.to_s if example.exception}
58
+
59
+ require 'rdf/cli'
60
+ let(:input) {File.expand_path("../data/countries.json", __FILE__)}
61
+ describe "#tabular-json" do
62
+ it "serializes to JSON" do
63
+ expect {RDF::CLI.exec_command("tabular-json", [input], format: :tabular)}.to write.to(:output)
64
+ end
65
+ end
66
+ end
33
67
  end