rdf-tabular 0.3.0 → 0.4.0.beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ module RDF::Tabular
8
8
  # @author [Gregg Kellogg](http://greggkellogg.net/)
9
9
  class Reader < RDF::Reader
10
10
  format Format
11
- include Utils
11
+ include RDF::Util::Logger
12
12
 
13
13
  # Metadata associated with the CSV
14
14
  #
@@ -21,14 +21,27 @@ module RDF::Tabular
21
21
  attr_reader :input
22
22
 
23
23
  ##
24
- # Warnings found during processing
25
- # @return [Array<String>]
26
- attr_reader :warnings
27
-
28
- ##
29
- # Accumulated errors found during processing
30
- # @return [Array<String>]
31
- attr_reader :errors
24
+ # Writer options
25
+ # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Writer#options-class_method
26
+ def self.options
27
+ super + [
28
+ RDF::CLI::Option.new(
29
+ symbol: :metadata,
30
+ datatype: RDF::URI,
31
+ on: ["--metadata URI"],
32
+ description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
33
+ RDF::CLI::Option.new(
34
+ symbol: :minimal,
35
+ datatype: TrueClass,
36
+ on: ["--minimal"],
37
+ description: "Includes only the information gleaned from the cells of the tabular data.") {true},
38
+ RDF::CLI::Option.new(
39
+ symbol: :noProv,
40
+ datatype: TrueClass,
41
+ on: ["--no-prov"],
42
+ description: "do not output optional provenance information.") {true},
43
+ ]
44
+ end
32
45
 
33
46
  ##
34
47
  # Initializes the RDF::Tabular Reader instance.
@@ -41,10 +54,6 @@ module RDF::Tabular
41
54
  # @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
42
55
  # @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
43
56
  # @option options [Boolean] :noProv do not output optional provenance information
44
- # @option options [Array] :errors
45
- # array for placing errors found when processing metadata. If not set, and validating, errors are output to `$stderr`
46
- # @option options [Array] :warnings
47
- # array for placing warnings found when processing metadata. If not set, and validating, warnings are output to `$stderr`
48
57
  # @option optinons [Array<Hash>] :fks_referencing_table
49
58
  # When called with Table metadata, a list of the foreign keys referencing this table
50
59
  # @yield [reader] `self`
@@ -62,11 +71,7 @@ module RDF::Tabular
62
71
  @options[:base] = "file:/#{File.expand_path(@options[:base])}"
63
72
  end
64
73
 
65
- @options[:depth] ||= 0
66
- @errors = @options.fetch(:errors, [])
67
- @warnings = @options.fetch(:warnings, [])
68
-
69
- debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
74
+ log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
70
75
 
71
76
  # Minimal implies noProv
72
77
  @options[:noProv] ||= @options[:minimal]
@@ -77,7 +82,7 @@ module RDF::Tabular
77
82
  else input
78
83
  end
79
84
 
80
- depth do
85
+ log_depth do
81
86
  # If input is JSON, then the input is the metadata
82
87
  content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
83
88
  if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
@@ -86,6 +91,20 @@ module RDF::Tabular
86
91
  @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
87
92
  @metadata.normalize!
88
93
  @input = @metadata
94
+ elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
95
+ !RDF::URI(@options[:base].to_s).fragment
96
+ require 'nokogiri' unless defined?(:Nokogiri)
97
+ doc = Nokogiri::HTML.parse(input)
98
+ doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
99
+ def script.content_type; "application/csvm+json"; end
100
+ log_debug("Reader#initialize") {"Process HTML script block"}
101
+ @input = script
102
+ @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
103
+ # If @metadata is for a Table, turn it into a TableGroup
104
+ @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
105
+ @metadata.normalize!
106
+ @input = @metadata
107
+ end
89
108
  elsif @options[:no_found_metadata]
90
109
  # Extract embedded metadata and merge
91
110
  dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
@@ -117,7 +136,7 @@ module RDF::Tabular
117
136
  @input = @metadata = Metadata.for_input(@input, @options).normalize!
118
137
  end
119
138
 
120
- debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
139
+ log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
121
140
 
122
141
  if block_given?
123
142
  case block.arity
@@ -140,9 +159,9 @@ module RDF::Tabular
140
159
 
141
160
  # Construct metadata from that passed from file open, along with information from the file.
142
161
  if input.is_a?(Metadata)
143
- debug("each_statement: metadata") {input.inspect}
162
+ log_debug("each_statement: metadata") {input.inspect}
144
163
 
145
- depth do
164
+ log_depth do
146
165
  begin
147
166
  # Validate metadata
148
167
  input.validate!
@@ -169,8 +188,6 @@ module RDF::Tabular
169
188
  base: input.tables.first.url,
170
189
  no_found_metadata: true,
171
190
  table_resource: table_resource,
172
- warnings: @warnings,
173
- errors: @errors,
174
191
  )) do |r|
175
192
  r.each_statement(&block)
176
193
  end
@@ -191,8 +208,6 @@ module RDF::Tabular
191
208
  no_found_metadata: true,
192
209
  table_resource: table_resource,
193
210
  fks_referencing_table: fks,
194
- warnings: @warnings,
195
- errors: @errors,
196
211
  )) do |r|
197
212
  r.each_statement(&block)
198
213
  end
@@ -231,16 +246,12 @@ module RDF::Tabular
231
246
  add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
232
247
  end
233
248
  end
234
- ensure
235
- warnings = @warnings.concat(input.warnings)
236
- if validate? && !warnings.empty? && !@options[:warnings]
237
- $stderr.puts "Warnings: #{warnings.join("\n")}"
238
- end
239
- if validate? && !errors.empty? && !@options[:errors]
240
- $stderr.puts "Errors: #{errors.join("\n")}"
241
- end
242
249
  end
243
250
  end
251
+
252
+ if validate? && log_statistics[:error]
253
+ raise RDF::ReaderError, "Errors found during processing"
254
+ end
244
255
  return
245
256
  end
246
257
 
@@ -261,8 +272,9 @@ module RDF::Tabular
261
272
  row.subject = table_resource
262
273
  add_statement(last_row_num + 1, row) unless metadata.suppressOutput
263
274
  next
275
+ else
276
+ last_row_num = row.sourceNumber
264
277
  end
265
- last_row_num = row.sourceNumber
266
278
 
267
279
  # Collect primary and foreign keys if validating
268
280
  if validate?
@@ -286,11 +298,20 @@ module RDF::Tabular
286
298
  end
287
299
  row.values.each_with_index do |cell, index|
288
300
  # Collect cell errors
289
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
290
- cell.errors.join("\n") unless Array(cell.errors).empty?
301
+ unless Array(cell.errors).empty?
302
+ self.send((validate? ? :log_error : :log_warn),
303
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
304
+ cell.errors.join("\n")
305
+ end
306
+ end
291
307
  next if cell.column.suppressOutput # Skip ignored cells
292
308
  cell_subject = cell.aboutUrl || default_cell_subject
293
- propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
309
+ propertyUrl = cell.propertyUrl || begin
310
+ # It's possible that the metadata URL already has a fragment, in which case we need to override it.
311
+ u = metadata.url.dup
312
+ u.fragment = cell.column.name
313
+ u
314
+ end
294
315
  add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
295
316
 
296
317
  if cell.column.valueUrl
@@ -339,16 +360,13 @@ module RDF::Tabular
339
360
  end
340
361
 
341
362
  ##
342
- # Validate and raise an exception if any errors are found while processing either metadata or tables
343
- # @return [self]
344
- # @raise [Error]
363
+ # Do we have valid metadata?
364
+ # @raise [RDF::ReaderError]
345
365
  def validate!
346
- each_statement {} # Read all rows
347
- raise Error, errors.join("\n") unless errors.empty?
348
- self
366
+ @options[:validate] = true
367
+ each_statement {}
349
368
  rescue RDF::ReaderError => e
350
369
  raise Error, e.message
351
- self
352
370
  end
353
371
 
354
372
  ##
@@ -395,16 +413,22 @@ module RDF::Tabular
395
413
  end
396
414
  options = {} unless options.is_a?(Hash)
397
415
 
398
- hash_fn = options[:atd] ? :to_atd : :to_hash
416
+ hash_fn = :to_hash
399
417
  options = options.merge(noProv: @options[:noProv])
400
418
 
401
- if io
419
+ res = if io
402
420
  ::JSON::dump_default_options = json_state
403
421
  ::JSON.dump(self.send(hash_fn, options), io)
404
422
  else
405
423
  hash = self.send(hash_fn, options)
406
424
  ::JSON.generate(hash, json_state)
407
425
  end
426
+
427
+ if validate? && log_statistics[:error]
428
+ raise RDF::Tabular::Error, "Errors found during processing"
429
+ end
430
+
431
+ res
408
432
  rescue IOError => e
409
433
  raise RDF::Tabular::Error, e.message
410
434
  end
@@ -419,8 +443,8 @@ module RDF::Tabular
419
443
  def to_hash(options = {})
420
444
  # Construct metadata from that passed from file open, along with information from the file.
421
445
  if input.is_a?(Metadata)
422
- debug("each_statement: metadata") {input.inspect}
423
- depth do
446
+ log_debug("each_statement: metadata") {input.inspect}
447
+ log_depth do
424
448
  # Get Metadata to invoke and open referenced files
425
449
  begin
426
450
  # Validate metadata
@@ -445,8 +469,6 @@ module RDF::Tabular
445
469
  base: input.tables.first.url,
446
470
  minimal: minimal?,
447
471
  no_found_metadata: true,
448
- warnings: @warnings,
449
- errors: @errors,
450
472
  )) do |r|
451
473
  case t = r.to_hash(options)
452
474
  when Array then tables += t unless input.tables.first.suppressOutput
@@ -461,8 +483,6 @@ module RDF::Tabular
461
483
  base: table.url,
462
484
  minimal: minimal?,
463
485
  no_found_metadata: true,
464
- warnings: @warnings,
465
- errors: @errors,
466
486
  )) do |r|
467
487
  case t = r.to_hash(options)
468
488
  when Array then tables += t unless table.suppressOutput
@@ -477,14 +497,6 @@ module RDF::Tabular
477
497
 
478
498
  # Result is table_group or array
479
499
  minimal? ? tables : table_group
480
- ensure
481
- warnings = @warnings.concat(input.warnings)
482
- if validate? && !warnings.empty? && !@options[:warnings]
483
- $stderr.puts "Warnings: #{warnings.join("\n")}"
484
- end
485
- if validate? && !errors.empty? && !@options[:errors]
486
- $stderr.puts "Errors: #{errors.join("\n")}"
487
- end
488
500
  end
489
501
  end
490
502
  else
@@ -524,8 +536,12 @@ module RDF::Tabular
524
536
  column = metadata.tableSchema.columns[index]
525
537
 
526
538
  # Collect cell errors
527
- (validate? ? errors : warnings) << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): " +
528
- cell.errors.join("\n") unless Array(cell.errors).empty?
539
+ unless Array(cell.errors).empty?
540
+ self.send(validate? ? :log_error : :log_warn,
541
+ "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
542
+ cell.errors.join("\n")
543
+ end
544
+ end
529
545
 
530
546
  # Ignore suppressed columns
531
547
  next if column.suppressOutput
@@ -607,71 +623,6 @@ module RDF::Tabular
607
623
  end
608
624
  end
609
625
 
610
- # Return a hash representation of the annotated tabular data model for JSON serialization
611
- # @param [Hash{Symbol => Object}] options
612
- # @return [Hash]
613
- def to_atd(options = {})
614
- # Construct metadata from that passed from file open, along with information from the file.
615
- if input.is_a?(Metadata)
616
- debug("each_statement: metadata") {input.inspect}
617
- depth do
618
- # Get Metadata to invoke and open referenced files
619
- case input.type
620
- when :TableGroup
621
- table_group = input.to_atd
622
- if input.tables.empty? && options[:original_input]
623
- Reader.new(options[:original_input], options.merge(
624
- base: options[:base],
625
- no_found_metadata: true
626
- )) do |r|
627
- table_group["tables"] << r.to_atd(options)
628
- end
629
- else
630
- input.each_table do |table|
631
- Reader.open(table.url, options.merge(
632
- metadata: table,
633
- base: table.url,
634
- no_found_metadata: true
635
- )) do |r|
636
- table_group["tables"] << r.to_atd(options)
637
- end
638
- end
639
- end
640
-
641
- # Result is table_group
642
- table_group
643
- when :Table
644
- table = nil
645
- Reader.open(input.url, options.merge(
646
- metadata: input,
647
- base: input.url,
648
- no_found_metadata: true
649
- )) do |r|
650
- table = r.to_atd(options)
651
- end
652
-
653
- table
654
- else
655
- raise "Opened inappropriate metadata type: #{input.type}"
656
- end
657
- end
658
- else
659
- rows = []
660
- table = metadata.to_atd
661
- rows, columns = table["rows"], table["columns"]
662
-
663
- # Input is file containing CSV data.
664
- # Output ROW-Level statements
665
- metadata.each_row(input) do |row|
666
- rows << row.to_atd
667
- row.values.each_with_index do |cell, colndx|
668
- columns[colndx]["cells"] << cell.to_atd
669
- end
670
- end
671
- table
672
- end
673
- end
674
-
675
626
  def minimal?; @options[:minimal]; end
676
627
  def prov?; !(@options[:noProv]); end
677
628
 
@@ -691,9 +642,9 @@ module RDF::Tabular
691
642
  # @param [URI, BNode, Literal] object the object of the statement
692
643
  # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
693
644
  def add_statement(node, *args)
694
- statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement.new(*args)
645
+ statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement(*args)
695
646
  raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
696
- debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
647
+ log_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
697
648
  @callback.call(statement)
698
649
  end
699
650
 
@@ -702,7 +653,7 @@ module RDF::Tabular
702
653
  pk_strings = {}
703
654
  primary_keys.reject(&:empty?).each do |row_pks|
704
655
  pk_names = row_pks.map {|cell| cell.value}.join(",")
705
- errors << "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
656
+ log_error "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
706
657
  pk_strings[pk_names] ||= 0
707
658
  pk_strings[pk_names] += 1
708
659
  end
@@ -735,7 +686,7 @@ module RDF::Tabular
735
686
  fk[:reference_to] ||= {}
736
687
  cell_values = cells.map {|cell| cell.stringValue unless cell.stringValue.to_s.empty?}.compact
737
688
  next if cell_values.empty? # Don't record if empty
738
- errors << "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
689
+ log_error "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
739
690
  fk[:reference_to][cell_values] ||= row
740
691
  end
741
692
  end
@@ -748,7 +699,7 @@ module RDF::Tabular
748
699
  # Verify that reference_from entry exists in reference_to
749
700
  fk.fetch(:reference_from, {}).each do |cell_values, row|
750
701
  unless fk.fetch(:reference_to, {}).has_key?(cell_values)
751
- errors << "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
702
+ log_error "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
752
703
  "Foreign Key violation, expected to find #{cell_values.map(&:to_s).inspect}"
753
704
  end
754
705
  end
@@ -134,7 +134,6 @@ module RDF::Tabular
134
134
  # Upcase value and remove internal spaces
135
135
  value = value.upcase
136
136
 
137
- #require 'byebug'; byebug unless value.empty?
138
137
  if value =~ re
139
138
 
140
139
  # Upcase value and remove internal spaces
@@ -226,7 +225,6 @@ module RDF::Tabular
226
225
  fractional_grouping_size = fractional_parts[0].to_s.length
227
226
 
228
227
  # Construct regular expression for integer part
229
- #require 'byebug'; byebug
230
228
  integer_str = if primary_grouping_size == 0
231
229
  "\\d{#{min_integer_digits},}"
232
230
  else
data/spec/format_spec.rb CHANGED
@@ -30,4 +30,38 @@ describe RDF::Tabular::Format do
30
30
  describe "#to_sym" do
31
31
  specify {expect(described_class.to_sym).to eq :tabular}
32
32
  end
33
+
34
+ describe ".cli_commands" do
35
+ before(:each) do
36
+ WebMock.stub_request(:any, %r(.*example.org.*)).
37
+ to_return(lambda {|request|
38
+ file = request.uri.to_s.split('/').last
39
+ content_type = case file
40
+ when /\.json/ then 'application/json'
41
+ when /\.csv/ then 'text/csv'
42
+ else 'text/plain'
43
+ end
44
+
45
+ path = File.expand_path("../data/#{file}", __FILE__)
46
+ if File.exist?(path)
47
+ {
48
+ body: File.read(path),
49
+ status: 200,
50
+ headers: {'Content-Type' => content_type}
51
+ }
52
+ else
53
+ {status: 401}
54
+ end
55
+ })
56
+ end
57
+ after(:each) {|example| puts logger.to_s if example.exception}
58
+
59
+ require 'rdf/cli'
60
+ let(:input) {File.expand_path("../data/countries.json", __FILE__)}
61
+ describe "#tabular-json" do
62
+ it "serializes to JSON" do
63
+ expect {RDF::CLI.exec_command("tabular-json", [input], format: :tabular)}.to write.to(:output)
64
+ end
65
+ end
66
+ end
33
67
  end