rdf-tabular 0.3.0 → 0.4.0.beta2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/VERSION +1 -1
- data/lib/rdf/tabular.rb +0 -1
- data/lib/rdf/tabular/format.rb +16 -0
- data/lib/rdf/tabular/metadata.rb +223 -191
- data/lib/rdf/tabular/reader.rb +84 -133
- data/lib/rdf/tabular/uax35.rb +0 -2
- data/spec/format_spec.rb +34 -0
- data/spec/matchers.rb +3 -78
- data/spec/metadata_spec.rb +150 -80
- data/spec/reader_spec.rb +27 -24
- data/spec/spec_helper.rb +4 -3
- data/spec/suite_helper.rb +1 -1
- data/spec/suite_spec.rb +8 -9
- metadata +109 -60
- data/lib/rdf/tabular/utils.rb +0 -33
data/lib/rdf/tabular/reader.rb
CHANGED
@@ -8,7 +8,7 @@ module RDF::Tabular
|
|
8
8
|
# @author [Gregg Kellogg](http://greggkellogg.net/)
|
9
9
|
class Reader < RDF::Reader
|
10
10
|
format Format
|
11
|
-
include
|
11
|
+
include RDF::Util::Logger
|
12
12
|
|
13
13
|
# Metadata associated with the CSV
|
14
14
|
#
|
@@ -21,14 +21,27 @@ module RDF::Tabular
|
|
21
21
|
attr_reader :input
|
22
22
|
|
23
23
|
##
|
24
|
-
#
|
25
|
-
# @
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
24
|
+
# Writer options
|
25
|
+
# @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Writer#options-class_method
|
26
|
+
def self.options
|
27
|
+
super + [
|
28
|
+
RDF::CLI::Option.new(
|
29
|
+
symbol: :metadata,
|
30
|
+
datatype: RDF::URI,
|
31
|
+
on: ["--metadata URI"],
|
32
|
+
description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
|
33
|
+
RDF::CLI::Option.new(
|
34
|
+
symbol: :minimal,
|
35
|
+
datatype: TrueClass,
|
36
|
+
on: ["--minimal"],
|
37
|
+
description: "Includes only the information gleaned from the cells of the tabular data.") {true},
|
38
|
+
RDF::CLI::Option.new(
|
39
|
+
symbol: :noProv,
|
40
|
+
datatype: TrueClass,
|
41
|
+
on: ["--no-prov"],
|
42
|
+
description: "do not output optional provenance information.") {true},
|
43
|
+
]
|
44
|
+
end
|
32
45
|
|
33
46
|
##
|
34
47
|
# Initializes the RDF::Tabular Reader instance.
|
@@ -41,10 +54,6 @@ module RDF::Tabular
|
|
41
54
|
# @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
|
42
55
|
# @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
|
43
56
|
# @option options [Boolean] :noProv do not output optional provenance information
|
44
|
-
# @option options [Array] :errors
|
45
|
-
# array for placing errors found when processing metadata. If not set, and validating, errors are output to `$stderr`
|
46
|
-
# @option options [Array] :warnings
|
47
|
-
# array for placing warnings found when processing metadata. If not set, and validating, warnings are output to `$stderr`
|
48
57
|
# @option optinons [Array<Hash>] :fks_referencing_table
|
49
58
|
# When called with Table metadata, a list of the foreign keys referencing this table
|
50
59
|
# @yield [reader] `self`
|
@@ -62,11 +71,7 @@ module RDF::Tabular
|
|
62
71
|
@options[:base] = "file:/#{File.expand_path(@options[:base])}"
|
63
72
|
end
|
64
73
|
|
65
|
-
@options[:
|
66
|
-
@errors = @options.fetch(:errors, [])
|
67
|
-
@warnings = @options.fetch(:warnings, [])
|
68
|
-
|
69
|
-
debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
|
74
|
+
log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
|
70
75
|
|
71
76
|
# Minimal implies noProv
|
72
77
|
@options[:noProv] ||= @options[:minimal]
|
@@ -77,7 +82,7 @@ module RDF::Tabular
|
|
77
82
|
else input
|
78
83
|
end
|
79
84
|
|
80
|
-
|
85
|
+
log_depth do
|
81
86
|
# If input is JSON, then the input is the metadata
|
82
87
|
content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
|
83
88
|
if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
|
@@ -86,6 +91,20 @@ module RDF::Tabular
|
|
86
91
|
@metadata = @metadata.to_table_group if @metadata.is_a?(Table)
|
87
92
|
@metadata.normalize!
|
88
93
|
@input = @metadata
|
94
|
+
elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
|
95
|
+
!RDF::URI(@options[:base].to_s).fragment
|
96
|
+
require 'nokogiri' unless defined?(:Nokogiri)
|
97
|
+
doc = Nokogiri::HTML.parse(input)
|
98
|
+
doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
|
99
|
+
def script.content_type; "application/csvm+json"; end
|
100
|
+
log_debug("Reader#initialize") {"Process HTML script block"}
|
101
|
+
@input = script
|
102
|
+
@metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
|
103
|
+
# If @metadata is for a Table, turn it into a TableGroup
|
104
|
+
@metadata = @metadata.to_table_group if @metadata.is_a?(Table)
|
105
|
+
@metadata.normalize!
|
106
|
+
@input = @metadata
|
107
|
+
end
|
89
108
|
elsif @options[:no_found_metadata]
|
90
109
|
# Extract embedded metadata and merge
|
91
110
|
dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
|
@@ -117,7 +136,7 @@ module RDF::Tabular
|
|
117
136
|
@input = @metadata = Metadata.for_input(@input, @options).normalize!
|
118
137
|
end
|
119
138
|
|
120
|
-
|
139
|
+
log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
|
121
140
|
|
122
141
|
if block_given?
|
123
142
|
case block.arity
|
@@ -140,9 +159,9 @@ module RDF::Tabular
|
|
140
159
|
|
141
160
|
# Construct metadata from that passed from file open, along with information from the file.
|
142
161
|
if input.is_a?(Metadata)
|
143
|
-
|
162
|
+
log_debug("each_statement: metadata") {input.inspect}
|
144
163
|
|
145
|
-
|
164
|
+
log_depth do
|
146
165
|
begin
|
147
166
|
# Validate metadata
|
148
167
|
input.validate!
|
@@ -169,8 +188,6 @@ module RDF::Tabular
|
|
169
188
|
base: input.tables.first.url,
|
170
189
|
no_found_metadata: true,
|
171
190
|
table_resource: table_resource,
|
172
|
-
warnings: @warnings,
|
173
|
-
errors: @errors,
|
174
191
|
)) do |r|
|
175
192
|
r.each_statement(&block)
|
176
193
|
end
|
@@ -191,8 +208,6 @@ module RDF::Tabular
|
|
191
208
|
no_found_metadata: true,
|
192
209
|
table_resource: table_resource,
|
193
210
|
fks_referencing_table: fks,
|
194
|
-
warnings: @warnings,
|
195
|
-
errors: @errors,
|
196
211
|
)) do |r|
|
197
212
|
r.each_statement(&block)
|
198
213
|
end
|
@@ -231,16 +246,12 @@ module RDF::Tabular
|
|
231
246
|
add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
|
232
247
|
end
|
233
248
|
end
|
234
|
-
ensure
|
235
|
-
warnings = @warnings.concat(input.warnings)
|
236
|
-
if validate? && !warnings.empty? && !@options[:warnings]
|
237
|
-
$stderr.puts "Warnings: #{warnings.join("\n")}"
|
238
|
-
end
|
239
|
-
if validate? && !errors.empty? && !@options[:errors]
|
240
|
-
$stderr.puts "Errors: #{errors.join("\n")}"
|
241
|
-
end
|
242
249
|
end
|
243
250
|
end
|
251
|
+
|
252
|
+
if validate? && log_statistics[:error]
|
253
|
+
raise RDF::ReaderError, "Errors found during processing"
|
254
|
+
end
|
244
255
|
return
|
245
256
|
end
|
246
257
|
|
@@ -261,8 +272,9 @@ module RDF::Tabular
|
|
261
272
|
row.subject = table_resource
|
262
273
|
add_statement(last_row_num + 1, row) unless metadata.suppressOutput
|
263
274
|
next
|
275
|
+
else
|
276
|
+
last_row_num = row.sourceNumber
|
264
277
|
end
|
265
|
-
last_row_num = row.sourceNumber
|
266
278
|
|
267
279
|
# Collect primary and foreign keys if validating
|
268
280
|
if validate?
|
@@ -286,11 +298,20 @@ module RDF::Tabular
|
|
286
298
|
end
|
287
299
|
row.values.each_with_index do |cell, index|
|
288
300
|
# Collect cell errors
|
289
|
-
|
290
|
-
|
301
|
+
unless Array(cell.errors).empty?
|
302
|
+
self.send((validate? ? :log_error : :log_warn),
|
303
|
+
"Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
|
304
|
+
cell.errors.join("\n")
|
305
|
+
end
|
306
|
+
end
|
291
307
|
next if cell.column.suppressOutput # Skip ignored cells
|
292
308
|
cell_subject = cell.aboutUrl || default_cell_subject
|
293
|
-
propertyUrl = cell.propertyUrl ||
|
309
|
+
propertyUrl = cell.propertyUrl || begin
|
310
|
+
# It's possible that the metadata URL already has a fragment, in which case we need to override it.
|
311
|
+
u = metadata.url.dup
|
312
|
+
u.fragment = cell.column.name
|
313
|
+
u
|
314
|
+
end
|
294
315
|
add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
|
295
316
|
|
296
317
|
if cell.column.valueUrl
|
@@ -339,16 +360,13 @@ module RDF::Tabular
|
|
339
360
|
end
|
340
361
|
|
341
362
|
##
|
342
|
-
#
|
343
|
-
# @
|
344
|
-
# @raise [Error]
|
363
|
+
# Do we have valid metadata?
|
364
|
+
# @raise [RDF::ReaderError]
|
345
365
|
def validate!
|
346
|
-
|
347
|
-
|
348
|
-
self
|
366
|
+
@options[:validate] = true
|
367
|
+
each_statement {}
|
349
368
|
rescue RDF::ReaderError => e
|
350
369
|
raise Error, e.message
|
351
|
-
self
|
352
370
|
end
|
353
371
|
|
354
372
|
##
|
@@ -395,16 +413,22 @@ module RDF::Tabular
|
|
395
413
|
end
|
396
414
|
options = {} unless options.is_a?(Hash)
|
397
415
|
|
398
|
-
hash_fn =
|
416
|
+
hash_fn = :to_hash
|
399
417
|
options = options.merge(noProv: @options[:noProv])
|
400
418
|
|
401
|
-
if io
|
419
|
+
res = if io
|
402
420
|
::JSON::dump_default_options = json_state
|
403
421
|
::JSON.dump(self.send(hash_fn, options), io)
|
404
422
|
else
|
405
423
|
hash = self.send(hash_fn, options)
|
406
424
|
::JSON.generate(hash, json_state)
|
407
425
|
end
|
426
|
+
|
427
|
+
if validate? && log_statistics[:error]
|
428
|
+
raise RDF::Tabular::Error, "Errors found during processing"
|
429
|
+
end
|
430
|
+
|
431
|
+
res
|
408
432
|
rescue IOError => e
|
409
433
|
raise RDF::Tabular::Error, e.message
|
410
434
|
end
|
@@ -419,8 +443,8 @@ module RDF::Tabular
|
|
419
443
|
def to_hash(options = {})
|
420
444
|
# Construct metadata from that passed from file open, along with information from the file.
|
421
445
|
if input.is_a?(Metadata)
|
422
|
-
|
423
|
-
|
446
|
+
log_debug("each_statement: metadata") {input.inspect}
|
447
|
+
log_depth do
|
424
448
|
# Get Metadata to invoke and open referenced files
|
425
449
|
begin
|
426
450
|
# Validate metadata
|
@@ -445,8 +469,6 @@ module RDF::Tabular
|
|
445
469
|
base: input.tables.first.url,
|
446
470
|
minimal: minimal?,
|
447
471
|
no_found_metadata: true,
|
448
|
-
warnings: @warnings,
|
449
|
-
errors: @errors,
|
450
472
|
)) do |r|
|
451
473
|
case t = r.to_hash(options)
|
452
474
|
when Array then tables += t unless input.tables.first.suppressOutput
|
@@ -461,8 +483,6 @@ module RDF::Tabular
|
|
461
483
|
base: table.url,
|
462
484
|
minimal: minimal?,
|
463
485
|
no_found_metadata: true,
|
464
|
-
warnings: @warnings,
|
465
|
-
errors: @errors,
|
466
486
|
)) do |r|
|
467
487
|
case t = r.to_hash(options)
|
468
488
|
when Array then tables += t unless table.suppressOutput
|
@@ -477,14 +497,6 @@ module RDF::Tabular
|
|
477
497
|
|
478
498
|
# Result is table_group or array
|
479
499
|
minimal? ? tables : table_group
|
480
|
-
ensure
|
481
|
-
warnings = @warnings.concat(input.warnings)
|
482
|
-
if validate? && !warnings.empty? && !@options[:warnings]
|
483
|
-
$stderr.puts "Warnings: #{warnings.join("\n")}"
|
484
|
-
end
|
485
|
-
if validate? && !errors.empty? && !@options[:errors]
|
486
|
-
$stderr.puts "Errors: #{errors.join("\n")}"
|
487
|
-
end
|
488
500
|
end
|
489
501
|
end
|
490
502
|
else
|
@@ -524,8 +536,12 @@ module RDF::Tabular
|
|
524
536
|
column = metadata.tableSchema.columns[index]
|
525
537
|
|
526
538
|
# Collect cell errors
|
527
|
-
|
528
|
-
|
539
|
+
unless Array(cell.errors).empty?
|
540
|
+
self.send(validate? ? :log_error : :log_warn,
|
541
|
+
"Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
|
542
|
+
cell.errors.join("\n")
|
543
|
+
end
|
544
|
+
end
|
529
545
|
|
530
546
|
# Ignore suppressed columns
|
531
547
|
next if column.suppressOutput
|
@@ -607,71 +623,6 @@ module RDF::Tabular
|
|
607
623
|
end
|
608
624
|
end
|
609
625
|
|
610
|
-
# Return a hash representation of the annotated tabular data model for JSON serialization
|
611
|
-
# @param [Hash{Symbol => Object}] options
|
612
|
-
# @return [Hash]
|
613
|
-
def to_atd(options = {})
|
614
|
-
# Construct metadata from that passed from file open, along with information from the file.
|
615
|
-
if input.is_a?(Metadata)
|
616
|
-
debug("each_statement: metadata") {input.inspect}
|
617
|
-
depth do
|
618
|
-
# Get Metadata to invoke and open referenced files
|
619
|
-
case input.type
|
620
|
-
when :TableGroup
|
621
|
-
table_group = input.to_atd
|
622
|
-
if input.tables.empty? && options[:original_input]
|
623
|
-
Reader.new(options[:original_input], options.merge(
|
624
|
-
base: options[:base],
|
625
|
-
no_found_metadata: true
|
626
|
-
)) do |r|
|
627
|
-
table_group["tables"] << r.to_atd(options)
|
628
|
-
end
|
629
|
-
else
|
630
|
-
input.each_table do |table|
|
631
|
-
Reader.open(table.url, options.merge(
|
632
|
-
metadata: table,
|
633
|
-
base: table.url,
|
634
|
-
no_found_metadata: true
|
635
|
-
)) do |r|
|
636
|
-
table_group["tables"] << r.to_atd(options)
|
637
|
-
end
|
638
|
-
end
|
639
|
-
end
|
640
|
-
|
641
|
-
# Result is table_group
|
642
|
-
table_group
|
643
|
-
when :Table
|
644
|
-
table = nil
|
645
|
-
Reader.open(input.url, options.merge(
|
646
|
-
metadata: input,
|
647
|
-
base: input.url,
|
648
|
-
no_found_metadata: true
|
649
|
-
)) do |r|
|
650
|
-
table = r.to_atd(options)
|
651
|
-
end
|
652
|
-
|
653
|
-
table
|
654
|
-
else
|
655
|
-
raise "Opened inappropriate metadata type: #{input.type}"
|
656
|
-
end
|
657
|
-
end
|
658
|
-
else
|
659
|
-
rows = []
|
660
|
-
table = metadata.to_atd
|
661
|
-
rows, columns = table["rows"], table["columns"]
|
662
|
-
|
663
|
-
# Input is file containing CSV data.
|
664
|
-
# Output ROW-Level statements
|
665
|
-
metadata.each_row(input) do |row|
|
666
|
-
rows << row.to_atd
|
667
|
-
row.values.each_with_index do |cell, colndx|
|
668
|
-
columns[colndx]["cells"] << cell.to_atd
|
669
|
-
end
|
670
|
-
end
|
671
|
-
table
|
672
|
-
end
|
673
|
-
end
|
674
|
-
|
675
626
|
def minimal?; @options[:minimal]; end
|
676
627
|
def prov?; !(@options[:noProv]); end
|
677
628
|
|
@@ -691,9 +642,9 @@ module RDF::Tabular
|
|
691
642
|
# @param [URI, BNode, Literal] object the object of the statement
|
692
643
|
# @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
|
693
644
|
def add_statement(node, *args)
|
694
|
-
statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement
|
645
|
+
statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement(*args)
|
695
646
|
raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
|
696
|
-
|
647
|
+
log_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
|
697
648
|
@callback.call(statement)
|
698
649
|
end
|
699
650
|
|
@@ -702,7 +653,7 @@ module RDF::Tabular
|
|
702
653
|
pk_strings = {}
|
703
654
|
primary_keys.reject(&:empty?).each do |row_pks|
|
704
655
|
pk_names = row_pks.map {|cell| cell.value}.join(",")
|
705
|
-
|
656
|
+
log_error "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
|
706
657
|
pk_strings[pk_names] ||= 0
|
707
658
|
pk_strings[pk_names] += 1
|
708
659
|
end
|
@@ -735,7 +686,7 @@ module RDF::Tabular
|
|
735
686
|
fk[:reference_to] ||= {}
|
736
687
|
cell_values = cells.map {|cell| cell.stringValue unless cell.stringValue.to_s.empty?}.compact
|
737
688
|
next if cell_values.empty? # Don't record if empty
|
738
|
-
|
689
|
+
log_error "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
|
739
690
|
fk[:reference_to][cell_values] ||= row
|
740
691
|
end
|
741
692
|
end
|
@@ -748,7 +699,7 @@ module RDF::Tabular
|
|
748
699
|
# Verify that reference_from entry exists in reference_to
|
749
700
|
fk.fetch(:reference_from, {}).each do |cell_values, row|
|
750
701
|
unless fk.fetch(:reference_to, {}).has_key?(cell_values)
|
751
|
-
|
702
|
+
log_error "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
|
752
703
|
"Foreign Key violation, expected to find #{cell_values.map(&:to_s).inspect}"
|
753
704
|
end
|
754
705
|
end
|
data/lib/rdf/tabular/uax35.rb
CHANGED
@@ -134,7 +134,6 @@ module RDF::Tabular
|
|
134
134
|
# Upcase value and remove internal spaces
|
135
135
|
value = value.upcase
|
136
136
|
|
137
|
-
#require 'byebug'; byebug unless value.empty?
|
138
137
|
if value =~ re
|
139
138
|
|
140
139
|
# Upcase value and remove internal spaces
|
@@ -226,7 +225,6 @@ module RDF::Tabular
|
|
226
225
|
fractional_grouping_size = fractional_parts[0].to_s.length
|
227
226
|
|
228
227
|
# Construct regular expression for integer part
|
229
|
-
#require 'byebug'; byebug
|
230
228
|
integer_str = if primary_grouping_size == 0
|
231
229
|
"\\d{#{min_integer_digits},}"
|
232
230
|
else
|
data/spec/format_spec.rb
CHANGED
@@ -30,4 +30,38 @@ describe RDF::Tabular::Format do
|
|
30
30
|
describe "#to_sym" do
|
31
31
|
specify {expect(described_class.to_sym).to eq :tabular}
|
32
32
|
end
|
33
|
+
|
34
|
+
describe ".cli_commands" do
|
35
|
+
before(:each) do
|
36
|
+
WebMock.stub_request(:any, %r(.*example.org.*)).
|
37
|
+
to_return(lambda {|request|
|
38
|
+
file = request.uri.to_s.split('/').last
|
39
|
+
content_type = case file
|
40
|
+
when /\.json/ then 'application/json'
|
41
|
+
when /\.csv/ then 'text/csv'
|
42
|
+
else 'text/plain'
|
43
|
+
end
|
44
|
+
|
45
|
+
path = File.expand_path("../data/#{file}", __FILE__)
|
46
|
+
if File.exist?(path)
|
47
|
+
{
|
48
|
+
body: File.read(path),
|
49
|
+
status: 200,
|
50
|
+
headers: {'Content-Type' => content_type}
|
51
|
+
}
|
52
|
+
else
|
53
|
+
{status: 401}
|
54
|
+
end
|
55
|
+
})
|
56
|
+
end
|
57
|
+
after(:each) {|example| puts logger.to_s if example.exception}
|
58
|
+
|
59
|
+
require 'rdf/cli'
|
60
|
+
let(:input) {File.expand_path("../data/countries.json", __FILE__)}
|
61
|
+
describe "#tabular-json" do
|
62
|
+
it "serializes to JSON" do
|
63
|
+
expect {RDF::CLI.exec_command("tabular-json", [input], format: :tabular)}.to write.to(:output)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
33
67
|
end
|