rdf-tabular 0.3.0 → 0.4.0.beta2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/VERSION +1 -1
- data/lib/rdf/tabular.rb +0 -1
- data/lib/rdf/tabular/format.rb +16 -0
- data/lib/rdf/tabular/metadata.rb +223 -191
- data/lib/rdf/tabular/reader.rb +84 -133
- data/lib/rdf/tabular/uax35.rb +0 -2
- data/spec/format_spec.rb +34 -0
- data/spec/matchers.rb +3 -78
- data/spec/metadata_spec.rb +150 -80
- data/spec/reader_spec.rb +27 -24
- data/spec/spec_helper.rb +4 -3
- data/spec/suite_helper.rb +1 -1
- data/spec/suite_spec.rb +8 -9
- metadata +109 -60
- data/lib/rdf/tabular/utils.rb +0 -33
data/lib/rdf/tabular/reader.rb
CHANGED
@@ -8,7 +8,7 @@ module RDF::Tabular
|
|
8
8
|
# @author [Gregg Kellogg](http://greggkellogg.net/)
|
9
9
|
class Reader < RDF::Reader
|
10
10
|
format Format
|
11
|
-
include
|
11
|
+
include RDF::Util::Logger
|
12
12
|
|
13
13
|
# Metadata associated with the CSV
|
14
14
|
#
|
@@ -21,14 +21,27 @@ module RDF::Tabular
|
|
21
21
|
attr_reader :input
|
22
22
|
|
23
23
|
##
|
24
|
-
#
|
25
|
-
# @
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
24
|
+
# Writer options
|
25
|
+
# @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Writer#options-class_method
|
26
|
+
def self.options
|
27
|
+
super + [
|
28
|
+
RDF::CLI::Option.new(
|
29
|
+
symbol: :metadata,
|
30
|
+
datatype: RDF::URI,
|
31
|
+
on: ["--metadata URI"],
|
32
|
+
description: "user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location.") {|arg| RDF::URI(arg)},
|
33
|
+
RDF::CLI::Option.new(
|
34
|
+
symbol: :minimal,
|
35
|
+
datatype: TrueClass,
|
36
|
+
on: ["--minimal"],
|
37
|
+
description: "Includes only the information gleaned from the cells of the tabular data.") {true},
|
38
|
+
RDF::CLI::Option.new(
|
39
|
+
symbol: :noProv,
|
40
|
+
datatype: TrueClass,
|
41
|
+
on: ["--no-prov"],
|
42
|
+
description: "do not output optional provenance information.") {true},
|
43
|
+
]
|
44
|
+
end
|
32
45
|
|
33
46
|
##
|
34
47
|
# Initializes the RDF::Tabular Reader instance.
|
@@ -41,10 +54,6 @@ module RDF::Tabular
|
|
41
54
|
# @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
|
42
55
|
# @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
|
43
56
|
# @option options [Boolean] :noProv do not output optional provenance information
|
44
|
-
# @option options [Array] :errors
|
45
|
-
# array for placing errors found when processing metadata. If not set, and validating, errors are output to `$stderr`
|
46
|
-
# @option options [Array] :warnings
|
47
|
-
# array for placing warnings found when processing metadata. If not set, and validating, warnings are output to `$stderr`
|
48
57
|
# @option optinons [Array<Hash>] :fks_referencing_table
|
49
58
|
# When called with Table metadata, a list of the foreign keys referencing this table
|
50
59
|
# @yield [reader] `self`
|
@@ -62,11 +71,7 @@ module RDF::Tabular
|
|
62
71
|
@options[:base] = "file:/#{File.expand_path(@options[:base])}"
|
63
72
|
end
|
64
73
|
|
65
|
-
@options[:
|
66
|
-
@errors = @options.fetch(:errors, [])
|
67
|
-
@warnings = @options.fetch(:warnings, [])
|
68
|
-
|
69
|
-
debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
|
74
|
+
log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
|
70
75
|
|
71
76
|
# Minimal implies noProv
|
72
77
|
@options[:noProv] ||= @options[:minimal]
|
@@ -77,7 +82,7 @@ module RDF::Tabular
|
|
77
82
|
else input
|
78
83
|
end
|
79
84
|
|
80
|
-
|
85
|
+
log_depth do
|
81
86
|
# If input is JSON, then the input is the metadata
|
82
87
|
content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
|
83
88
|
if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
|
@@ -86,6 +91,20 @@ module RDF::Tabular
|
|
86
91
|
@metadata = @metadata.to_table_group if @metadata.is_a?(Table)
|
87
92
|
@metadata.normalize!
|
88
93
|
@input = @metadata
|
94
|
+
elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
|
95
|
+
!RDF::URI(@options[:base].to_s).fragment
|
96
|
+
require 'nokogiri' unless defined?(:Nokogiri)
|
97
|
+
doc = Nokogiri::HTML.parse(input)
|
98
|
+
doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
|
99
|
+
def script.content_type; "application/csvm+json"; end
|
100
|
+
log_debug("Reader#initialize") {"Process HTML script block"}
|
101
|
+
@input = script
|
102
|
+
@metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
|
103
|
+
# If @metadata is for a Table, turn it into a TableGroup
|
104
|
+
@metadata = @metadata.to_table_group if @metadata.is_a?(Table)
|
105
|
+
@metadata.normalize!
|
106
|
+
@input = @metadata
|
107
|
+
end
|
89
108
|
elsif @options[:no_found_metadata]
|
90
109
|
# Extract embedded metadata and merge
|
91
110
|
dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
|
@@ -117,7 +136,7 @@ module RDF::Tabular
|
|
117
136
|
@input = @metadata = Metadata.for_input(@input, @options).normalize!
|
118
137
|
end
|
119
138
|
|
120
|
-
|
139
|
+
log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
|
121
140
|
|
122
141
|
if block_given?
|
123
142
|
case block.arity
|
@@ -140,9 +159,9 @@ module RDF::Tabular
|
|
140
159
|
|
141
160
|
# Construct metadata from that passed from file open, along with information from the file.
|
142
161
|
if input.is_a?(Metadata)
|
143
|
-
|
162
|
+
log_debug("each_statement: metadata") {input.inspect}
|
144
163
|
|
145
|
-
|
164
|
+
log_depth do
|
146
165
|
begin
|
147
166
|
# Validate metadata
|
148
167
|
input.validate!
|
@@ -169,8 +188,6 @@ module RDF::Tabular
|
|
169
188
|
base: input.tables.first.url,
|
170
189
|
no_found_metadata: true,
|
171
190
|
table_resource: table_resource,
|
172
|
-
warnings: @warnings,
|
173
|
-
errors: @errors,
|
174
191
|
)) do |r|
|
175
192
|
r.each_statement(&block)
|
176
193
|
end
|
@@ -191,8 +208,6 @@ module RDF::Tabular
|
|
191
208
|
no_found_metadata: true,
|
192
209
|
table_resource: table_resource,
|
193
210
|
fks_referencing_table: fks,
|
194
|
-
warnings: @warnings,
|
195
|
-
errors: @errors,
|
196
211
|
)) do |r|
|
197
212
|
r.each_statement(&block)
|
198
213
|
end
|
@@ -231,16 +246,12 @@ module RDF::Tabular
|
|
231
246
|
add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
|
232
247
|
end
|
233
248
|
end
|
234
|
-
ensure
|
235
|
-
warnings = @warnings.concat(input.warnings)
|
236
|
-
if validate? && !warnings.empty? && !@options[:warnings]
|
237
|
-
$stderr.puts "Warnings: #{warnings.join("\n")}"
|
238
|
-
end
|
239
|
-
if validate? && !errors.empty? && !@options[:errors]
|
240
|
-
$stderr.puts "Errors: #{errors.join("\n")}"
|
241
|
-
end
|
242
249
|
end
|
243
250
|
end
|
251
|
+
|
252
|
+
if validate? && log_statistics[:error]
|
253
|
+
raise RDF::ReaderError, "Errors found during processing"
|
254
|
+
end
|
244
255
|
return
|
245
256
|
end
|
246
257
|
|
@@ -261,8 +272,9 @@ module RDF::Tabular
|
|
261
272
|
row.subject = table_resource
|
262
273
|
add_statement(last_row_num + 1, row) unless metadata.suppressOutput
|
263
274
|
next
|
275
|
+
else
|
276
|
+
last_row_num = row.sourceNumber
|
264
277
|
end
|
265
|
-
last_row_num = row.sourceNumber
|
266
278
|
|
267
279
|
# Collect primary and foreign keys if validating
|
268
280
|
if validate?
|
@@ -286,11 +298,20 @@ module RDF::Tabular
|
|
286
298
|
end
|
287
299
|
row.values.each_with_index do |cell, index|
|
288
300
|
# Collect cell errors
|
289
|
-
|
290
|
-
|
301
|
+
unless Array(cell.errors).empty?
|
302
|
+
self.send((validate? ? :log_error : :log_warn),
|
303
|
+
"Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
|
304
|
+
cell.errors.join("\n")
|
305
|
+
end
|
306
|
+
end
|
291
307
|
next if cell.column.suppressOutput # Skip ignored cells
|
292
308
|
cell_subject = cell.aboutUrl || default_cell_subject
|
293
|
-
propertyUrl = cell.propertyUrl ||
|
309
|
+
propertyUrl = cell.propertyUrl || begin
|
310
|
+
# It's possible that the metadata URL already has a fragment, in which case we need to override it.
|
311
|
+
u = metadata.url.dup
|
312
|
+
u.fragment = cell.column.name
|
313
|
+
u
|
314
|
+
end
|
294
315
|
add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
|
295
316
|
|
296
317
|
if cell.column.valueUrl
|
@@ -339,16 +360,13 @@ module RDF::Tabular
|
|
339
360
|
end
|
340
361
|
|
341
362
|
##
|
342
|
-
#
|
343
|
-
# @
|
344
|
-
# @raise [Error]
|
363
|
+
# Do we have valid metadata?
|
364
|
+
# @raise [RDF::ReaderError]
|
345
365
|
def validate!
|
346
|
-
|
347
|
-
|
348
|
-
self
|
366
|
+
@options[:validate] = true
|
367
|
+
each_statement {}
|
349
368
|
rescue RDF::ReaderError => e
|
350
369
|
raise Error, e.message
|
351
|
-
self
|
352
370
|
end
|
353
371
|
|
354
372
|
##
|
@@ -395,16 +413,22 @@ module RDF::Tabular
|
|
395
413
|
end
|
396
414
|
options = {} unless options.is_a?(Hash)
|
397
415
|
|
398
|
-
hash_fn =
|
416
|
+
hash_fn = :to_hash
|
399
417
|
options = options.merge(noProv: @options[:noProv])
|
400
418
|
|
401
|
-
if io
|
419
|
+
res = if io
|
402
420
|
::JSON::dump_default_options = json_state
|
403
421
|
::JSON.dump(self.send(hash_fn, options), io)
|
404
422
|
else
|
405
423
|
hash = self.send(hash_fn, options)
|
406
424
|
::JSON.generate(hash, json_state)
|
407
425
|
end
|
426
|
+
|
427
|
+
if validate? && log_statistics[:error]
|
428
|
+
raise RDF::Tabular::Error, "Errors found during processing"
|
429
|
+
end
|
430
|
+
|
431
|
+
res
|
408
432
|
rescue IOError => e
|
409
433
|
raise RDF::Tabular::Error, e.message
|
410
434
|
end
|
@@ -419,8 +443,8 @@ module RDF::Tabular
|
|
419
443
|
def to_hash(options = {})
|
420
444
|
# Construct metadata from that passed from file open, along with information from the file.
|
421
445
|
if input.is_a?(Metadata)
|
422
|
-
|
423
|
-
|
446
|
+
log_debug("each_statement: metadata") {input.inspect}
|
447
|
+
log_depth do
|
424
448
|
# Get Metadata to invoke and open referenced files
|
425
449
|
begin
|
426
450
|
# Validate metadata
|
@@ -445,8 +469,6 @@ module RDF::Tabular
|
|
445
469
|
base: input.tables.first.url,
|
446
470
|
minimal: minimal?,
|
447
471
|
no_found_metadata: true,
|
448
|
-
warnings: @warnings,
|
449
|
-
errors: @errors,
|
450
472
|
)) do |r|
|
451
473
|
case t = r.to_hash(options)
|
452
474
|
when Array then tables += t unless input.tables.first.suppressOutput
|
@@ -461,8 +483,6 @@ module RDF::Tabular
|
|
461
483
|
base: table.url,
|
462
484
|
minimal: minimal?,
|
463
485
|
no_found_metadata: true,
|
464
|
-
warnings: @warnings,
|
465
|
-
errors: @errors,
|
466
486
|
)) do |r|
|
467
487
|
case t = r.to_hash(options)
|
468
488
|
when Array then tables += t unless table.suppressOutput
|
@@ -477,14 +497,6 @@ module RDF::Tabular
|
|
477
497
|
|
478
498
|
# Result is table_group or array
|
479
499
|
minimal? ? tables : table_group
|
480
|
-
ensure
|
481
|
-
warnings = @warnings.concat(input.warnings)
|
482
|
-
if validate? && !warnings.empty? && !@options[:warnings]
|
483
|
-
$stderr.puts "Warnings: #{warnings.join("\n")}"
|
484
|
-
end
|
485
|
-
if validate? && !errors.empty? && !@options[:errors]
|
486
|
-
$stderr.puts "Errors: #{errors.join("\n")}"
|
487
|
-
end
|
488
500
|
end
|
489
501
|
end
|
490
502
|
else
|
@@ -524,8 +536,12 @@ module RDF::Tabular
|
|
524
536
|
column = metadata.tableSchema.columns[index]
|
525
537
|
|
526
538
|
# Collect cell errors
|
527
|
-
|
528
|
-
|
539
|
+
unless Array(cell.errors).empty?
|
540
|
+
self.send(validate? ? :log_error : :log_warn,
|
541
|
+
"Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
|
542
|
+
cell.errors.join("\n")
|
543
|
+
end
|
544
|
+
end
|
529
545
|
|
530
546
|
# Ignore suppressed columns
|
531
547
|
next if column.suppressOutput
|
@@ -607,71 +623,6 @@ module RDF::Tabular
|
|
607
623
|
end
|
608
624
|
end
|
609
625
|
|
610
|
-
# Return a hash representation of the annotated tabular data model for JSON serialization
|
611
|
-
# @param [Hash{Symbol => Object}] options
|
612
|
-
# @return [Hash]
|
613
|
-
def to_atd(options = {})
|
614
|
-
# Construct metadata from that passed from file open, along with information from the file.
|
615
|
-
if input.is_a?(Metadata)
|
616
|
-
debug("each_statement: metadata") {input.inspect}
|
617
|
-
depth do
|
618
|
-
# Get Metadata to invoke and open referenced files
|
619
|
-
case input.type
|
620
|
-
when :TableGroup
|
621
|
-
table_group = input.to_atd
|
622
|
-
if input.tables.empty? && options[:original_input]
|
623
|
-
Reader.new(options[:original_input], options.merge(
|
624
|
-
base: options[:base],
|
625
|
-
no_found_metadata: true
|
626
|
-
)) do |r|
|
627
|
-
table_group["tables"] << r.to_atd(options)
|
628
|
-
end
|
629
|
-
else
|
630
|
-
input.each_table do |table|
|
631
|
-
Reader.open(table.url, options.merge(
|
632
|
-
metadata: table,
|
633
|
-
base: table.url,
|
634
|
-
no_found_metadata: true
|
635
|
-
)) do |r|
|
636
|
-
table_group["tables"] << r.to_atd(options)
|
637
|
-
end
|
638
|
-
end
|
639
|
-
end
|
640
|
-
|
641
|
-
# Result is table_group
|
642
|
-
table_group
|
643
|
-
when :Table
|
644
|
-
table = nil
|
645
|
-
Reader.open(input.url, options.merge(
|
646
|
-
metadata: input,
|
647
|
-
base: input.url,
|
648
|
-
no_found_metadata: true
|
649
|
-
)) do |r|
|
650
|
-
table = r.to_atd(options)
|
651
|
-
end
|
652
|
-
|
653
|
-
table
|
654
|
-
else
|
655
|
-
raise "Opened inappropriate metadata type: #{input.type}"
|
656
|
-
end
|
657
|
-
end
|
658
|
-
else
|
659
|
-
rows = []
|
660
|
-
table = metadata.to_atd
|
661
|
-
rows, columns = table["rows"], table["columns"]
|
662
|
-
|
663
|
-
# Input is file containing CSV data.
|
664
|
-
# Output ROW-Level statements
|
665
|
-
metadata.each_row(input) do |row|
|
666
|
-
rows << row.to_atd
|
667
|
-
row.values.each_with_index do |cell, colndx|
|
668
|
-
columns[colndx]["cells"] << cell.to_atd
|
669
|
-
end
|
670
|
-
end
|
671
|
-
table
|
672
|
-
end
|
673
|
-
end
|
674
|
-
|
675
626
|
def minimal?; @options[:minimal]; end
|
676
627
|
def prov?; !(@options[:noProv]); end
|
677
628
|
|
@@ -691,9 +642,9 @@ module RDF::Tabular
|
|
691
642
|
# @param [URI, BNode, Literal] object the object of the statement
|
692
643
|
# @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
|
693
644
|
def add_statement(node, *args)
|
694
|
-
statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement
|
645
|
+
statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement(*args)
|
695
646
|
raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
|
696
|
-
|
647
|
+
log_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
|
697
648
|
@callback.call(statement)
|
698
649
|
end
|
699
650
|
|
@@ -702,7 +653,7 @@ module RDF::Tabular
|
|
702
653
|
pk_strings = {}
|
703
654
|
primary_keys.reject(&:empty?).each do |row_pks|
|
704
655
|
pk_names = row_pks.map {|cell| cell.value}.join(",")
|
705
|
-
|
656
|
+
log_error "Table #{metadata.url} has duplicate primary key #{pk_names}" if pk_strings.has_key?(pk_names)
|
706
657
|
pk_strings[pk_names] ||= 0
|
707
658
|
pk_strings[pk_names] += 1
|
708
659
|
end
|
@@ -735,7 +686,7 @@ module RDF::Tabular
|
|
735
686
|
fk[:reference_to] ||= {}
|
736
687
|
cell_values = cells.map {|cell| cell.stringValue unless cell.stringValue.to_s.empty?}.compact
|
737
688
|
next if cell_values.empty? # Don't record if empty
|
738
|
-
|
689
|
+
log_error "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}): found duplicate foreign key target: #{cell_values.map(&:to_s).inspect}" if fk[:reference_to][cell_values]
|
739
690
|
fk[:reference_to][cell_values] ||= row
|
740
691
|
end
|
741
692
|
end
|
@@ -748,7 +699,7 @@ module RDF::Tabular
|
|
748
699
|
# Verify that reference_from entry exists in reference_to
|
749
700
|
fk.fetch(:reference_from, {}).each do |cell_values, row|
|
750
701
|
unless fk.fetch(:reference_to, {}).has_key?(cell_values)
|
751
|
-
|
702
|
+
log_error "Table #{table.url} row #{row.number}(src #{row.sourceNumber}): " +
|
752
703
|
"Foreign Key violation, expected to find #{cell_values.map(&:to_s).inspect}"
|
753
704
|
end
|
754
705
|
end
|
data/lib/rdf/tabular/uax35.rb
CHANGED
@@ -134,7 +134,6 @@ module RDF::Tabular
|
|
134
134
|
# Upcase value and remove internal spaces
|
135
135
|
value = value.upcase
|
136
136
|
|
137
|
-
#require 'byebug'; byebug unless value.empty?
|
138
137
|
if value =~ re
|
139
138
|
|
140
139
|
# Upcase value and remove internal spaces
|
@@ -226,7 +225,6 @@ module RDF::Tabular
|
|
226
225
|
fractional_grouping_size = fractional_parts[0].to_s.length
|
227
226
|
|
228
227
|
# Construct regular expression for integer part
|
229
|
-
#require 'byebug'; byebug
|
230
228
|
integer_str = if primary_grouping_size == 0
|
231
229
|
"\\d{#{min_integer_digits},}"
|
232
230
|
else
|
data/spec/format_spec.rb
CHANGED
@@ -30,4 +30,38 @@ describe RDF::Tabular::Format do
|
|
30
30
|
describe "#to_sym" do
|
31
31
|
specify {expect(described_class.to_sym).to eq :tabular}
|
32
32
|
end
|
33
|
+
|
34
|
+
describe ".cli_commands" do
|
35
|
+
before(:each) do
|
36
|
+
WebMock.stub_request(:any, %r(.*example.org.*)).
|
37
|
+
to_return(lambda {|request|
|
38
|
+
file = request.uri.to_s.split('/').last
|
39
|
+
content_type = case file
|
40
|
+
when /\.json/ then 'application/json'
|
41
|
+
when /\.csv/ then 'text/csv'
|
42
|
+
else 'text/plain'
|
43
|
+
end
|
44
|
+
|
45
|
+
path = File.expand_path("../data/#{file}", __FILE__)
|
46
|
+
if File.exist?(path)
|
47
|
+
{
|
48
|
+
body: File.read(path),
|
49
|
+
status: 200,
|
50
|
+
headers: {'Content-Type' => content_type}
|
51
|
+
}
|
52
|
+
else
|
53
|
+
{status: 401}
|
54
|
+
end
|
55
|
+
})
|
56
|
+
end
|
57
|
+
after(:each) {|example| puts logger.to_s if example.exception}
|
58
|
+
|
59
|
+
require 'rdf/cli'
|
60
|
+
let(:input) {File.expand_path("../data/countries.json", __FILE__)}
|
61
|
+
describe "#tabular-json" do
|
62
|
+
it "serializes to JSON" do
|
63
|
+
expect {RDF::CLI.exec_command("tabular-json", [input], format: :tabular)}.to write.to(:output)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
33
67
|
end
|