rdf-tabular 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,591 @@
1
+ require 'rdf'
2
+
3
+ module RDF::Tabular
4
+ ##
5
+ # A Tabular Data to RDF parser in Ruby.
6
+ #
7
+ # @author [Gregg Kellogg](http://greggkellogg.net/)
8
+ class Reader < RDF::Reader
9
+ format Format
10
+ include Utils
11
+
12
+ # Metadata associated with the CSV
13
+ #
14
+ # @return [Metadata]
15
+ attr_reader :metadata
16
+
17
+ ##
18
+ # Input open to read
19
+ # @return [:read]
20
+ attr_reader :input
21
+
22
+ ##
23
+ # Initializes the RDF::Tabular Reader instance.
24
+ #
25
+ # @param [Util::File::RemoteDoc, IO, StringIO, Array<Array<String>>] input
26
+ # An opened file possibly JSON Metadata,
27
+ # or an Array used as an internalized array of arrays
28
+ # @param [Hash{Symbol => Object}] options
29
+ # any additional options (see `RDF::Reader#initialize`)
30
+ # @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
31
+ # @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
32
+ # @option options [Boolean] :noProv do not output optional provenance information
33
+ # @yield [reader] `self`
34
+ # @yieldparam [RDF::Reader] reader
35
+ # @yieldreturn [void] ignored
36
+ # @raise [RDF::ReaderError] if the CSV document cannot be loaded
37
+ def initialize(input = $stdin, options = {}, &block)
38
+ super do
39
+ # Base would be how we are to take this
40
+ @options[:base] ||= base_uri.to_s if base_uri
41
+ @options[:base] ||= input.base_uri if input.respond_to?(:base_uri)
42
+ @options[:base] ||= input.path if input.respond_to?(:path)
43
+ @options[:base] ||= input.filename if input.respond_to?(:filename)
44
+ if RDF::URI(@options[:base]).relative? && File.exist?(@options[:base])
45
+ @options[:base] = "file:/#{File.expand_path(@options[:base])}"
46
+ end
47
+
48
+ @options[:depth] ||= 0
49
+
50
+ debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
51
+
52
+ # Minimal implies noProv
53
+ @options[:noProv] ||= @options[:minimal]
54
+
55
+ @input = input.is_a?(String) ? StringIO.new(input) : input
56
+
57
+ depth do
58
+ # If input is JSON, then the input is the metadata
59
+ if @options[:base] =~ /\.json(?:ld)?$/ ||
60
+ @input.respond_to?(:content_type) && @input.content_type =~ %r(application/(?:ld+)json)
61
+ @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
62
+ # If @metadata is for a Table, merge with something empty to create a TableGroup metadata
63
+ if @metadata.is_a?(TableGroup)
64
+ @metadata.normalize!
65
+ else
66
+ @metadata = @metadata.merge(TableGroup.new({}))
67
+ end
68
+ @input = @metadata
69
+ elsif @options[:no_found_metadata]
70
+ # Extract embedded metadata and merge
71
+ table_metadata = @options[:metadata]
72
+ embedded_metadata = table_metadata.dialect.embedded_metadata(input, @options)
73
+ @metadata = table_metadata.dup.merge!(embedded_metadata)
74
+ else
75
+ # HTTP flags
76
+ if @input.respond_to?(:headers) &&
77
+ input.headers.fetch(:content_type, '').split(';').include?('header=absent')
78
+ @options[:metadata] ||= Table.new(url: @options[:base])
79
+ @options[:metadata].dialect.header = false
80
+ end
81
+
82
+ # It's tabluar data. Find metadata and proceed as if it was specified in the first place
83
+ @metadata = Metadata.for_input(@input, @options)
84
+ @input = @metadata
85
+ end
86
+
87
+ debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
88
+
89
+ if block_given?
90
+ case block.arity
91
+ when 0 then instance_eval(&block)
92
+ else block.call(self)
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ ##
100
+ # @private
101
+ # @see RDF::Reader#each_statement
102
+ def each_statement(&block)
103
+ if block_given?
104
+ @callback = block
105
+
106
+ start_time = Time.now
107
+
108
+ # Construct metadata from that passed from file open, along with information from the file.
109
+ if input.is_a?(Metadata)
110
+ debug("each_statement: metadata") {input.inspect}
111
+
112
+ # Validate metadata
113
+ input.validate!
114
+
115
+ depth do
116
+ # Get Metadata to invoke and open referenced files
117
+ case input.type
118
+ when :TableGroup
119
+ # Use resolved @id of TableGroup, if available
120
+ table_group = input.id || RDF::Node.new
121
+ add_statement(0, table_group, RDF.type, CSVW.TableGroup) unless minimal?
122
+
123
+ # Common Properties
124
+ input.each do |key, value|
125
+ next unless key.to_s.include?(':') || key == :notes
126
+ input.common_properties(table_group, key, value) do |statement|
127
+ add_statement(0, statement)
128
+ end
129
+ end unless minimal?
130
+
131
+ input.each_resource do |table|
132
+ next if table.suppressOutput
133
+ table_resource = table.id || RDF::Node.new
134
+ add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
135
+ Reader.open(table.url, options.merge(
136
+ format: :tabular,
137
+ metadata: table,
138
+ base: table.url,
139
+ no_found_metadata: true,
140
+ table_resource: table_resource
141
+ )) do |r|
142
+ r.each_statement(&block)
143
+ end
144
+ end
145
+
146
+ # Provenance
147
+ if prov?
148
+ activity = RDF::Node.new
149
+ add_statement(0, table_group, RDF::PROV.wasGeneratedBy, activity)
150
+ add_statement(0, activity, RDF.type, RDF::PROV.Activity)
151
+ add_statement(0, activity, RDF::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
152
+ add_statement(0, activity, RDF::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
153
+ add_statement(0, activity, RDF::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))
154
+
155
+ unless (urls = input.resources.map(&:url)).empty?
156
+ usage = RDF::Node.new
157
+ add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
158
+ add_statement(0, usage, RDF.type, RDF::PROV.Usage)
159
+ urls.each do |url|
160
+ add_statement(0, usage, RDF::PROV.entity, RDF::URI(url))
161
+ end
162
+ add_statement(0, usage, RDF::PROV.hadRole, CSVW.csvEncodedTabularData)
163
+ end
164
+
165
+ unless Array(input.filenames).empty?
166
+ usage = RDF::Node.new
167
+ add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
168
+ add_statement(0, usage, RDF.type, RDF::PROV.Usage)
169
+ Array(input.filenames).each do |fn|
170
+ add_statement(0, usage, RDF::PROV.entity, RDF::URI(fn))
171
+ end
172
+ add_statement(0, usage, RDF::PROV.hadRole, CSVW.tabularMetadata)
173
+ end
174
+ end
175
+ when :Table
176
+ Reader.open(input.url, options.merge(format: :tabular, metadata: input, base: input.url, no_found_metadata: true)) do |r|
177
+ r.each_statement(&block)
178
+ end
179
+ else
180
+ raise "Opened inappropriate metadata type: #{input.type}"
181
+ end
182
+ end
183
+ return
184
+ end
185
+
186
+ # Output Table-Level RDF triples
187
+ table_resource = options.fetch(:table_resource, (metadata.id || RDF::Node.new))
188
+ unless minimal?
189
+ add_statement(0, table_resource, RDF.type, CSVW.Table)
190
+ add_statement(0, table_resource, CSVW.url, RDF::URI(metadata.url))
191
+ end
192
+
193
+ # Common Properties
194
+ metadata.each do |key, value|
195
+ next unless key.to_s.include?(':') || key == :notes
196
+ metadata.common_properties(table_resource, key, value) do |statement|
197
+ add_statement(0, statement)
198
+ end
199
+ end unless minimal?
200
+
201
+ # Input is file containing CSV data.
202
+ # Output ROW-Level statements
203
+ last_row_num = 0
204
+ metadata.each_row(input) do |row|
205
+ if row.is_a?(RDF::Statement)
206
+ # May add additional comments
207
+ row.subject = table_resource
208
+ add_statement(last_row_num + 1, row)
209
+ next
210
+ end
211
+ last_row_num = row.sourceNumber
212
+
213
+ # Output row-level metadata
214
+ row_resource = RDF::Node.new
215
+ default_cell_subject = RDF::Node.new
216
+ unless minimal?
217
+ add_statement(row.sourceNumber, table_resource, CSVW.row, row_resource)
218
+ add_statement(row.sourceNumber, row_resource, CSVW.rownum, row.number)
219
+ add_statement(row.sourceNumber, row_resource, CSVW.url, row.id)
220
+ end
221
+ row.values.each_with_index do |cell, index|
222
+ next if cell.column.suppressOutput # Skip ignored cells
223
+ cell_subject = cell.aboutUrl || default_cell_subject
224
+ propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
225
+ add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
226
+
227
+ if cell.column.valueUrl
228
+ add_statement(row.sourceNumber, cell_subject, propertyUrl, cell.valueUrl) if cell.valueUrl
229
+ elsif cell.column.ordered && cell.column.separator
230
+ list = RDF::List[*Array(cell.value)]
231
+ add_statement(row.sourceNumber, cell_subject, propertyUrl, list.subject)
232
+ list.each_statement do |statement|
233
+ next if statement.predicate == RDF.type && statement.object == RDF.List
234
+ add_statement(row.sourceNumber, statement.subject, statement.predicate, statement.object)
235
+ end
236
+ else
237
+ Array(cell.value).each do |v|
238
+ add_statement(row.sourceNumber, cell_subject, propertyUrl, v)
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end
244
+ enum_for(:each_statement)
245
+ end
246
+
247
+ ##
248
+ # @private
249
+ # @see RDF::Reader#each_triple
250
+ def each_triple(&block)
251
+ if block_given?
252
+ each_statement do |statement|
253
+ block.call(*statement.to_triple)
254
+ end
255
+ end
256
+ enum_for(:each_triple)
257
+ end
258
+
259
+ ##
260
+ # Transform to JSON. Note that this must be run from within the reader context if the input is an open IO stream.
261
+ #
262
+ # @example outputing annotated CSV as JSON
263
+ # result = nil
264
+ # RDF::Tabular::Reader.open("etc/doap.csv") do |reader|
265
+ # result = reader.to_json
266
+ # end
267
+ # result #=> {...}
268
+ #
269
+ # @example outputing annotated CSV as JSON from an in-memory structure
270
+ # csv = %(
271
+ # GID,On Street,Species,Trim Cycle,Inventory Date
272
+ # 1,ADDISON AV,Celtis australis,Large Tree Routine Prune,10/18/2010
273
+ # 2,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
274
+ # 3,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
275
+ # ).gsub(/^\s+/, '')
276
+ # r = RDF::Tabular::Reader.new(csv)
277
+ # r.to_json #=> {...}
278
+ #
279
+ # @param [Hash{Symbol => Object}] options may also be a JSON state
280
+ # @option options [IO, StringIO] io to output to file
281
+ # @option options [::JSON::State] :state used when dumping
282
+ # @option options [Boolean] :atd output Abstract Table representation instead
283
+ # @return [String]
284
+ def to_json(options = {})
285
+ io = case options
286
+ when IO, StringIO then options
287
+ when Hash then options[:io]
288
+ end
289
+ json_state = case options
290
+ when Hash
291
+ case
292
+ when options.has_key?(:state) then options[:state]
293
+ when options.has_key?(:indent) then options
294
+ else ::JSON::LD::JSON_STATE
295
+ end
296
+ when ::JSON::State, ::JSON::Ext::Generator::State, ::JSON::Pure::Generator::State
297
+ options
298
+ else ::JSON::LD::JSON_STATE
299
+ end
300
+ options = {} unless options.is_a?(Hash)
301
+
302
+ hash_fn = options[:atd] ? :to_atd : :to_hash
303
+ options = options.merge(noProv: @options[:noProv])
304
+
305
+ if io
306
+ ::JSON::dump_default_options = json_state
307
+ ::JSON.dump(self.send(hash_fn, options), io)
308
+ else
309
+ hash = self.send(hash_fn, options)
310
+ ::JSON.generate(hash, json_state)
311
+ end
312
+ end
313
+
314
+ ##
315
+ # Return a hash representation of the data for JSON serialization
316
+ #
317
+ # Produces an array if run in minimal mode.
318
+ #
319
+ # @param [Hash{Symbol => Object}] options
320
+ # @return [Hash, Array]
321
+ def to_hash(options = {})
322
+ # Construct metadata from that passed from file open, along with information from the file.
323
+ if input.is_a?(Metadata)
324
+ debug("each_statement: metadata") {input.inspect}
325
+ depth do
326
+ # Get Metadata to invoke and open referenced files
327
+ case input.type
328
+ when :TableGroup
329
+ # Validate metadata
330
+ input.validate!
331
+
332
+ tables = []
333
+ table_group = {}
334
+ table_group['@id'] = input.id.to_s if input.id
335
+
336
+ # Common Properties
337
+ input.each do |key, value|
338
+ next unless key.to_s.include?(':') || key == :notes
339
+ table_group[key] = input.common_properties(nil, key, value)
340
+ table_group[key] = [table_group[key]] if key == :notes && !table_group[key].is_a?(Array)
341
+ end
342
+
343
+ table_group['table'] = tables
344
+
345
+ input.each_resource do |table|
346
+ next if table.suppressOutput
347
+ Reader.open(table.url, options.merge(
348
+ format: :tabular,
349
+ metadata: table,
350
+ base: table.url,
351
+ minimal: minimal?,
352
+ no_found_metadata: true
353
+ )) do |r|
354
+ case table = r.to_hash(options)
355
+ when Array then tables += table
356
+ when Hash then tables << table
357
+ end
358
+ end
359
+ end
360
+
361
+ # Result is table_group or array
362
+ minimal? ? tables : table_group
363
+ when :Table
364
+ table = nil
365
+ Reader.open(input.url, options.merge(
366
+ format: :tabular,
367
+ metadata: input,
368
+ base: input.url,
369
+ minimal: minimal?,
370
+ no_found_metadata: true
371
+ )) do |r|
372
+ table = r.to_hash(options)
373
+ end
374
+
375
+ table
376
+ else
377
+ raise "Opened inappropriate metadata type: #{input.type}"
378
+ end
379
+ end
380
+ else
381
+ rows = []
382
+ table = {}
383
+ table['@id'] = metadata.id.to_s if metadata.id
384
+ table['url'] = metadata.url.to_s
385
+
386
+ # Use string values notes and common properties
387
+ metadata.each do |key, value|
388
+ next unless key.to_s.include?(':') || key == :notes
389
+ table[key] = metadata.common_properties(nil, key, value)
390
+ table[key] = [table[key]] if key == :notes && !table[key].is_a?(Array)
391
+ end unless minimal?
392
+
393
+ table.merge!("row" => rows)
394
+
395
+ # Input is file containing CSV data.
396
+ # Output ROW-Level statements
397
+ metadata.each_row(input) do |row|
398
+ if row.is_a?(RDF::Statement)
399
+ # May add additional comments
400
+ table['rdfs:comment'] ||= []
401
+ table['rdfs:comment'] << row.object.to_s
402
+ next
403
+ end
404
+ # Output row-level metadata
405
+ r, a, values = {}, {}, {}
406
+ r["url"] = row.id.to_s
407
+ r["rownum"] = row.number
408
+
409
+ row.values.each_with_index do |cell, index|
410
+ column = metadata.tableSchema.columns[index]
411
+
412
+ # Ignore suppressed columns
413
+ next if column.suppressOutput
414
+
415
+ # Skip valueUrl cells where the valueUrl is null
416
+ next if cell.column.valueUrl && cell.valueUrl.nil?
417
+
418
+ # Skip empty sequences
419
+ next if !cell.column.valueUrl && cell.value.is_a?(Array) && cell.value.empty?
420
+
421
+ subject = cell.aboutUrl || 'null'
422
+ co = (a[subject.to_s] ||= {})
423
+ co['@id'] = subject.to_s unless subject == 'null'
424
+ prop = case cell.propertyUrl
425
+ when RDF.type then '@type'
426
+ when nil then column.name
427
+ else
428
+ # Compact the property to a term or prefixed name
429
+ metadata.context.compact_iri(cell.propertyUrl, vocab: true)
430
+ end
431
+
432
+ value = case
433
+ when prop == '@type'
434
+ metadata.context.compact_iri(cell.valueUrl || cell.value, vocab: true)
435
+ when cell.valueUrl
436
+ unless subject == cell.valueUrl
437
+ values[cell.valueUrl.to_s] ||= {o: co, prop: prop, count: 0}
438
+ values[cell.valueUrl.to_s][:count] += 1
439
+ end
440
+ cell.valueUrl.to_s
441
+ when cell.value.is_a?(RDF::Literal::Numeric)
442
+ cell.value.object
443
+ when cell.value.is_a?(RDF::Literal::Boolean)
444
+ cell.value.object
445
+ else
446
+ cell.value
447
+ end
448
+
449
+ # Add or merge value
450
+ merge_compacted_value(co, prop, value)
451
+ end
452
+
453
+ # Check for nesting
454
+ values.keys.each do |valueUrl|
455
+ next unless a.has_key?(valueUrl)
456
+ ref = values[valueUrl]
457
+ co = ref[:o]
458
+ prop = ref[:prop]
459
+ next if ref[:count] != 1
460
+ raise "Expected #{ref[o][prop].inspect} to include #{valueUrl.inspect}" unless Array(co[prop]).include?(valueUrl)
461
+ co[prop] = Array(co[prop]).map {|e| e == valueUrl ? a.delete(valueUrl) : e}
462
+ co[prop] = co[prop].first if co[prop].length == 1
463
+ end
464
+
465
+ r["describes"] = a.values
466
+
467
+ if minimal?
468
+ rows.concat(r["describes"])
469
+ else
470
+ rows << r
471
+ end
472
+ end
473
+
474
+ minimal? ? table["row"] : table
475
+ end
476
+ end
477
+
478
+ # Return a hash representation of the annotated tabular data model for JSON serialization
479
+ # @param [Hash{Symbol => Object}] options
480
+ # @return [Hash]
481
+ def to_atd(options = {})
482
+ # Construct metadata from that passed from file open, along with information from the file.
483
+ if input.is_a?(Metadata)
484
+ debug("each_statement: metadata") {input.inspect}
485
+ depth do
486
+ # Get Metadata to invoke and open referenced files
487
+ case input.type
488
+ when :TableGroup
489
+ table_group = input.to_atd
490
+
491
+ input.each_resource do |table|
492
+ Reader.open(table.url, options.merge(
493
+ format: :tabular,
494
+ metadata: table,
495
+ base: table.url,
496
+ no_found_metadata: true, # FIXME: remove
497
+ noProv: true
498
+ )) do |r|
499
+ table = r.to_atd(options)
500
+
501
+ # Fill in columns and rows in table_group entry from returned table
502
+ t = table_group[:resources].detect {|tab| tab["url"] == table["url"]}
503
+ t["columns"] = table["columns"]
504
+ t["rows"] = table["rows"]
505
+ end
506
+ end
507
+
508
+ # Result is table_group
509
+ table_group
510
+ when :Table
511
+ table = nil
512
+ Reader.open(input.url, options.merge(
513
+ format: :tabular,
514
+ metadata: input,
515
+ base: input.url,
516
+ no_found_metadata: true,
517
+ noProv: true
518
+ )) do |r|
519
+ table = r.to_atd(options)
520
+ end
521
+
522
+ table
523
+ else
524
+ raise "Opened inappropriate metadata type: #{input.type}"
525
+ end
526
+ end
527
+ else
528
+ rows = []
529
+ table = metadata.to_atd
530
+ rows, columns = table["rows"], table["columns"]
531
+
532
+ # Input is file containing CSV data.
533
+ # Output ROW-Level statements
534
+ metadata.each_row(input) do |row|
535
+ rows << row.to_atd
536
+ row.values.each_with_index do |cell, colndx|
537
+ columns[colndx]["cells"] << cell.id
538
+ end
539
+ end
540
+ table
541
+ end
542
+ end
543
+
544
+ def minimal?; @options[:minimal]; end
545
+ def prov?; !(@options[:noProv]); end
546
+
547
+ private
548
+ ##
549
+ # @overload add_statement(lineno, statement)
550
+ # Add a statement, object can be literal or URI or bnode
551
+ # @param [String] lineno
552
+ # @param [RDF::Statement] statement
553
+ # @yield [RDF::Statement]
554
+ # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
555
+ #
556
+ # @overload add_statement(lineno, subject, predicate, object)
557
+ # Add a triple
558
+ # @param [URI, BNode] subject the subject of the statement
559
+ # @param [URI] predicate the predicate of the statement
560
+ # @param [URI, BNode, Literal] object the object of the statement
561
+ # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
562
+ def add_statement(node, *args)
563
+ statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement.new(*args)
564
+ raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
565
+ debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
566
+ @callback.call(statement)
567
+ end
568
+
569
+ # Merge values into compacted results, creating arrays if necessary
570
+ def merge_compacted_value(hash, key, value)
571
+ return unless hash
572
+ case hash[key]
573
+ when nil then hash[key] = value
574
+ when Array
575
+ if value.is_a?(Array)
576
+ hash[key].concat(value)
577
+ else
578
+ hash[key] << value
579
+ end
580
+ else
581
+ hash[key] = [hash[key]]
582
+ if value.is_a?(Array)
583
+ hash[key].concat(value)
584
+ else
585
+ hash[key] << value
586
+ end
587
+ end
588
+ end
589
+ end
590
+ end
591
+