rdf-tabular 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,591 @@
1
+ require 'rdf'
2
+
3
+ module RDF::Tabular
4
+ ##
5
+ # A Tabular Data to RDF parser in Ruby.
6
+ #
7
+ # @author [Gregg Kellogg](http://greggkellogg.net/)
8
+ class Reader < RDF::Reader
9
+ format Format
10
+ include Utils
11
+
12
+ # Metadata associated with the CSV
13
+ #
14
+ # @return [Metadata]
15
+ attr_reader :metadata
16
+
17
+ ##
18
+ # Input open to read
19
+ # @return [:read]
20
+ attr_reader :input
21
+
22
+ ##
23
+ # Initializes the RDF::Tabular Reader instance.
24
+ #
25
+ # @param [Util::File::RemoteDoc, IO, StringIO, Array<Array<String>>] input
26
+ # An opened file possibly JSON Metadata,
27
+ # or an Array used as an internalized array of arrays
28
+ # @param [Hash{Symbol => Object}] options
29
+ # any additional options (see `RDF::Reader#initialize`)
30
+ # @option options [Metadata, Hash, String, RDF::URI] :metadata user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
31
+ # @option options [Boolean] :minimal includes only the information gleaned from the cells of the tabular data
32
+ # @option options [Boolean] :noProv do not output optional provenance information
33
+ # @yield [reader] `self`
34
+ # @yieldparam [RDF::Reader] reader
35
+ # @yieldreturn [void] ignored
36
+ # @raise [RDF::ReaderError] if the CSV document cannot be loaded
37
+ def initialize(input = $stdin, options = {}, &block)
38
+ super do
39
+ # Base would be how we are to take this
40
+ @options[:base] ||= base_uri.to_s if base_uri
41
+ @options[:base] ||= input.base_uri if input.respond_to?(:base_uri)
42
+ @options[:base] ||= input.path if input.respond_to?(:path)
43
+ @options[:base] ||= input.filename if input.respond_to?(:filename)
44
+ if RDF::URI(@options[:base]).relative? && File.exist?(@options[:base])
45
+ @options[:base] = "file:/#{File.expand_path(@options[:base])}"
46
+ end
47
+
48
+ @options[:depth] ||= 0
49
+
50
+ debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}
51
+
52
+ # Minimal implies noProv
53
+ @options[:noProv] ||= @options[:minimal]
54
+
55
+ @input = input.is_a?(String) ? StringIO.new(input) : input
56
+
57
+ depth do
58
+ # If input is JSON, then the input is the metadata
59
+ if @options[:base] =~ /\.json(?:ld)?$/ ||
60
+ @input.respond_to?(:content_type) && @input.content_type =~ %r(application/(?:ld+)json)
61
+ @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
62
+ # If @metadata is for a Table, merge with something empty to create a TableGroup metadata
63
+ if @metadata.is_a?(TableGroup)
64
+ @metadata.normalize!
65
+ else
66
+ @metadata = @metadata.merge(TableGroup.new({}))
67
+ end
68
+ @input = @metadata
69
+ elsif @options[:no_found_metadata]
70
+ # Extract embedded metadata and merge
71
+ table_metadata = @options[:metadata]
72
+ embedded_metadata = table_metadata.dialect.embedded_metadata(input, @options)
73
+ @metadata = table_metadata.dup.merge!(embedded_metadata)
74
+ else
75
+ # HTTP flags
76
+ if @input.respond_to?(:headers) &&
77
+ input.headers.fetch(:content_type, '').split(';').include?('header=absent')
78
+ @options[:metadata] ||= Table.new(url: @options[:base])
79
+ @options[:metadata].dialect.header = false
80
+ end
81
+
82
+ # It's tabluar data. Find metadata and proceed as if it was specified in the first place
83
+ @metadata = Metadata.for_input(@input, @options)
84
+ @input = @metadata
85
+ end
86
+
87
+ debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}
88
+
89
+ if block_given?
90
+ case block.arity
91
+ when 0 then instance_eval(&block)
92
+ else block.call(self)
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ ##
100
+ # @private
101
+ # @see RDF::Reader#each_statement
102
+ def each_statement(&block)
103
+ if block_given?
104
+ @callback = block
105
+
106
+ start_time = Time.now
107
+
108
+ # Construct metadata from that passed from file open, along with information from the file.
109
+ if input.is_a?(Metadata)
110
+ debug("each_statement: metadata") {input.inspect}
111
+
112
+ # Validate metadata
113
+ input.validate!
114
+
115
+ depth do
116
+ # Get Metadata to invoke and open referenced files
117
+ case input.type
118
+ when :TableGroup
119
+ # Use resolved @id of TableGroup, if available
120
+ table_group = input.id || RDF::Node.new
121
+ add_statement(0, table_group, RDF.type, CSVW.TableGroup) unless minimal?
122
+
123
+ # Common Properties
124
+ input.each do |key, value|
125
+ next unless key.to_s.include?(':') || key == :notes
126
+ input.common_properties(table_group, key, value) do |statement|
127
+ add_statement(0, statement)
128
+ end
129
+ end unless minimal?
130
+
131
+ input.each_resource do |table|
132
+ next if table.suppressOutput
133
+ table_resource = table.id || RDF::Node.new
134
+ add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
135
+ Reader.open(table.url, options.merge(
136
+ format: :tabular,
137
+ metadata: table,
138
+ base: table.url,
139
+ no_found_metadata: true,
140
+ table_resource: table_resource
141
+ )) do |r|
142
+ r.each_statement(&block)
143
+ end
144
+ end
145
+
146
+ # Provenance
147
+ if prov?
148
+ activity = RDF::Node.new
149
+ add_statement(0, table_group, RDF::PROV.wasGeneratedBy, activity)
150
+ add_statement(0, activity, RDF.type, RDF::PROV.Activity)
151
+ add_statement(0, activity, RDF::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
152
+ add_statement(0, activity, RDF::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
153
+ add_statement(0, activity, RDF::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))
154
+
155
+ unless (urls = input.resources.map(&:url)).empty?
156
+ usage = RDF::Node.new
157
+ add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
158
+ add_statement(0, usage, RDF.type, RDF::PROV.Usage)
159
+ urls.each do |url|
160
+ add_statement(0, usage, RDF::PROV.entity, RDF::URI(url))
161
+ end
162
+ add_statement(0, usage, RDF::PROV.hadRole, CSVW.csvEncodedTabularData)
163
+ end
164
+
165
+ unless Array(input.filenames).empty?
166
+ usage = RDF::Node.new
167
+ add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
168
+ add_statement(0, usage, RDF.type, RDF::PROV.Usage)
169
+ Array(input.filenames).each do |fn|
170
+ add_statement(0, usage, RDF::PROV.entity, RDF::URI(fn))
171
+ end
172
+ add_statement(0, usage, RDF::PROV.hadRole, CSVW.tabularMetadata)
173
+ end
174
+ end
175
+ when :Table
176
+ Reader.open(input.url, options.merge(format: :tabular, metadata: input, base: input.url, no_found_metadata: true)) do |r|
177
+ r.each_statement(&block)
178
+ end
179
+ else
180
+ raise "Opened inappropriate metadata type: #{input.type}"
181
+ end
182
+ end
183
+ return
184
+ end
185
+
186
+ # Output Table-Level RDF triples
187
+ table_resource = options.fetch(:table_resource, (metadata.id || RDF::Node.new))
188
+ unless minimal?
189
+ add_statement(0, table_resource, RDF.type, CSVW.Table)
190
+ add_statement(0, table_resource, CSVW.url, RDF::URI(metadata.url))
191
+ end
192
+
193
+ # Common Properties
194
+ metadata.each do |key, value|
195
+ next unless key.to_s.include?(':') || key == :notes
196
+ metadata.common_properties(table_resource, key, value) do |statement|
197
+ add_statement(0, statement)
198
+ end
199
+ end unless minimal?
200
+
201
+ # Input is file containing CSV data.
202
+ # Output ROW-Level statements
203
+ last_row_num = 0
204
+ metadata.each_row(input) do |row|
205
+ if row.is_a?(RDF::Statement)
206
+ # May add additional comments
207
+ row.subject = table_resource
208
+ add_statement(last_row_num + 1, row)
209
+ next
210
+ end
211
+ last_row_num = row.sourceNumber
212
+
213
+ # Output row-level metadata
214
+ row_resource = RDF::Node.new
215
+ default_cell_subject = RDF::Node.new
216
+ unless minimal?
217
+ add_statement(row.sourceNumber, table_resource, CSVW.row, row_resource)
218
+ add_statement(row.sourceNumber, row_resource, CSVW.rownum, row.number)
219
+ add_statement(row.sourceNumber, row_resource, CSVW.url, row.id)
220
+ end
221
+ row.values.each_with_index do |cell, index|
222
+ next if cell.column.suppressOutput # Skip ignored cells
223
+ cell_subject = cell.aboutUrl || default_cell_subject
224
+ propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
225
+ add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?
226
+
227
+ if cell.column.valueUrl
228
+ add_statement(row.sourceNumber, cell_subject, propertyUrl, cell.valueUrl) if cell.valueUrl
229
+ elsif cell.column.ordered && cell.column.separator
230
+ list = RDF::List[*Array(cell.value)]
231
+ add_statement(row.sourceNumber, cell_subject, propertyUrl, list.subject)
232
+ list.each_statement do |statement|
233
+ next if statement.predicate == RDF.type && statement.object == RDF.List
234
+ add_statement(row.sourceNumber, statement.subject, statement.predicate, statement.object)
235
+ end
236
+ else
237
+ Array(cell.value).each do |v|
238
+ add_statement(row.sourceNumber, cell_subject, propertyUrl, v)
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end
244
+ enum_for(:each_statement)
245
+ end
246
+
247
+ ##
248
+ # @private
249
+ # @see RDF::Reader#each_triple
250
+ def each_triple(&block)
251
+ if block_given?
252
+ each_statement do |statement|
253
+ block.call(*statement.to_triple)
254
+ end
255
+ end
256
+ enum_for(:each_triple)
257
+ end
258
+
259
+ ##
260
+ # Transform to JSON. Note that this must be run from within the reader context if the input is an open IO stream.
261
+ #
262
+ # @example outputing annotated CSV as JSON
263
+ # result = nil
264
+ # RDF::Tabular::Reader.open("etc/doap.csv") do |reader|
265
+ # result = reader.to_json
266
+ # end
267
+ # result #=> {...}
268
+ #
269
+ # @example outputing annotated CSV as JSON from an in-memory structure
270
+ # csv = %(
271
+ # GID,On Street,Species,Trim Cycle,Inventory Date
272
+ # 1,ADDISON AV,Celtis australis,Large Tree Routine Prune,10/18/2010
273
+ # 2,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
274
+ # 3,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
275
+ # ).gsub(/^\s+/, '')
276
+ # r = RDF::Tabular::Reader.new(csv)
277
+ # r.to_json #=> {...}
278
+ #
279
+ # @param [Hash{Symbol => Object}] options may also be a JSON state
280
+ # @option options [IO, StringIO] io to output to file
281
+ # @option options [::JSON::State] :state used when dumping
282
+ # @option options [Boolean] :atd output Abstract Table representation instead
283
+ # @return [String]
284
+ def to_json(options = {})
285
+ io = case options
286
+ when IO, StringIO then options
287
+ when Hash then options[:io]
288
+ end
289
+ json_state = case options
290
+ when Hash
291
+ case
292
+ when options.has_key?(:state) then options[:state]
293
+ when options.has_key?(:indent) then options
294
+ else ::JSON::LD::JSON_STATE
295
+ end
296
+ when ::JSON::State, ::JSON::Ext::Generator::State, ::JSON::Pure::Generator::State
297
+ options
298
+ else ::JSON::LD::JSON_STATE
299
+ end
300
+ options = {} unless options.is_a?(Hash)
301
+
302
+ hash_fn = options[:atd] ? :to_atd : :to_hash
303
+ options = options.merge(noProv: @options[:noProv])
304
+
305
+ if io
306
+ ::JSON::dump_default_options = json_state
307
+ ::JSON.dump(self.send(hash_fn, options), io)
308
+ else
309
+ hash = self.send(hash_fn, options)
310
+ ::JSON.generate(hash, json_state)
311
+ end
312
+ end
313
+
314
+ ##
315
+ # Return a hash representation of the data for JSON serialization
316
+ #
317
+ # Produces an array if run in minimal mode.
318
+ #
319
+ # @param [Hash{Symbol => Object}] options
320
+ # @return [Hash, Array]
321
+ def to_hash(options = {})
322
+ # Construct metadata from that passed from file open, along with information from the file.
323
+ if input.is_a?(Metadata)
324
+ debug("each_statement: metadata") {input.inspect}
325
+ depth do
326
+ # Get Metadata to invoke and open referenced files
327
+ case input.type
328
+ when :TableGroup
329
+ # Validate metadata
330
+ input.validate!
331
+
332
+ tables = []
333
+ table_group = {}
334
+ table_group['@id'] = input.id.to_s if input.id
335
+
336
+ # Common Properties
337
+ input.each do |key, value|
338
+ next unless key.to_s.include?(':') || key == :notes
339
+ table_group[key] = input.common_properties(nil, key, value)
340
+ table_group[key] = [table_group[key]] if key == :notes && !table_group[key].is_a?(Array)
341
+ end
342
+
343
+ table_group['table'] = tables
344
+
345
+ input.each_resource do |table|
346
+ next if table.suppressOutput
347
+ Reader.open(table.url, options.merge(
348
+ format: :tabular,
349
+ metadata: table,
350
+ base: table.url,
351
+ minimal: minimal?,
352
+ no_found_metadata: true
353
+ )) do |r|
354
+ case table = r.to_hash(options)
355
+ when Array then tables += table
356
+ when Hash then tables << table
357
+ end
358
+ end
359
+ end
360
+
361
+ # Result is table_group or array
362
+ minimal? ? tables : table_group
363
+ when :Table
364
+ table = nil
365
+ Reader.open(input.url, options.merge(
366
+ format: :tabular,
367
+ metadata: input,
368
+ base: input.url,
369
+ minimal: minimal?,
370
+ no_found_metadata: true
371
+ )) do |r|
372
+ table = r.to_hash(options)
373
+ end
374
+
375
+ table
376
+ else
377
+ raise "Opened inappropriate metadata type: #{input.type}"
378
+ end
379
+ end
380
+ else
381
+ rows = []
382
+ table = {}
383
+ table['@id'] = metadata.id.to_s if metadata.id
384
+ table['url'] = metadata.url.to_s
385
+
386
+ # Use string values notes and common properties
387
+ metadata.each do |key, value|
388
+ next unless key.to_s.include?(':') || key == :notes
389
+ table[key] = metadata.common_properties(nil, key, value)
390
+ table[key] = [table[key]] if key == :notes && !table[key].is_a?(Array)
391
+ end unless minimal?
392
+
393
+ table.merge!("row" => rows)
394
+
395
+ # Input is file containing CSV data.
396
+ # Output ROW-Level statements
397
+ metadata.each_row(input) do |row|
398
+ if row.is_a?(RDF::Statement)
399
+ # May add additional comments
400
+ table['rdfs:comment'] ||= []
401
+ table['rdfs:comment'] << row.object.to_s
402
+ next
403
+ end
404
+ # Output row-level metadata
405
+ r, a, values = {}, {}, {}
406
+ r["url"] = row.id.to_s
407
+ r["rownum"] = row.number
408
+
409
+ row.values.each_with_index do |cell, index|
410
+ column = metadata.tableSchema.columns[index]
411
+
412
+ # Ignore suppressed columns
413
+ next if column.suppressOutput
414
+
415
+ # Skip valueUrl cells where the valueUrl is null
416
+ next if cell.column.valueUrl && cell.valueUrl.nil?
417
+
418
+ # Skip empty sequences
419
+ next if !cell.column.valueUrl && cell.value.is_a?(Array) && cell.value.empty?
420
+
421
+ subject = cell.aboutUrl || 'null'
422
+ co = (a[subject.to_s] ||= {})
423
+ co['@id'] = subject.to_s unless subject == 'null'
424
+ prop = case cell.propertyUrl
425
+ when RDF.type then '@type'
426
+ when nil then column.name
427
+ else
428
+ # Compact the property to a term or prefixed name
429
+ metadata.context.compact_iri(cell.propertyUrl, vocab: true)
430
+ end
431
+
432
+ value = case
433
+ when prop == '@type'
434
+ metadata.context.compact_iri(cell.valueUrl || cell.value, vocab: true)
435
+ when cell.valueUrl
436
+ unless subject == cell.valueUrl
437
+ values[cell.valueUrl.to_s] ||= {o: co, prop: prop, count: 0}
438
+ values[cell.valueUrl.to_s][:count] += 1
439
+ end
440
+ cell.valueUrl.to_s
441
+ when cell.value.is_a?(RDF::Literal::Numeric)
442
+ cell.value.object
443
+ when cell.value.is_a?(RDF::Literal::Boolean)
444
+ cell.value.object
445
+ else
446
+ cell.value
447
+ end
448
+
449
+ # Add or merge value
450
+ merge_compacted_value(co, prop, value)
451
+ end
452
+
453
+ # Check for nesting
454
+ values.keys.each do |valueUrl|
455
+ next unless a.has_key?(valueUrl)
456
+ ref = values[valueUrl]
457
+ co = ref[:o]
458
+ prop = ref[:prop]
459
+ next if ref[:count] != 1
460
+ raise "Expected #{ref[o][prop].inspect} to include #{valueUrl.inspect}" unless Array(co[prop]).include?(valueUrl)
461
+ co[prop] = Array(co[prop]).map {|e| e == valueUrl ? a.delete(valueUrl) : e}
462
+ co[prop] = co[prop].first if co[prop].length == 1
463
+ end
464
+
465
+ r["describes"] = a.values
466
+
467
+ if minimal?
468
+ rows.concat(r["describes"])
469
+ else
470
+ rows << r
471
+ end
472
+ end
473
+
474
+ minimal? ? table["row"] : table
475
+ end
476
+ end
477
+
478
+ # Return a hash representation of the annotated tabular data model for JSON serialization
479
+ # @param [Hash{Symbol => Object}] options
480
+ # @return [Hash]
481
+ def to_atd(options = {})
482
+ # Construct metadata from that passed from file open, along with information from the file.
483
+ if input.is_a?(Metadata)
484
+ debug("each_statement: metadata") {input.inspect}
485
+ depth do
486
+ # Get Metadata to invoke and open referenced files
487
+ case input.type
488
+ when :TableGroup
489
+ table_group = input.to_atd
490
+
491
+ input.each_resource do |table|
492
+ Reader.open(table.url, options.merge(
493
+ format: :tabular,
494
+ metadata: table,
495
+ base: table.url,
496
+ no_found_metadata: true, # FIXME: remove
497
+ noProv: true
498
+ )) do |r|
499
+ table = r.to_atd(options)
500
+
501
+ # Fill in columns and rows in table_group entry from returned table
502
+ t = table_group[:resources].detect {|tab| tab["url"] == table["url"]}
503
+ t["columns"] = table["columns"]
504
+ t["rows"] = table["rows"]
505
+ end
506
+ end
507
+
508
+ # Result is table_group
509
+ table_group
510
+ when :Table
511
+ table = nil
512
+ Reader.open(input.url, options.merge(
513
+ format: :tabular,
514
+ metadata: input,
515
+ base: input.url,
516
+ no_found_metadata: true,
517
+ noProv: true
518
+ )) do |r|
519
+ table = r.to_atd(options)
520
+ end
521
+
522
+ table
523
+ else
524
+ raise "Opened inappropriate metadata type: #{input.type}"
525
+ end
526
+ end
527
+ else
528
+ rows = []
529
+ table = metadata.to_atd
530
+ rows, columns = table["rows"], table["columns"]
531
+
532
+ # Input is file containing CSV data.
533
+ # Output ROW-Level statements
534
+ metadata.each_row(input) do |row|
535
+ rows << row.to_atd
536
+ row.values.each_with_index do |cell, colndx|
537
+ columns[colndx]["cells"] << cell.id
538
+ end
539
+ end
540
+ table
541
+ end
542
+ end
543
+
544
+ def minimal?; @options[:minimal]; end
545
+ def prov?; !(@options[:noProv]); end
546
+
547
+ private
548
+ ##
549
+ # @overload add_statement(lineno, statement)
550
+ # Add a statement, object can be literal or URI or bnode
551
+ # @param [String] lineno
552
+ # @param [RDF::Statement] statement
553
+ # @yield [RDF::Statement]
554
+ # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
555
+ #
556
+ # @overload add_statement(lineno, subject, predicate, object)
557
+ # Add a triple
558
+ # @param [URI, BNode] subject the subject of the statement
559
+ # @param [URI] predicate the predicate of the statement
560
+ # @param [URI, BNode, Literal] object the object of the statement
561
+ # @raise [ReaderError] Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
562
+ def add_statement(node, *args)
563
+ statement = args[0].is_a?(RDF::Statement) ? args[0] : RDF::Statement.new(*args)
564
+ raise RDF::ReaderError, "#{statement.inspect} is invalid" if validate? && statement.invalid?
565
+ debug(node) {"statement: #{RDF::NTriples.serialize(statement)}".chomp}
566
+ @callback.call(statement)
567
+ end
568
+
569
+ # Merge values into compacted results, creating arrays if necessary
570
+ def merge_compacted_value(hash, key, value)
571
+ return unless hash
572
+ case hash[key]
573
+ when nil then hash[key] = value
574
+ when Array
575
+ if value.is_a?(Array)
576
+ hash[key].concat(value)
577
+ else
578
+ hash[key] << value
579
+ end
580
+ else
581
+ hash[key] = [hash[key]]
582
+ if value.is_a?(Array)
583
+ hash[key].concat(value)
584
+ else
585
+ hash[key] << value
586
+ end
587
+ end
588
+ end
589
+ end
590
+ end
591
+