spread2rdf 0.0.1pre.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MzBlOGJjZDMyMjBjMjNmNDA5MTk3MmY1NjMzMmE2OWNiZGE3MTljYw==
5
+ data.tar.gz: !binary |-
6
+ Y2IxMjY2MDBjZDk1OGYxNjE2YjlmM2UyZGZkYTQ2NmZjNDk5MWYyYQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ODlkODhkZDQ1NzZmNGVhY2E0MGIwNDgyOWVmZDFkODkyYjJlYmNiZjYyMzlh
10
+ ZmEzNjk4YjJiNjIyZjM1OTM2YTQ1ZGZlZDJjZTFhOTA1Mzc0ZWJlZTY4NjY1
11
+ YmEyNGZmNGM1ZjBkYTMxMTNmZWQ2YTZkNzVkOTViMDA2Y2I0ZTk=
12
+ data.tar.gz: !binary |-
13
+ MDExZWZiZjY4YjBlNDdkYzhkN2ExZjNmNTVhNWVkNGEwMDBmYzg0YTY0ZGM2
14
+ Njk3ZmQ4NmU3MmY3MjAxYzkwZjkzZThjM2VhZWMzYjYyZmJjYjBjMTc0MTdi
15
+ YzUwNzlhNTRhMjY0NDJiNzIxZTdiYTkzNTZlM2U0ZTBjZTNmODY=
data/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ .DS_Store
2
+ .idea
3
+ .idea45
4
+ *.gem
5
+ *.rbc
6
+ .bundle
7
+ .config
8
+ .yardoc
9
+ Gemfile.lock
10
+ InstalledFiles
11
+ _yardoc
12
+ coverage
13
+ doc/
14
+ lib/bundler/man
15
+ pkg
16
+ rdoc
17
+ spec/reports
18
+ test/tmp
19
+ test/version_tmp
20
+ testdata
21
+ tmp
22
+
23
+ lib/**/-*
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in spread2rdf.gemspec
4
+ gemspec
5
+
6
+ group :development, :test do
7
+ gem 'rake'
8
+ gem 'pry', '~> 0.9.12.2'
9
+ gem 'pry-nav', '~> 0.2.3'
10
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Marcel Otto
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # Spread2RDF
2
+
3
+ Spread2RDF is a converter for complex spreadsheets to RDF and a DSL for
4
+ specifying the mapping rules for this conversion.
5
+
6
+ ## Features
7
+
8
+ * Supports Excel/Excelx, Google spreadsheets, OpenOffice, LibreOffice and CSV
9
+ spreadsheets as input, thanks to [Roo](https://github.com/Empact/roo).
10
+ (Currently, it's tested for Excel only.
11
+ If you have a problem with another spreadsheet type,
12
+ [raise an issue](https://github.com/marcelotto/spread2rdf/issues).)
13
+ * Supports many RDF serialization formats for the output, thanks to
14
+ [RDF.rb](https://github.com/ruby-rdf/rdf).
15
+
16
+ ## Installation
17
+
18
+ Install [Ruby](http://www.ruby-lang.org/) and execute the following command
19
+ in a terminal:
20
+
21
+ $ gem install spread2rdf
22
+
23
+ ## Command-line interface
24
+
25
+ For a description of all available parameters, type the following in a terminal:
26
+
27
+ $ spread2rdf --help
28
+
29
+ ## Mapping DSL
30
+
31
+ Description is following soon.
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create new Pull Request
40
+
41
+
42
+ ## Authors
43
+
44
+ * Marcel Otto
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1pre.1
data/bin/spread2rdf ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'spread2rdf'
3
+ Spread2RDF::Cli.new.run
@@ -0,0 +1,45 @@
1
+ module Spread2RDF
2
+ module Attributes
3
+ extend ActiveSupport::Concern
4
+
5
+ module ClassMethods
6
+ def attributes
7
+ if superclass.respond_to?(:attributes) and
8
+ (super_attributes = superclass.attributes).is_a? Hash
9
+ @attributes.reverse_merge(super_attributes)
10
+ else
11
+ @attributes
12
+ end
13
+ end
14
+
15
+ def attributes=(defaults)
16
+ defaults.each { |attribute, default_value| attr_accessor attribute }
17
+ @attributes = @attributes.try(:merge, defaults) || defaults
18
+ end
19
+ end
20
+
21
+ def init_attributes(initial_values)
22
+ self.class.attributes.each do |attribute, default_value|
23
+ instance_variable_set("@#{attribute}".to_sym,
24
+ initial_values.delete(attribute) || default_value)
25
+ end
26
+ initial_values
27
+ end
28
+
29
+ def update_attributes(update_values)
30
+ update_values.each do |attribute, value|
31
+ next unless self.class.attributes.include? attribute
32
+ instance_variable_set("@#{attribute}".to_sym, value)
33
+ end
34
+ update_values
35
+ end
36
+
37
+ def inspect
38
+ "#{self}: " +
39
+ self.class.attributes.map do |attribute, default_value|
40
+ "#{attribute}=#{self.send(attribute)}"
41
+ end.join(', ')
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,91 @@
1
+ # coding: utf-8
2
+
3
+ module Spread2RDF
4
+ class Cli
5
+ def initialize
6
+ parse_command_line!
7
+ end
8
+
9
+ def run(schema_spec_file = nil)
10
+ schema_spec_file ||= @options[:schema_spec_file]
11
+ abort "No schema specification file given" if schema_spec_file.nil?
12
+ abort "Couldn't find schema specification file #{schema_spec_file}" unless
13
+ File.exist?(schema_spec_file)
14
+ load schema_spec_file
15
+ abort "No schema specification found" if Spreadsheet.definitions.empty?
16
+ puts "Reading #{@input_file} ..."
17
+ @table = Spreadsheet.definitions.first
18
+ @table.read(@input_file)
19
+ write_output
20
+ self
21
+ end
22
+
23
+ private
24
+
25
+ # Parse command line options
26
+ def parse_command_line!(options={})
27
+ @options = options
28
+ optparse = OptionParser.new do |opts|
29
+ opts.banner = 'Usage: spread2rdf [options] -s SPEC_FILE SPREAD_SHEET_FILE'
30
+
31
+ opts.on( '-h', '--help', 'Display this information' ) do
32
+ puts opts
33
+ exit
34
+ end
35
+
36
+ @options[:output_dir] = '.'
37
+ opts.on( '-o', '--output DIR', 'Output directory (default: current directory)' ) do |dir|
38
+ abort "Output directory #{dir} doesn't exist" unless Dir.exist?(dir)
39
+ @options[:output_dir] = dir
40
+ end
41
+
42
+ @options[:output_format] = 'ttl'
43
+ opts.on( '-f', '--output-format FORMAT', 'Serialization format for the RDF data',
44
+ "FORMAT being one of: nt, n3, ttl, rdf, xml, html, json (default: ttl)") do |format|
45
+ #format = 'turtle' if format == 'ttl'
46
+ @options[:output_format] = format.strip.downcase
47
+ end
48
+
49
+ @options[:schema_spec_file] = nil
50
+ opts.on( '-s', '--schema SPEC_FILE', 'Schema specification file (required)' ) do |file|
51
+ @options[:schema_spec_file] = file
52
+ end
53
+
54
+ end
55
+
56
+ optparse.parse!
57
+ raise OptionParser::ParseError, 'required file arguments missing' if ARGV.empty?
58
+ raise OptionParser::ParseError, 'required schema specification file missing' if @options[:schema_spec_file].nil?
59
+
60
+ @input_file = ARGV.first
61
+ rescue OptionParser::ParseError => e
62
+ puts e.message
63
+ puts optparse.help
64
+ exit
65
+ end
66
+
67
+ def output_filename
68
+ output_dir = @options[:output_dir]
69
+ name = File.basename(@input_file, File.extname(@input_file))
70
+ "#{output_dir}/#{name}.#{@options[:output_format]}"
71
+ end
72
+
73
+ def write_output
74
+ filename = output_filename
75
+ abort 'No RDF data to write!' if @table.try(:to_rdf).blank?
76
+ graph = @table.to_rdf
77
+ puts "Writing #{graph.count} RDF statements to #{filename} ... "
78
+ # TODO: base_uri: ... for writer constructor
79
+ RDF::Writer.open(filename) do |writer|
80
+ RDF::Vocabulary.each do |vocabulary|
81
+ writer.prefix vocabulary.__prefix__, vocabulary.to_s
82
+ end
83
+ Namespace.namespace.each do |name, namespace|
84
+ writer.prefix name.to_s.downcase, namespace.to_s
85
+ end
86
+ graph.each_statement { |statement| writer << statement }
87
+ end
88
+ end
89
+ self
90
+ end
91
+ end
@@ -0,0 +1,28 @@
1
+ require 'zip'
2
+ #require 'zip/zipfilesystem'
3
+ # source (adapted to newer version of Roo and ruby-zip): https://gist.github.com/roblingle/1333908
4
+
5
+ # Easy access to xlsm files through the roo gem, version 1.10.0.
6
+ # The error that led me to write this is in the file below for google fodder. Not exactly sure what was causing
7
+ # the problem, so I'm not sure that this change won't break everything on your computer or summon zombies.
8
+ #
9
+ # Be sure to tell roo that you don't care about the extension mismatch:
10
+ # xl = Roo::Excelx.new("C:/path/to/spreadsheet_with_macro.xlsm", :zip, :warning)
11
+ #
12
+ class Roo::Excelx
13
+
14
+ alias :old_initialize :initialize
15
+ def initialize(filename, options = {}) # , packed=nil, file_warning = :error)
16
+ @original_file = filename
17
+ old_initialize(filename, options)
18
+ end
19
+
20
+ # extract files from the zip file, rewrites a method of the same name in lib/roo/excelx.rb
21
+ def extract_content(tmpdir, zipfilename_unused)
22
+ #Zip::ZipFile.open(@original_file) do |zip|
23
+ Zip::File.open(@original_file) do |zip|
24
+ process_zipfile(tmpdir, @original_file, zip)
25
+ end
26
+ end
27
+
28
+ end
@@ -0,0 +1,14 @@
1
+ module Spread2RDF
2
+ module Helper
3
+
4
+ module_function
5
+
6
+ # TODO: include this in the MappingContext(s)
7
+ def resource_name(string)
8
+ string
9
+ .gsub(', ', '-')
10
+ .gsub(' ', '-')
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,50 @@
1
+ module Spread2RDF
2
+ module Namespace
3
+ class << self
4
+ def [](name)
5
+ name = name.to_sym
6
+ self.namespace[name] ||
7
+ ( RDF.const_defined?(name) && RDF.const_get(name)) ||
8
+ nil
9
+ end
10
+
11
+ def []=(name, namespace)
12
+ name = name.to_sym
13
+ self.namespace[name] = case namespace
14
+ when RDF::Vocabulary then namespace
15
+ when String, RDF::URI then RDF::Vocabulary.new(namespace)
16
+ else raise ArgumentError, "expecting a namespace but got #{namespace}:#{namespace.class}"
17
+ end
18
+ end
19
+
20
+ def namespace
21
+ @namespace ||= {}
22
+ end
23
+
24
+ def namespaces
25
+ namespace.values
26
+ end
27
+
28
+ def resolve_to_namespace(namespace_descriptor)
29
+ case namespace_descriptor
30
+ when Symbol
31
+ Namespace[namespace_descriptor]
32
+ when RDF::Vocabulary, RDF::URI, String
33
+ namespace_descriptor.to_s
34
+ else
35
+ raise "invalid namespace: #{namespace_descriptor.inspect}"
36
+ end
37
+ end
38
+
39
+ def const_missing(name)
40
+ self[name] or super
41
+ end
42
+ end
43
+ end
44
+ NS = Namespace
45
+
46
+ def self.const_missing(name)
47
+ Namespace[name] or super
48
+ end
49
+
50
+ end
@@ -0,0 +1,48 @@
1
+ module Spread2RDF
2
+ class Spreadsheet
3
+ class Sheet
4
+ class Column < Element
5
+
6
+ attr_reader :coord # this is set by Worksheet#index_columns!
7
+
8
+ self.attributes = {
9
+ predicate: nil,
10
+ object: nil,
11
+ statement: nil
12
+ }
13
+
14
+ def initialize(sheet, options = {}, &block)
15
+ super
16
+ end
17
+
18
+ alias sheet parent
19
+
20
+ def worksheet
21
+ parent = self.parent
22
+ parent = parent.parent until parent.is_a? Worksheet or parent.nil?
23
+ parent
24
+ end
25
+
26
+ def map(range, context)
27
+ #puts "mapping #{self} in #{range} ..."
28
+ case range
29
+ when Integer
30
+ coord = Coord[row: range, column: self.coord]
31
+ worksheet.cell_mapping[coord.to_sym] ||= mapping =
32
+ create_context(context, row: range,
33
+ subject: context.subject, predicate: predicate)
34
+ mapping.object
35
+ when Range
36
+ range.map { |row| self.map(row, context) }
37
+ else raise ArgumentError
38
+ end
39
+ end
40
+
41
+ def to_s
42
+ "#{super} of #{sheet}"
43
+ end
44
+
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,156 @@
1
+ module Spread2RDF
2
+ class Spreadsheet
3
+ class Sheet
4
+ class Column
5
+ class MappingContext < Spreadsheet::MappingContext
6
+
7
+ self.attributes = {
8
+ subject: nil,
9
+ predicate: nil,
10
+ row: nil
11
+ }
12
+
13
+ alias column element
14
+ alias property predicate
15
+
16
+ attr_reader :value
17
+
18
+ def initialize(sheet, parent_context = nil, attr = {})
19
+ super
20
+ @value = cell_value(row: row, column: column.coord)
21
+ return if @value.blank?
22
+ statements_to_object
23
+ worksheet.graph << self.graph
24
+ end
25
+
26
+ def cell_coord
27
+ Coord[row: row, column: column.coord]
28
+ end
29
+
30
+ def subject
31
+ @subject or parent_context.try(:subject)
32
+ end
33
+
34
+ def object
35
+ @object ||= @value && map_to_object(value)
36
+ end
37
+
38
+ def value_of_column(name)
39
+ other_column = sheet.column[name]
40
+ raise "couldn't find column #{name} when mapping #{column}" if
41
+ other_column.nil?
42
+ cell_value(row: row, column: other_column.coord)
43
+ end
44
+
45
+ def object_of_column(name)
46
+ other_column = sheet.column[name]
47
+ raise "couldn't find column #{name} when mapping #{column}" if
48
+ other_column.nil?
49
+ cell(row: row, column: other_column.coord).object
50
+ end
51
+
52
+ ######################################################################
53
+ # Value-to-object mapping
54
+
55
+ private
56
+
57
+ def map_to_object(value)
58
+ case object_mapping_mode
59
+ when :to_string
60
+ value
61
+ when :resource_ref
62
+ resolve_resource_ref
63
+ when :new_resource
64
+ create_resource_object
65
+ when :custom
66
+ # TODO execute a mapping block in the context of Column::MappingContext
67
+ else
68
+ raise 'internal error: unknown column mapping type'
69
+ end
70
+ end
71
+
72
+
73
+ def object_mapping_mode
74
+ case
75
+ when column.object.nil? then :to_string
76
+ when column.object.is_a?(Proc) then :custom
77
+ when !column.object[:uri].nil? then :new_resource
78
+ when !column.object[:from].nil? then :resource_ref
79
+ else
80
+ raise "mapping specification error: don't know how to map #{column}"
81
+ end
82
+ end
83
+
84
+ def resolve_resource_ref
85
+ source = column.object[:from]
86
+ source = { worksheet: source } if source.is_a? Symbol
87
+ raise ArgumentError, "expecting a Hash as source, but got #{source}" unless source.is_a? Hash
88
+ source_worksheet = source[:worksheet]
89
+ source_worksheet = spreadsheet.worksheet[source_worksheet]
90
+ raise "#{column}: couldn't find source worksheet #{source[:worksheet]}" if source_worksheet.nil?
91
+ source_predicate = source[:predicate] || RDF::RDFS.label
92
+ result = source_worksheet.graph.query([nil, source_predicate, value])
93
+ raise "#{column}: couldn't find a resource for #{value} in #{source_worksheet}" if result.empty?
94
+ raise "#{column}: found multiple resources for #{value} in #{source_worksheet}: #{result.map(&:subject)}" if result.count > 1
95
+ result.first.subject
96
+ end
97
+
98
+ # TODO: Should we reuse/share mapping logic with Sheet::MappingContext (#subject etc.)?
99
+ def create_resource_object
100
+ case
101
+ when (column.object.try(:fetch, :uri, nil) || object) == :bnode
102
+ RDF::Node.new
103
+ else
104
+ raise NotImplementedError
105
+ end
106
+ end
107
+
108
+
109
+ ######################################################################
110
+ # Statement mapping
111
+
112
+ def statement_mapping_mode
113
+ case
114
+ when column.statement == :none then :ignore
115
+ when column.statement == :none then :ignore
116
+ when column.predicate.nil? then :ignore
117
+ when restriction_mode then :restriction
118
+ else :default
119
+ end
120
+ end
121
+
122
+ def restriction_mode
123
+ restriction_mode = column.statement
124
+ case restriction_mode
125
+ when :restriction then RDF::OWL.hasValue
126
+ when Hash then restriction_mode[:restriction]
127
+ else nil
128
+ end
129
+ end
130
+
131
+ def statements_to_object
132
+ case statement_mapping_mode
133
+ when :default
134
+ statement(subject, predicate, object)
135
+ when :restriction
136
+ restriction_class = RDF::Node.new
137
+ statements(
138
+ [ subject, RDF::RDFS.subClassOf, restriction_class ],
139
+ [ restriction_class, RDF.type, RDF::OWL.Restriction ],
140
+ [ restriction_class, RDF::OWL.onProperty, predicate ],
141
+ [ restriction_class, restriction_mode, object ]
142
+ )
143
+ end
144
+ exec(value, &column.block) if column.block
145
+ end
146
+
147
+ def exec(value, &block)
148
+ #puts "executing block of #{@___column___} in row #{row}"
149
+ self.instance_exec(value, &block)
150
+ end
151
+
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,51 @@
1
+ module Spread2RDF
2
+ class Spreadsheet
3
+ class Coord < Struct.new(:column, :row)
4
+ def initialize(*args)
5
+ case args.length
6
+ when 2 then super
7
+ when 1
8
+ case args = args.first
9
+ when Hash
10
+ super(args[:column], args[:row])
11
+ when Symbol, String
12
+ coord = args.to_s
13
+ raise "Invalid cell coordinates #{coord}" unless coord =~ /(\w+)(\d+)/
14
+ super(Regexp.last_match[1], Regexp.last_match[2].to_i)
15
+ else raise ArgumentError, "can't handle argument #{args}"
16
+ end
17
+ else raise ArgumentError, "too many arguments: #{args}"
18
+ end
19
+ end
20
+
21
+ def column_as_number
22
+ Roo::Base.letter_to_number(column)
23
+ end
24
+
25
+ def column_as_index
26
+ column_as_number - 1
27
+ end
28
+
29
+ def increment_column(count = 1)
30
+ self.class.increment_column(self.column, count)
31
+ end
32
+
33
+ def to_s
34
+ "#{column}#{row}"
35
+ end
36
+
37
+ def to_sym
38
+ to_s.to_sym
39
+ end
40
+
41
+ class << self
42
+ alias [] new
43
+
44
+ def increment_column(column, count=1)
45
+ Roo::Base.number_to_letter(Roo::Base.letter_to_number(column) + count)
46
+ end
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,53 @@
1
+ module Spread2RDF
2
+ class Spreadsheet
3
+ class Element
4
+ include Attributes
5
+
6
+ self.attributes = {
7
+ name: nil,
8
+ source_name: nil
9
+ }
10
+
11
+ attr_reader :parent
12
+ attr_reader :block
13
+
14
+
15
+ def initialize(parent, attr={}, &block)
16
+ @parent = parent
17
+ @block = block
18
+ init_attributes(attr)
19
+ end
20
+
21
+ def init
22
+
23
+ end
24
+
25
+ def name
26
+ (@name or @source_name).try(:to_sym)
27
+ end
28
+
29
+ def source_name
30
+ (@source_name or @name).try(:to_s)
31
+ end
32
+
33
+ def spreadsheet
34
+ parent.spreadsheet
35
+ end
36
+
37
+ def to_s
38
+ name = (self.name.to_s == self.source_name.to_s ?
39
+ self.name : "#{self.name} (#{self.source_name})" )
40
+ "#{self.class.name.split('::').last} #{name}"
41
+ end
42
+
43
+ private
44
+
45
+ def create_context(parent_context, attr)
46
+ context_class = self.class.const_get(:MappingContext)
47
+ context_class.new(self, parent_context, attr)
48
+ end
49
+
50
+ end
51
+ end
52
+ end
53
+