dwcr 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ module DwCR
5
+ module Metaschema
6
+
7
+ # This class represents _field_ nodes in the DarwinCoreArchive
8
+ # that correspond to columns in table sof the DwCA schema
9
+ # and where applicable the ContentFile instances
10
+ # * +type+: the column type of the node
11
+ # default: 'string'
12
+ # * +name+: the name for the node
13
+ # default: term without namespace in underscore (snake case) form
14
+ # * +term+: the full term (a uri), including namespace for the node
15
+ # see http://rs.tdwg.org/dwc/terms/index.htm
16
+ # * +default+: the default vale for columns in the
17
+ # DwCA schema table
18
+ # * +index+: the column index in the ContentFile instances
19
+ # associated with the Attribute instances' parent Entity instance
20
+ # * +max_content_length+: the maximum string length of values
21
+ # in the corresponding column in the ContentFile instances
22
+ # associated with the Attribute instances' parent Entity instance
23
+ # * *#entity*:
24
+ # the Entity instance the Attribute instance belongs to
25
+ class Attribute < Sequel::Model
26
+ include XMLParsable
27
+
28
+ many_to_one :entity
29
+
30
+ # Returns the last component of the term
31
+ # will return the full term is the term is not unambiguous
32
+ def baseterm
33
+ unambiguous ? term&.split('/')&.last : term
34
+ end
35
+
36
+ # Returns a symbol for the +name+ that is
37
+ # the name of the column in the DarwinCoreArchive schema
38
+ def column_name
39
+ name.to_sym
40
+ end
41
+
42
+ # Reurns +true+ if the _field_ node is the foreign key in the
43
+ # _core_ or an _extension_ node in the DwCA schema, +false+ otherwise
44
+ def foreign_key?
45
+ index == entity.key_column && !entity.is_core
46
+ end
47
+
48
+ # Returns the maximum length for values in the corresponding column
49
+ # in the DwCA schema
50
+ # the returned value is the greater of either the length of the +default+
51
+ # or the +max_content_length+ or +nil+ if neither is set
52
+ def length
53
+ [default&.length, max_content_length].compact.max
54
+ end
55
+
56
+ # Returns an array that can be splatted as arguments into the
57
+ # Sequel::Schema::CreatTableGenerator#column method:
58
+ # <tt>[name, type, options]</tt>
59
+ def to_table_column
60
+ col_type = type ? type.to_sym : :string
61
+ [column_name, col_type, { index: index_options, default: default }]
62
+ end
63
+
64
+ alias length= max_content_length=
65
+
66
+ private
67
+
68
+ # Returns the index options for the Sequel database column
69
+ # by default DwCR does not persist the foreign key field of any _extension_
70
+ # nodes in the DwCA schema (as that relation is re-established through
71
+ # associations) using proper SQL foreign keys
72
+ # therefore the below else clause will never be reached in DwCR files
73
+ # generated from an existing DarwinCoreArchive
74
+ def index_options
75
+ return false unless index && index == entity.key_column
76
+ if entity.is_core
77
+ { unique: true }
78
+ else
79
+ true
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ module DwCR
5
+ #
6
+ module Metaschema
7
+ # This class represents _location_ nodes (children of the _files_ nodes)
8
+ # in DarwinCoreArchive, which represent csv files associated with a
9
+ # _core_ or _extension_ node
10
+ # * +name+:
11
+ # the basename of the file with extension (normally .csv)
12
+ # * +path+:
13
+ # the directory path where the file is located
14
+ # set this attribute to load content files from arbitrary directories
15
+ # * +is_loaded+:
16
+ # a flag that is set to +true+ when the ContentFile instance's
17
+ # contents have been loaded from the CSV file into the database
18
+ # * *#entity*:
19
+ # the Entity instance the ContentFile instance belongs to
20
+ class ContentFile < Sequel::Model
21
+ many_to_one :entity
22
+
23
+ # Returns the full file name including the path
24
+ def file_name
25
+ File.join(path, name)
26
+ end
27
+
28
+ # Returns an array of symbols for column names for each column in the
29
+ # (headerless) CSV file specified in the +file+ attribute
30
+ # the array is mapped from the ContentFile instance's parent Entity
31
+ # instance's Attribute instances that have an +index+
32
+ # The array is sorted by the +index+
33
+ def content_headers
34
+ entity.attributes_dataset
35
+ .exclude(index: nil)
36
+ .exclude(type: nil)
37
+ .order(:index)
38
+ .map(&:column_name)
39
+ end
40
+
41
+ # Inserts all rows of the CSV file belonging to the ContentFile instance
42
+ # into the table of the DwCA represented by the instance's parent
43
+ # Entity instance
44
+ # Will raise an error for _extension_ instances
45
+ # if the _core_ instance is not loaded
46
+ def load
47
+ return if is_loaded
48
+ load_error = 'core needs to be loaded before extension files'
49
+ raise load_error unless entity.is_core || entity.core.loaded?
50
+ CSV.foreach(file_name) { |row| insert_row(row) }
51
+ self.is_loaded = true
52
+ save
53
+ is_loaded
54
+ end
55
+
56
+ # Deletes all rows of the CSV file belonging to the ContentFile instance
57
+ # from the table of the DwCA represented by the instance's parent
58
+ # Entity instance
59
+ # *Warning*: If this is called on a _core_ instance,
60
+ # this will also destroy any dependant _extension_ records!
61
+ def unload!
62
+ return unless is_loaded
63
+ CSV.foreach(file_name) { |row| delete_row(row) }
64
+ self.is_loaded = false
65
+ save
66
+ end
67
+
68
+ private
69
+
70
+ # removes all cells from a row for which the column in all
71
+ # associated CSV files is empty (the associated Attribute instance
72
+ # has <tt>type == nil</tt>)
73
+ def compact(row)
74
+ empty_cols = entity.attributes_dataset
75
+ .exclude(index: nil)
76
+ .where(type: nil)
77
+ .map(&:index)
78
+ row.delete_if.with_index { |_, index| empty_cols.include?(index) }
79
+ end
80
+
81
+ # Delets a CSV row from the DwCA table represented
82
+ # by the instance's parent Entity instance
83
+ def delete_row(row)
84
+ row_vals = row_to_hash row
85
+ row_vals.delete(entity.key) unless entity.is_core
86
+ rec = entity.model_get.first(row_vals)
87
+ rec.destroy
88
+ end
89
+
90
+ # Inserts a CSV row into the DwCA schema's _core_ table
91
+ def insert_into_core(row)
92
+ return unless entity.is_core
93
+ entity.model_get.create(row_to_hash(row))
94
+ end
95
+
96
+ # Inserts a CSV row into an _extension_ table of the DwCA schema
97
+ def insert_into_extension(row)
98
+ row_vals = row_to_hash row
99
+ core_row(row_vals.delete(entity.key)).send(add_related, row_vals)
100
+ end
101
+
102
+ # Inserts a CSV row into the DwCA table represented
103
+ # by the instance's parent Entity instance
104
+ def insert_row(row)
105
+ rec = insert_into_core(row) || insert_into_extension(row)
106
+ entity.send(add_related, rec)
107
+ end
108
+
109
+ # Returns the row from the DwCA _core_ that matches the foreign key
110
+ def core_row(foreign_key)
111
+ entity.core.model_get.first(entity.core.key => foreign_key)
112
+ end
113
+
114
+ # Returns a string that is the method name to add a related record via
115
+ # an association on the DwCA model
116
+ def add_related
117
+ 'add_' + entity.name.singularize
118
+ end
119
+
120
+ # Creates a hash from a headerless CSV row
121
+ # Entity instance's :attributes colum names are keys
122
+ # the CSV row cells are values
123
+ def row_to_hash(row)
124
+ keys = content_headers
125
+ vals = compact row
126
+ keys.zip(vals).to_h.select { |_k, v| v && !v.empty? }
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,195 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ module DwCR
5
+ #
6
+ module Metaschema
7
+ # This class represents _core_ or _extension_ nodes in DarwinCoreArchive
8
+ # * +name+: the name for the node
9
+ # default: pluralized term without namespace
10
+ # in underscore (snake case) form
11
+ # * +term+: the full term (a uri), including namespace for the node
12
+ # see http://rs.tdwg.org/dwc/terms/index.htm
13
+ # * +is_core+:
14
+ # _true_ if the node is the core node of the DarwinCoreArchive,
15
+ # _false_ otherwise
16
+ # * +key_column+: the column in the node representing
17
+ # the primary key of the core node or
18
+ # the foreign key in am extension node
19
+ # * +fields_enclosed_by+: directive to form CSV in :content_files
20
+ # default: '&quot;'
21
+ # * +fields_terminated_by+: fieldsTerminatedBy=","
22
+ # * +lines_terminated_by+: linesTerminatedBy="\r\n"
23
+ # * *#archive*:
24
+ # the Archive instance the Entity instance belongs to
25
+ # * *#attributes*:
26
+ # Attribute instances associated with the Entity
27
+ # * *#content_files*:
28
+ # ContentFile instances associated with the Entity
29
+ # * *#core*:
30
+ # if the Entity instance is an _extension_
31
+ # returns the Entity instance representing the _core_ node
32
+ # nil otherwise
33
+ # * *#extensions*:
34
+ # if the Entity instance is the _core_ node
35
+ # returns any Entity instances representing _extension_ nodes
36
+ class Entity < Sequel::Model
37
+ include XMLParsable
38
+
39
+ ensure_not_core = lambda do |ent, attr|
40
+ e = 'extensions must be associated with a core'
41
+ attr.is_core = false
42
+ raise ArgumentError, e unless ent.is_core
43
+ attr.archive = ent.archive
44
+ end
45
+
46
+ ensure_path = lambda do |ent, attr|
47
+ attr.path ||= ent.archive.path
48
+ end
49
+
50
+ ensure_unique_name = lambda do |ent, attr|
51
+ attr.name ||= attr.term&.split('/')&.last&.underscore
52
+ name_taken = ent.attributes_dataset.first(name: attr.name)
53
+ if name_taken
54
+ attr.name += '!'
55
+ attr.unambiguous = false
56
+ name_taken.unambiguous = false
57
+ name_taken.save
58
+ end
59
+ end
60
+
61
+ many_to_one :archive
62
+ one_to_many :attributes, before_add: ensure_unique_name
63
+ one_to_many :content_files, before_add: ensure_path
64
+ many_to_one :core, class: self
65
+ one_to_many :extensions, key: :core_id, class: self,
66
+ before_add: ensure_not_core
67
+
68
+ # Returns the last component of the term
69
+ def baseterm
70
+ term.split('/').last
71
+ end
72
+
73
+ # Returns a string with Entity instance's singularized name in camelcase
74
+ # this is the name of the Sequel::Model in the DarwinCoreArchive schema
75
+ def class_name
76
+ name.classify
77
+ end
78
+
79
+ # Returns an array of full filenames with path
80
+ # for all associated ContentFile instances
81
+ def files
82
+ content_files.map(&:file_name)
83
+ end
84
+
85
+ # Returns *true* if all _content_files_ have been loaded,
86
+ # _false_ otherwise
87
+ def loaded?
88
+ loaded_files = content_files_dataset.where(is_loaded: true)
89
+ return true if loaded_files.count == content_files.size
90
+ loaded_files.empty? ? false : loaded_files.map(&:file_name)
91
+ end
92
+
93
+ # Returns a symbol based on the Entity instance's foreign key name
94
+ def foreign_key
95
+ class_name.foreign_key.to_sym
96
+ end
97
+
98
+ # Returns a symbol for the +name+ of
99
+ # the associated Attribute instance that is the key_column
100
+ # in the DarwinCoreArchive node the Entity represents
101
+ def key
102
+ attributes_dataset.first(index: key_column).name.to_sym
103
+ end
104
+
105
+ # Returns an array of parameters for all associations of the Sequel::Model
106
+ # in the DarwinCoreArchive schema that the Entity represents
107
+ # each set of parameters is an array
108
+ # <tt>[association_type, association_name, options]</tt>
109
+ # that can be splattet as arguments into
110
+ # Sequel::Model::Associations::ClassMethods#associate
111
+ def model_associations
112
+ # add the assoc to Entity here
113
+ meta_assoc = [:many_to_one, :entity, { class: Entity }]
114
+ if is_core
115
+ a = extensions.map { |extension| association_with(extension) }
116
+ a.unshift meta_assoc
117
+ else
118
+ [meta_assoc, association_with(core)]
119
+ end
120
+ end
121
+
122
+ # Returns the constant with module name for the Entity instance
123
+ # this is the constant of the Sequel::Model
124
+ # in the DarwinCoreArchive schema
125
+ def model_get
126
+ modelname = 'DwCR::' + class_name
127
+ modelname.constantize
128
+ end
129
+
130
+ # Returns a symbol for the pluralzid +name+ that is
131
+ # the name of the table in the DarwinCoreArchive schema
132
+ def table_name
133
+ name.tableize.to_sym
134
+ end
135
+
136
+ # Analyzes the Entity instance's content_files
137
+ # for any parameters given as modifiers and updates the
138
+ # asssociated _attributes_ with the new values
139
+ # * _:length_ or _'length'_
140
+ # will update the Attribute instances' +max_content_length+
141
+ # * _:type_ or _'type'_
142
+ # will update the Attribute instances' +type+ attributes
143
+ def update_attributes!(*modifiers)
144
+ DwCAContentAnalyzer::FileSet.new(files, modifiers).columns.each do |cp|
145
+ column = attributes_dataset.first(index: cp[:index])
146
+ modifiers.each { |m| column.send(m.to_s + '=', cp[m]) }
147
+ column.save
148
+ end
149
+ end
150
+
151
+ # Methods to add records to the :attributes and
152
+ # :content_files associations form xml
153
+
154
+ # Creates a Attribute instance from an xml node (_field_)
155
+ # given that the instance has not been previously defined
156
+ # if an instance has been previously defined, it will be updated
157
+ def add_attribute_from(xml)
158
+ attribute = attributes_dataset.first(term: term_from(xml))
159
+ attribute ||= add_attribute(values_from(xml, :term, :index, :default))
160
+ attribute.update_from(xml, :index, :default)
161
+ end
162
+
163
+ # Creates a ContentFile instance from an xml node (_file_)
164
+ # a +path+ can be given to add files from arbitrary directories
165
+ # +name+ is parsed from the _location_ node
166
+ def add_files_from(xml, path: nil)
167
+ files_from(xml).each { |file| add_content_file(name: file, path: path) }
168
+ end
169
+
170
+ private
171
+
172
+ # Sequel Model hook that creates a default +name+ from the +term+
173
+ def before_create
174
+ e = 'Entity instances need to belong to a Archive'
175
+ raise ArgumentError, e unless archive
176
+ self.name ||= term&.split('/')&.last&.underscore
177
+ super
178
+ end
179
+
180
+ # Returns an array that can be splattet as arguments into the
181
+ # Sequel::Model::Associations::ClassMethods#associate method:
182
+ # <tt>[association_type, association_name, options]</tt>
183
+ def association_with(entity)
184
+ options = { class: entity.class_name, class_namespace: 'DwCR' }
185
+ if is_core
186
+ options[:key] = foreign_key
187
+ [:one_to_many, entity.table_name, options]
188
+ else
189
+ options[:key] = entity.foreign_key
190
+ [:many_to_one, entity.name.singularize.to_sym, options]
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'xml_parsable'
4
+
5
+ #
6
+ module DwCR
7
+ # This module provides functionality to create, update, and verify the
8
+ # part of the DwCR schema needed to persist information about the DwCR
9
+ module Metaschema
10
+ # Creates the tables for the metaschema present in every DwCR
11
+ # _archives_, _entities_, _attributes_, _content_files_
12
+ # loads the Sequel::Model classes for these tables
13
+ def self.create
14
+ tabledefs = Psych.load_file(File.join(__dir__, 'metaschema_tables.yml'))
15
+ tabledefs.to_h.each do |table, columns|
16
+ DB.create_table? table do
17
+ primary_key :id
18
+ columns.each { |c| column(*c) }
19
+ end
20
+ end
21
+ load_models
22
+ end
23
+
24
+ # Loads all Sequel::Model classes for the metaschema
25
+ def self.load_models
26
+ require_relative 'archive'
27
+ require_relative 'entity'
28
+ require_relative 'attribute'
29
+ require_relative 'content_file'
30
+ end
31
+
32
+ # Returns schema or index parameters for a table, depending on the
33
+ # second argument (+:schema+ or +:indexes+)
34
+ # returns +false+ if the table does not exist
35
+ def self.inspect_table(table, method)
36
+ DB.indexes(table).values.map { |x| x[:columns] }.flatten
37
+ DB.send(method, table)
38
+ rescue Sequel::Error
39
+ false
40
+ end
41
+
42
+ # Performs an integrety check on +table+, veryfies all columns
43
+ # are present with the parameters given in +columns+;
44
+ # a column parameter is an array with the structure:
45
+ # <tt>[:column_name, :column_type, {column_options} ]</tt>
46
+ def self.columns?(table, *columns)
47
+ db_cols = inspect_table(table, :schema)
48
+ return unless db_cols
49
+ exp_cols = columns.map(&:first).unshift(:id)
50
+ exp_cols == db_cols.map(&:first)
51
+ end
52
+
53
+ # Performs an integrety check on +table+, veryfies all indices
54
+ # are present with the parameters given in +columns+;
55
+ # a column parameter is an array with the structure:
56
+ # <tt>[:column_name, :column_type, {column_options} ]</tt>
57
+ def self.indexes?(table, *columns)
58
+ db_idxs = inspect_table(table, :indexes)
59
+ return unless db_idxs
60
+ exp_idxs = columns.select { |column| column[2]&.fetch(:index, false) }
61
+ .map(&:first)
62
+ exp_idxs & db_idxs.values.map { |x| x[:columns] }.flatten == exp_idxs
63
+ end
64
+
65
+ # Updates all Attribute instances in a Archive
66
+ # with parameters from files in ContentFile
67
+ # _schema_options_: a Hash with attribute names as keys and boolean values
68
+ # <tt>{ :type => true, :length => true }</tt>
69
+ # updates any attribute given as key where value is _true_
70
+ def self.update(archive, **options)
71
+ return if options.empty?
72
+
73
+ # FIXME: handle situation where schema tables have been created
74
+ options.select! { |_k, v| v == true }
75
+ archive.entities
76
+ .each { |entity| entity.update_attributes!(*options.keys) }
77
+ end
78
+
79
+ # Performs an integrity check on the metaschema in the DWCR file
80
+ # (the current database connection)
81
+ # returns true if all tables, columns, and indices as given in
82
+ # _config/metaschema_tables.yml_ are present
83
+ def self.valid?
84
+ tabledefs = Psych.load_file('lib/dwcr/metaschema/metaschema_tables.yml')
85
+ status = tabledefs.map do |td|
86
+ table = td.first
87
+ columns = td.last
88
+ columns?(table, *columns) && indexes?(table, *columns)
89
+ end
90
+ return false if status.uniq.size > 1
91
+ status.first
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,130 @@
1
+ ---
2
+ !ruby/sym archives:
3
+ -
4
+ - !ruby/sym core_id
5
+ - !ruby/sym integer
6
+ - !ruby/sym index:
7
+ true
8
+ -
9
+ - !ruby/sym name
10
+ - !ruby/sym string
11
+ -
12
+ - !ruby/sym path
13
+ - !ruby/sym string
14
+ -
15
+ - !ruby/sym xmlns
16
+ - !ruby/sym string
17
+ - !ruby/sym default:
18
+ 'http://rs.tdwg.org/dwc/text/'
19
+ -
20
+ - !ruby/sym xmlns__xs
21
+ - !ruby/sym string
22
+ - !ruby/sym default:
23
+ 'http://www.w3.org/2001/XMLSchema'
24
+ -
25
+ - !ruby/sym xmlns__xsi
26
+ - !ruby/sym string
27
+ - !ruby/sym default:
28
+ 'http://www.w3.org/2001/XMLSchema-instance'
29
+ -
30
+ - !ruby/sym xsi__schema_location
31
+ - !ruby/sym string
32
+ - !ruby/sym default:
33
+ 'http://rs.tdwg.org/dwc/text/ http://rs.tdwg.org/dwc/text/tdwg_dwc_text.xsd'
34
+ !ruby/sym attributes:
35
+ -
36
+ - !ruby/sym entity_id
37
+ - !ruby/sym integer
38
+ - !ruby/sym index:
39
+ true
40
+ -
41
+ - !ruby/sym type
42
+ - !ruby/sym string
43
+ - !ruby/sym default:
44
+ string
45
+ -
46
+ - !ruby/sym name
47
+ - !ruby/sym string
48
+ -
49
+ !ruby/sym index: true
50
+ !ruby/sym null: false
51
+ -
52
+ - !ruby/sym term
53
+ - !ruby/sym string
54
+ -
55
+ - !ruby/sym default
56
+ - !ruby/sym string
57
+ -
58
+ - !ruby/sym index
59
+ - !ruby/sym integer
60
+ -
61
+ - !ruby/sym max_content_length
62
+ - !ruby/sym integer
63
+ -
64
+ - !ruby/sym unambiguous
65
+ - !ruby/sym boolean
66
+ - !ruby/sym default:
67
+ true
68
+
69
+
70
+ !ruby/sym entities:
71
+ -
72
+ - !ruby/sym archive_id
73
+ - !ruby/sym integer
74
+ - !ruby/sym index:
75
+ true
76
+ -
77
+ - !ruby/sym core_id
78
+ - !ruby/sym integer
79
+ - !ruby/sym index:
80
+ true
81
+ -
82
+ - !ruby/sym name
83
+ - !ruby/sym string
84
+ - !ruby/sym null:
85
+ false
86
+ -
87
+ - !ruby/sym term
88
+ - !ruby/sym string
89
+ -
90
+ - !ruby/sym is_core
91
+ - !ruby/sym boolean
92
+ -
93
+ - !ruby/sym key_column
94
+ - !ruby/sym integer
95
+ -
96
+ - !ruby/sym fields_enclosed_by
97
+ - !ruby/sym string
98
+ - !ruby/sym default:
99
+ '&quot;'
100
+ -
101
+ - !ruby/sym fields_terminated_by
102
+ - !ruby/sym string
103
+ - !ruby/sym default:
104
+ ','
105
+ -
106
+ - !ruby/sym lines_terminated_by
107
+ - !ruby/sym string
108
+ - !ruby/sym default:
109
+ '\r\n'
110
+
111
+ !ruby/sym content_files:
112
+ -
113
+ - !ruby/sym entity_id
114
+ - !ruby/sym integer
115
+ - !ruby/sym index:
116
+ true
117
+ -
118
+ - !ruby/sym name
119
+ - !ruby/sym string
120
+ - !ruby/sym null:
121
+ false
122
+ -
123
+ - !ruby/sym path
124
+ - !ruby/sym string
125
+ -
126
+ - !ruby/sym is_loaded
127
+ - !ruby/sym boolean
128
+ - !ruby/sym default:
129
+ false
130
+