taxonifi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +155 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/assessor/assessor.rb +31 -0
- data/lib/assessor/base.rb +17 -0
- data/lib/assessor/row_assessor.rb +131 -0
- data/lib/export/export.rb +9 -0
- data/lib/export/format/base.rb +43 -0
- data/lib/export/format/species_file.rb +341 -0
- data/lib/lumper/lumper.rb +334 -0
- data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
- data/lib/models/author_year.rb +39 -0
- data/lib/models/base.rb +73 -0
- data/lib/models/collection.rb +92 -0
- data/lib/models/generic_object.rb +15 -0
- data/lib/models/geog.rb +59 -0
- data/lib/models/geog_collection.rb +28 -0
- data/lib/models/name.rb +206 -0
- data/lib/models/name_collection.rb +149 -0
- data/lib/models/person.rb +49 -0
- data/lib/models/ref.rb +85 -0
- data/lib/models/ref_collection.rb +106 -0
- data/lib/models/species_name.rb +85 -0
- data/lib/splitter/builder.rb +26 -0
- data/lib/splitter/lexer.rb +70 -0
- data/lib/splitter/parser.rb +54 -0
- data/lib/splitter/splitter.rb +45 -0
- data/lib/splitter/tokens.rb +322 -0
- data/lib/taxonifi.rb +36 -0
- data/test/file_fixtures/Lygaeoidea.csv +801 -0
- data/test/helper.rb +38 -0
- data/test/test_exporter.rb +32 -0
- data/test/test_lumper_geogs.rb +59 -0
- data/test/test_lumper_hierarchical_collection.rb +88 -0
- data/test/test_lumper_names.rb +119 -0
- data/test/test_lumper_parent_child_name_collection.rb +41 -0
- data/test/test_lumper_refs.rb +91 -0
- data/test/test_parser.rb +34 -0
- data/test/test_splitter.rb +27 -0
- data/test/test_splitter_tokens.rb +403 -0
- data/test/test_taxonifi.rb +11 -0
- data/test/test_taxonifi_accessor.rb +61 -0
- data/test/test_taxonifi_geog.rb +51 -0
- data/test/test_taxonifi_name.rb +186 -0
- data/test/test_taxonifi_name_collection.rb +158 -0
- data/test/test_taxonifi_ref.rb +90 -0
- data/test/test_taxonifi_ref_collection.rb +69 -0
- data/test/test_taxonifi_species_name.rb +95 -0
- metadata +167 -0
@@ -0,0 +1,341 @@
|
|
1
|
+
|
2
|
+
module Taxonifi::Export
|
3
|
+
|
4
|
+
# Dumps tables identical to the existing structure in SpeciesFile.
|
5
|
+
# Will only work in the pre Identity world. Will reconfigure
|
6
|
+
# as templates for Jim's work after the fact.
|
7
|
+
class SpeciesFile < Taxonifi::Export::Base
|
8
|
+
|
9
|
+
# tblRanks 5/17/2012
|
10
|
+
SPECIES_FILE_RANKS = {
|
11
|
+
'subspecies' => 5,
|
12
|
+
'species' => 10,
|
13
|
+
'species subgroup' => 11,
|
14
|
+
'species group' => 12,
|
15
|
+
'species series' => 14,
|
16
|
+
'infragenus' => 16,
|
17
|
+
'subgenus' => 18,
|
18
|
+
'genus' => 20,
|
19
|
+
'genus group' => 22,
|
20
|
+
'subtribe' => 28,
|
21
|
+
'tribe' => 30,
|
22
|
+
'supertribe' => 32,
|
23
|
+
'infrafamily' => 36,
|
24
|
+
'subfamily' => 38,
|
25
|
+
'subfamily group' => 39,
|
26
|
+
'family' => 40,
|
27
|
+
'epifamily' => 41,
|
28
|
+
'superfamily' => 42,
|
29
|
+
'superfamily group' => 44,
|
30
|
+
'subinfraordinal group' => 45,
|
31
|
+
'infraorder' => 46,
|
32
|
+
'suborder' => 8,
|
33
|
+
'order' => 50,
|
34
|
+
'mirorder' => 51,
|
35
|
+
'superorder' => 52,
|
36
|
+
'magnorder' => 53,
|
37
|
+
'cohort' => 54,
|
38
|
+
'supercohort' => 55,
|
39
|
+
'infraclass' => 56,
|
40
|
+
'subclass' => 58,
|
41
|
+
'class' => 60,
|
42
|
+
'superclass' => 62,
|
43
|
+
'infraphylum' => 66,
|
44
|
+
'subphylum' => 68,
|
45
|
+
'phylum' => 70,
|
46
|
+
'superphylum' => 72,
|
47
|
+
'infrakingdom' => 76,
|
48
|
+
'subkingdom' => 78,
|
49
|
+
'kingdom' => 80,
|
50
|
+
'superkingdom' => 82,
|
51
|
+
'life' => 90,
|
52
|
+
'unknown' => 100
|
53
|
+
}
|
54
|
+
|
55
|
+
attr_accessor :name_collection
|
56
|
+
attr_accessor :ref_collection
|
57
|
+
attr_accessor :author_index
|
58
|
+
attr_accessor :genus_names, :species_names, :nomenclator
|
59
|
+
attr_accessor :authorized_user_id, :time
|
60
|
+
|
61
|
+
# MANIFEST order is important
|
62
|
+
MANIFEST = %w{tblTaxa tblRefs tblPeople tblRefAuthors tblGenusNames tblSpeciesNames tblNomenclator tblCites}
|
63
|
+
|
64
|
+
def initialize(options = {})
|
65
|
+
opts = {
|
66
|
+
:nc => Taxonifi::Model::NameCollection.new,
|
67
|
+
:export_folder => 'species_file',
|
68
|
+
:authorized_user_id => nil
|
69
|
+
}.merge!(options)
|
70
|
+
|
71
|
+
super(opts)
|
72
|
+
raise Taxonifi::Export::ExportError, 'NameCollection not passed to SpeciesFile export.' if ! opts[:nc].class == Taxonifi::Model::NameCollection
|
73
|
+
raise Taxonifi::Export::ExportError, 'You must provide authorized_user_id for species_file export initialization.' if opts[:authorized_user_id].nil?
|
74
|
+
@name_collection = opts[:nc]
|
75
|
+
@authorized_user_id = opts[:authorized_user_id]
|
76
|
+
@author_index = {}
|
77
|
+
|
78
|
+
#
|
79
|
+
# Careful here, at present we are just generating Reference micro-citations from our names, so the indexing "just works"
|
80
|
+
# because it's all internal. There will is a strong potential for key collisions if this pipeline is modified to
|
81
|
+
# include references external to the initialized name_collection. See also export_references.
|
82
|
+
#
|
83
|
+
@by_author_reference_index = {}
|
84
|
+
@genus_names = {}
|
85
|
+
@species_names = {}
|
86
|
+
@nomenclator = {}
|
87
|
+
@time = Time.now.strftime("%F %T")
|
88
|
+
end
|
89
|
+
|
90
|
+
# Export only the ref_collection. Sidesteps the main name-centric exports
|
91
|
+
# Note that this still uses the base @name_collection object as a starting reference,
|
92
|
+
# it just references @name_collection.ref_collection. So you can do:
|
93
|
+
# nc = Taxonifi::Model::NameCollection.new
|
94
|
+
# nc.ref_collection = Taxonifi::Model::RefCollection.new
|
95
|
+
# etc.
|
96
|
+
def export_references(options = {})
|
97
|
+
opts = {
|
98
|
+
:starting_ref_id => 0,
|
99
|
+
:starting_author_id => 0
|
100
|
+
}
|
101
|
+
|
102
|
+
configure_folders
|
103
|
+
build_author_index
|
104
|
+
|
105
|
+
# order matters
|
106
|
+
['tblPeople', 'tblRefs', 'tblRefAuthors', 'sqlRefs' ].each do |t|
|
107
|
+
write_file(t, send(t))
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Assumes names that are the same are the same person.
|
112
|
+
def build_author_index
|
113
|
+
@author_index = @name_collection.ref_collection.unique_authors.inject({}){|hsh, a| hsh.merge!(a.compact_string => a)}
|
114
|
+
end
|
115
|
+
|
116
|
+
def export()
|
117
|
+
super
|
118
|
+
@name_collection.generate_ref_collection(1)
|
119
|
+
|
120
|
+
# Give authors unique ids
|
121
|
+
@name_collection.ref_collection.uniquify_authors(1)
|
122
|
+
build_author_index
|
123
|
+
|
124
|
+
# See notes in #initalize re potential key collisions!
|
125
|
+
@by_author_reference_index = @name_collection.ref_collection.collection.inject({}){|hsh, r| hsh.merge!(r.author_year_index => r)}
|
126
|
+
|
127
|
+
@name_collection.names_at_rank('genus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
|
128
|
+
@name_collection.names_at_rank('subgenus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
|
129
|
+
@name_collection.names_at_rank('species').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
130
|
+
@name_collection.names_at_rank('subspecies').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
131
|
+
|
132
|
+
MANIFEST.each do |f|
|
133
|
+
write_file(f, send(f))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def tblTaxa
|
138
|
+
@headers = %w{TaxonNameID TaxonNameStr RankID Name Parens AboveID RefID DataFlags AccessCode NameStatus StatusFlags OriginalGenusID LastUpdate ModifiedBy}
|
139
|
+
@csv_string = CSV.generate() do |csv|
|
140
|
+
csv << @headers
|
141
|
+
@name_collection.collection.each do |n|
|
142
|
+
ref = @by_author_reference_index[n.author_year_index]
|
143
|
+
cols = {
|
144
|
+
TaxonNameID: n.id,
|
145
|
+
TaxonNameStr: n.parent_ids_sf_style, # closure -> ends with 1
|
146
|
+
RankID: SPECIES_FILE_RANKS[n.rank],
|
147
|
+
Name: n.name,
|
148
|
+
Parens: (n.parens ? 1 : 0),
|
149
|
+
AboveID: (n.related_name.nil? ? (n.parent ? n.parent.id : 0) : n.related_name.id), # !! SF folks like to pre-populate with zeros
|
150
|
+
RefID: (ref ? ref.id : 0),
|
151
|
+
DataFlags: 0, # see http://software.speciesfile.org/Design/TaxaTables.aspx#Taxon, a flag populated when data is reviewed, initialize to zero
|
152
|
+
AccessCode: 0,
|
153
|
+
NameStatus: (n.related_name.nil? ? 0 : 7), # 0 :valid, 7: synonym)
|
154
|
+
StatusFlags: (n.related_name.nil? ? 0 : 262144), # 0 :valid, 262144: jr. synonym
|
155
|
+
OriginalGenusID: (!n.parens && n.parent_at_rank('genus') ? n.parent_at_rank('genus').id : 0), # SF must be pre-configured with 0 filler (this restriction needs to go)
|
156
|
+
LastUpdate: @time,
|
157
|
+
ModifiedBy: @authorized_user_id,
|
158
|
+
}
|
159
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
160
|
+
end
|
161
|
+
end
|
162
|
+
@csv_string
|
163
|
+
end
|
164
|
+
|
165
|
+
# Generate a tblRefs string.
|
166
|
+
def tblRefs
|
167
|
+
@headers = %w{RefID ActualYear Title PubID Verbatim}
|
168
|
+
@csv_string = CSV.generate(:col_sep => "\t") do |csv|
|
169
|
+
csv << @headers
|
170
|
+
@name_collection.ref_collection.collection.each_with_index do |r,i|
|
171
|
+
cols = {
|
172
|
+
RefID: r.id, # i + 1,
|
173
|
+
Title: (r.title.nil? ? """""" : r.title),
|
174
|
+
PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
|
175
|
+
ActualYear: r.year,
|
176
|
+
Verbatim: r.full_citation
|
177
|
+
}
|
178
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
179
|
+
end
|
180
|
+
end
|
181
|
+
@csv_string
|
182
|
+
end
|
183
|
+
|
184
|
+
# TODO make a standard transaction wrapper
|
185
|
+
def sqlRefs
|
186
|
+
sql = [ 'BEGIN TRY', 'BEGIN TRANSACTION']
|
187
|
+
@headers = %w{RefID ActualYear Title PubID Verbatim}
|
188
|
+
@name_collection.ref_collection.collection.each_with_index do |r,i|
|
189
|
+
cols = {
|
190
|
+
RefID: r.id, # i + 1,
|
191
|
+
Title: (r.title.nil? ? """""" : r.title),
|
192
|
+
PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
|
193
|
+
ActualYear: r.year,
|
194
|
+
Verbatim: r.full_citation
|
195
|
+
}
|
196
|
+
sql << "INSERT INTO tblRefs (#{@headers.sort.join(",")}) VALUES (#{@headers.sort.collect{|h| "'#{cols[h.to_sym].to_s.gsub(/'/,"''")}'"}.join(",")});"
|
197
|
+
end
|
198
|
+
sql << ['COMMIT', 'END TRY', 'BEGIN CATCH', 'ROLLBACK', 'END CATCH']
|
199
|
+
sql.join("\n")
|
200
|
+
end
|
201
|
+
|
202
|
+
# Generate tblPeople string.
|
203
|
+
def tblPeople
|
204
|
+
@headers = %w{PersonID FamilyName GivenNames GivenInitials Suffix Role LastUpdate ModifiedBy}
|
205
|
+
@csv_string = CSV.generate() do |csv|
|
206
|
+
csv << @headers
|
207
|
+
@author_index.keys.each_with_index do |k,i|
|
208
|
+
a = @author_index[k]
|
209
|
+
# a.id = i + 1
|
210
|
+
cols = {
|
211
|
+
PersonID: a.id,
|
212
|
+
FamilyName: a.last_name,
|
213
|
+
GivenName: a.first_name,
|
214
|
+
GivenInitials: a.initials_string,
|
215
|
+
Suffix: a.suffix,
|
216
|
+
Role: 1, # authors
|
217
|
+
LastUpdate: @time,
|
218
|
+
ModifiedBy: @authorized_user_id
|
219
|
+
}
|
220
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
221
|
+
end
|
222
|
+
end
|
223
|
+
@csv_string
|
224
|
+
end
|
225
|
+
|
226
|
+
# Generate tblRefAuthors string.
|
227
|
+
def tblRefAuthors
|
228
|
+
@headers = %w{RefID PersonID SeqNum AuthorCount LastUpdate ModifiedBy}
|
229
|
+
@csv_string = CSV.generate() do |csv|
|
230
|
+
csv << @headers
|
231
|
+
@name_collection.ref_collection.collection.each do |r|
|
232
|
+
r.authors.each_with_index do |x, i|
|
233
|
+
a = @author_index[x.compact_string]
|
234
|
+
cols = {
|
235
|
+
RefID: r.id,
|
236
|
+
PersonID: a.id,
|
237
|
+
SeqNum: i + 1,
|
238
|
+
AuthorCount: r.authors.size,
|
239
|
+
LastUpdate: @time,
|
240
|
+
ModifiedBy: @authorized_user_id
|
241
|
+
}
|
242
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
@csv_string
|
247
|
+
end
|
248
|
+
|
249
|
+
# Generate tblCites string.
|
250
|
+
def tblCites
|
251
|
+
@headers = %w{TaxonNameID SeqNum RefID NomenclatorID LastUpdate ModifiedBy NewNameStatus CitePages Note TypeClarification CurrentConcept ConceptChange InfoFlags InfoFlagStatus PolynomialStatus}
|
252
|
+
@csv_string = CSV.generate() do |csv|
|
253
|
+
csv << @headers
|
254
|
+
@name_collection.collection.each do |n|
|
255
|
+
ref = @by_author_reference_index[n.author_year_index]
|
256
|
+
next if ref.nil?
|
257
|
+
cols = {
|
258
|
+
TaxonNameID: n.id,
|
259
|
+
SeqNum: 1,
|
260
|
+
RefID: ref.id,
|
261
|
+
NomenclatorID: @nomenclator[n.nomenclator_name],
|
262
|
+
LastUpdate: @time,
|
263
|
+
ModifiedBy: @authorized_user_id,
|
264
|
+
CitePages: """""", # equates to "" in CSV speak
|
265
|
+
NewNameStatus: 0,
|
266
|
+
Note: """""",
|
267
|
+
TypeClarification: 0, # We might derive more data from this
|
268
|
+
CurrentConcept: 1, # Boolean, right?
|
269
|
+
ConceptChange: 0, # Unspecified
|
270
|
+
InfoFlags: 0, #
|
271
|
+
InfoFlagStatus: 1, # 1 => needs review
|
272
|
+
PolynomialStatus: 0
|
273
|
+
}
|
274
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
275
|
+
end
|
276
|
+
end
|
277
|
+
@csv_string
|
278
|
+
end
|
279
|
+
|
280
|
+
def tblGenusNames
|
281
|
+
@csv_string = csv_for_genus_and_species_names_tables('Genus')
|
282
|
+
@csv_string
|
283
|
+
end
|
284
|
+
|
285
|
+
def tblSpeciesNames
|
286
|
+
@csv_string = csv_for_genus_and_species_names_tables('Species')
|
287
|
+
@csv_string
|
288
|
+
end
|
289
|
+
|
290
|
+
def csv_for_genus_and_species_names_tables(type)
|
291
|
+
col = "#{type}NameID"
|
292
|
+
@headers = [col, "Name", "LastUpdate", "ModifiedBy", "Italicize"]
|
293
|
+
@csv_string = CSV.generate() do |csv|
|
294
|
+
csv << @headers
|
295
|
+
var = self.send("#{type.downcase}_names")
|
296
|
+
var.keys.each_with_index do |n,i|
|
297
|
+
var[n] = i + 1
|
298
|
+
cols = {
|
299
|
+
col.to_sym => i + 1,
|
300
|
+
Name: n,
|
301
|
+
LastUpdate: @time,
|
302
|
+
ModifiedBy: @authorized_user_id,
|
303
|
+
Italicize: 1 # always true for these data
|
304
|
+
}
|
305
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
306
|
+
end
|
307
|
+
end
|
308
|
+
@csv_string
|
309
|
+
end
|
310
|
+
|
311
|
+
# must be called post tblGenusNames and tblSpeciesNames
|
312
|
+
def tblNomenclator
|
313
|
+
@headers = %w{NomenclatorID GenusNameID SubgenusNameID SpeciesNameID SubspeciesNameID LastUpdate ModifiedBy SuitableForGenus SuitableForSpecies InfrasubspeciesNameID InfrasubKind}
|
314
|
+
@csv_string = CSV.generate() do |csv|
|
315
|
+
csv << @headers
|
316
|
+
i = 1
|
317
|
+
@name_collection.collection.each do |n|
|
318
|
+
next if Taxonifi::RANKS.index(n.rank) < Taxonifi::RANKS.index('genus')
|
319
|
+
cols = {
|
320
|
+
NomenclatorID: i,
|
321
|
+
GenusNameID: @genus_names[n.parent_name_at_rank('genus')] || 0,
|
322
|
+
SubgenusNameID: @genus_names[n.parent_name_at_rank('subgenus')] || 0,
|
323
|
+
SpeciesNameID: @species_names[n.parent_name_at_rank('species')] || 0,
|
324
|
+
SubspeciesNameID: @species_names[n.parent_name_at_rank('subspecies')] || 0,
|
325
|
+
InfrasubspeciesNameID: 0,
|
326
|
+
InfrasubKind: 0, # this might be wrong
|
327
|
+
LastUpdate: @time,
|
328
|
+
ModifiedBy: @authorized_user_id,
|
329
|
+
SuitableForGenus: 0, # Set in SF
|
330
|
+
SuitableForSpecies: 0 # Set in SF
|
331
|
+
}
|
332
|
+
@nomenclator.merge!(n.nomenclator_name => i)
|
333
|
+
i += 1
|
334
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
335
|
+
end
|
336
|
+
end
|
337
|
+
@csv_string
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
end
|
@@ -0,0 +1,334 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../taxonifi'))
|
2
|
+
|
3
|
+
# The lumper lumps! Tools for recognizing and using
|
4
|
+
# combinations of column types.
|
5
|
+
module Taxonifi::Lumper
|
6
|
+
|
7
|
+
# Define groups of columns/fields and include
|
8
|
+
# functionality to determine whether your
|
9
|
+
# columns match a given set.
|
10
|
+
module Lumps
|
11
|
+
Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "lumps/*.rb") )) do |file|
|
12
|
+
require file
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class LumperError < StandardError; end
|
17
|
+
|
18
|
+
# Columns used for species epithets.
|
19
|
+
# !! Todo: map DwC URIs to these labels (at present they largely correllate with Tokens,
|
20
|
+
# perhaps map URIs to tokens!?)
|
21
|
+
QUAD = ['genus', 'subgenus', 'species', 'subspecies']
|
22
|
+
|
23
|
+
# Columns representing author and year
|
24
|
+
AUTHOR_YEAR = ['author', 'year']
|
25
|
+
|
26
|
+
# A Hash of named column combinations
|
27
|
+
LUMPS = {
|
28
|
+
quadrinomial: QUAD,
|
29
|
+
quad_author_year: QUAD + AUTHOR_YEAR,
|
30
|
+
names: Taxonifi::RANKS + AUTHOR_YEAR,
|
31
|
+
higher: Taxonifi::RANKS - [QUAD + AUTHOR_YEAR],
|
32
|
+
species: ['species', 'subspecies'],
|
33
|
+
genera: ['genus', 'subgenus'],
|
34
|
+
citation_basic: %w{authors year title publication volume number pages pg_start pg_end},
|
35
|
+
citation_small: %w{authors year title publication volume_number pages},
|
36
|
+
basic_geog: %w{country state county}, # add 'continent'
|
37
|
+
eol_basic: %w{identifier parent child rank synonyms}
|
38
|
+
}
|
39
|
+
|
40
|
+
# Lumps for which all columns are represented
|
41
|
+
# TODO: This is really an assessor method
|
42
|
+
def self.available_lumps(columns)
|
43
|
+
raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.available_lumps.' if !(columns.class == Array)
|
44
|
+
LUMPS.keys.select{|k| (LUMPS[k] - columns) == []}
|
45
|
+
end
|
46
|
+
|
47
|
+
# Lumps for which any column is represented
|
48
|
+
# # TODO: This is really an assessor method
|
49
|
+
def self.intersecting_lumps(columns)
|
50
|
+
raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.intersecting_lumps.' if !(columns.class == Array)
|
51
|
+
intersections = []
|
52
|
+
LUMPS.keys.each do |k|
|
53
|
+
intersections.push k if (LUMPS[k] & columns).size > 0
|
54
|
+
end
|
55
|
+
intersections
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return a Taxonifi::Model::NameCollection from a csv file.
|
59
|
+
def self.create_name_collection(csv)
|
60
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
|
61
|
+
nc = Taxonifi::Model::NameCollection.new
|
62
|
+
|
63
|
+
row_size = csv.size
|
64
|
+
|
65
|
+
# The row index contains a vector of parent ids like
|
66
|
+
# [0, 4, 29]
|
67
|
+
# This implies that Name with #id 29 has Parent with #id 4
|
68
|
+
# Initialize an empty index.
|
69
|
+
row_index = []
|
70
|
+
(0..(row_size-1)).each do |i|
|
71
|
+
row_index[i] = []
|
72
|
+
end
|
73
|
+
|
74
|
+
# The name_index keeps track of unique name per rank like
|
75
|
+
# :genus => {'Foo' => [0,2]}
|
76
|
+
# This says that "Foo" is instantiated two times in the
|
77
|
+
# name collection, with id 0, and id 2.
|
78
|
+
name_index = {}
|
79
|
+
|
80
|
+
# First pass, create and index names
|
81
|
+
Taxonifi::Assessor::RowAssessor.rank_headers(csv.headers).each do |rank|
|
82
|
+
name_index[rank] = {}
|
83
|
+
csv.each_with_index do |row, i|
|
84
|
+
row_rank = Taxonifi::Assessor::RowAssessor.lump_name_rank(row).to_s # metadata (e.g. author year) apply to this rank
|
85
|
+
|
86
|
+
name = row[rank]
|
87
|
+
|
88
|
+
if !name.nil? # cell has data
|
89
|
+
n = nil # a Name if necessary
|
90
|
+
name_id = nil # index the new or existing name
|
91
|
+
|
92
|
+
if name_index[rank][name] # name (string) exists
|
93
|
+
|
94
|
+
exists = false
|
95
|
+
name_index[rank][name].each do |id|
|
96
|
+
# Compare vectors of parent_ids for name presence
|
97
|
+
if nc.parent_id_vector(id) == row_index[i]
|
98
|
+
exists = true
|
99
|
+
name_id = id
|
100
|
+
break # don't need to check further
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
if !exists # name (string) exists, but parents are different, create new name
|
105
|
+
n = Taxonifi::Model::Name.new()
|
106
|
+
end
|
107
|
+
|
108
|
+
else # no version of the name exists
|
109
|
+
n = Taxonifi::Model::Name.new()
|
110
|
+
end # end name exists
|
111
|
+
|
112
|
+
# If we created a new name
|
113
|
+
if !n.nil?
|
114
|
+
n.rank = rank
|
115
|
+
n.name = name
|
116
|
+
n.parent = nc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
|
117
|
+
n.row_number = i
|
118
|
+
|
119
|
+
# Name/year needs to be standardized / cased out
|
120
|
+
# headers are overlapping at times
|
121
|
+
|
122
|
+
if row['author_year'] && row_rank == rank
|
123
|
+
builder = Taxonifi::Splitter::Builder.build_author_year(row['author_year'])
|
124
|
+
n.author = builder.people
|
125
|
+
n.year = builder.year
|
126
|
+
n.parens = !builder.parens
|
127
|
+
end
|
128
|
+
|
129
|
+
name_id = nc.add_object(n).id
|
130
|
+
# Add the name to the index of unique names
|
131
|
+
name_index[rank][name] ||= []
|
132
|
+
name_index[rank][name].push name_id
|
133
|
+
end
|
134
|
+
|
135
|
+
# build a by row vector of parent child relationships
|
136
|
+
row_index[i].push name_id
|
137
|
+
end # end cell has data
|
138
|
+
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
nc
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return a Taxonifi::Model::RefCollection from a CSV file.
|
146
|
+
def self.create_ref_collection(csv)
|
147
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_ref_collection.' if csv.class != CSV::Table
|
148
|
+
rc = Taxonifi::Model::RefCollection.new
|
149
|
+
row_size = csv.size
|
150
|
+
|
151
|
+
ref_index = {}
|
152
|
+
csv.each_with_index do |row, i|
|
153
|
+
if Taxonifi::Assessor::RowAssessor.intersecting_lumps_with_data(row, [:citation_small]).include?(:citation_small)
|
154
|
+
r = Taxonifi::Model::Ref.new(
|
155
|
+
:year => row['year'],
|
156
|
+
:title => row['title'],
|
157
|
+
:publication => row['publication']
|
158
|
+
)
|
159
|
+
|
160
|
+
# TODO: break out each of these lexes to a builder
|
161
|
+
if row['authors'] && !row['authors'].empty?
|
162
|
+
lexer = Taxonifi::Splitter::Lexer.new(row['authors'])
|
163
|
+
authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
|
164
|
+
authors.names.each do |a|
|
165
|
+
n = Taxonifi::Model::Person.new()
|
166
|
+
n.last_name = a[:last_name]
|
167
|
+
n.initials = a[:initials]
|
168
|
+
r.authors.push n
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
if row['volume_number'] && !row['volume_number'].empty?
|
173
|
+
lexer = Taxonifi::Splitter::Lexer.new(row['volume_number'], :volume_number)
|
174
|
+
t = lexer.pop(Taxonifi::Splitter::Tokens::VolumeNumber)
|
175
|
+
r.volume = t.volume
|
176
|
+
r.number = t.number
|
177
|
+
end
|
178
|
+
|
179
|
+
if row['pages'] && !row['pages'].empty?
|
180
|
+
# If our regex doesn't match dump the field into pages
|
181
|
+
begin
|
182
|
+
lexer = Taxonifi::Splitter::Lexer.new(row['pages'], :pages)
|
183
|
+
t = lexer.pop(Taxonifi::Splitter::Tokens::Pages)
|
184
|
+
r.pg_start = t.pg_start
|
185
|
+
r.pg_end = t.pg_end
|
186
|
+
rescue
|
187
|
+
r.pages = row['pages']
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
# Do some indexing.
|
192
|
+
ref_str = r.compact_string
|
193
|
+
if !ref_index.keys.include?(ref_str)
|
194
|
+
ref_id = rc.add_object(r).id
|
195
|
+
ref_index.merge!(ref_str => ref_id)
|
196
|
+
rc.row_index[i] = r
|
197
|
+
else
|
198
|
+
rc.row_index[i] = ref_index[ref_str]
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
rc
|
203
|
+
end
|
204
|
+
|
205
|
+
# Creates a generic Collection with Objects of GenericObject
|
206
|
+
# Objects are assigned to parents (rank) according to the order provided in headers.
|
207
|
+
# Objects are considered the same if they have the same name and the same parents closure, e.g.
|
208
|
+
#
|
209
|
+
# a b c
|
210
|
+
# a b d
|
211
|
+
# e b f
|
212
|
+
#
|
213
|
+
# Will return 7 objects named in order a,b,c,d,e,b,f
|
214
|
+
#
|
215
|
+
# a,b b,c b,d e,b b,f are the unique parent/child relationships stored
|
216
|
+
#
|
217
|
+
#
|
218
|
+
def self.create_hierarchical_collection(csv, headers)
|
219
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
|
220
|
+
raise Taxonifi::Lumper::LumperError, 'No headers provided to create_hierarchical_collection.' if headers.size == 0
|
221
|
+
|
222
|
+
c = Taxonifi::Model::Collection.new
|
223
|
+
row_size = csv.size
|
224
|
+
|
225
|
+
# See create_name_collection
|
226
|
+
row_index = []
|
227
|
+
(0..(row_size-1)).each do |i|
|
228
|
+
row_index[i] = []
|
229
|
+
end
|
230
|
+
|
231
|
+
name_index = {}
|
232
|
+
headers.each do |h|
|
233
|
+
name_index[h] = {}
|
234
|
+
end
|
235
|
+
|
236
|
+
csv.each_with_index do |row, i|
|
237
|
+
headers.each do |rank|
|
238
|
+
name = row[rank]
|
239
|
+
if !name.nil? && !name.empty? # cell has data
|
240
|
+
o = nil # a Name if necessary
|
241
|
+
name_id = nil # index the new or existing name
|
242
|
+
|
243
|
+
if name_index[rank][name] # name exists
|
244
|
+
|
245
|
+
exists = false
|
246
|
+
name_index[rank][name].each do |id|
|
247
|
+
if c.parent_id_vector(id) == row_index[i]
|
248
|
+
exists = true
|
249
|
+
name_id = id
|
250
|
+
break
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
if !exists
|
255
|
+
o = Taxonifi::Model::GenericObject.new()
|
256
|
+
end
|
257
|
+
else
|
258
|
+
o = Taxonifi::Model::GenericObject.new()
|
259
|
+
end
|
260
|
+
|
261
|
+
if !o.nil?
|
262
|
+
o.name = name
|
263
|
+
o.rank = rank
|
264
|
+
o.row_number = i
|
265
|
+
o.parent = c.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
|
266
|
+
|
267
|
+
name_id = c.add_object(o).id
|
268
|
+
name_index[rank][name] ||= []
|
269
|
+
name_index[rank][name].push name_id
|
270
|
+
|
271
|
+
end
|
272
|
+
row_index[i].push name_id
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
c
|
277
|
+
end
|
278
|
+
|
279
|
+
# Return a geog collection from a csv file.
|
280
|
+
def self.create_geog_collection(csv)
|
281
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_geog_collection.' if csv.class != CSV::Table
|
282
|
+
gc = Taxonifi::Model::GeogCollection.new
|
283
|
+
|
284
|
+
row_size = csv.size
|
285
|
+
|
286
|
+
# See create_name_collection
|
287
|
+
row_index = []
|
288
|
+
(0..(row_size-1)).each do |i|
|
289
|
+
row_index[i] = []
|
290
|
+
end
|
291
|
+
|
292
|
+
name_index = {}
|
293
|
+
geog_headers = Taxonifi::Assessor::RowAssessor.geog_headers(csv.headers)
|
294
|
+
geog_headers.each do |h|
|
295
|
+
name_index[h] = {}
|
296
|
+
end
|
297
|
+
|
298
|
+
# We don't have the same problems as with taxon names, i.e.
|
299
|
+
# boo in
|
300
|
+
# Foo nil boo
|
301
|
+
# Foo bar boo
|
302
|
+
# is the same thing wrt geography, not the case for taxon names.
|
303
|
+
# We can use a row first loop to build as we go
|
304
|
+
|
305
|
+
csv.each_with_index do |row, i|
|
306
|
+
geog_headers.each do |level|
|
307
|
+
name = row[level]
|
308
|
+
if !name.nil? && !name.empty? # cell has data
|
309
|
+
g = nil # a Name if necessary
|
310
|
+
name_id = nil # index the new or existing name
|
311
|
+
|
312
|
+
if name_index[level][name] # name exists
|
313
|
+
name_id = name_index[level][name]
|
314
|
+
else
|
315
|
+
g = Taxonifi::Model::Geog.new()
|
316
|
+
name_id = gc.add_object(g).id
|
317
|
+
end
|
318
|
+
|
319
|
+
if !g.nil?
|
320
|
+
g.name = name
|
321
|
+
g.rank = level
|
322
|
+
g.parent = gc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
|
323
|
+
end
|
324
|
+
|
325
|
+
name_index[level][name] = name_id
|
326
|
+
row_index[i].push name_id
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
gc
|
331
|
+
end
|
332
|
+
|
333
|
+
end # end Lumper Module
|
334
|
+
|