taxonifi 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +155 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/assessor/assessor.rb +31 -0
- data/lib/assessor/base.rb +17 -0
- data/lib/assessor/row_assessor.rb +131 -0
- data/lib/export/export.rb +9 -0
- data/lib/export/format/base.rb +43 -0
- data/lib/export/format/species_file.rb +341 -0
- data/lib/lumper/lumper.rb +334 -0
- data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
- data/lib/models/author_year.rb +39 -0
- data/lib/models/base.rb +73 -0
- data/lib/models/collection.rb +92 -0
- data/lib/models/generic_object.rb +15 -0
- data/lib/models/geog.rb +59 -0
- data/lib/models/geog_collection.rb +28 -0
- data/lib/models/name.rb +206 -0
- data/lib/models/name_collection.rb +149 -0
- data/lib/models/person.rb +49 -0
- data/lib/models/ref.rb +85 -0
- data/lib/models/ref_collection.rb +106 -0
- data/lib/models/species_name.rb +85 -0
- data/lib/splitter/builder.rb +26 -0
- data/lib/splitter/lexer.rb +70 -0
- data/lib/splitter/parser.rb +54 -0
- data/lib/splitter/splitter.rb +45 -0
- data/lib/splitter/tokens.rb +322 -0
- data/lib/taxonifi.rb +36 -0
- data/test/file_fixtures/Lygaeoidea.csv +801 -0
- data/test/helper.rb +38 -0
- data/test/test_exporter.rb +32 -0
- data/test/test_lumper_geogs.rb +59 -0
- data/test/test_lumper_hierarchical_collection.rb +88 -0
- data/test/test_lumper_names.rb +119 -0
- data/test/test_lumper_parent_child_name_collection.rb +41 -0
- data/test/test_lumper_refs.rb +91 -0
- data/test/test_parser.rb +34 -0
- data/test/test_splitter.rb +27 -0
- data/test/test_splitter_tokens.rb +403 -0
- data/test/test_taxonifi.rb +11 -0
- data/test/test_taxonifi_accessor.rb +61 -0
- data/test/test_taxonifi_geog.rb +51 -0
- data/test/test_taxonifi_name.rb +186 -0
- data/test/test_taxonifi_name_collection.rb +158 -0
- data/test/test_taxonifi_ref.rb +90 -0
- data/test/test_taxonifi_ref_collection.rb +69 -0
- data/test/test_taxonifi_species_name.rb +95 -0
- metadata +167 -0
@@ -0,0 +1,341 @@
|
|
1
|
+
|
2
|
+
module Taxonifi::Export
|
3
|
+
|
4
|
+
# Dumps tables identical to the existing structure in SpeciesFile.
|
5
|
+
# Will only work in the pre Identity world. Will reconfigure
|
6
|
+
# as templates for Jim's work after the fact.
|
7
|
+
class SpeciesFile < Taxonifi::Export::Base
|
8
|
+
|
9
|
+
# tblRanks 5/17/2012
|
10
|
+
SPECIES_FILE_RANKS = {
|
11
|
+
'subspecies' => 5,
|
12
|
+
'species' => 10,
|
13
|
+
'species subgroup' => 11,
|
14
|
+
'species group' => 12,
|
15
|
+
'species series' => 14,
|
16
|
+
'infragenus' => 16,
|
17
|
+
'subgenus' => 18,
|
18
|
+
'genus' => 20,
|
19
|
+
'genus group' => 22,
|
20
|
+
'subtribe' => 28,
|
21
|
+
'tribe' => 30,
|
22
|
+
'supertribe' => 32,
|
23
|
+
'infrafamily' => 36,
|
24
|
+
'subfamily' => 38,
|
25
|
+
'subfamily group' => 39,
|
26
|
+
'family' => 40,
|
27
|
+
'epifamily' => 41,
|
28
|
+
'superfamily' => 42,
|
29
|
+
'superfamily group' => 44,
|
30
|
+
'subinfraordinal group' => 45,
|
31
|
+
'infraorder' => 46,
|
32
|
+
'suborder' => 8,
|
33
|
+
'order' => 50,
|
34
|
+
'mirorder' => 51,
|
35
|
+
'superorder' => 52,
|
36
|
+
'magnorder' => 53,
|
37
|
+
'cohort' => 54,
|
38
|
+
'supercohort' => 55,
|
39
|
+
'infraclass' => 56,
|
40
|
+
'subclass' => 58,
|
41
|
+
'class' => 60,
|
42
|
+
'superclass' => 62,
|
43
|
+
'infraphylum' => 66,
|
44
|
+
'subphylum' => 68,
|
45
|
+
'phylum' => 70,
|
46
|
+
'superphylum' => 72,
|
47
|
+
'infrakingdom' => 76,
|
48
|
+
'subkingdom' => 78,
|
49
|
+
'kingdom' => 80,
|
50
|
+
'superkingdom' => 82,
|
51
|
+
'life' => 90,
|
52
|
+
'unknown' => 100
|
53
|
+
}
|
54
|
+
|
55
|
+
attr_accessor :name_collection
|
56
|
+
attr_accessor :ref_collection
|
57
|
+
attr_accessor :author_index
|
58
|
+
attr_accessor :genus_names, :species_names, :nomenclator
|
59
|
+
attr_accessor :authorized_user_id, :time
|
60
|
+
|
61
|
+
# MANIFEST order is important
|
62
|
+
MANIFEST = %w{tblTaxa tblRefs tblPeople tblRefAuthors tblGenusNames tblSpeciesNames tblNomenclator tblCites}
|
63
|
+
|
64
|
+
def initialize(options = {})
|
65
|
+
opts = {
|
66
|
+
:nc => Taxonifi::Model::NameCollection.new,
|
67
|
+
:export_folder => 'species_file',
|
68
|
+
:authorized_user_id => nil
|
69
|
+
}.merge!(options)
|
70
|
+
|
71
|
+
super(opts)
|
72
|
+
raise Taxonifi::Export::ExportError, 'NameCollection not passed to SpeciesFile export.' if ! opts[:nc].class == Taxonifi::Model::NameCollection
|
73
|
+
raise Taxonifi::Export::ExportError, 'You must provide authorized_user_id for species_file export initialization.' if opts[:authorized_user_id].nil?
|
74
|
+
@name_collection = opts[:nc]
|
75
|
+
@authorized_user_id = opts[:authorized_user_id]
|
76
|
+
@author_index = {}
|
77
|
+
|
78
|
+
#
|
79
|
+
# Careful here, at present we are just generating Reference micro-citations from our names, so the indexing "just works"
|
80
|
+
# because it's all internal. There will is a strong potential for key collisions if this pipeline is modified to
|
81
|
+
# include references external to the initialized name_collection. See also export_references.
|
82
|
+
#
|
83
|
+
@by_author_reference_index = {}
|
84
|
+
@genus_names = {}
|
85
|
+
@species_names = {}
|
86
|
+
@nomenclator = {}
|
87
|
+
@time = Time.now.strftime("%F %T")
|
88
|
+
end
|
89
|
+
|
90
|
+
# Export only the ref_collection. Sidesteps the main name-centric exports
|
91
|
+
# Note that this still uses the base @name_collection object as a starting reference,
|
92
|
+
# it just references @name_collection.ref_collection. So you can do:
|
93
|
+
# nc = Taxonifi::Model::NameCollection.new
|
94
|
+
# nc.ref_collection = Taxonifi::Model::RefCollection.new
|
95
|
+
# etc.
|
96
|
+
def export_references(options = {})
|
97
|
+
opts = {
|
98
|
+
:starting_ref_id => 0,
|
99
|
+
:starting_author_id => 0
|
100
|
+
}
|
101
|
+
|
102
|
+
configure_folders
|
103
|
+
build_author_index
|
104
|
+
|
105
|
+
# order matters
|
106
|
+
['tblPeople', 'tblRefs', 'tblRefAuthors', 'sqlRefs' ].each do |t|
|
107
|
+
write_file(t, send(t))
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Assumes names that are the same are the same person.
|
112
|
+
def build_author_index
|
113
|
+
@author_index = @name_collection.ref_collection.unique_authors.inject({}){|hsh, a| hsh.merge!(a.compact_string => a)}
|
114
|
+
end
|
115
|
+
|
116
|
+
def export()
|
117
|
+
super
|
118
|
+
@name_collection.generate_ref_collection(1)
|
119
|
+
|
120
|
+
# Give authors unique ids
|
121
|
+
@name_collection.ref_collection.uniquify_authors(1)
|
122
|
+
build_author_index
|
123
|
+
|
124
|
+
# See notes in #initalize re potential key collisions!
|
125
|
+
@by_author_reference_index = @name_collection.ref_collection.collection.inject({}){|hsh, r| hsh.merge!(r.author_year_index => r)}
|
126
|
+
|
127
|
+
@name_collection.names_at_rank('genus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
|
128
|
+
@name_collection.names_at_rank('subgenus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
|
129
|
+
@name_collection.names_at_rank('species').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
130
|
+
@name_collection.names_at_rank('subspecies').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
131
|
+
|
132
|
+
MANIFEST.each do |f|
|
133
|
+
write_file(f, send(f))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def tblTaxa
|
138
|
+
@headers = %w{TaxonNameID TaxonNameStr RankID Name Parens AboveID RefID DataFlags AccessCode NameStatus StatusFlags OriginalGenusID LastUpdate ModifiedBy}
|
139
|
+
@csv_string = CSV.generate() do |csv|
|
140
|
+
csv << @headers
|
141
|
+
@name_collection.collection.each do |n|
|
142
|
+
ref = @by_author_reference_index[n.author_year_index]
|
143
|
+
cols = {
|
144
|
+
TaxonNameID: n.id,
|
145
|
+
TaxonNameStr: n.parent_ids_sf_style, # closure -> ends with 1
|
146
|
+
RankID: SPECIES_FILE_RANKS[n.rank],
|
147
|
+
Name: n.name,
|
148
|
+
Parens: (n.parens ? 1 : 0),
|
149
|
+
AboveID: (n.related_name.nil? ? (n.parent ? n.parent.id : 0) : n.related_name.id), # !! SF folks like to pre-populate with zeros
|
150
|
+
RefID: (ref ? ref.id : 0),
|
151
|
+
DataFlags: 0, # see http://software.speciesfile.org/Design/TaxaTables.aspx#Taxon, a flag populated when data is reviewed, initialize to zero
|
152
|
+
AccessCode: 0,
|
153
|
+
NameStatus: (n.related_name.nil? ? 0 : 7), # 0 :valid, 7: synonym)
|
154
|
+
StatusFlags: (n.related_name.nil? ? 0 : 262144), # 0 :valid, 262144: jr. synonym
|
155
|
+
OriginalGenusID: (!n.parens && n.parent_at_rank('genus') ? n.parent_at_rank('genus').id : 0), # SF must be pre-configured with 0 filler (this restriction needs to go)
|
156
|
+
LastUpdate: @time,
|
157
|
+
ModifiedBy: @authorized_user_id,
|
158
|
+
}
|
159
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
160
|
+
end
|
161
|
+
end
|
162
|
+
@csv_string
|
163
|
+
end
|
164
|
+
|
165
|
+
# Generate a tblRefs string.
|
166
|
+
def tblRefs
|
167
|
+
@headers = %w{RefID ActualYear Title PubID Verbatim}
|
168
|
+
@csv_string = CSV.generate(:col_sep => "\t") do |csv|
|
169
|
+
csv << @headers
|
170
|
+
@name_collection.ref_collection.collection.each_with_index do |r,i|
|
171
|
+
cols = {
|
172
|
+
RefID: r.id, # i + 1,
|
173
|
+
Title: (r.title.nil? ? """""" : r.title),
|
174
|
+
PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
|
175
|
+
ActualYear: r.year,
|
176
|
+
Verbatim: r.full_citation
|
177
|
+
}
|
178
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
179
|
+
end
|
180
|
+
end
|
181
|
+
@csv_string
|
182
|
+
end
|
183
|
+
|
184
|
+
# TODO make a standard transaction wrapper
|
185
|
+
def sqlRefs
|
186
|
+
sql = [ 'BEGIN TRY', 'BEGIN TRANSACTION']
|
187
|
+
@headers = %w{RefID ActualYear Title PubID Verbatim}
|
188
|
+
@name_collection.ref_collection.collection.each_with_index do |r,i|
|
189
|
+
cols = {
|
190
|
+
RefID: r.id, # i + 1,
|
191
|
+
Title: (r.title.nil? ? """""" : r.title),
|
192
|
+
PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
|
193
|
+
ActualYear: r.year,
|
194
|
+
Verbatim: r.full_citation
|
195
|
+
}
|
196
|
+
sql << "INSERT INTO tblRefs (#{@headers.sort.join(",")}) VALUES (#{@headers.sort.collect{|h| "'#{cols[h.to_sym].to_s.gsub(/'/,"''")}'"}.join(",")});"
|
197
|
+
end
|
198
|
+
sql << ['COMMIT', 'END TRY', 'BEGIN CATCH', 'ROLLBACK', 'END CATCH']
|
199
|
+
sql.join("\n")
|
200
|
+
end
|
201
|
+
|
202
|
+
# Generate tblPeople string.
|
203
|
+
def tblPeople
|
204
|
+
@headers = %w{PersonID FamilyName GivenNames GivenInitials Suffix Role LastUpdate ModifiedBy}
|
205
|
+
@csv_string = CSV.generate() do |csv|
|
206
|
+
csv << @headers
|
207
|
+
@author_index.keys.each_with_index do |k,i|
|
208
|
+
a = @author_index[k]
|
209
|
+
# a.id = i + 1
|
210
|
+
cols = {
|
211
|
+
PersonID: a.id,
|
212
|
+
FamilyName: a.last_name,
|
213
|
+
GivenName: a.first_name,
|
214
|
+
GivenInitials: a.initials_string,
|
215
|
+
Suffix: a.suffix,
|
216
|
+
Role: 1, # authors
|
217
|
+
LastUpdate: @time,
|
218
|
+
ModifiedBy: @authorized_user_id
|
219
|
+
}
|
220
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
221
|
+
end
|
222
|
+
end
|
223
|
+
@csv_string
|
224
|
+
end
|
225
|
+
|
226
|
+
# Generate tblRefAuthors string.
|
227
|
+
def tblRefAuthors
|
228
|
+
@headers = %w{RefID PersonID SeqNum AuthorCount LastUpdate ModifiedBy}
|
229
|
+
@csv_string = CSV.generate() do |csv|
|
230
|
+
csv << @headers
|
231
|
+
@name_collection.ref_collection.collection.each do |r|
|
232
|
+
r.authors.each_with_index do |x, i|
|
233
|
+
a = @author_index[x.compact_string]
|
234
|
+
cols = {
|
235
|
+
RefID: r.id,
|
236
|
+
PersonID: a.id,
|
237
|
+
SeqNum: i + 1,
|
238
|
+
AuthorCount: r.authors.size,
|
239
|
+
LastUpdate: @time,
|
240
|
+
ModifiedBy: @authorized_user_id
|
241
|
+
}
|
242
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
@csv_string
|
247
|
+
end
|
248
|
+
|
249
|
+
# Generate tblCites string.
|
250
|
+
def tblCites
|
251
|
+
@headers = %w{TaxonNameID SeqNum RefID NomenclatorID LastUpdate ModifiedBy NewNameStatus CitePages Note TypeClarification CurrentConcept ConceptChange InfoFlags InfoFlagStatus PolynomialStatus}
|
252
|
+
@csv_string = CSV.generate() do |csv|
|
253
|
+
csv << @headers
|
254
|
+
@name_collection.collection.each do |n|
|
255
|
+
ref = @by_author_reference_index[n.author_year_index]
|
256
|
+
next if ref.nil?
|
257
|
+
cols = {
|
258
|
+
TaxonNameID: n.id,
|
259
|
+
SeqNum: 1,
|
260
|
+
RefID: ref.id,
|
261
|
+
NomenclatorID: @nomenclator[n.nomenclator_name],
|
262
|
+
LastUpdate: @time,
|
263
|
+
ModifiedBy: @authorized_user_id,
|
264
|
+
CitePages: """""", # equates to "" in CSV speak
|
265
|
+
NewNameStatus: 0,
|
266
|
+
Note: """""",
|
267
|
+
TypeClarification: 0, # We might derive more data from this
|
268
|
+
CurrentConcept: 1, # Boolean, right?
|
269
|
+
ConceptChange: 0, # Unspecified
|
270
|
+
InfoFlags: 0, #
|
271
|
+
InfoFlagStatus: 1, # 1 => needs review
|
272
|
+
PolynomialStatus: 0
|
273
|
+
}
|
274
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
275
|
+
end
|
276
|
+
end
|
277
|
+
@csv_string
|
278
|
+
end
|
279
|
+
|
280
|
+
def tblGenusNames
|
281
|
+
@csv_string = csv_for_genus_and_species_names_tables('Genus')
|
282
|
+
@csv_string
|
283
|
+
end
|
284
|
+
|
285
|
+
def tblSpeciesNames
|
286
|
+
@csv_string = csv_for_genus_and_species_names_tables('Species')
|
287
|
+
@csv_string
|
288
|
+
end
|
289
|
+
|
290
|
+
def csv_for_genus_and_species_names_tables(type)
|
291
|
+
col = "#{type}NameID"
|
292
|
+
@headers = [col, "Name", "LastUpdate", "ModifiedBy", "Italicize"]
|
293
|
+
@csv_string = CSV.generate() do |csv|
|
294
|
+
csv << @headers
|
295
|
+
var = self.send("#{type.downcase}_names")
|
296
|
+
var.keys.each_with_index do |n,i|
|
297
|
+
var[n] = i + 1
|
298
|
+
cols = {
|
299
|
+
col.to_sym => i + 1,
|
300
|
+
Name: n,
|
301
|
+
LastUpdate: @time,
|
302
|
+
ModifiedBy: @authorized_user_id,
|
303
|
+
Italicize: 1 # always true for these data
|
304
|
+
}
|
305
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
306
|
+
end
|
307
|
+
end
|
308
|
+
@csv_string
|
309
|
+
end
|
310
|
+
|
311
|
+
# must be called post tblGenusNames and tblSpeciesNames
|
312
|
+
def tblNomenclator
|
313
|
+
@headers = %w{NomenclatorID GenusNameID SubgenusNameID SpeciesNameID SubspeciesNameID LastUpdate ModifiedBy SuitableForGenus SuitableForSpecies InfrasubspeciesNameID InfrasubKind}
|
314
|
+
@csv_string = CSV.generate() do |csv|
|
315
|
+
csv << @headers
|
316
|
+
i = 1
|
317
|
+
@name_collection.collection.each do |n|
|
318
|
+
next if Taxonifi::RANKS.index(n.rank) < Taxonifi::RANKS.index('genus')
|
319
|
+
cols = {
|
320
|
+
NomenclatorID: i,
|
321
|
+
GenusNameID: @genus_names[n.parent_name_at_rank('genus')] || 0,
|
322
|
+
SubgenusNameID: @genus_names[n.parent_name_at_rank('subgenus')] || 0,
|
323
|
+
SpeciesNameID: @species_names[n.parent_name_at_rank('species')] || 0,
|
324
|
+
SubspeciesNameID: @species_names[n.parent_name_at_rank('subspecies')] || 0,
|
325
|
+
InfrasubspeciesNameID: 0,
|
326
|
+
InfrasubKind: 0, # this might be wrong
|
327
|
+
LastUpdate: @time,
|
328
|
+
ModifiedBy: @authorized_user_id,
|
329
|
+
SuitableForGenus: 0, # Set in SF
|
330
|
+
SuitableForSpecies: 0 # Set in SF
|
331
|
+
}
|
332
|
+
@nomenclator.merge!(n.nomenclator_name => i)
|
333
|
+
i += 1
|
334
|
+
csv << @headers.collect{|h| cols[h.to_sym]}
|
335
|
+
end
|
336
|
+
end
|
337
|
+
@csv_string
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
end
|
@@ -0,0 +1,334 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../taxonifi'))
|
2
|
+
|
3
|
+
# The lumper lumps! Tools for recognizing and using
|
4
|
+
# combinations of column types.
|
5
|
+
module Taxonifi::Lumper
|
6
|
+
|
7
|
+
# Define groups of columns/fields and include
|
8
|
+
# functionality to determine whether your
|
9
|
+
# columns match a given set.
|
10
|
+
module Lumps
|
11
|
+
Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "lumps/*.rb") )) do |file|
|
12
|
+
require file
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class LumperError < StandardError; end
|
17
|
+
|
18
|
+
# Columns used for species epithets.
|
19
|
+
# !! Todo: map DwC URIs to these labels (at present they largely correllate with Tokens,
|
20
|
+
# perhaps map URIs to tokens!?)
|
21
|
+
QUAD = ['genus', 'subgenus', 'species', 'subspecies']
|
22
|
+
|
23
|
+
# Columns representing author and year
|
24
|
+
AUTHOR_YEAR = ['author', 'year']
|
25
|
+
|
26
|
+
# A Hash of named column combinations
|
27
|
+
LUMPS = {
|
28
|
+
quadrinomial: QUAD,
|
29
|
+
quad_author_year: QUAD + AUTHOR_YEAR,
|
30
|
+
names: Taxonifi::RANKS + AUTHOR_YEAR,
|
31
|
+
higher: Taxonifi::RANKS - [QUAD + AUTHOR_YEAR],
|
32
|
+
species: ['species', 'subspecies'],
|
33
|
+
genera: ['genus', 'subgenus'],
|
34
|
+
citation_basic: %w{authors year title publication volume number pages pg_start pg_end},
|
35
|
+
citation_small: %w{authors year title publication volume_number pages},
|
36
|
+
basic_geog: %w{country state county}, # add 'continent'
|
37
|
+
eol_basic: %w{identifier parent child rank synonyms}
|
38
|
+
}
|
39
|
+
|
40
|
+
# Lumps for which all columns are represented
|
41
|
+
# TODO: This is really an assessor method
|
42
|
+
def self.available_lumps(columns)
|
43
|
+
raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.available_lumps.' if !(columns.class == Array)
|
44
|
+
LUMPS.keys.select{|k| (LUMPS[k] - columns) == []}
|
45
|
+
end
|
46
|
+
|
47
|
+
# Lumps for which any column is represented
|
48
|
+
# # TODO: This is really an assessor method
|
49
|
+
def self.intersecting_lumps(columns)
|
50
|
+
raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.intersecting_lumps.' if !(columns.class == Array)
|
51
|
+
intersections = []
|
52
|
+
LUMPS.keys.each do |k|
|
53
|
+
intersections.push k if (LUMPS[k] & columns).size > 0
|
54
|
+
end
|
55
|
+
intersections
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return a Taxonifi::Model::NameCollection from a csv file.
|
59
|
+
def self.create_name_collection(csv)
|
60
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
|
61
|
+
nc = Taxonifi::Model::NameCollection.new
|
62
|
+
|
63
|
+
row_size = csv.size
|
64
|
+
|
65
|
+
# The row index contains a vector of parent ids like
|
66
|
+
# [0, 4, 29]
|
67
|
+
# This implies that Name with #id 29 has Parent with #id 4
|
68
|
+
# Initialize an empty index.
|
69
|
+
row_index = []
|
70
|
+
(0..(row_size-1)).each do |i|
|
71
|
+
row_index[i] = []
|
72
|
+
end
|
73
|
+
|
74
|
+
# The name_index keeps track of unique name per rank like
|
75
|
+
# :genus => {'Foo' => [0,2]}
|
76
|
+
# This says that "Foo" is instantiated two times in the
|
77
|
+
# name collection, with id 0, and id 2.
|
78
|
+
name_index = {}
|
79
|
+
|
80
|
+
# First pass, create and index names
|
81
|
+
Taxonifi::Assessor::RowAssessor.rank_headers(csv.headers).each do |rank|
|
82
|
+
name_index[rank] = {}
|
83
|
+
csv.each_with_index do |row, i|
|
84
|
+
row_rank = Taxonifi::Assessor::RowAssessor.lump_name_rank(row).to_s # metadata (e.g. author year) apply to this rank
|
85
|
+
|
86
|
+
name = row[rank]
|
87
|
+
|
88
|
+
if !name.nil? # cell has data
|
89
|
+
n = nil # a Name if necessary
|
90
|
+
name_id = nil # index the new or existing name
|
91
|
+
|
92
|
+
if name_index[rank][name] # name (string) exists
|
93
|
+
|
94
|
+
exists = false
|
95
|
+
name_index[rank][name].each do |id|
|
96
|
+
# Compare vectors of parent_ids for name presence
|
97
|
+
if nc.parent_id_vector(id) == row_index[i]
|
98
|
+
exists = true
|
99
|
+
name_id = id
|
100
|
+
break # don't need to check further
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
if !exists # name (string) exists, but parents are different, create new name
|
105
|
+
n = Taxonifi::Model::Name.new()
|
106
|
+
end
|
107
|
+
|
108
|
+
else # no version of the name exists
|
109
|
+
n = Taxonifi::Model::Name.new()
|
110
|
+
end # end name exists
|
111
|
+
|
112
|
+
# If we created a new name
|
113
|
+
if !n.nil?
|
114
|
+
n.rank = rank
|
115
|
+
n.name = name
|
116
|
+
n.parent = nc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
|
117
|
+
n.row_number = i
|
118
|
+
|
119
|
+
# Name/year needs to be standardized / cased out
|
120
|
+
# headers are overlapping at times
|
121
|
+
|
122
|
+
if row['author_year'] && row_rank == rank
|
123
|
+
builder = Taxonifi::Splitter::Builder.build_author_year(row['author_year'])
|
124
|
+
n.author = builder.people
|
125
|
+
n.year = builder.year
|
126
|
+
n.parens = !builder.parens
|
127
|
+
end
|
128
|
+
|
129
|
+
name_id = nc.add_object(n).id
|
130
|
+
# Add the name to the index of unique names
|
131
|
+
name_index[rank][name] ||= []
|
132
|
+
name_index[rank][name].push name_id
|
133
|
+
end
|
134
|
+
|
135
|
+
# build a by row vector of parent child relationships
|
136
|
+
row_index[i].push name_id
|
137
|
+
end # end cell has data
|
138
|
+
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
nc
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return a Taxonifi::Model::RefCollection from a CSV file.
|
146
|
+
def self.create_ref_collection(csv)
|
147
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_ref_collection.' if csv.class != CSV::Table
|
148
|
+
rc = Taxonifi::Model::RefCollection.new
|
149
|
+
row_size = csv.size
|
150
|
+
|
151
|
+
ref_index = {}
|
152
|
+
csv.each_with_index do |row, i|
|
153
|
+
if Taxonifi::Assessor::RowAssessor.intersecting_lumps_with_data(row, [:citation_small]).include?(:citation_small)
|
154
|
+
r = Taxonifi::Model::Ref.new(
|
155
|
+
:year => row['year'],
|
156
|
+
:title => row['title'],
|
157
|
+
:publication => row['publication']
|
158
|
+
)
|
159
|
+
|
160
|
+
# TODO: break out each of these lexes to a builder
|
161
|
+
if row['authors'] && !row['authors'].empty?
|
162
|
+
lexer = Taxonifi::Splitter::Lexer.new(row['authors'])
|
163
|
+
authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
|
164
|
+
authors.names.each do |a|
|
165
|
+
n = Taxonifi::Model::Person.new()
|
166
|
+
n.last_name = a[:last_name]
|
167
|
+
n.initials = a[:initials]
|
168
|
+
r.authors.push n
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
if row['volume_number'] && !row['volume_number'].empty?
|
173
|
+
lexer = Taxonifi::Splitter::Lexer.new(row['volume_number'], :volume_number)
|
174
|
+
t = lexer.pop(Taxonifi::Splitter::Tokens::VolumeNumber)
|
175
|
+
r.volume = t.volume
|
176
|
+
r.number = t.number
|
177
|
+
end
|
178
|
+
|
179
|
+
if row['pages'] && !row['pages'].empty?
|
180
|
+
# If our regex doesn't match dump the field into pages
|
181
|
+
begin
|
182
|
+
lexer = Taxonifi::Splitter::Lexer.new(row['pages'], :pages)
|
183
|
+
t = lexer.pop(Taxonifi::Splitter::Tokens::Pages)
|
184
|
+
r.pg_start = t.pg_start
|
185
|
+
r.pg_end = t.pg_end
|
186
|
+
rescue
|
187
|
+
r.pages = row['pages']
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
# Do some indexing.
|
192
|
+
ref_str = r.compact_string
|
193
|
+
if !ref_index.keys.include?(ref_str)
|
194
|
+
ref_id = rc.add_object(r).id
|
195
|
+
ref_index.merge!(ref_str => ref_id)
|
196
|
+
rc.row_index[i] = r
|
197
|
+
else
|
198
|
+
rc.row_index[i] = ref_index[ref_str]
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
rc
|
203
|
+
end
|
204
|
+
|
205
|
+
# Creates a generic Collection with Objects of GenericObject
|
206
|
+
# Objects are assigned to parents (rank) according to the order provided in headers.
|
207
|
+
# Objects are considered the same if they have the same name and the same parents closure, e.g.
|
208
|
+
#
|
209
|
+
# a b c
|
210
|
+
# a b d
|
211
|
+
# e b f
|
212
|
+
#
|
213
|
+
# Will return 7 objects named in order a,b,c,d,e,b,f
|
214
|
+
#
|
215
|
+
# a,b b,c b,d e,b b,f are the unique parent/child relationships stored
|
216
|
+
#
|
217
|
+
#
|
218
|
+
def self.create_hierarchical_collection(csv, headers)
|
219
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
|
220
|
+
raise Taxonifi::Lumper::LumperError, 'No headers provided to create_hierarchical_collection.' if headers.size == 0
|
221
|
+
|
222
|
+
c = Taxonifi::Model::Collection.new
|
223
|
+
row_size = csv.size
|
224
|
+
|
225
|
+
# See create_name_collection
|
226
|
+
row_index = []
|
227
|
+
(0..(row_size-1)).each do |i|
|
228
|
+
row_index[i] = []
|
229
|
+
end
|
230
|
+
|
231
|
+
name_index = {}
|
232
|
+
headers.each do |h|
|
233
|
+
name_index[h] = {}
|
234
|
+
end
|
235
|
+
|
236
|
+
csv.each_with_index do |row, i|
|
237
|
+
headers.each do |rank|
|
238
|
+
name = row[rank]
|
239
|
+
if !name.nil? && !name.empty? # cell has data
|
240
|
+
o = nil # a Name if necessary
|
241
|
+
name_id = nil # index the new or existing name
|
242
|
+
|
243
|
+
if name_index[rank][name] # name exists
|
244
|
+
|
245
|
+
exists = false
|
246
|
+
name_index[rank][name].each do |id|
|
247
|
+
if c.parent_id_vector(id) == row_index[i]
|
248
|
+
exists = true
|
249
|
+
name_id = id
|
250
|
+
break
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
if !exists
|
255
|
+
o = Taxonifi::Model::GenericObject.new()
|
256
|
+
end
|
257
|
+
else
|
258
|
+
o = Taxonifi::Model::GenericObject.new()
|
259
|
+
end
|
260
|
+
|
261
|
+
if !o.nil?
|
262
|
+
o.name = name
|
263
|
+
o.rank = rank
|
264
|
+
o.row_number = i
|
265
|
+
o.parent = c.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
|
266
|
+
|
267
|
+
name_id = c.add_object(o).id
|
268
|
+
name_index[rank][name] ||= []
|
269
|
+
name_index[rank][name].push name_id
|
270
|
+
|
271
|
+
end
|
272
|
+
row_index[i].push name_id
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
c
|
277
|
+
end
|
278
|
+
|
279
|
+
# Return a geog collection from a csv file.
|
280
|
+
def self.create_geog_collection(csv)
|
281
|
+
raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_geog_collection.' if csv.class != CSV::Table
|
282
|
+
gc = Taxonifi::Model::GeogCollection.new
|
283
|
+
|
284
|
+
row_size = csv.size
|
285
|
+
|
286
|
+
# See create_name_collection
|
287
|
+
row_index = []
|
288
|
+
(0..(row_size-1)).each do |i|
|
289
|
+
row_index[i] = []
|
290
|
+
end
|
291
|
+
|
292
|
+
name_index = {}
|
293
|
+
geog_headers = Taxonifi::Assessor::RowAssessor.geog_headers(csv.headers)
|
294
|
+
geog_headers.each do |h|
|
295
|
+
name_index[h] = {}
|
296
|
+
end
|
297
|
+
|
298
|
+
# We don't have the same problems as with taxon names, i.e.
|
299
|
+
# boo in
|
300
|
+
# Foo nil boo
|
301
|
+
# Foo bar boo
|
302
|
+
# is the same thing wrt geography, not the case for taxon names.
|
303
|
+
# We can use a row first loop to build as we go
|
304
|
+
|
305
|
+
csv.each_with_index do |row, i|
|
306
|
+
geog_headers.each do |level|
|
307
|
+
name = row[level]
|
308
|
+
if !name.nil? && !name.empty? # cell has data
|
309
|
+
g = nil # a Name if necessary
|
310
|
+
name_id = nil # index the new or existing name
|
311
|
+
|
312
|
+
if name_index[level][name] # name exists
|
313
|
+
name_id = name_index[level][name]
|
314
|
+
else
|
315
|
+
g = Taxonifi::Model::Geog.new()
|
316
|
+
name_id = gc.add_object(g).id
|
317
|
+
end
|
318
|
+
|
319
|
+
if !g.nil?
|
320
|
+
g.name = name
|
321
|
+
g.rank = level
|
322
|
+
g.parent = gc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
|
323
|
+
end
|
324
|
+
|
325
|
+
name_index[level][name] = name_id
|
326
|
+
row_index[i].push name_id
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
gc
|
331
|
+
end
|
332
|
+
|
333
|
+
end # end Lumper Module
|
334
|
+
|