taxonifi 0.2.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +59 -0
- data/.travis.yml +11 -0
- data/Gemfile +5 -17
- data/Gemfile.lock +22 -40
- data/README.md +192 -0
- data/Rakefile +35 -26
- data/lib/export/format/base.rb +1 -1
- data/lib/export/format/species_file.rb +154 -152
- data/lib/lumper/clump.rb +1 -1
- data/lib/lumper/lumper.rb +22 -18
- data/lib/lumper/lumps/parent_child_name_collection.rb +1 -2
- data/lib/lumper/name_index.rb +21 -0
- data/lib/{models → model}/author_year.rb +2 -2
- data/lib/{models → model}/base.rb +35 -5
- data/lib/{models → model}/collection.rb +8 -1
- data/lib/{models → model}/name.rb +128 -36
- data/lib/{models → model}/name_collection.rb +134 -33
- data/lib/{models → model}/person.rb +1 -1
- data/lib/{models → model}/ref.rb +4 -2
- data/lib/model/ref_collection.rb +171 -0
- data/lib/{models → model}/species_name.rb +24 -3
- data/lib/splitter/builder.rb +1 -1
- data/lib/splitter/parser.rb +5 -0
- data/lib/splitter/tokens.rb +54 -9
- data/lib/taxonifi/version.rb +3 -0
- data/lib/taxonifi.rb +5 -9
- data/taxonifi.gemspec +29 -99
- data/test/helper.rb +1 -1
- data/test/test_exporter.rb +1 -1
- data/test/test_lumper_names.rb +9 -9
- data/test/test_lumper_refs.rb +4 -4
- data/test/test_parser.rb +97 -26
- data/test/test_splitter_tokens.rb +25 -4
- data/test/test_taxonifi_base.rb +1 -1
- data/test/test_taxonifi_geog.rb +1 -1
- data/test/test_taxonifi_name.rb +13 -14
- data/test/test_taxonifi_name_collection.rb +11 -5
- data/test/test_taxonifi_ref.rb +1 -1
- data/test/test_taxonifi_ref_collection.rb +40 -3
- data/test/test_taxonifi_species_name.rb +51 -1
- data/travis/before_install.sh +2 -0
- metadata +96 -66
- data/README.rdoc +0 -154
- data/VERSION +0 -1
- data/lib/models/ref_collection.rb +0 -107
- /data/lib/{models → model}/generic_object.rb +0 -0
- /data/lib/{models → model}/geog.rb +0 -0
- /data/lib/{models → model}/geog_collection.rb +0 -0
- /data/lib/{models → model}/shared_class_methods.rb +0 -0
@@ -8,6 +8,7 @@ module Taxonifi::Export
|
|
8
8
|
|
9
9
|
# tblRanks 5/17/2012
|
10
10
|
SPECIES_FILE_RANKS = {
|
11
|
+
'variety' => 5, # there is no variety rank per se in SFs, they are handled this way according to DE
|
11
12
|
'subspecies' => 5,
|
12
13
|
'species' => 10,
|
13
14
|
'species subgroup' => 11,
|
@@ -55,18 +56,17 @@ module Taxonifi::Export
|
|
55
56
|
attr_accessor :name_collection
|
56
57
|
attr_accessor :ref_collection
|
57
58
|
attr_accessor :pub_collection
|
58
|
-
attr_accessor :author_index
|
59
59
|
attr_accessor :genus_names, :species_names, :nomenclator
|
60
60
|
attr_accessor :authorized_user_id, :time
|
61
|
-
|
61
|
+
|
62
|
+
attr_accessor :built_nomenclators
|
62
63
|
|
63
64
|
def initialize(options = {})
|
64
65
|
opts = {
|
65
66
|
:nc => Taxonifi::Model::NameCollection.new,
|
66
67
|
:export_folder => 'species_file',
|
67
68
|
:authorized_user_id => nil,
|
68
|
-
:
|
69
|
-
:manifest => %w{tblPubs tblRefs tblPeople tblRefAuthors tblTaxa tblGenusNames tblSpeciesNames tblNomenclator tblCites}
|
69
|
+
:manifest => %w{tblPubs tblRefs tblPeople tblRefAuthors tblTaxa tblGenusNames tblSpeciesNames tblNomenclator tblCites tblTypeSpecies}
|
70
70
|
}.merge!(options)
|
71
71
|
|
72
72
|
@manifest = opts[:manifest]
|
@@ -77,9 +77,7 @@ module Taxonifi::Export
|
|
77
77
|
@name_collection = opts[:nc]
|
78
78
|
@pub_collection = {} # title => id
|
79
79
|
@authorized_user_id = opts[:authorized_user_id]
|
80
|
-
|
81
|
-
@starting_ref_id = opts[:starting_ref_id]
|
82
|
-
|
80
|
+
|
83
81
|
# Careful here, at present we are just generating Reference micro-citations from our names, so the indexing "just works"
|
84
82
|
# because it's all internal. There will is a strong potential for key collisions if this pipeline is modified to
|
85
83
|
# include references external to the initialized name_collection. See also export_references.
|
@@ -93,11 +91,6 @@ module Taxonifi::Export
|
|
93
91
|
@empty_quotes = ""
|
94
92
|
end
|
95
93
|
|
96
|
-
# Assumes names that are the same are the same person.
|
97
|
-
def build_author_index
|
98
|
-
@author_index = @name_collection.ref_collection.unique_authors.inject({}){|hsh, a| hsh.merge!(a.compact_string => a)}
|
99
|
-
end
|
100
|
-
|
101
94
|
def export()
|
102
95
|
super
|
103
96
|
# You must have
|
@@ -109,9 +102,6 @@ module Taxonifi::Export
|
|
109
102
|
# Give authors unique ids:
|
110
103
|
# @name_collection.ref_collection.uniquify_authors(1)
|
111
104
|
|
112
|
-
if @name_collection.ref_collection
|
113
|
-
build_author_index
|
114
|
-
end
|
115
105
|
|
116
106
|
# raise Taxonifi::Export::ExportError, 'NameCollection has no RefCollection, you might try @name_collection.generate_ref_collection(1), or alter the manifest: hash.' if ! @name_collection.ref_collection.nil?
|
117
107
|
|
@@ -122,14 +112,28 @@ module Taxonifi::Export
|
|
122
112
|
@name_collection.names_at_rank('subgenus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
|
123
113
|
@name_collection.names_at_rank('species').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
124
114
|
@name_collection.names_at_rank('subspecies').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
115
|
+
@name_collection.names_at_rank('variety').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
|
116
|
+
|
117
|
+
# Add combinations of names from nomenclators/citations as well
|
118
|
+
|
119
|
+
@name_collection.nomenclators.keys.each do |k|
|
120
|
+
@genus_names.merge!(@name_collection.nomenclators[k][0] => nil)
|
121
|
+
@genus_names.merge!(@name_collection.nomenclators[k][1] => nil)
|
122
|
+
@species_names.merge!(@name_collection.nomenclators[k][2] => nil)
|
123
|
+
@species_names.merge!(@name_collection.nomenclators[k][3] => nil)
|
124
|
+
@species_names.merge!(@name_collection.nomenclators[k][4] => nil)
|
125
|
+
end
|
125
126
|
|
127
|
+
@genus_names.delete_if{|key,value| key.nil? || key.length == 0}
|
128
|
+
@species_names.delete_if{|key,value| key.nil? || key.length == 0}
|
129
|
+
|
126
130
|
str = [ 'BEGIN TRY', 'BEGIN TRANSACTION']
|
127
131
|
@manifest.each do |f|
|
128
132
|
str << send(f)
|
129
133
|
end
|
130
134
|
str << ['COMMIT', 'END TRY', 'BEGIN CATCH',
|
131
|
-
|
132
|
-
|
135
|
+
'SELECT ERROR_LINE() AS ErrorLine, ERROR_NUMBER() AS ErrorNumber, ERROR_MESSAGE() AS ErrorMessage;',
|
136
|
+
'ROLLBACK', 'END CATCH']
|
133
137
|
write_file('everything.sql', str.join("\n\n"))
|
134
138
|
true
|
135
139
|
end
|
@@ -142,56 +146,56 @@ module Taxonifi::Export
|
|
142
146
|
# nc.ref_collection = Taxonifi::Model::RefCollection.new
|
143
147
|
# etc.
|
144
148
|
def export_references(options = {})
|
145
|
-
raise Taxonifi::Export::ExportError, 'Method deprecated, alter manifest
|
146
|
-
|
147
|
-
|
148
|
-
# :starting_author_id => 0
|
149
|
-
#}
|
150
|
-
|
151
|
-
#configure_folders
|
152
|
-
#build_author_index
|
153
|
-
|
154
|
-
## order matters
|
155
|
-
#['tblPeople', 'tblRefs', 'tblRefAuthors', 'sqlRefs' ].each do |t|
|
156
|
-
# write_file(t, send(t))
|
157
|
-
#end
|
158
|
-
end
|
149
|
+
raise Taxonifi::Export::ExportError, 'Method deprecated, alter manifest to achieve a similar result.'
|
150
|
+
#configure_folders
|
151
|
+
end
|
159
152
|
|
160
|
-
#
|
161
|
-
# by .
|
153
|
+
# Gets the reference for a name as referenced
|
154
|
+
# by .properties[:link_to_ref_from_row]
|
162
155
|
def get_ref(name)
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
156
|
+
# if not name.properties[:link_to_ref_from_row].nil?
|
157
|
+
# return @name_collection.ref_collection.object_from_row(name.properties[:link_to_ref_from_row])
|
158
|
+
# end
|
159
|
+
# nil
|
160
|
+
name.original_description_reference ? name.original_description_reference : nil
|
167
161
|
end
|
168
162
|
|
169
163
|
def tblTaxa
|
170
|
-
@headers = %w{TaxonNameID TaxonNameStr RankID Name Parens AboveID RefID DataFlags AccessCode NameStatus StatusFlags OriginalGenusID LastUpdate ModifiedBy}
|
164
|
+
@headers = %w{TaxonNameID TaxonNameStr RankID Name Parens AboveID RefID DataFlags AccessCode Extinct NameStatus StatusFlags OriginalGenusID LastUpdate ModifiedBy}
|
171
165
|
sql = []
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
166
|
+
sql_above = []
|
167
|
+
|
168
|
+
# Need to add by rank for FK constraint handling
|
169
|
+
|
170
|
+
Taxonifi::RANKS.each do |rank|
|
171
|
+
@name_collection.names_at_rank(rank).each do |n|
|
172
|
+
$DEBUG && $stderr.puts("#{n.name} is too long") if n.name.length > 30
|
173
|
+
|
174
|
+
# ref = get_ref(n)
|
175
|
+
cols = {
|
176
|
+
TaxonNameID: n.id,
|
177
|
+
TaxonNameStr: n.parent_ids_sf_style, # closure -> ends with 1
|
178
|
+
RankID: SPECIES_FILE_RANKS[n.rank],
|
179
|
+
Name: n.name,
|
180
|
+
Parens: (n.parens ? 1 : 0),
|
181
|
+
AboveID: 0,
|
182
|
+
RefID: (n.original_description_reference ? n.original_description_reference.id : 0),
|
183
|
+
DataFlags: 0, # see http://software.speciesfile.org/Design/TaxaTables.aspx#Taxon, a flag populated when data is reviewed, initialize to zero
|
184
|
+
AccessCode: 0,
|
185
|
+
Extinct: (n.properties && n.properties['extinct'] == 'true' ? 1 : 0),
|
186
|
+
NameStatus: (n.related_name.nil? ? 0 : 7), # 0 :valid, 7: synonym)
|
187
|
+
StatusFlags: (n.related_name.nil? ? 0 : 262144), # 0 :valid, 262144: jr. synonym
|
188
|
+
OriginalGenusID: (n.properties && !n.properties['original_genus_id'].nil? ? n.properties['original_genus_id'] : 0), # SF must be pre-configured with 0 filler (this restriction needs to go)
|
189
|
+
LastUpdate: @time,
|
190
|
+
ModifiedBy: @authorized_user_id,
|
191
|
+
}
|
192
|
+
sql << sql_insert_statement('tblTaxa', cols)
|
193
|
+
above_id = (n.related_name.nil? ? (n.parent ? n.parent.id : 0) : n.related_name.id)
|
194
|
+
sql_above.push "UPDATE tblTaxa SET AboveID = #{above_id} where TaxonNameID = #{n.id};"
|
195
|
+
end
|
193
196
|
end
|
194
|
-
|
197
|
+
|
198
|
+
sql.join("\n") + sql_above.join("\n")
|
195
199
|
end
|
196
200
|
|
197
201
|
# Generate a tblRefs string.
|
@@ -202,6 +206,16 @@ module Taxonifi::Export
|
|
202
206
|
# Assumes the 0 "null" pub id is there
|
203
207
|
pub_id = @pub_collection[r.publication] ? @pub_collection[r.publication] : 0
|
204
208
|
|
209
|
+
# Build a note based on "unused" properties
|
210
|
+
note = []
|
211
|
+
if r.properties
|
212
|
+
r.properties.keys.each do |k|
|
213
|
+
note.push "#{k}: #{r.properties[k]}" if r.properties[k] && r.properties.length > 0
|
214
|
+
end
|
215
|
+
end
|
216
|
+
note = note.join("; ")
|
217
|
+
note = @empty_quotes if note.length == 0
|
218
|
+
|
205
219
|
cols = {
|
206
220
|
RefID: r.id,
|
207
221
|
ContainingRefID: 0,
|
@@ -210,12 +224,12 @@ module Taxonifi::Export
|
|
210
224
|
Series: @empty_quotes,
|
211
225
|
Volume: (r.volume ? r.volume : @empty_quotes),
|
212
226
|
Issue: (r.number ? r.number : @empty_quotes),
|
213
|
-
RefPages: r.page_string, # always a
|
227
|
+
RefPages: r.page_string, # always a strings
|
214
228
|
ActualYear: (r.year ? r.year : @empty_quotes),
|
215
229
|
StatedYear: @empty_quotes,
|
216
230
|
AccessCode: 0,
|
217
231
|
Flags: 0,
|
218
|
-
Note:
|
232
|
+
Note: note,
|
219
233
|
LastUpdate: @time,
|
220
234
|
LinkID: 0,
|
221
235
|
ModifiedBy: @authorized_user_id,
|
@@ -231,7 +245,7 @@ module Taxonifi::Export
|
|
231
245
|
def tblPubs
|
232
246
|
sql = []
|
233
247
|
@headers = %w{PubID PrefID PubType ShortName FullName Note LastUpdate ModifiedBy Publisher PlacePublished PubRegID Status StartYear EndYear BHL}
|
234
|
-
|
248
|
+
|
235
249
|
# Hackish should build this elsewhere, but degrades OK
|
236
250
|
pubs = @name_collection.ref_collection.collection.collect{|r| r.publication}.compact.uniq
|
237
251
|
|
@@ -263,9 +277,7 @@ module Taxonifi::Export
|
|
263
277
|
def tblPeople
|
264
278
|
@headers = %w{PersonID FamilyName GivenNames GivenInitials Suffix Role LastUpdate ModifiedBy}
|
265
279
|
sql = []
|
266
|
-
@
|
267
|
-
a = @author_index[k]
|
268
|
-
# a.id = i + 1
|
280
|
+
@name_collection.ref_collection.all_authors.each do |a|
|
269
281
|
cols = {
|
270
282
|
PersonID: a.id,
|
271
283
|
FamilyName: (a.last_name.length > 0 ? a.last_name : "Unknown"),
|
@@ -287,12 +299,11 @@ module Taxonifi::Export
|
|
287
299
|
sql = []
|
288
300
|
@name_collection.ref_collection.collection.each do |r|
|
289
301
|
r.authors.each_with_index do |x, i|
|
290
|
-
a = @author_index[x.compact_string]
|
291
302
|
cols = {
|
292
303
|
RefID: r.id,
|
293
|
-
PersonID:
|
304
|
+
PersonID: x.id,
|
294
305
|
SeqNum: i + 1,
|
295
|
-
AuthorCount: r.authors.size,
|
306
|
+
AuthorCount: r.authors.size + 1,
|
296
307
|
LastUpdate: @time,
|
297
308
|
ModifiedBy: @authorized_user_id
|
298
309
|
}
|
@@ -306,35 +317,62 @@ module Taxonifi::Export
|
|
306
317
|
def tblCites
|
307
318
|
@headers = %w{TaxonNameID SeqNum RefID NomenclatorID LastUpdate ModifiedBy NewNameStatus CitePages Note TypeClarification CurrentConcept ConceptChange InfoFlags InfoFlagStatus PolynomialStatus}
|
308
319
|
sql = []
|
309
|
-
|
310
|
-
@name_collection.collection.each do |n|
|
311
|
-
next if @nomenclator[n.nomenclator_name].nil? # Only create nomenclator records if they are original citations, otherwise not !! Might need updating in future imports
|
312
|
-
ref = get_ref(n)
|
313
320
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
321
|
+
@name_collection.citations.keys.each do |name_id|
|
322
|
+
seq_num = 1
|
323
|
+
@name_collection.citations[name_id].each do |ref_id, nomenclator_index, properties|
|
324
|
+
cols = {
|
325
|
+
TaxonNameID: name_id,
|
326
|
+
SeqNum: seq_num,
|
327
|
+
RefID: ref_id,
|
328
|
+
NomenclatorID: nomenclator_index,
|
329
|
+
LastUpdate: @time,
|
330
|
+
ModifiedBy: @authorized_user_id,
|
331
|
+
CitePages: (properties[:cite_pages] ? properties[:cite_pages] : @empty_quotes),
|
332
|
+
NewNameStatus: 0,
|
333
|
+
Note: (properties[:note] ? properties[:note] : @empty_quotes),
|
334
|
+
TypeClarification: 0, # We might derive more data from this
|
335
|
+
CurrentConcept: (properties[:current_concept] == true ? 1 : 0), # Boolean, right?
|
336
|
+
ConceptChange: 0, # Unspecified
|
337
|
+
InfoFlags: 0, #
|
338
|
+
InfoFlagStatus: 1, # 1 => needs review
|
339
|
+
PolynomialStatus: 0
|
340
|
+
}
|
341
|
+
sql << sql_insert_statement('tblCites', cols)
|
342
|
+
seq_num += 1
|
343
|
+
end
|
334
344
|
end
|
335
345
|
sql.join("\n")
|
336
346
|
end
|
337
347
|
|
348
|
+
# Generate tblTypeSpecies string.
|
349
|
+
def tblTypeSpecies
|
350
|
+
@headers = %w{GenusNameID SpeciesNameID Reason AuthorityRefID FirstFamGrpNameID LastUpdate ModifiedBy NewID}
|
351
|
+
sql = []
|
352
|
+
|
353
|
+
names = @name_collection.names_at_rank('genus') + @name_collection.names_at_rank('subgenus')
|
354
|
+
names.each do |n|
|
355
|
+
if n.properties[:type_species_id]
|
356
|
+
ref = get_ref(n)
|
357
|
+
|
358
|
+
# ref = @by_author_reference_index[n.author_year_index]
|
359
|
+
next if ref.nil?
|
360
|
+
cols = {
|
361
|
+
GenusNameID: n.id ,
|
362
|
+
SpeciesNameID: n.properties[:type_species_id],
|
363
|
+
Reason: 0 ,
|
364
|
+
AuthorityRefID: 0 ,
|
365
|
+
FirstFamGrpNameID: 0 ,
|
366
|
+
LastUpdate: @time ,
|
367
|
+
ModifiedBy: @authorized_user_id ,
|
368
|
+
NewID: 0 # What is this?
|
369
|
+
}
|
370
|
+
sql << sql_insert_statement('tblTypeSpecies', cols)
|
371
|
+
end
|
372
|
+
end
|
373
|
+
sql.join("\n")
|
374
|
+
end
|
375
|
+
|
338
376
|
def tblGenusNames
|
339
377
|
# TODO: SF tests catch unused names based on some names not being included in Nomeclator data. We could optimize so that the work around is removed.
|
340
378
|
# I.e., all the names get added here, not all the names get added to Nomclator/Cites because of citations which are not original combinations
|
@@ -374,78 +412,42 @@ module Taxonifi::Export
|
|
374
412
|
@headers = %w{NomenclatorID GenusNameID SubgenusNameID SpeciesNameID SubspeciesNameID LastUpdate ModifiedBy SuitableForGenus SuitableForSpecies InfrasubspeciesNameID InfrasubKind}
|
375
413
|
sql = []
|
376
414
|
i = 1
|
377
|
-
@name_collection.collection.each do |n|
|
378
|
-
gid, sgid = 0,0
|
379
|
-
sid = @species_names[n.parent_name_at_rank('species')] || 0
|
380
|
-
ssid = @species_names[n.parent_name_at_rank('subspecies')] || 0
|
381
415
|
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
416
|
+
# Ugh, move build from here
|
417
|
+
@name_collection.nomenclators.keys.each do |i|
|
418
|
+
name = @name_collection.nomenclators[i]
|
419
|
+
genus_id = @genus_names[name[0]]
|
420
|
+
genus_id ||= 0
|
421
|
+
subgenus_id = @genus_names[name[1]]
|
422
|
+
subgenus_id ||= 0
|
423
|
+
species_id = @species_names[name[2]]
|
424
|
+
species_id ||= 0
|
425
|
+
subspecies_id = @species_names[name[3]]
|
426
|
+
subspecies_id ||= 0
|
427
|
+
variety_id = @species_names[name[4]]
|
428
|
+
variety_id ||= 0
|
386
429
|
|
387
|
-
next if Taxonifi::RANKS.index(n.rank) < Taxonifi::RANKS.index('subtribe')
|
388
|
-
|
389
|
-
ref = get_ref(n)
|
390
|
-
# debugger
|
391
|
-
# ref = @by_author_reference_index[n.author_year_index]
|
392
|
-
|
393
|
-
next if ref.nil?
|
394
430
|
cols = {
|
395
431
|
NomenclatorID: i,
|
396
|
-
GenusNameID:
|
397
|
-
SubgenusNameID:
|
398
|
-
SpeciesNameID:
|
399
|
-
SubspeciesNameID:
|
400
|
-
InfrasubspeciesNameID:
|
401
|
-
InfrasubKind: 0
|
432
|
+
GenusNameID: genus_id,
|
433
|
+
SubgenusNameID: subgenus_id,
|
434
|
+
SpeciesNameID: species_id,
|
435
|
+
SubspeciesNameID: subspecies_id,
|
436
|
+
InfrasubspeciesNameID: variety_id,
|
437
|
+
InfrasubKind: (variety_id == 0 ? 0 : 2),
|
402
438
|
LastUpdate: @time,
|
403
439
|
ModifiedBy: @authorized_user_id,
|
404
|
-
SuitableForGenus: 0, # Set in SF
|
405
|
-
SuitableForSpecies: 0 # Set in SF
|
440
|
+
SuitableForGenus: 0, # Set in SF w test
|
441
|
+
SuitableForSpecies: 0 # Set in SF w test
|
406
442
|
}
|
407
|
-
@nomenclator.merge!(n.nomenclator_name => i)
|
408
443
|
i += 1
|
409
|
-
|
410
444
|
sql << sql_insert_statement('tblNomenclator', cols)
|
411
445
|
end
|
412
446
|
|
413
|
-
# TODO: DRY this up with above?!
|
414
|
-
@name_collection.combinations.each do |c|
|
415
|
-
gid, sgid = 0,0
|
416
|
-
sid = (c[2].nil? ? 0 : @species_names[c[2].name])
|
417
|
-
ssid = (c[3].nil? ? 0 : @species_names[c[3].name])
|
418
|
-
|
419
|
-
if c.compact.last.parens == false
|
420
|
-
gid = (c[0].nil? ? 0 : @genus_names[c[0].name])
|
421
|
-
sgid = (c[1].nil? ? 0 : @genus_names[c[1].name])
|
422
|
-
end
|
423
|
-
|
424
|
-
# ref = @by_author_reference_index[c.compact.last.author_year_index]
|
425
|
-
ref = @name_collection.ref_collection.object_from_row(c.compact.last.related[:link_to_ref_from_row])
|
426
|
-
|
427
|
-
next if ref.nil?
|
428
|
-
|
429
|
-
cols = {
|
430
|
-
NomenclatorID: i,
|
431
|
-
GenusNameID: gid ,
|
432
|
-
SubgenusNameID: sgid ,
|
433
|
-
SpeciesNameID: sid ,
|
434
|
-
SubspeciesNameID: ssid ,
|
435
|
-
InfrasubspeciesNameID: 0,
|
436
|
-
InfrasubKind: 0, # this might be wrong
|
437
|
-
LastUpdate: @time,
|
438
|
-
ModifiedBy: @authorized_user_id,
|
439
|
-
SuitableForGenus: 0, # Set in SF
|
440
|
-
SuitableForSpecies: 0 # Set in SF
|
441
|
-
}
|
442
|
-
# check!?
|
443
|
-
@nomenclator.merge!(c.compact.last.nomenclator_name => i)
|
444
|
-
sql << sql_insert_statement('tblNomenclator', cols)
|
445
|
-
i += 1
|
446
|
-
end
|
447
447
|
sql.join("\n")
|
448
448
|
end
|
449
449
|
|
450
|
+
|
451
|
+
|
450
452
|
end # End class
|
451
453
|
end # End module
|
data/lib/lumper/clump.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# require File.expand_path(File.join(File.dirname(__FILE__), '../taxonifi'))
|
2
2
|
|
3
|
-
# A Clump is a "C"ollection of lump derivatives and the
|
3
|
+
# A Clump is a "C"ollection of lump derivatives and the relationships between these derivatives!
|
4
4
|
# It's used to define relationships among objects derived, for example, between single rows of data
|
5
5
|
module Taxonifi::Lumper:Clumps
|
6
6
|
|
data/lib/lumper/lumper.rb
CHANGED
@@ -63,7 +63,7 @@ module Taxonifi::Lumper
|
|
63
63
|
opts = {
|
64
64
|
:csv => [],
|
65
65
|
:initial_id => 0,
|
66
|
-
:capture_related_fields => true # Stores other column values in (column_header => value) pairs in Name
|
66
|
+
:capture_related_fields => true # Stores other column values in (column_header => value) pairs in Name#properties
|
67
67
|
}.merge!(options)
|
68
68
|
|
69
69
|
csv = opts[:csv]
|
@@ -82,14 +82,14 @@ module Taxonifi::Lumper
|
|
82
82
|
# :genus => {'Foo' => [0,2]}
|
83
83
|
# This says that "Foo" is instantiated two times in the
|
84
84
|
# name collection, with id 0, and id 2.
|
85
|
-
name_index = {}
|
85
|
+
name_index = {} # Taxonifi::Lumper::NameIndex.new # {}
|
86
86
|
|
87
87
|
has_ref_fields = ([:citation_basic, :citation_small] & Taxonifi::Lumper.intersecting_lumps(csv.headers)).size > 0
|
88
88
|
unused_fields = csv.headers - Taxonifi::Lumper::LUMPS[:names]
|
89
89
|
|
90
|
-
|
91
90
|
# First pass, create and index names
|
92
91
|
Taxonifi::Assessor::RowAssessor.rank_headers(csv.headers).each do |rank|
|
92
|
+
# name_index.new_rank(rank)
|
93
93
|
name_index[rank] = {}
|
94
94
|
csv.each_with_index do |row, i|
|
95
95
|
shares_rank = (rank == Taxonifi::Assessor::RowAssessor.lump_name_rank(row).to_s)
|
@@ -99,9 +99,8 @@ module Taxonifi::Lumper
|
|
99
99
|
n = nil # a Name if necessary
|
100
100
|
name_id = nil # index the new or existing Name
|
101
101
|
|
102
|
+
exists = false
|
102
103
|
if name_index[rank][name] # A matching name (String) has been previously added
|
103
|
-
exists = false
|
104
|
-
|
105
104
|
name_index[rank][name].each do |id|
|
106
105
|
# Compare vectors of parent_ids for name presence
|
107
106
|
if nc.parent_id_vector(id) == row_index[i]
|
@@ -110,15 +109,12 @@ module Taxonifi::Lumper
|
|
110
109
|
break
|
111
110
|
end
|
112
111
|
end
|
113
|
-
|
114
|
-
if !exists # name (string) exists, but parents are different, create new name
|
115
|
-
n = Taxonifi::Model::Name.new()
|
116
|
-
end
|
117
|
-
|
118
|
-
else # no version of the name exists
|
119
|
-
n = Taxonifi::Model::Name.new()
|
120
112
|
end # end name exists
|
121
113
|
|
114
|
+
n = Taxonifi::Model::Name.new() if !exists
|
115
|
+
|
116
|
+
unused_data = row.to_hash.select{|f| unused_fields.include?(f)}
|
117
|
+
row_identifier = (row['identifier'] ? row['identifier'] : i)
|
122
118
|
|
123
119
|
# Populate the new name if created. Previously matched names are not effected.
|
124
120
|
if !n.nil?
|
@@ -134,13 +130,13 @@ module Taxonifi::Lumper
|
|
134
130
|
if shares_rank
|
135
131
|
if row['author_year']
|
136
132
|
builder = Taxonifi::Splitter::Builder.build_author_year(row['author_year'])
|
137
|
-
n.
|
133
|
+
n.authors = builder.people # was author!?
|
138
134
|
n.year = builder.year
|
139
|
-
n.parens =
|
135
|
+
n.parens = builder.parens
|
140
136
|
end
|
141
137
|
|
142
|
-
n.
|
143
|
-
n.
|
138
|
+
n.add_property(:link_to_ref_from_row, i) if has_ref_fields # TODO: update this
|
139
|
+
n.add_properties(unused_data) if opts[:capture_related_fields]
|
144
140
|
end
|
145
141
|
|
146
142
|
name_id = nc.add_object(n).id
|
@@ -150,6 +146,14 @@ module Taxonifi::Lumper
|
|
150
146
|
$DEBUG && $stderr.puts("added #{nc.collection.size - 1} | #{n.name} | #{n.rank} | #{n.parent ? n.parent.name : '-'} | #{n.parent ? n.parent.id : '-'}")
|
151
147
|
else
|
152
148
|
$DEBUG && $stderr.puts("already present #{rank} | #{name}")
|
149
|
+
if shares_rank
|
150
|
+
# original::
|
151
|
+
nc.add_duplicate_entry_metadata(name_id, row_identifier, unused_data)
|
152
|
+
|
153
|
+
# hack
|
154
|
+
# nc.add_duplicate_entry_metadata(name_id, row_identifier, row.to_hash)
|
155
|
+
|
156
|
+
end
|
153
157
|
end
|
154
158
|
|
155
159
|
# build a by row vector of parent child relationships
|
@@ -166,7 +170,7 @@ module Taxonifi::Lumper
|
|
166
170
|
opts = {
|
167
171
|
:csv => nil,
|
168
172
|
:inital_id => 1,
|
169
|
-
:capture_related_fields => true # Stores other column values in (column_header => value) pairs in Ref
|
173
|
+
:capture_related_fields => true # Stores other column values in (column_header => value) pairs in Ref#related
|
170
174
|
}.merge!(options)
|
171
175
|
csv = opts[:csv]
|
172
176
|
|
@@ -219,7 +223,7 @@ module Taxonifi::Lumper
|
|
219
223
|
end
|
220
224
|
end
|
221
225
|
|
222
|
-
r.
|
226
|
+
r.add_properties(row.to_hash.select{|f| unused_fields.include?(f)}) if opts[:capture_related_fields]
|
223
227
|
|
224
228
|
# Do some indexing.
|
225
229
|
ref_str = r.compact_string
|
@@ -56,7 +56,7 @@ module Taxonifi::Lumper::Lumps::ParentChildNameCollection
|
|
56
56
|
n.rank = rank
|
57
57
|
n.name = name
|
58
58
|
n.row_number = i
|
59
|
-
n.
|
59
|
+
n.add_property(:external_id, external_id)
|
60
60
|
|
61
61
|
if parent = external_index[parent_id]
|
62
62
|
n.parent = parent
|
@@ -134,7 +134,6 @@ module Taxonifi::Lumper::Lumps::ParentChildNameCollection
|
|
134
134
|
# validation in general, something to look at, for now, throw up our hands and move on.
|
135
135
|
return last_id if (real_genus.nil? || real_species.nil?)
|
136
136
|
|
137
|
-
# debugger if real_genus.id == 399
|
138
137
|
real_subgenus = nil # revisit
|
139
138
|
real_subspecies = nc.object_by_id(nc.name_exists?(tmp_subspecies)) if !tmp_subspecies.nil?
|
140
139
|
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../taxonifi'))
|
2
|
+
|
3
|
+
module Taxonifi::Lumper
|
4
|
+
class NameIndex
|
5
|
+
attr_accessor :index
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@index = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def new_rank(rank)
|
12
|
+
@index[rank] = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def name_exists_at_rank?(name, rank)
|
16
|
+
name_index[rank] && name_index[rank][name]
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require File.expand_path(File.join(File.dirname(__FILE__), "../
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "../model/base.rb"))
|
2
2
|
|
3
3
|
module Taxonifi
|
4
4
|
module Model
|
@@ -28,7 +28,7 @@ module Taxonifi
|
|
28
28
|
def compact_index
|
29
29
|
index = [@year]
|
30
30
|
@people.each do |a|
|
31
|
-
index.push
|
31
|
+
index.push(a.compact_string)
|
32
32
|
end
|
33
33
|
index.join("-")
|
34
34
|
end
|