taxonifi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/Gemfile +18 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +155 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/lib/assessor/assessor.rb +31 -0
  9. data/lib/assessor/base.rb +17 -0
  10. data/lib/assessor/row_assessor.rb +131 -0
  11. data/lib/export/export.rb +9 -0
  12. data/lib/export/format/base.rb +43 -0
  13. data/lib/export/format/species_file.rb +341 -0
  14. data/lib/lumper/lumper.rb +334 -0
  15. data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
  16. data/lib/models/author_year.rb +39 -0
  17. data/lib/models/base.rb +73 -0
  18. data/lib/models/collection.rb +92 -0
  19. data/lib/models/generic_object.rb +15 -0
  20. data/lib/models/geog.rb +59 -0
  21. data/lib/models/geog_collection.rb +28 -0
  22. data/lib/models/name.rb +206 -0
  23. data/lib/models/name_collection.rb +149 -0
  24. data/lib/models/person.rb +49 -0
  25. data/lib/models/ref.rb +85 -0
  26. data/lib/models/ref_collection.rb +106 -0
  27. data/lib/models/species_name.rb +85 -0
  28. data/lib/splitter/builder.rb +26 -0
  29. data/lib/splitter/lexer.rb +70 -0
  30. data/lib/splitter/parser.rb +54 -0
  31. data/lib/splitter/splitter.rb +45 -0
  32. data/lib/splitter/tokens.rb +322 -0
  33. data/lib/taxonifi.rb +36 -0
  34. data/test/file_fixtures/Lygaeoidea.csv +801 -0
  35. data/test/helper.rb +38 -0
  36. data/test/test_exporter.rb +32 -0
  37. data/test/test_lumper_geogs.rb +59 -0
  38. data/test/test_lumper_hierarchical_collection.rb +88 -0
  39. data/test/test_lumper_names.rb +119 -0
  40. data/test/test_lumper_parent_child_name_collection.rb +41 -0
  41. data/test/test_lumper_refs.rb +91 -0
  42. data/test/test_parser.rb +34 -0
  43. data/test/test_splitter.rb +27 -0
  44. data/test/test_splitter_tokens.rb +403 -0
  45. data/test/test_taxonifi.rb +11 -0
  46. data/test/test_taxonifi_accessor.rb +61 -0
  47. data/test/test_taxonifi_geog.rb +51 -0
  48. data/test/test_taxonifi_name.rb +186 -0
  49. data/test/test_taxonifi_name_collection.rb +158 -0
  50. data/test/test_taxonifi_ref.rb +90 -0
  51. data/test/test_taxonifi_ref_collection.rb +69 -0
  52. data/test/test_taxonifi_species_name.rb +95 -0
  53. metadata +167 -0
@@ -0,0 +1,341 @@
1
+
2
+ module Taxonifi::Export
3
+
4
+ # Dumps tables identical to the existing structure in SpeciesFile.
5
+ # Will only work in the pre Identity world. Will reconfigure
6
+ # as templates for Jim's work after the fact.
7
+ class SpeciesFile < Taxonifi::Export::Base
8
+
9
+ # tblRanks 5/17/2012
10
+ SPECIES_FILE_RANKS = {
11
+ 'subspecies' => 5,
12
+ 'species' => 10,
13
+ 'species subgroup' => 11,
14
+ 'species group' => 12,
15
+ 'species series' => 14,
16
+ 'infragenus' => 16,
17
+ 'subgenus' => 18,
18
+ 'genus' => 20,
19
+ 'genus group' => 22,
20
+ 'subtribe' => 28,
21
+ 'tribe' => 30,
22
+ 'supertribe' => 32,
23
+ 'infrafamily' => 36,
24
+ 'subfamily' => 38,
25
+ 'subfamily group' => 39,
26
+ 'family' => 40,
27
+ 'epifamily' => 41,
28
+ 'superfamily' => 42,
29
+ 'superfamily group' => 44,
30
+ 'subinfraordinal group' => 45,
31
+ 'infraorder' => 46,
32
+ 'suborder' => 8,
33
+ 'order' => 50,
34
+ 'mirorder' => 51,
35
+ 'superorder' => 52,
36
+ 'magnorder' => 53,
37
+ 'cohort' => 54,
38
+ 'supercohort' => 55,
39
+ 'infraclass' => 56,
40
+ 'subclass' => 58,
41
+ 'class' => 60,
42
+ 'superclass' => 62,
43
+ 'infraphylum' => 66,
44
+ 'subphylum' => 68,
45
+ 'phylum' => 70,
46
+ 'superphylum' => 72,
47
+ 'infrakingdom' => 76,
48
+ 'subkingdom' => 78,
49
+ 'kingdom' => 80,
50
+ 'superkingdom' => 82,
51
+ 'life' => 90,
52
+ 'unknown' => 100
53
+ }
54
+
55
+ attr_accessor :name_collection
56
+ attr_accessor :ref_collection
57
+ attr_accessor :author_index
58
+ attr_accessor :genus_names, :species_names, :nomenclator
59
+ attr_accessor :authorized_user_id, :time
60
+
61
+ # MANIFEST order is important
62
+ MANIFEST = %w{tblTaxa tblRefs tblPeople tblRefAuthors tblGenusNames tblSpeciesNames tblNomenclator tblCites}
63
+
64
+ def initialize(options = {})
65
+ opts = {
66
+ :nc => Taxonifi::Model::NameCollection.new,
67
+ :export_folder => 'species_file',
68
+ :authorized_user_id => nil
69
+ }.merge!(options)
70
+
71
+ super(opts)
72
+ raise Taxonifi::Export::ExportError, 'NameCollection not passed to SpeciesFile export.' if ! opts[:nc].class == Taxonifi::Model::NameCollection
73
+ raise Taxonifi::Export::ExportError, 'You must provide authorized_user_id for species_file export initialization.' if opts[:authorized_user_id].nil?
74
+ @name_collection = opts[:nc]
75
+ @authorized_user_id = opts[:authorized_user_id]
76
+ @author_index = {}
77
+
78
+ #
79
+ # Careful here, at present we are just generating Reference micro-citations from our names, so the indexing "just works"
80
+ # because it's all internal. There will is a strong potential for key collisions if this pipeline is modified to
81
+ # include references external to the initialized name_collection. See also export_references.
82
+ #
83
+ @by_author_reference_index = {}
84
+ @genus_names = {}
85
+ @species_names = {}
86
+ @nomenclator = {}
87
+ @time = Time.now.strftime("%F %T")
88
+ end
89
+
90
+ # Export only the ref_collection. Sidesteps the main name-centric exports
91
+ # Note that this still uses the base @name_collection object as a starting reference,
92
+ # it just references @name_collection.ref_collection. So you can do:
93
+ # nc = Taxonifi::Model::NameCollection.new
94
+ # nc.ref_collection = Taxonifi::Model::RefCollection.new
95
+ # etc.
96
+ def export_references(options = {})
97
+ opts = {
98
+ :starting_ref_id => 0,
99
+ :starting_author_id => 0
100
+ }
101
+
102
+ configure_folders
103
+ build_author_index
104
+
105
+ # order matters
106
+ ['tblPeople', 'tblRefs', 'tblRefAuthors', 'sqlRefs' ].each do |t|
107
+ write_file(t, send(t))
108
+ end
109
+ end
110
+
111
+ # Assumes names that are the same are the same person.
112
+ def build_author_index
113
+ @author_index = @name_collection.ref_collection.unique_authors.inject({}){|hsh, a| hsh.merge!(a.compact_string => a)}
114
+ end
115
+
116
+ def export()
117
+ super
118
+ @name_collection.generate_ref_collection(1)
119
+
120
+ # Give authors unique ids
121
+ @name_collection.ref_collection.uniquify_authors(1)
122
+ build_author_index
123
+
124
+ # See notes in #initalize re potential key collisions!
125
+ @by_author_reference_index = @name_collection.ref_collection.collection.inject({}){|hsh, r| hsh.merge!(r.author_year_index => r)}
126
+
127
+ @name_collection.names_at_rank('genus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
128
+ @name_collection.names_at_rank('subgenus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
129
+ @name_collection.names_at_rank('species').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
130
+ @name_collection.names_at_rank('subspecies').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
131
+
132
+ MANIFEST.each do |f|
133
+ write_file(f, send(f))
134
+ end
135
+ end
136
+
137
+ def tblTaxa
138
+ @headers = %w{TaxonNameID TaxonNameStr RankID Name Parens AboveID RefID DataFlags AccessCode NameStatus StatusFlags OriginalGenusID LastUpdate ModifiedBy}
139
+ @csv_string = CSV.generate() do |csv|
140
+ csv << @headers
141
+ @name_collection.collection.each do |n|
142
+ ref = @by_author_reference_index[n.author_year_index]
143
+ cols = {
144
+ TaxonNameID: n.id,
145
+ TaxonNameStr: n.parent_ids_sf_style, # closure -> ends with 1
146
+ RankID: SPECIES_FILE_RANKS[n.rank],
147
+ Name: n.name,
148
+ Parens: (n.parens ? 1 : 0),
149
+ AboveID: (n.related_name.nil? ? (n.parent ? n.parent.id : 0) : n.related_name.id), # !! SF folks like to pre-populate with zeros
150
+ RefID: (ref ? ref.id : 0),
151
+ DataFlags: 0, # see http://software.speciesfile.org/Design/TaxaTables.aspx#Taxon, a flag populated when data is reviewed, initialize to zero
152
+ AccessCode: 0,
153
+ NameStatus: (n.related_name.nil? ? 0 : 7), # 0 :valid, 7: synonym)
154
+ StatusFlags: (n.related_name.nil? ? 0 : 262144), # 0 :valid, 262144: jr. synonym
155
+ OriginalGenusID: (!n.parens && n.parent_at_rank('genus') ? n.parent_at_rank('genus').id : 0), # SF must be pre-configured with 0 filler (this restriction needs to go)
156
+ LastUpdate: @time,
157
+ ModifiedBy: @authorized_user_id,
158
+ }
159
+ csv << @headers.collect{|h| cols[h.to_sym]}
160
+ end
161
+ end
162
+ @csv_string
163
+ end
164
+
165
+ # Generate a tblRefs string.
166
+ def tblRefs
167
+ @headers = %w{RefID ActualYear Title PubID Verbatim}
168
+ @csv_string = CSV.generate(:col_sep => "\t") do |csv|
169
+ csv << @headers
170
+ @name_collection.ref_collection.collection.each_with_index do |r,i|
171
+ cols = {
172
+ RefID: r.id, # i + 1,
173
+ Title: (r.title.nil? ? """""" : r.title),
174
+ PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
175
+ ActualYear: r.year,
176
+ Verbatim: r.full_citation
177
+ }
178
+ csv << @headers.collect{|h| cols[h.to_sym]}
179
+ end
180
+ end
181
+ @csv_string
182
+ end
183
+
184
+ # TODO make a standard transaction wrapper
185
+ def sqlRefs
186
+ sql = [ 'BEGIN TRY', 'BEGIN TRANSACTION']
187
+ @headers = %w{RefID ActualYear Title PubID Verbatim}
188
+ @name_collection.ref_collection.collection.each_with_index do |r,i|
189
+ cols = {
190
+ RefID: r.id, # i + 1,
191
+ Title: (r.title.nil? ? """""" : r.title),
192
+ PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
193
+ ActualYear: r.year,
194
+ Verbatim: r.full_citation
195
+ }
196
+ sql << "INSERT INTO tblRefs (#{@headers.sort.join(",")}) VALUES (#{@headers.sort.collect{|h| "'#{cols[h.to_sym].to_s.gsub(/'/,"''")}'"}.join(",")});"
197
+ end
198
+ sql << ['COMMIT', 'END TRY', 'BEGIN CATCH', 'ROLLBACK', 'END CATCH']
199
+ sql.join("\n")
200
+ end
201
+
202
+ # Generate tblPeople string.
203
+ def tblPeople
204
+ @headers = %w{PersonID FamilyName GivenNames GivenInitials Suffix Role LastUpdate ModifiedBy}
205
+ @csv_string = CSV.generate() do |csv|
206
+ csv << @headers
207
+ @author_index.keys.each_with_index do |k,i|
208
+ a = @author_index[k]
209
+ # a.id = i + 1
210
+ cols = {
211
+ PersonID: a.id,
212
+ FamilyName: a.last_name,
213
+ GivenName: a.first_name,
214
+ GivenInitials: a.initials_string,
215
+ Suffix: a.suffix,
216
+ Role: 1, # authors
217
+ LastUpdate: @time,
218
+ ModifiedBy: @authorized_user_id
219
+ }
220
+ csv << @headers.collect{|h| cols[h.to_sym]}
221
+ end
222
+ end
223
+ @csv_string
224
+ end
225
+
226
+ # Generate tblRefAuthors string.
227
+ def tblRefAuthors
228
+ @headers = %w{RefID PersonID SeqNum AuthorCount LastUpdate ModifiedBy}
229
+ @csv_string = CSV.generate() do |csv|
230
+ csv << @headers
231
+ @name_collection.ref_collection.collection.each do |r|
232
+ r.authors.each_with_index do |x, i|
233
+ a = @author_index[x.compact_string]
234
+ cols = {
235
+ RefID: r.id,
236
+ PersonID: a.id,
237
+ SeqNum: i + 1,
238
+ AuthorCount: r.authors.size,
239
+ LastUpdate: @time,
240
+ ModifiedBy: @authorized_user_id
241
+ }
242
+ csv << @headers.collect{|h| cols[h.to_sym]}
243
+ end
244
+ end
245
+ end
246
+ @csv_string
247
+ end
248
+
249
+ # Generate tblCites string.
250
+ def tblCites
251
+ @headers = %w{TaxonNameID SeqNum RefID NomenclatorID LastUpdate ModifiedBy NewNameStatus CitePages Note TypeClarification CurrentConcept ConceptChange InfoFlags InfoFlagStatus PolynomialStatus}
252
+ @csv_string = CSV.generate() do |csv|
253
+ csv << @headers
254
+ @name_collection.collection.each do |n|
255
+ ref = @by_author_reference_index[n.author_year_index]
256
+ next if ref.nil?
257
+ cols = {
258
+ TaxonNameID: n.id,
259
+ SeqNum: 1,
260
+ RefID: ref.id,
261
+ NomenclatorID: @nomenclator[n.nomenclator_name],
262
+ LastUpdate: @time,
263
+ ModifiedBy: @authorized_user_id,
264
+ CitePages: """""", # equates to "" in CSV speak
265
+ NewNameStatus: 0,
266
+ Note: """""",
267
+ TypeClarification: 0, # We might derive more data from this
268
+ CurrentConcept: 1, # Boolean, right?
269
+ ConceptChange: 0, # Unspecified
270
+ InfoFlags: 0, #
271
+ InfoFlagStatus: 1, # 1 => needs review
272
+ PolynomialStatus: 0
273
+ }
274
+ csv << @headers.collect{|h| cols[h.to_sym]}
275
+ end
276
+ end
277
+ @csv_string
278
+ end
279
+
280
+ def tblGenusNames
281
+ @csv_string = csv_for_genus_and_species_names_tables('Genus')
282
+ @csv_string
283
+ end
284
+
285
+ def tblSpeciesNames
286
+ @csv_string = csv_for_genus_and_species_names_tables('Species')
287
+ @csv_string
288
+ end
289
+
290
+ def csv_for_genus_and_species_names_tables(type)
291
+ col = "#{type}NameID"
292
+ @headers = [col, "Name", "LastUpdate", "ModifiedBy", "Italicize"]
293
+ @csv_string = CSV.generate() do |csv|
294
+ csv << @headers
295
+ var = self.send("#{type.downcase}_names")
296
+ var.keys.each_with_index do |n,i|
297
+ var[n] = i + 1
298
+ cols = {
299
+ col.to_sym => i + 1,
300
+ Name: n,
301
+ LastUpdate: @time,
302
+ ModifiedBy: @authorized_user_id,
303
+ Italicize: 1 # always true for these data
304
+ }
305
+ csv << @headers.collect{|h| cols[h.to_sym]}
306
+ end
307
+ end
308
+ @csv_string
309
+ end
310
+
311
+ # must be called post tblGenusNames and tblSpeciesNames
312
+ def tblNomenclator
313
+ @headers = %w{NomenclatorID GenusNameID SubgenusNameID SpeciesNameID SubspeciesNameID LastUpdate ModifiedBy SuitableForGenus SuitableForSpecies InfrasubspeciesNameID InfrasubKind}
314
+ @csv_string = CSV.generate() do |csv|
315
+ csv << @headers
316
+ i = 1
317
+ @name_collection.collection.each do |n|
318
+ next if Taxonifi::RANKS.index(n.rank) < Taxonifi::RANKS.index('genus')
319
+ cols = {
320
+ NomenclatorID: i,
321
+ GenusNameID: @genus_names[n.parent_name_at_rank('genus')] || 0,
322
+ SubgenusNameID: @genus_names[n.parent_name_at_rank('subgenus')] || 0,
323
+ SpeciesNameID: @species_names[n.parent_name_at_rank('species')] || 0,
324
+ SubspeciesNameID: @species_names[n.parent_name_at_rank('subspecies')] || 0,
325
+ InfrasubspeciesNameID: 0,
326
+ InfrasubKind: 0, # this might be wrong
327
+ LastUpdate: @time,
328
+ ModifiedBy: @authorized_user_id,
329
+ SuitableForGenus: 0, # Set in SF
330
+ SuitableForSpecies: 0 # Set in SF
331
+ }
332
+ @nomenclator.merge!(n.nomenclator_name => i)
333
+ i += 1
334
+ csv << @headers.collect{|h| cols[h.to_sym]}
335
+ end
336
+ end
337
+ @csv_string
338
+ end
339
+
340
+ end
341
+ end
@@ -0,0 +1,334 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '../taxonifi'))
2
+
3
+ # The lumper lumps! Tools for recognizing and using
4
+ # combinations of column types.
5
+ module Taxonifi::Lumper
6
+
7
+ # Define groups of columns/fields and include
8
+ # functionality to determine whether your
9
+ # columns match a given set.
10
+ module Lumps
11
+ Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "lumps/*.rb") )) do |file|
12
+ require file
13
+ end
14
+ end
15
+
16
+ class LumperError < StandardError; end
17
+
18
+ # Columns used for species epithets.
19
+ # !! Todo: map DwC URIs to these labels (at present they largely correllate with Tokens,
20
+ # perhaps map URIs to tokens!?)
21
+ QUAD = ['genus', 'subgenus', 'species', 'subspecies']
22
+
23
+ # Columns representing author and year
24
+ AUTHOR_YEAR = ['author', 'year']
25
+
26
+ # A Hash of named column combinations
27
+ LUMPS = {
28
+ quadrinomial: QUAD,
29
+ quad_author_year: QUAD + AUTHOR_YEAR,
30
+ names: Taxonifi::RANKS + AUTHOR_YEAR,
31
+ higher: Taxonifi::RANKS - [QUAD + AUTHOR_YEAR],
32
+ species: ['species', 'subspecies'],
33
+ genera: ['genus', 'subgenus'],
34
+ citation_basic: %w{authors year title publication volume number pages pg_start pg_end},
35
+ citation_small: %w{authors year title publication volume_number pages},
36
+ basic_geog: %w{country state county}, # add 'continent'
37
+ eol_basic: %w{identifier parent child rank synonyms}
38
+ }
39
+
40
+ # Lumps for which all columns are represented
41
+ # TODO: This is really an assessor method
42
+ def self.available_lumps(columns)
43
+ raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.available_lumps.' if !(columns.class == Array)
44
+ LUMPS.keys.select{|k| (LUMPS[k] - columns) == []}
45
+ end
46
+
47
+ # Lumps for which any column is represented
48
+ # # TODO: This is really an assessor method
49
+ def self.intersecting_lumps(columns)
50
+ raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.intersecting_lumps.' if !(columns.class == Array)
51
+ intersections = []
52
+ LUMPS.keys.each do |k|
53
+ intersections.push k if (LUMPS[k] & columns).size > 0
54
+ end
55
+ intersections
56
+ end
57
+
58
+ # Return a Taxonifi::Model::NameCollection from a csv file.
59
+ def self.create_name_collection(csv)
60
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
61
+ nc = Taxonifi::Model::NameCollection.new
62
+
63
+ row_size = csv.size
64
+
65
+ # The row index contains a vector of parent ids like
66
+ # [0, 4, 29]
67
+ # This implies that Name with #id 29 has Parent with #id 4
68
+ # Initialize an empty index.
69
+ row_index = []
70
+ (0..(row_size-1)).each do |i|
71
+ row_index[i] = []
72
+ end
73
+
74
+ # The name_index keeps track of unique name per rank like
75
+ # :genus => {'Foo' => [0,2]}
76
+ # This says that "Foo" is instantiated two times in the
77
+ # name collection, with id 0, and id 2.
78
+ name_index = {}
79
+
80
+ # First pass, create and index names
81
+ Taxonifi::Assessor::RowAssessor.rank_headers(csv.headers).each do |rank|
82
+ name_index[rank] = {}
83
+ csv.each_with_index do |row, i|
84
+ row_rank = Taxonifi::Assessor::RowAssessor.lump_name_rank(row).to_s # metadata (e.g. author year) apply to this rank
85
+
86
+ name = row[rank]
87
+
88
+ if !name.nil? # cell has data
89
+ n = nil # a Name if necessary
90
+ name_id = nil # index the new or existing name
91
+
92
+ if name_index[rank][name] # name (string) exists
93
+
94
+ exists = false
95
+ name_index[rank][name].each do |id|
96
+ # Compare vectors of parent_ids for name presence
97
+ if nc.parent_id_vector(id) == row_index[i]
98
+ exists = true
99
+ name_id = id
100
+ break # don't need to check further
101
+ end
102
+ end
103
+
104
+ if !exists # name (string) exists, but parents are different, create new name
105
+ n = Taxonifi::Model::Name.new()
106
+ end
107
+
108
+ else # no version of the name exists
109
+ n = Taxonifi::Model::Name.new()
110
+ end # end name exists
111
+
112
+ # If we created a new name
113
+ if !n.nil?
114
+ n.rank = rank
115
+ n.name = name
116
+ n.parent = nc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
117
+ n.row_number = i
118
+
119
+ # Name/year needs to be standardized / cased out
120
+ # headers are overlapping at times
121
+
122
+ if row['author_year'] && row_rank == rank
123
+ builder = Taxonifi::Splitter::Builder.build_author_year(row['author_year'])
124
+ n.author = builder.people
125
+ n.year = builder.year
126
+ n.parens = !builder.parens
127
+ end
128
+
129
+ name_id = nc.add_object(n).id
130
+ # Add the name to the index of unique names
131
+ name_index[rank][name] ||= []
132
+ name_index[rank][name].push name_id
133
+ end
134
+
135
+ # build a by row vector of parent child relationships
136
+ row_index[i].push name_id
137
+ end # end cell has data
138
+
139
+ end
140
+ end
141
+
142
+ nc
143
+ end
144
+
145
+ # Return a Taxonifi::Model::RefCollection from a CSV file.
146
+ def self.create_ref_collection(csv)
147
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_ref_collection.' if csv.class != CSV::Table
148
+ rc = Taxonifi::Model::RefCollection.new
149
+ row_size = csv.size
150
+
151
+ ref_index = {}
152
+ csv.each_with_index do |row, i|
153
+ if Taxonifi::Assessor::RowAssessor.intersecting_lumps_with_data(row, [:citation_small]).include?(:citation_small)
154
+ r = Taxonifi::Model::Ref.new(
155
+ :year => row['year'],
156
+ :title => row['title'],
157
+ :publication => row['publication']
158
+ )
159
+
160
+ # TODO: break out each of these lexes to a builder
161
+ if row['authors'] && !row['authors'].empty?
162
+ lexer = Taxonifi::Splitter::Lexer.new(row['authors'])
163
+ authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
164
+ authors.names.each do |a|
165
+ n = Taxonifi::Model::Person.new()
166
+ n.last_name = a[:last_name]
167
+ n.initials = a[:initials]
168
+ r.authors.push n
169
+ end
170
+ end
171
+
172
+ if row['volume_number'] && !row['volume_number'].empty?
173
+ lexer = Taxonifi::Splitter::Lexer.new(row['volume_number'], :volume_number)
174
+ t = lexer.pop(Taxonifi::Splitter::Tokens::VolumeNumber)
175
+ r.volume = t.volume
176
+ r.number = t.number
177
+ end
178
+
179
+ if row['pages'] && !row['pages'].empty?
180
+ # If our regex doesn't match dump the field into pages
181
+ begin
182
+ lexer = Taxonifi::Splitter::Lexer.new(row['pages'], :pages)
183
+ t = lexer.pop(Taxonifi::Splitter::Tokens::Pages)
184
+ r.pg_start = t.pg_start
185
+ r.pg_end = t.pg_end
186
+ rescue
187
+ r.pages = row['pages']
188
+ end
189
+ end
190
+
191
+ # Do some indexing.
192
+ ref_str = r.compact_string
193
+ if !ref_index.keys.include?(ref_str)
194
+ ref_id = rc.add_object(r).id
195
+ ref_index.merge!(ref_str => ref_id)
196
+ rc.row_index[i] = r
197
+ else
198
+ rc.row_index[i] = ref_index[ref_str]
199
+ end
200
+ end
201
+ end
202
+ rc
203
+ end
204
+
205
+ # Creates a generic Collection with Objects of GenericObject
206
+ # Objects are assigned to parents (rank) according to the order provided in headers.
207
+ # Objects are considered the same if they have the same name and the same parents closure, e.g.
208
+ #
209
+ # a b c
210
+ # a b d
211
+ # e b f
212
+ #
213
+ # Will return 7 objects named in order a,b,c,d,e,b,f
214
+ #
215
+ # a,b b,c b,d e,b b,f are the unique parent/child relationships stored
216
+ #
217
+ #
218
+ def self.create_hierarchical_collection(csv, headers)
219
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
220
+ raise Taxonifi::Lumper::LumperError, 'No headers provided to create_hierarchical_collection.' if headers.size == 0
221
+
222
+ c = Taxonifi::Model::Collection.new
223
+ row_size = csv.size
224
+
225
+ # See create_name_collection
226
+ row_index = []
227
+ (0..(row_size-1)).each do |i|
228
+ row_index[i] = []
229
+ end
230
+
231
+ name_index = {}
232
+ headers.each do |h|
233
+ name_index[h] = {}
234
+ end
235
+
236
+ csv.each_with_index do |row, i|
237
+ headers.each do |rank|
238
+ name = row[rank]
239
+ if !name.nil? && !name.empty? # cell has data
240
+ o = nil # a Name if necessary
241
+ name_id = nil # index the new or existing name
242
+
243
+ if name_index[rank][name] # name exists
244
+
245
+ exists = false
246
+ name_index[rank][name].each do |id|
247
+ if c.parent_id_vector(id) == row_index[i]
248
+ exists = true
249
+ name_id = id
250
+ break
251
+ end
252
+ end
253
+
254
+ if !exists
255
+ o = Taxonifi::Model::GenericObject.new()
256
+ end
257
+ else
258
+ o = Taxonifi::Model::GenericObject.new()
259
+ end
260
+
261
+ if !o.nil?
262
+ o.name = name
263
+ o.rank = rank
264
+ o.row_number = i
265
+ o.parent = c.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
266
+
267
+ name_id = c.add_object(o).id
268
+ name_index[rank][name] ||= []
269
+ name_index[rank][name].push name_id
270
+
271
+ end
272
+ row_index[i].push name_id
273
+ end
274
+ end
275
+ end
276
+ c
277
+ end
278
+
279
+ # Return a geog collection from a csv file.
280
+ def self.create_geog_collection(csv)
281
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_geog_collection.' if csv.class != CSV::Table
282
+ gc = Taxonifi::Model::GeogCollection.new
283
+
284
+ row_size = csv.size
285
+
286
+ # See create_name_collection
287
+ row_index = []
288
+ (0..(row_size-1)).each do |i|
289
+ row_index[i] = []
290
+ end
291
+
292
+ name_index = {}
293
+ geog_headers = Taxonifi::Assessor::RowAssessor.geog_headers(csv.headers)
294
+ geog_headers.each do |h|
295
+ name_index[h] = {}
296
+ end
297
+
298
+ # We don't have the same problems as with taxon names, i.e.
299
+ # boo in
300
+ # Foo nil boo
301
+ # Foo bar boo
302
+ # is the same thing wrt geography, not the case for taxon names.
303
+ # We can use a row first loop to build as we go
304
+
305
+ csv.each_with_index do |row, i|
306
+ geog_headers.each do |level|
307
+ name = row[level]
308
+ if !name.nil? && !name.empty? # cell has data
309
+ g = nil # a Name if necessary
310
+ name_id = nil # index the new or existing name
311
+
312
+ if name_index[level][name] # name exists
313
+ name_id = name_index[level][name]
314
+ else
315
+ g = Taxonifi::Model::Geog.new()
316
+ name_id = gc.add_object(g).id
317
+ end
318
+
319
+ if !g.nil?
320
+ g.name = name
321
+ g.rank = level
322
+ g.parent = gc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
323
+ end
324
+
325
+ name_index[level][name] = name_id
326
+ row_index[i].push name_id
327
+ end
328
+ end
329
+ end
330
+ gc
331
+ end
332
+
333
+ end # end Lumper Module
334
+