taxonifi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/Gemfile +18 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +155 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/lib/assessor/assessor.rb +31 -0
  9. data/lib/assessor/base.rb +17 -0
  10. data/lib/assessor/row_assessor.rb +131 -0
  11. data/lib/export/export.rb +9 -0
  12. data/lib/export/format/base.rb +43 -0
  13. data/lib/export/format/species_file.rb +341 -0
  14. data/lib/lumper/lumper.rb +334 -0
  15. data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
  16. data/lib/models/author_year.rb +39 -0
  17. data/lib/models/base.rb +73 -0
  18. data/lib/models/collection.rb +92 -0
  19. data/lib/models/generic_object.rb +15 -0
  20. data/lib/models/geog.rb +59 -0
  21. data/lib/models/geog_collection.rb +28 -0
  22. data/lib/models/name.rb +206 -0
  23. data/lib/models/name_collection.rb +149 -0
  24. data/lib/models/person.rb +49 -0
  25. data/lib/models/ref.rb +85 -0
  26. data/lib/models/ref_collection.rb +106 -0
  27. data/lib/models/species_name.rb +85 -0
  28. data/lib/splitter/builder.rb +26 -0
  29. data/lib/splitter/lexer.rb +70 -0
  30. data/lib/splitter/parser.rb +54 -0
  31. data/lib/splitter/splitter.rb +45 -0
  32. data/lib/splitter/tokens.rb +322 -0
  33. data/lib/taxonifi.rb +36 -0
  34. data/test/file_fixtures/Lygaeoidea.csv +801 -0
  35. data/test/helper.rb +38 -0
  36. data/test/test_exporter.rb +32 -0
  37. data/test/test_lumper_geogs.rb +59 -0
  38. data/test/test_lumper_hierarchical_collection.rb +88 -0
  39. data/test/test_lumper_names.rb +119 -0
  40. data/test/test_lumper_parent_child_name_collection.rb +41 -0
  41. data/test/test_lumper_refs.rb +91 -0
  42. data/test/test_parser.rb +34 -0
  43. data/test/test_splitter.rb +27 -0
  44. data/test/test_splitter_tokens.rb +403 -0
  45. data/test/test_taxonifi.rb +11 -0
  46. data/test/test_taxonifi_accessor.rb +61 -0
  47. data/test/test_taxonifi_geog.rb +51 -0
  48. data/test/test_taxonifi_name.rb +186 -0
  49. data/test/test_taxonifi_name_collection.rb +158 -0
  50. data/test/test_taxonifi_ref.rb +90 -0
  51. data/test/test_taxonifi_ref_collection.rb +69 -0
  52. data/test/test_taxonifi_species_name.rb +95 -0
  53. metadata +167 -0
@@ -0,0 +1,341 @@
1
+
2
+ module Taxonifi::Export
3
+
4
+ # Dumps tables identical to the existing structure in SpeciesFile.
5
+ # Will only work in the pre Identity world. Will reconfigure
6
+ # as templates for Jim's work after the fact.
7
+ class SpeciesFile < Taxonifi::Export::Base
8
+
9
+ # tblRanks 5/17/2012
10
+ SPECIES_FILE_RANKS = {
11
+ 'subspecies' => 5,
12
+ 'species' => 10,
13
+ 'species subgroup' => 11,
14
+ 'species group' => 12,
15
+ 'species series' => 14,
16
+ 'infragenus' => 16,
17
+ 'subgenus' => 18,
18
+ 'genus' => 20,
19
+ 'genus group' => 22,
20
+ 'subtribe' => 28,
21
+ 'tribe' => 30,
22
+ 'supertribe' => 32,
23
+ 'infrafamily' => 36,
24
+ 'subfamily' => 38,
25
+ 'subfamily group' => 39,
26
+ 'family' => 40,
27
+ 'epifamily' => 41,
28
+ 'superfamily' => 42,
29
+ 'superfamily group' => 44,
30
+ 'subinfraordinal group' => 45,
31
+ 'infraorder' => 46,
32
+ 'suborder' => 8,
33
+ 'order' => 50,
34
+ 'mirorder' => 51,
35
+ 'superorder' => 52,
36
+ 'magnorder' => 53,
37
+ 'cohort' => 54,
38
+ 'supercohort' => 55,
39
+ 'infraclass' => 56,
40
+ 'subclass' => 58,
41
+ 'class' => 60,
42
+ 'superclass' => 62,
43
+ 'infraphylum' => 66,
44
+ 'subphylum' => 68,
45
+ 'phylum' => 70,
46
+ 'superphylum' => 72,
47
+ 'infrakingdom' => 76,
48
+ 'subkingdom' => 78,
49
+ 'kingdom' => 80,
50
+ 'superkingdom' => 82,
51
+ 'life' => 90,
52
+ 'unknown' => 100
53
+ }
54
+
55
+ attr_accessor :name_collection
56
+ attr_accessor :ref_collection
57
+ attr_accessor :author_index
58
+ attr_accessor :genus_names, :species_names, :nomenclator
59
+ attr_accessor :authorized_user_id, :time
60
+
61
+ # MANIFEST order is important
62
+ MANIFEST = %w{tblTaxa tblRefs tblPeople tblRefAuthors tblGenusNames tblSpeciesNames tblNomenclator tblCites}
63
+
64
+ def initialize(options = {})
65
+ opts = {
66
+ :nc => Taxonifi::Model::NameCollection.new,
67
+ :export_folder => 'species_file',
68
+ :authorized_user_id => nil
69
+ }.merge!(options)
70
+
71
+ super(opts)
72
+ raise Taxonifi::Export::ExportError, 'NameCollection not passed to SpeciesFile export.' if ! opts[:nc].class == Taxonifi::Model::NameCollection
73
+ raise Taxonifi::Export::ExportError, 'You must provide authorized_user_id for species_file export initialization.' if opts[:authorized_user_id].nil?
74
+ @name_collection = opts[:nc]
75
+ @authorized_user_id = opts[:authorized_user_id]
76
+ @author_index = {}
77
+
78
+ #
79
+ # Careful here, at present we are just generating Reference micro-citations from our names, so the indexing "just works"
80
+ # because it's all internal. There will is a strong potential for key collisions if this pipeline is modified to
81
+ # include references external to the initialized name_collection. See also export_references.
82
+ #
83
+ @by_author_reference_index = {}
84
+ @genus_names = {}
85
+ @species_names = {}
86
+ @nomenclator = {}
87
+ @time = Time.now.strftime("%F %T")
88
+ end
89
+
90
+ # Export only the ref_collection. Sidesteps the main name-centric exports
91
+ # Note that this still uses the base @name_collection object as a starting reference,
92
+ # it just references @name_collection.ref_collection. So you can do:
93
+ # nc = Taxonifi::Model::NameCollection.new
94
+ # nc.ref_collection = Taxonifi::Model::RefCollection.new
95
+ # etc.
96
+ def export_references(options = {})
97
+ opts = {
98
+ :starting_ref_id => 0,
99
+ :starting_author_id => 0
100
+ }
101
+
102
+ configure_folders
103
+ build_author_index
104
+
105
+ # order matters
106
+ ['tblPeople', 'tblRefs', 'tblRefAuthors', 'sqlRefs' ].each do |t|
107
+ write_file(t, send(t))
108
+ end
109
+ end
110
+
111
+ # Assumes names that are the same are the same person.
112
+ def build_author_index
113
+ @author_index = @name_collection.ref_collection.unique_authors.inject({}){|hsh, a| hsh.merge!(a.compact_string => a)}
114
+ end
115
+
116
+ def export()
117
+ super
118
+ @name_collection.generate_ref_collection(1)
119
+
120
+ # Give authors unique ids
121
+ @name_collection.ref_collection.uniquify_authors(1)
122
+ build_author_index
123
+
124
+ # See notes in #initalize re potential key collisions!
125
+ @by_author_reference_index = @name_collection.ref_collection.collection.inject({}){|hsh, r| hsh.merge!(r.author_year_index => r)}
126
+
127
+ @name_collection.names_at_rank('genus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
128
+ @name_collection.names_at_rank('subgenus').inject(@genus_names){|hsh, n| hsh.merge!(n.name => nil)}
129
+ @name_collection.names_at_rank('species').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
130
+ @name_collection.names_at_rank('subspecies').inject(@species_names){|hsh, n| hsh.merge!(n.name => nil)}
131
+
132
+ MANIFEST.each do |f|
133
+ write_file(f, send(f))
134
+ end
135
+ end
136
+
137
+ def tblTaxa
138
+ @headers = %w{TaxonNameID TaxonNameStr RankID Name Parens AboveID RefID DataFlags AccessCode NameStatus StatusFlags OriginalGenusID LastUpdate ModifiedBy}
139
+ @csv_string = CSV.generate() do |csv|
140
+ csv << @headers
141
+ @name_collection.collection.each do |n|
142
+ ref = @by_author_reference_index[n.author_year_index]
143
+ cols = {
144
+ TaxonNameID: n.id,
145
+ TaxonNameStr: n.parent_ids_sf_style, # closure -> ends with 1
146
+ RankID: SPECIES_FILE_RANKS[n.rank],
147
+ Name: n.name,
148
+ Parens: (n.parens ? 1 : 0),
149
+ AboveID: (n.related_name.nil? ? (n.parent ? n.parent.id : 0) : n.related_name.id), # !! SF folks like to pre-populate with zeros
150
+ RefID: (ref ? ref.id : 0),
151
+ DataFlags: 0, # see http://software.speciesfile.org/Design/TaxaTables.aspx#Taxon, a flag populated when data is reviewed, initialize to zero
152
+ AccessCode: 0,
153
+ NameStatus: (n.related_name.nil? ? 0 : 7), # 0 :valid, 7: synonym)
154
+ StatusFlags: (n.related_name.nil? ? 0 : 262144), # 0 :valid, 262144: jr. synonym
155
+ OriginalGenusID: (!n.parens && n.parent_at_rank('genus') ? n.parent_at_rank('genus').id : 0), # SF must be pre-configured with 0 filler (this restriction needs to go)
156
+ LastUpdate: @time,
157
+ ModifiedBy: @authorized_user_id,
158
+ }
159
+ csv << @headers.collect{|h| cols[h.to_sym]}
160
+ end
161
+ end
162
+ @csv_string
163
+ end
164
+
165
+ # Generate a tblRefs string.
166
+ def tblRefs
167
+ @headers = %w{RefID ActualYear Title PubID Verbatim}
168
+ @csv_string = CSV.generate(:col_sep => "\t") do |csv|
169
+ csv << @headers
170
+ @name_collection.ref_collection.collection.each_with_index do |r,i|
171
+ cols = {
172
+ RefID: r.id, # i + 1,
173
+ Title: (r.title.nil? ? """""" : r.title),
174
+ PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
175
+ ActualYear: r.year,
176
+ Verbatim: r.full_citation
177
+ }
178
+ csv << @headers.collect{|h| cols[h.to_sym]}
179
+ end
180
+ end
181
+ @csv_string
182
+ end
183
+
184
+ # TODO make a standard transaction wrapper
185
+ def sqlRefs
186
+ sql = [ 'BEGIN TRY', 'BEGIN TRANSACTION']
187
+ @headers = %w{RefID ActualYear Title PubID Verbatim}
188
+ @name_collection.ref_collection.collection.each_with_index do |r,i|
189
+ cols = {
190
+ RefID: r.id, # i + 1,
191
+ Title: (r.title.nil? ? """""" : r.title),
192
+ PubID: 0, # Careful - assumes you have a pre-generated PubID of Zero in there, PubID table is not included in CSV imports
193
+ ActualYear: r.year,
194
+ Verbatim: r.full_citation
195
+ }
196
+ sql << "INSERT INTO tblRefs (#{@headers.sort.join(",")}) VALUES (#{@headers.sort.collect{|h| "'#{cols[h.to_sym].to_s.gsub(/'/,"''")}'"}.join(",")});"
197
+ end
198
+ sql << ['COMMIT', 'END TRY', 'BEGIN CATCH', 'ROLLBACK', 'END CATCH']
199
+ sql.join("\n")
200
+ end
201
+
202
+ # Generate tblPeople string.
203
+ def tblPeople
204
+ @headers = %w{PersonID FamilyName GivenNames GivenInitials Suffix Role LastUpdate ModifiedBy}
205
+ @csv_string = CSV.generate() do |csv|
206
+ csv << @headers
207
+ @author_index.keys.each_with_index do |k,i|
208
+ a = @author_index[k]
209
+ # a.id = i + 1
210
+ cols = {
211
+ PersonID: a.id,
212
+ FamilyName: a.last_name,
213
+ GivenName: a.first_name,
214
+ GivenInitials: a.initials_string,
215
+ Suffix: a.suffix,
216
+ Role: 1, # authors
217
+ LastUpdate: @time,
218
+ ModifiedBy: @authorized_user_id
219
+ }
220
+ csv << @headers.collect{|h| cols[h.to_sym]}
221
+ end
222
+ end
223
+ @csv_string
224
+ end
225
+
226
+ # Generate tblRefAuthors string.
227
+ def tblRefAuthors
228
+ @headers = %w{RefID PersonID SeqNum AuthorCount LastUpdate ModifiedBy}
229
+ @csv_string = CSV.generate() do |csv|
230
+ csv << @headers
231
+ @name_collection.ref_collection.collection.each do |r|
232
+ r.authors.each_with_index do |x, i|
233
+ a = @author_index[x.compact_string]
234
+ cols = {
235
+ RefID: r.id,
236
+ PersonID: a.id,
237
+ SeqNum: i + 1,
238
+ AuthorCount: r.authors.size,
239
+ LastUpdate: @time,
240
+ ModifiedBy: @authorized_user_id
241
+ }
242
+ csv << @headers.collect{|h| cols[h.to_sym]}
243
+ end
244
+ end
245
+ end
246
+ @csv_string
247
+ end
248
+
249
+ # Generate tblCites string.
250
+ def tblCites
251
+ @headers = %w{TaxonNameID SeqNum RefID NomenclatorID LastUpdate ModifiedBy NewNameStatus CitePages Note TypeClarification CurrentConcept ConceptChange InfoFlags InfoFlagStatus PolynomialStatus}
252
+ @csv_string = CSV.generate() do |csv|
253
+ csv << @headers
254
+ @name_collection.collection.each do |n|
255
+ ref = @by_author_reference_index[n.author_year_index]
256
+ next if ref.nil?
257
+ cols = {
258
+ TaxonNameID: n.id,
259
+ SeqNum: 1,
260
+ RefID: ref.id,
261
+ NomenclatorID: @nomenclator[n.nomenclator_name],
262
+ LastUpdate: @time,
263
+ ModifiedBy: @authorized_user_id,
264
+ CitePages: """""", # equates to "" in CSV speak
265
+ NewNameStatus: 0,
266
+ Note: """""",
267
+ TypeClarification: 0, # We might derive more data from this
268
+ CurrentConcept: 1, # Boolean, right?
269
+ ConceptChange: 0, # Unspecified
270
+ InfoFlags: 0, #
271
+ InfoFlagStatus: 1, # 1 => needs review
272
+ PolynomialStatus: 0
273
+ }
274
+ csv << @headers.collect{|h| cols[h.to_sym]}
275
+ end
276
+ end
277
+ @csv_string
278
+ end
279
+
280
+ def tblGenusNames
281
+ @csv_string = csv_for_genus_and_species_names_tables('Genus')
282
+ @csv_string
283
+ end
284
+
285
+ def tblSpeciesNames
286
+ @csv_string = csv_for_genus_and_species_names_tables('Species')
287
+ @csv_string
288
+ end
289
+
290
+ def csv_for_genus_and_species_names_tables(type)
291
+ col = "#{type}NameID"
292
+ @headers = [col, "Name", "LastUpdate", "ModifiedBy", "Italicize"]
293
+ @csv_string = CSV.generate() do |csv|
294
+ csv << @headers
295
+ var = self.send("#{type.downcase}_names")
296
+ var.keys.each_with_index do |n,i|
297
+ var[n] = i + 1
298
+ cols = {
299
+ col.to_sym => i + 1,
300
+ Name: n,
301
+ LastUpdate: @time,
302
+ ModifiedBy: @authorized_user_id,
303
+ Italicize: 1 # always true for these data
304
+ }
305
+ csv << @headers.collect{|h| cols[h.to_sym]}
306
+ end
307
+ end
308
+ @csv_string
309
+ end
310
+
311
+ # must be called post tblGenusNames and tblSpeciesNames
312
+ def tblNomenclator
313
+ @headers = %w{NomenclatorID GenusNameID SubgenusNameID SpeciesNameID SubspeciesNameID LastUpdate ModifiedBy SuitableForGenus SuitableForSpecies InfrasubspeciesNameID InfrasubKind}
314
+ @csv_string = CSV.generate() do |csv|
315
+ csv << @headers
316
+ i = 1
317
+ @name_collection.collection.each do |n|
318
+ next if Taxonifi::RANKS.index(n.rank) < Taxonifi::RANKS.index('genus')
319
+ cols = {
320
+ NomenclatorID: i,
321
+ GenusNameID: @genus_names[n.parent_name_at_rank('genus')] || 0,
322
+ SubgenusNameID: @genus_names[n.parent_name_at_rank('subgenus')] || 0,
323
+ SpeciesNameID: @species_names[n.parent_name_at_rank('species')] || 0,
324
+ SubspeciesNameID: @species_names[n.parent_name_at_rank('subspecies')] || 0,
325
+ InfrasubspeciesNameID: 0,
326
+ InfrasubKind: 0, # this might be wrong
327
+ LastUpdate: @time,
328
+ ModifiedBy: @authorized_user_id,
329
+ SuitableForGenus: 0, # Set in SF
330
+ SuitableForSpecies: 0 # Set in SF
331
+ }
332
+ @nomenclator.merge!(n.nomenclator_name => i)
333
+ i += 1
334
+ csv << @headers.collect{|h| cols[h.to_sym]}
335
+ end
336
+ end
337
+ @csv_string
338
+ end
339
+
340
+ end
341
+ end
@@ -0,0 +1,334 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '../taxonifi'))
2
+
3
+ # The lumper lumps! Tools for recognizing and using
4
+ # combinations of column types.
5
+ module Taxonifi::Lumper
6
+
7
+ # Define groups of columns/fields and include
8
+ # functionality to determine whether your
9
+ # columns match a given set.
10
+ module Lumps
11
+ Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "lumps/*.rb") )) do |file|
12
+ require file
13
+ end
14
+ end
15
+
16
+ class LumperError < StandardError; end
17
+
18
+ # Columns used for species epithets.
19
+ # !! Todo: map DwC URIs to these labels (at present they largely correllate with Tokens,
20
+ # perhaps map URIs to tokens!?)
21
+ QUAD = ['genus', 'subgenus', 'species', 'subspecies']
22
+
23
+ # Columns representing author and year
24
+ AUTHOR_YEAR = ['author', 'year']
25
+
26
+ # A Hash of named column combinations
27
+ LUMPS = {
28
+ quadrinomial: QUAD,
29
+ quad_author_year: QUAD + AUTHOR_YEAR,
30
+ names: Taxonifi::RANKS + AUTHOR_YEAR,
31
+ higher: Taxonifi::RANKS - [QUAD + AUTHOR_YEAR],
32
+ species: ['species', 'subspecies'],
33
+ genera: ['genus', 'subgenus'],
34
+ citation_basic: %w{authors year title publication volume number pages pg_start pg_end},
35
+ citation_small: %w{authors year title publication volume_number pages},
36
+ basic_geog: %w{country state county}, # add 'continent'
37
+ eol_basic: %w{identifier parent child rank synonyms}
38
+ }
39
+
40
+ # Lumps for which all columns are represented
41
+ # TODO: This is really an assessor method
42
+ def self.available_lumps(columns)
43
+ raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.available_lumps.' if !(columns.class == Array)
44
+ LUMPS.keys.select{|k| (LUMPS[k] - columns) == []}
45
+ end
46
+
47
+ # Lumps for which any column is represented
48
+ # # TODO: This is really an assessor method
49
+ def self.intersecting_lumps(columns)
50
+ raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.intersecting_lumps.' if !(columns.class == Array)
51
+ intersections = []
52
+ LUMPS.keys.each do |k|
53
+ intersections.push k if (LUMPS[k] & columns).size > 0
54
+ end
55
+ intersections
56
+ end
57
+
58
+ # Return a Taxonifi::Model::NameCollection from a csv file.
59
+ def self.create_name_collection(csv)
60
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
61
+ nc = Taxonifi::Model::NameCollection.new
62
+
63
+ row_size = csv.size
64
+
65
+ # The row index contains a vector of parent ids like
66
+ # [0, 4, 29]
67
+ # This implies that Name with #id 29 has Parent with #id 4
68
+ # Initialize an empty index.
69
+ row_index = []
70
+ (0..(row_size-1)).each do |i|
71
+ row_index[i] = []
72
+ end
73
+
74
+ # The name_index keeps track of unique name per rank like
75
+ # :genus => {'Foo' => [0,2]}
76
+ # This says that "Foo" is instantiated two times in the
77
+ # name collection, with id 0, and id 2.
78
+ name_index = {}
79
+
80
+ # First pass, create and index names
81
+ Taxonifi::Assessor::RowAssessor.rank_headers(csv.headers).each do |rank|
82
+ name_index[rank] = {}
83
+ csv.each_with_index do |row, i|
84
+ row_rank = Taxonifi::Assessor::RowAssessor.lump_name_rank(row).to_s # metadata (e.g. author year) apply to this rank
85
+
86
+ name = row[rank]
87
+
88
+ if !name.nil? # cell has data
89
+ n = nil # a Name if necessary
90
+ name_id = nil # index the new or existing name
91
+
92
+ if name_index[rank][name] # name (string) exists
93
+
94
+ exists = false
95
+ name_index[rank][name].each do |id|
96
+ # Compare vectors of parent_ids for name presence
97
+ if nc.parent_id_vector(id) == row_index[i]
98
+ exists = true
99
+ name_id = id
100
+ break # don't need to check further
101
+ end
102
+ end
103
+
104
+ if !exists # name (string) exists, but parents are different, create new name
105
+ n = Taxonifi::Model::Name.new()
106
+ end
107
+
108
+ else # no version of the name exists
109
+ n = Taxonifi::Model::Name.new()
110
+ end # end name exists
111
+
112
+ # If we created a new name
113
+ if !n.nil?
114
+ n.rank = rank
115
+ n.name = name
116
+ n.parent = nc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
117
+ n.row_number = i
118
+
119
+ # Name/year needs to be standardized / cased out
120
+ # headers are overlapping at times
121
+
122
+ if row['author_year'] && row_rank == rank
123
+ builder = Taxonifi::Splitter::Builder.build_author_year(row['author_year'])
124
+ n.author = builder.people
125
+ n.year = builder.year
126
+ n.parens = !builder.parens
127
+ end
128
+
129
+ name_id = nc.add_object(n).id
130
+ # Add the name to the index of unique names
131
+ name_index[rank][name] ||= []
132
+ name_index[rank][name].push name_id
133
+ end
134
+
135
+ # build a by row vector of parent child relationships
136
+ row_index[i].push name_id
137
+ end # end cell has data
138
+
139
+ end
140
+ end
141
+
142
+ nc
143
+ end
144
+
145
+ # Return a Taxonifi::Model::RefCollection from a CSV file.
146
+ def self.create_ref_collection(csv)
147
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_ref_collection.' if csv.class != CSV::Table
148
+ rc = Taxonifi::Model::RefCollection.new
149
+ row_size = csv.size
150
+
151
+ ref_index = {}
152
+ csv.each_with_index do |row, i|
153
+ if Taxonifi::Assessor::RowAssessor.intersecting_lumps_with_data(row, [:citation_small]).include?(:citation_small)
154
+ r = Taxonifi::Model::Ref.new(
155
+ :year => row['year'],
156
+ :title => row['title'],
157
+ :publication => row['publication']
158
+ )
159
+
160
+ # TODO: break out each of these lexes to a builder
161
+ if row['authors'] && !row['authors'].empty?
162
+ lexer = Taxonifi::Splitter::Lexer.new(row['authors'])
163
+ authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
164
+ authors.names.each do |a|
165
+ n = Taxonifi::Model::Person.new()
166
+ n.last_name = a[:last_name]
167
+ n.initials = a[:initials]
168
+ r.authors.push n
169
+ end
170
+ end
171
+
172
+ if row['volume_number'] && !row['volume_number'].empty?
173
+ lexer = Taxonifi::Splitter::Lexer.new(row['volume_number'], :volume_number)
174
+ t = lexer.pop(Taxonifi::Splitter::Tokens::VolumeNumber)
175
+ r.volume = t.volume
176
+ r.number = t.number
177
+ end
178
+
179
+ if row['pages'] && !row['pages'].empty?
180
+ # If our regex doesn't match dump the field into pages
181
+ begin
182
+ lexer = Taxonifi::Splitter::Lexer.new(row['pages'], :pages)
183
+ t = lexer.pop(Taxonifi::Splitter::Tokens::Pages)
184
+ r.pg_start = t.pg_start
185
+ r.pg_end = t.pg_end
186
+ rescue
187
+ r.pages = row['pages']
188
+ end
189
+ end
190
+
191
+ # Do some indexing.
192
+ ref_str = r.compact_string
193
+ if !ref_index.keys.include?(ref_str)
194
+ ref_id = rc.add_object(r).id
195
+ ref_index.merge!(ref_str => ref_id)
196
+ rc.row_index[i] = r
197
+ else
198
+ rc.row_index[i] = ref_index[ref_str]
199
+ end
200
+ end
201
+ end
202
+ rc
203
+ end
204
+
205
+ # Creates a generic Collection with Objects of GenericObject
206
+ # Objects are assigned to parents (rank) according to the order provided in headers.
207
+ # Objects are considered the same if they have the same name and the same parents closure, e.g.
208
+ #
209
+ # a b c
210
+ # a b d
211
+ # e b f
212
+ #
213
+ # Will return 7 objects named in order a,b,c,d,e,b,f
214
+ #
215
+ # a,b b,c b,d e,b b,f are the unique parent/child relationships stored
216
+ #
217
+ #
218
+ def self.create_hierarchical_collection(csv, headers)
219
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
220
+ raise Taxonifi::Lumper::LumperError, 'No headers provided to create_hierarchical_collection.' if headers.size == 0
221
+
222
+ c = Taxonifi::Model::Collection.new
223
+ row_size = csv.size
224
+
225
+ # See create_name_collection
226
+ row_index = []
227
+ (0..(row_size-1)).each do |i|
228
+ row_index[i] = []
229
+ end
230
+
231
+ name_index = {}
232
+ headers.each do |h|
233
+ name_index[h] = {}
234
+ end
235
+
236
+ csv.each_with_index do |row, i|
237
+ headers.each do |rank|
238
+ name = row[rank]
239
+ if !name.nil? && !name.empty? # cell has data
240
+ o = nil # a Name if necessary
241
+ name_id = nil # index the new or existing name
242
+
243
+ if name_index[rank][name] # name exists
244
+
245
+ exists = false
246
+ name_index[rank][name].each do |id|
247
+ if c.parent_id_vector(id) == row_index[i]
248
+ exists = true
249
+ name_id = id
250
+ break
251
+ end
252
+ end
253
+
254
+ if !exists
255
+ o = Taxonifi::Model::GenericObject.new()
256
+ end
257
+ else
258
+ o = Taxonifi::Model::GenericObject.new()
259
+ end
260
+
261
+ if !o.nil?
262
+ o.name = name
263
+ o.rank = rank
264
+ o.row_number = i
265
+ o.parent = c.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
266
+
267
+ name_id = c.add_object(o).id
268
+ name_index[rank][name] ||= []
269
+ name_index[rank][name].push name_id
270
+
271
+ end
272
+ row_index[i].push name_id
273
+ end
274
+ end
275
+ end
276
+ c
277
+ end
278
+
279
+ # Return a geog collection from a csv file.
280
+ def self.create_geog_collection(csv)
281
+ raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_geog_collection.' if csv.class != CSV::Table
282
+ gc = Taxonifi::Model::GeogCollection.new
283
+
284
+ row_size = csv.size
285
+
286
+ # See create_name_collection
287
+ row_index = []
288
+ (0..(row_size-1)).each do |i|
289
+ row_index[i] = []
290
+ end
291
+
292
+ name_index = {}
293
+ geog_headers = Taxonifi::Assessor::RowAssessor.geog_headers(csv.headers)
294
+ geog_headers.each do |h|
295
+ name_index[h] = {}
296
+ end
297
+
298
+ # We don't have the same problems as with taxon names, i.e.
299
+ # boo in
300
+ # Foo nil boo
301
+ # Foo bar boo
302
+ # is the same thing wrt geography, not the case for taxon names.
303
+ # We can use a row first loop to build as we go
304
+
305
+ csv.each_with_index do |row, i|
306
+ geog_headers.each do |level|
307
+ name = row[level]
308
+ if !name.nil? && !name.empty? # cell has data
309
+ g = nil # a Name if necessary
310
+ name_id = nil # index the new or existing name
311
+
312
+ if name_index[level][name] # name exists
313
+ name_id = name_index[level][name]
314
+ else
315
+ g = Taxonifi::Model::Geog.new()
316
+ name_id = gc.add_object(g).id
317
+ end
318
+
319
+ if !g.nil?
320
+ g.name = name
321
+ g.rank = level
322
+ g.parent = gc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row
323
+ end
324
+
325
+ name_index[level][name] = name_id
326
+ row_index[i].push name_id
327
+ end
328
+ end
329
+ end
330
+ gc
331
+ end
332
+
333
+ end # end Lumper Module
334
+