taxonifi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/Gemfile +18 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +155 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/lib/assessor/assessor.rb +31 -0
  9. data/lib/assessor/base.rb +17 -0
  10. data/lib/assessor/row_assessor.rb +131 -0
  11. data/lib/export/export.rb +9 -0
  12. data/lib/export/format/base.rb +43 -0
  13. data/lib/export/format/species_file.rb +341 -0
  14. data/lib/lumper/lumper.rb +334 -0
  15. data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
  16. data/lib/models/author_year.rb +39 -0
  17. data/lib/models/base.rb +73 -0
  18. data/lib/models/collection.rb +92 -0
  19. data/lib/models/generic_object.rb +15 -0
  20. data/lib/models/geog.rb +59 -0
  21. data/lib/models/geog_collection.rb +28 -0
  22. data/lib/models/name.rb +206 -0
  23. data/lib/models/name_collection.rb +149 -0
  24. data/lib/models/person.rb +49 -0
  25. data/lib/models/ref.rb +85 -0
  26. data/lib/models/ref_collection.rb +106 -0
  27. data/lib/models/species_name.rb +85 -0
  28. data/lib/splitter/builder.rb +26 -0
  29. data/lib/splitter/lexer.rb +70 -0
  30. data/lib/splitter/parser.rb +54 -0
  31. data/lib/splitter/splitter.rb +45 -0
  32. data/lib/splitter/tokens.rb +322 -0
  33. data/lib/taxonifi.rb +36 -0
  34. data/test/file_fixtures/Lygaeoidea.csv +801 -0
  35. data/test/helper.rb +38 -0
  36. data/test/test_exporter.rb +32 -0
  37. data/test/test_lumper_geogs.rb +59 -0
  38. data/test/test_lumper_hierarchical_collection.rb +88 -0
  39. data/test/test_lumper_names.rb +119 -0
  40. data/test/test_lumper_parent_child_name_collection.rb +41 -0
  41. data/test/test_lumper_refs.rb +91 -0
  42. data/test/test_parser.rb +34 -0
  43. data/test/test_splitter.rb +27 -0
  44. data/test/test_splitter_tokens.rb +403 -0
  45. data/test/test_taxonifi.rb +11 -0
  46. data/test/test_taxonifi_accessor.rb +61 -0
  47. data/test/test_taxonifi_geog.rb +51 -0
  48. data/test/test_taxonifi_name.rb +186 -0
  49. data/test/test_taxonifi_name_collection.rb +158 -0
  50. data/test/test_taxonifi_ref.rb +90 -0
  51. data/test/test_taxonifi_ref_collection.rb +69 -0
  52. data/test/test_taxonifi_species_name.rb +95 -0
  53. metadata +167 -0
@@ -0,0 +1,84 @@
1
+ # Handles DwC-esque files (e.g. as dumped by EoL), i.e. a file with columns like:
2
+ # [identifier parent child rank synonyms]
3
+ # Instantiates individual names for all names (including synonym lists) into a NameCollection.
4
+ # See 'test/test_lumper_parent_child_name_collection' for example use.
5
+ module Taxonifi::Lumper::Lumps::ParentChildNameCollection
6
+
7
+ def self.name_collection(csv)
8
+ raise Taxonifi::Lumper::LumperError, "CSV does not have the required headers (#{Taxonifi::Lumper::LUMPS[:eol_basic].join(", ")})." if !Taxonifi::Lumper.available_lumps(csv.headers).include?(:eol_basic)
9
+
10
+ nc = Taxonifi::Model::NameCollection.new(:initial_id => 1)
11
+ external_index = {} # identifier => Taxonifi::Name
12
+
13
+ csv.each_with_index do |row,i|
14
+ name = row['child']
15
+ rank = row['rank'].downcase if !row['rank'].nil?
16
+ parent_id = row['parent'].to_i
17
+ external_id = row['identifier'].to_i
18
+ valid_species_id = nil
19
+
20
+ case rank
21
+ when 'species', nil
22
+ valid_species_id = add_species_names_from_string(nc, name, external_index[parent_id])
23
+ external_index.merge!(external_id => nc.object_by_id(valid_species_id))
24
+ else # Just a single string, we don't have to break anything down.
25
+ n = nil
26
+
27
+ if nc.by_name_index[rank][name]
28
+ exists = false
29
+ # TODO: this hasn't been hit yet
30
+ nc.by_name_index[rank][name].each do |id|
31
+ if nc.parent_id_vector(id).pop == nc.parent_id_vector(parent_id)
32
+ exists = true
33
+ break
34
+ end
35
+ end
36
+ if !exists
37
+ n = Taxonifi::Model::Name.new()
38
+ end
39
+ else
40
+ n = Taxonifi::Model::Name.new()
41
+ end
42
+
43
+ # Build the name
44
+ if !n.nil?
45
+ # TODO: No author, year have yet been observed for genus and higher names
46
+ n.rank = rank
47
+ n.name = name
48
+ n.external_id = external_id
49
+ n.row_number = i
50
+
51
+ if parent = external_index[parent_id]
52
+ n.parent = parent
53
+ end
54
+
55
+ nc.add_object(n)
56
+ external_index.merge!(external_id => n)
57
+ end
58
+ end
59
+
60
+ if !row['synonyms'].nil? && row['synonyms'].size > 0
61
+ other_names = row['synonyms'].split("|")
62
+ other_names.each do |n|
63
+ add_species_names_from_string(nc, n, external_index[parent_id], valid_species_id)
64
+ end
65
+ end
66
+
67
+ end # end row
68
+ nc
69
+ end
70
+
71
+ # Add the individual names in a species epithet string. Assumes parents all previously created.
72
+ def self.add_species_names_from_string(nc, string, parent = nil, synonym_id = nil)
73
+ names = Taxonifi::Splitter::Builder.build_species_name(string) # A Taxonifi::Model::SpeciesName instance
74
+ if !parent.nil? # nc.object_by_id(parent_id)
75
+ names.names.last.parent = parent # swap out the genus to the Model referenced by parent_id
76
+ else
77
+ raise Taxonifi::Lumper::LumperError, "Parent of [#{names.names.last.name}] within [#{names.display_name}] not yet instantiated. \n !! To resolve: \n\t 1) If this is not a species name your file may be missing a value in the 'Rank' column (nil values are assumed to be species, all other ranks must be populated). \n\t 2) Parent names must be read before children, check that this is the case."
78
+ end
79
+ last_id = nc.add_object(names.names.last).id
80
+ nc.object_by_id(last_id).related_name = nc.object_by_id(synonym_id) if !synonym_id.nil?
81
+ last_id
82
+ end
83
+
84
+ end
@@ -0,0 +1,39 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), "../models/base.rb"))
2
+
3
+ module Taxonifi
4
+ module Model
5
+ # A class to aggregate People and Year combinations.
6
+ class AuthorYear < Taxonifi::Model::Base
7
+ # Array of Taxonifi::Model::People
8
+ attr_accessor :people
9
+ # String
10
+ attr_accessor :year
11
+ # The parens attribute reflects that this combinations was
12
+ # cited in parentheses.
13
+ attr_accessor :parens
14
+
15
+ def initialize(options = {})
16
+ opts = {
17
+ :people => [],
18
+ :parens => false,
19
+ :year => nil
20
+ }.merge!(options)
21
+
22
+ @parens = opts[:parens]
23
+ @people = opts[:people]
24
+ @year = opts[:year]
25
+ end
26
+
27
+ # Return a string representing all data, used in indexing.
28
+ def compact_index
29
+ index = [@year]
30
+ @people.each do |a|
31
+ index.push a.compact_string
32
+ end
33
+ index.join("-")
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,73 @@
1
+ module Taxonifi
2
+ class ModelError < StandardError; end
3
+ module Model
4
+
5
+ # A base class for all Taxonifi::Models that represent
6
+ # "individuals" (as opposed to collections of indviduals).
7
+ class Base
8
+ # The id of this object.
9
+ attr_accessor :id
10
+ # Optionly store the row this came from
11
+ attr_accessor :row_number
12
+ # Optionally store an id representing the original id usef for this record.
13
+ attr_accessor :external_id
14
+
15
+ # Assign on new() all attributes for the ATTRIBUTES
16
+ # constant in a given subclass.
17
+ # !! Check validity prior to building.
18
+ def build(attributes, opts)
19
+ attributes.each do |c|
20
+ self.send("#{c}=",opts[c]) if !opts[c].nil?
21
+ end
22
+ end
23
+
24
+ def id=(id)
25
+ raise Taxonifi::ModelError, "Base model objects must have Fixnum ids." if !id.nil? && id.class != Fixnum
26
+ @id = id
27
+ end
28
+
29
+ # The ids only of ancestors.
30
+ # Immediate ancestor id is in [].last
31
+ def ancestor_ids
32
+ i = 0 # check for recursion
33
+ ids = []
34
+ p = parent
35
+ while !p.nil?
36
+ ids.unshift p.id
37
+ p = p.parent
38
+ i += 1
39
+ raise Taxonifi::ModelError, "Infite recursion in parent string detected for Base model object #{id}." if i > 100
40
+ end
41
+ ids
42
+ end
43
+
44
+ # Ancestor objects for subclasses
45
+ # that have a parent property.
46
+ # TODO: check for parent attributes
47
+ def ancestors
48
+ i = 0 # check for recursion
49
+ ancestors = []
50
+ p = parent
51
+ while !p.nil?
52
+ ancestors.unshift p
53
+ p = p.parent
54
+ i += 1
55
+ raise Taxonifi::ModelError, "Infite recursion in parent string detected for Base model object #{id.display_name}." if i > 100
56
+ end
57
+ ancestors
58
+ end
59
+
60
+ # Determines identity base ONLY
61
+ # on attributes in ATTRIBUTES.
62
+ def identical?(obj)
63
+ raise Taxonifi::ModelError, "Objects are not comparible." if obj.class != self.class
64
+ self.class::ATTRIBUTES.each do |a|
65
+ next if a == :id # don't compare
66
+ return false if obj.send(a) != self.send(a)
67
+ end
68
+ return true
69
+ end
70
+
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,92 @@
1
+ module Taxonifi
2
+ class CollectionError < StandardError; end
3
+ module Model
4
+
5
+ # The base class that all collection classes are derived from.
6
+ class Collection
7
+ attr_accessor :by_id_index
8
+ attr_accessor :current_free_id
9
+ attr_accessor :collection
10
+
11
+ def initialize(options = {})
12
+ opts = {
13
+ :initial_id => 0
14
+ }.merge!(options)
15
+ raise CollectionError, "Can not start with an initial_id of nil." if opts[:initial_id].nil?
16
+ @collection = []
17
+ @by_id_index = {}
18
+ @current_free_id = opts[:initial_id]
19
+ true
20
+ end
21
+
22
+ # Define the default class. Over-ridden in
23
+ # specific collections.
24
+ def object_class
25
+ Taxonifi::Model::GenericObject
26
+ end
27
+
28
+ # Return an object in this collection by id.
29
+ def object_by_id(id)
30
+ @by_id_index[id]
31
+ end
32
+
33
+ # Add an object to the collection.
34
+ def add_object(obj)
35
+ raise CollectionError, "Taxonifi::Model::#{object_class.class}#id may not be pre-initialized if used with #add_object, consider using #add_object_pre_indexed." if !obj.id.nil?
36
+ object_is_allowed?(obj)
37
+ obj.id = @current_free_id.to_i
38
+ @current_free_id += 1
39
+ @collection.push(obj)
40
+ @by_id_index.merge!(obj.id => obj)
41
+ return obj
42
+ end
43
+
44
+ # Add an object without setting its ID.
45
+ def add_object_pre_indexed(obj)
46
+ object_is_allowed?(obj)
47
+ raise CollectionError, "Taxonifi::Model::#{object_class.class} does not have a pre-indexed id." if obj.id.nil?
48
+ @collection.push(obj)
49
+ @by_id_index.merge!(obj.id => obj)
50
+ return obj
51
+ end
52
+
53
+
54
+ # Return an array of ancestor (parent) ids.
55
+ # TODO: deprecate?
56
+ # More or less identical to Taxonifi::Name.ancestor_ids except
57
+ # this checks against the indexed names in the collection
58
+ # rather than Name->Name relationships
59
+ # The two should be identical in all(?) conievable cases
60
+ def parent_id_vector(id = Fixnum)
61
+ vector = []
62
+ return vector if @by_id_index[id].nil? || @by_id_index[id].parent.nil?
63
+ id = @by_id_index[id].parent.id
64
+ while !id.nil?
65
+ vector.unshift id
66
+ if @by_id_index[id].parent
67
+ id = @by_id_index[id].parent.id
68
+ else
69
+ id = nil
70
+ end
71
+ end
72
+ vector
73
+ end
74
+
75
+ # Returns an Array which respresents
76
+ # all the "root" objects.
77
+ def objects_without_parents
78
+ collection.select{|o| o.parent.nil?}
79
+ end
80
+
81
+ protected
82
+
83
+ # Check to see that the object can be added to this collection.
84
+ def object_is_allowed?(obj)
85
+ raise CollectionError, "Taxonifi::Model::#{object_class.class} not passed to Collection.add_object()." if !(obj.class == object_class)
86
+ true
87
+ end
88
+
89
+ end
90
+ end
91
+
92
+ end
@@ -0,0 +1,15 @@
1
+ module Taxonifi
2
+ class ModelError < StandardError; end
3
+ module Model
4
+
5
+ # A generic object, has name, parent, rank properties.
6
+ class GenericObject < Base
7
+ # String
8
+ attr_accessor :name
9
+ # Parent object, same class as self
10
+ attr_accessor :parent
11
+ # String, arbitrarily assignable rank
12
+ attr_accessor :rank
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,59 @@
1
+ module Taxonifi
2
+ class GeogError < StandardError; end
3
+ module Model
4
+
5
+ # An instance of some geopolitical unit.
6
+ # Not fully developed yet.
7
+ class Geog < Taxonifi::Model::Base
8
+
9
+ # require 'geokit'
10
+ # include Geokit::Geocoders
11
+
12
+ GEOG_RANKS = ['country', 'state', 'county']
13
+ ATTRIBUTES = [:name, :rank, :parent]
14
+ ATTRIBUTES.each do |a|
15
+ attr_accessor a
16
+ end
17
+
18
+ def initialize(options = {})
19
+ opts = {
20
+ }.merge!(options)
21
+ @parent = nil
22
+ build(ATTRIBUTES - [:parent], opts)
23
+ @parent = opts[:parent] if (!opts[:parent].nil? && opts[:parent].class == Taxonifi::Model::Geog)
24
+ true
25
+ end
26
+
27
+ # Set the "rank" of this geographic unit.
28
+ def rank=(rank)
29
+ r = rank.to_s.downcase.strip
30
+ if !GEOG_RANKS.include?(r)
31
+ raise GeogError, "#{r} is not a valid rank."
32
+ end
33
+ @rank = r
34
+ end
35
+
36
+ # Set parent of this rank (also a Taxonifi::Model::Geog instance).
37
+ def parent=(parent)
38
+ if parent.nil?
39
+ raise GeogError, "Parent can't be set to nil in Taxonifi::Model::Geog."
40
+ end
41
+
42
+ if @rank.nil?
43
+ raise Taxonifi::GeogError, "Parent of geog can not be set if rank of child is not set."
44
+ end
45
+
46
+ if parent.class != Taxonifi::Model::Geog
47
+ raise GeogError, "Parent is not a Taxonifi::Model::Geog."
48
+ end
49
+
50
+ if GEOG_RANKS.index(parent.rank) >= GEOG_RANKS.index(self.rank)
51
+ raise GeogError, "Parent is same or lower rank than self (#{rank})."
52
+ end
53
+
54
+ @parent = parent
55
+ end
56
+
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,28 @@
1
+ module Taxonifi
2
+ class GeogCollectionError < StandardError; end
3
+ module Model
4
+
5
+ # Collection of geog objects.
6
+ # TODO: Consider moving the row index to the base collection (those this doesn't
7
+ # always make sense).
8
+ class GeogCollection < Taxonifi::Model::Collection
9
+ attr_accessor :row_index
10
+
11
+ def initialize(options = {})
12
+ super
13
+ @row_index = []
14
+ true
15
+ end
16
+
17
+ # Return the object represented by a row.
18
+ def object_from_row(row_number)
19
+ @row_index[row_number]
20
+ end
21
+
22
+ def object_class
23
+ Taxonifi::Model::Geog
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,206 @@
1
+ module Taxonifi
2
+ class NameError < StandardError; end
3
+
4
+ # A taxonomic name.
5
+ class Model::Name < Taxonifi::Model::Base
6
+
7
+ # String
8
+ attr_accessor :name
9
+ # String
10
+ attr_accessor :rank
11
+ # String, authors as originally read
12
+ attr_accessor :year
13
+ # Boolean, true if parens present (i.e. _not_ in original combination)
14
+ attr_accessor :parens
15
+ # A Taxonifi::Model::Name
16
+ attr_accessor :parent
17
+ # String
18
+ attr_accessor :author
19
+ # General purpose relationship, typically used to indicate synonymy. A Taxonifi::Model::Name
20
+ attr_accessor :related_name
21
+
22
+ # Array, contains properties assignable in Taxonifi::Model::Name#new()
23
+ ATTRIBUTES = [:name, :rank, :year, :parens, :parent, :author, :related_name]
24
+
25
+ ATTRIBUTES.each do |a|
26
+ attr_accessor a
27
+ end
28
+
29
+ # optionally parsed/index
30
+ attr_accessor :authors
31
+
32
+ # optionally parsed/index
33
+ attr_accessor :author_year_index
34
+
35
+ def initialize(options = {})
36
+ opts = {
37
+ id: nil
38
+ }.merge!(options)
39
+ @parent = nil
40
+ build(ATTRIBUTES, opts)
41
+ add_author_year(opts[:author_year]) if !opts[:author_year].nil? && opts[:author_year].size > 0
42
+ @parent = opts[:parent] if (!opts[:parent].nil? && opts[:parent].class == Taxonifi::Model::Name)
43
+ @id = opts[:id] # if !opts[:id].nil? && opts[:id].size != 0
44
+ @authors ||= []
45
+ true
46
+ end
47
+
48
+ # Returns an Array of Taxonifi::Model::Person
49
+ def add_author_year(string) # :yields: Array of Taxonifi::Model::Person
50
+ auth_yr = Taxonifi::Splitter::Builder.build_author_year(string)
51
+ @year = auth_yr.year
52
+ @authors = auth_yr.people
53
+ end
54
+
55
+ # Translates the String representation of author year to an Array of People.
56
+ # Used in indexing, when comparing Name microtations to Ref microcitations.
57
+ def derive_authors_year
58
+ add_author_year(author_year_string)
59
+ end
60
+
61
+ # Set the rank.
62
+ def rank=(rank)
63
+ r = rank.to_s.downcase.strip
64
+ if !RANKS.include?(r)
65
+ raise NameError, "#{r} is not a valid rank."
66
+ end
67
+ @rank = r
68
+ end
69
+
70
+ # Set the parent (a Taxonifi::Model::Name)
71
+ def parent=(parent)
72
+ if @rank.nil?
73
+ raise Taxonifi::NameError, "Parent of name can not be set if rank of child is not set."
74
+ end
75
+
76
+ # TODO: ICZN class over-ride
77
+ if parent.class != Taxonifi::Model::Name
78
+ raise NameError, "Parent is not a Taxonifi::Model::Name."
79
+ end
80
+
81
+ if RANKS.index(parent.rank) >= RANKS.index(self.rank)
82
+ raise NameError, "Parent is same or lower rank than self (#{rank})."
83
+ end
84
+
85
+ @parent = parent
86
+ end
87
+
88
+ # Returns a formatted string, including parens for the name
89
+ # TODO: rename to reflect parens
90
+ def author_year
91
+ au = author_year_string
92
+ if self.parens == false
93
+ "(#{au})"
94
+ else
95
+ au.size == 0 ? nil : au
96
+ end
97
+ end
98
+
99
+ # Return the author year string.
100
+ def author_year_string
101
+ au = [self.author, self.year].compact.join(", ")
102
+ end
103
+
104
+ # Return the name of a parent at a given rank.
105
+ # TODO: move method to Base?
106
+ def parent_name_at_rank(rank)
107
+ return self.name if self.rank == rank
108
+ p = @parent
109
+ i = 0
110
+ while !p.nil?
111
+ return p.name if p.rank == rank
112
+ p = p.parent
113
+ i+= 1
114
+ raise NameError, "Loop detected among parents for [#{self.display_name}]." if i > 75
115
+ end
116
+ nil
117
+ end
118
+
119
+ # Return the parent at a given rank.
120
+ # TODO: move method to Base?
121
+ def parent_at_rank(rank)
122
+ return self if self.rank == rank
123
+ p = @parent
124
+ i = 0
125
+ while !p.nil?
126
+ return p if p.rank == rank
127
+ p = p.parent
128
+ raise NameError, "Loop detected among parents fo [#{self.display_name}]" if i > 75
129
+ end
130
+ nil
131
+ end
132
+
133
+ # Return the human readable version of this name with author year (String)
134
+ def display_name
135
+ [nomenclator_name, author_year].compact.join(" ")
136
+ end
137
+
138
+ # Return the human readable version of this name, without author year (String)
139
+ def nomenclator_name
140
+ case @rank
141
+ when 'species', 'subspecies'
142
+ [parent_name_at_rank('genus'), (parent_name_at_rank('subgenus') ? "({parent_name_at_rank('subgenus')})" : nil), parent_name_at_rank('species'), @name].compact.join(" ")
143
+ when 'subgenus'
144
+ [parent_name_at_rank('genus'), "(#{@name})"].compact.join(" ")
145
+ else
146
+ [@name].compact.join(" ")
147
+ end
148
+ end
149
+
150
+ # Return a dashed "vector" of ids representing the ancestor parent closure, like:
151
+ # 0-1-14-29g-45s-99-100.
152
+ # Postfixed g means "genus", postifed s means "subgenus. As per SpecieFile usage.
153
+ def parent_ids_sf_style
154
+ ids = []
155
+ (ancestors.push self).each do |a|
156
+ case a.rank
157
+ when 'genus'
158
+ ids.push "#{a.id}g"
159
+ when 'subgenus'
160
+ ids.push "#{a.id}s"
161
+ else
162
+ ids.push a.id.to_s
163
+ end
164
+ end
165
+
166
+ ids.join("-")
167
+ end
168
+
169
+ # Return names indexed by author_year.
170
+ def author_year_index
171
+ @author_year_index ||= generate_author_year_index
172
+ end
173
+
174
+ # Generate/return the author year index.
175
+ def generate_author_year_index
176
+ @author_year_index = Taxonifi::Model::AuthorYear.new(people: @authors, year: @year).compact_index
177
+ end
178
+
179
+ end
180
+
181
+ # ICZN specific sublassing of a taxonomic name.
182
+ # !! Minimally tested and not broadly implmented.
183
+ class Model::IcznName < Taxonifi::Model::Name
184
+ def initialize
185
+ super
186
+ end
187
+
188
+ # Set the name, checks for family group restrictions.
189
+ def name=(name)
190
+ case @rank
191
+ when 'superfamily'
192
+ raise NameError, "ICZN superfamily name does not end in 'oidae'." if name[-5,5] != 'oidae'
193
+ when 'family'
194
+ raise NameError, "ICZN family name does not end in 'idae'." if name[-4,4] != 'idae'
195
+ when 'subfamily'
196
+ raise NameError, "ICZN subfamily name does not end in 'inae'." if name[-4,4] != 'inae'
197
+ when 'tribe'
198
+ raise NameError, "ICZN tribe name does not end in 'ini'." if name[-3,3] != 'ini'
199
+ when 'subtribe'
200
+ raise NameError, "ICZN subtribe name does not end in 'ina'." if name[-3,3] != 'ina'
201
+ end
202
+ @name = name
203
+ end
204
+ end
205
+
206
+ end