taxonifi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/Gemfile +18 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +155 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/lib/assessor/assessor.rb +31 -0
  9. data/lib/assessor/base.rb +17 -0
  10. data/lib/assessor/row_assessor.rb +131 -0
  11. data/lib/export/export.rb +9 -0
  12. data/lib/export/format/base.rb +43 -0
  13. data/lib/export/format/species_file.rb +341 -0
  14. data/lib/lumper/lumper.rb +334 -0
  15. data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
  16. data/lib/models/author_year.rb +39 -0
  17. data/lib/models/base.rb +73 -0
  18. data/lib/models/collection.rb +92 -0
  19. data/lib/models/generic_object.rb +15 -0
  20. data/lib/models/geog.rb +59 -0
  21. data/lib/models/geog_collection.rb +28 -0
  22. data/lib/models/name.rb +206 -0
  23. data/lib/models/name_collection.rb +149 -0
  24. data/lib/models/person.rb +49 -0
  25. data/lib/models/ref.rb +85 -0
  26. data/lib/models/ref_collection.rb +106 -0
  27. data/lib/models/species_name.rb +85 -0
  28. data/lib/splitter/builder.rb +26 -0
  29. data/lib/splitter/lexer.rb +70 -0
  30. data/lib/splitter/parser.rb +54 -0
  31. data/lib/splitter/splitter.rb +45 -0
  32. data/lib/splitter/tokens.rb +322 -0
  33. data/lib/taxonifi.rb +36 -0
  34. data/test/file_fixtures/Lygaeoidea.csv +801 -0
  35. data/test/helper.rb +38 -0
  36. data/test/test_exporter.rb +32 -0
  37. data/test/test_lumper_geogs.rb +59 -0
  38. data/test/test_lumper_hierarchical_collection.rb +88 -0
  39. data/test/test_lumper_names.rb +119 -0
  40. data/test/test_lumper_parent_child_name_collection.rb +41 -0
  41. data/test/test_lumper_refs.rb +91 -0
  42. data/test/test_parser.rb +34 -0
  43. data/test/test_splitter.rb +27 -0
  44. data/test/test_splitter_tokens.rb +403 -0
  45. data/test/test_taxonifi.rb +11 -0
  46. data/test/test_taxonifi_accessor.rb +61 -0
  47. data/test/test_taxonifi_geog.rb +51 -0
  48. data/test/test_taxonifi_name.rb +186 -0
  49. data/test/test_taxonifi_name_collection.rb +158 -0
  50. data/test/test_taxonifi_ref.rb +90 -0
  51. data/test/test_taxonifi_ref_collection.rb +69 -0
  52. data/test/test_taxonifi_species_name.rb +95 -0
  53. metadata +167 -0
@@ -0,0 +1,322 @@
1
+ #
2
+ # Tokens are simple classes that return a regular expression (pattern to match).
3
+ # You should write a test in test_resolver.rb before defining a token.
4
+ # Remember to register tokens in lists at the bottom of this file.
5
+ #
6
+ module Taxonifi::Splitter::Tokens
7
+
8
+ class Token
9
+ # This allows access the to class attribute regexp, without using a class variable
10
+ class << self
11
+ attr_reader :regexp
12
+ end
13
+
14
+ attr_reader :value, :flag
15
+ def initialize(str)
16
+ @value = str
17
+ end
18
+ end
19
+
20
+ class Year < Token
21
+ @regexp = Regexp.new(/\A\s*(\d\d\d\d)\s*/i)
22
+ end
23
+
24
+ class LeftParen < Token
25
+ @regexp = Regexp.new(/\A\s*(\()\s*/i)
26
+ end
27
+
28
+ class RightParen < Token
29
+ @regexp = Regexp.new(/\A\s*(\))\s*/i)
30
+ end
31
+
32
+ class Comma < Token
33
+ @regexp = Regexp.new(/\A\s*(\,)\s*/i)
34
+ end
35
+
36
+ # A token to match an author year combination, breaks
37
+ # the string into three parts.
38
+ class AuthorYear < Token
39
+ attr_reader :authors, :year, :parens
40
+ # This is going to hit just everything, should only be used
41
+ # in one off when you know you have that string.
42
+ @regexp = Regexp.new(/\A\s*(\(?[^\+\d)]+(\d\d\d\d)?\)?)\s*/i)
43
+
44
+ def initialize(str)
45
+ str.strip!
46
+ # check for parens
47
+ if str =~ /\((.*)\)/
48
+ w = $1
49
+ @parens = true
50
+ else
51
+ w = str
52
+ @parens = false
53
+ end
54
+ # check for year
55
+ if w =~ /(\d\d\d\d)\Z/
56
+ @year = $1.to_i
57
+ w.gsub!(/\d\d\d\d\Z/, "")
58
+ w.strip!
59
+ end
60
+ w.gsub!(/,\s*\Z/, '')
61
+ @authors = w.strip
62
+ true
63
+ end
64
+ end
65
+
66
+ # Complex breakdown of author strings. Handles
67
+ # a wide variety of formats.
68
+ # See test_splitter_tokens.rb for scope. As with
69
+ # AuthorYear this will match just about anything when used alone.
70
+ # Add exceptions at will, just test using TestSplittTokens#test_authors.
71
+ # TODO: Unicode the [a-z] bits?
72
+ class Authors < Token
73
+ attr_reader :names
74
+ @regexp = Regexp.new(/\A\s*([^\d]+)\s*/i)
75
+
76
+ def initialize(input)
77
+ str = input
78
+ @names = []
79
+ str.strip!
80
+ naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
81
+ individuals = []
82
+ last_individual = nil
83
+
84
+ # We can simplify if there is an "and" or &
85
+ if str =~ /(\s+and\s+|\&)/i
86
+ l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
87
+ last_individual = r
88
+ str = l
89
+ naked_and = true
90
+ end
91
+
92
+ # Look for an exception case, no initials, "and" or "&" previously present, like:
93
+ # Foo, Bar and Smith
94
+ if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/
95
+ individuals.unshift str.split(/\s*\,\s*/)
96
+ str = nil
97
+ end
98
+
99
+ # Look for an exception case, no periods and multiple commas, like:
100
+ # Foo A, Bar ZA, Smith-Blorf A
101
+ if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
102
+ individuals = str.split(",")
103
+ str = nil
104
+ end
105
+
106
+ prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
107
+ pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join
108
+
109
+ postfix = ['de la', 'von', 'da', 'van', ', Jr.']
110
+ post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join
111
+
112
+ # Initials second
113
+ m1 = Regexp.new(/^\s*(#{pre_reg} # legal prefix words, includes space if present
114
+ [A-Z][a-z]+ # a captialized Name
115
+ (\-[A-Z][a-z]+)? # optional dashed addition
116
+ \s*,\s* # required comma
117
+ (\s* # initials, optionally surrounded by whitescape
118
+ (\-)? # optional preceeding dash, hits second initials
119
+ [A-Z] # required capital initial
120
+ (\-)? # optional initial dash
121
+ (\-[A-Z])? # optional dashed initial
122
+ \s*\. # required period
123
+ \s*)
124
+ {1,} # repeat initials as necessary
125
+ #{post_reg}) # optional legal postfixes
126
+ \s*/x)
127
+
128
+ # Initials first
129
+ m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/) # (R. Watson | R.F. Watson),
130
+
131
+ # pick off remaining authors one at a time
132
+ if str
133
+ parsing = true
134
+ i = 0
135
+ while parsing
136
+ individual = ''
137
+ check_for_more_individuals = false
138
+ [m2, m1].each do |regex|
139
+ if str =~ regex
140
+ individual = $1
141
+ str.slice!(individual)
142
+ str.strip!
143
+ str.slice!(",")
144
+ individuals.push(individual)
145
+ check_for_more_individuals = true # at least once match, keep going
146
+ end
147
+ end
148
+
149
+ # puts "[#{individual}] : #{str}"
150
+ if !check_for_more_individuals
151
+ if str && str.size != 0
152
+ individuals.push(str)
153
+ parsing = false
154
+ end
155
+ end
156
+
157
+ i += 1
158
+ raise if i > 100
159
+ parsing = false if str.size == 0
160
+ end
161
+ end
162
+
163
+ # Note to remember positive look behind (?<= ) for future hax
164
+ # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)
165
+
166
+ individuals.push(last_individual) if !last_individual.nil?
167
+ individuals.flatten!
168
+
169
+ # At this point we have isolated individuals. Strategy is to slice out initials and remainder is last name.
170
+ # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats)
171
+ # TODO: Make a Token
172
+ match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)
173
+
174
+ # TODO: merge with pre/postfix list
175
+ suffixes = [
176
+ Regexp.new(/\s(van)\s?/i),
177
+ Regexp.new(/\s(jr\.)/i),
178
+ Regexp.new(/\s(von)\s?/i),
179
+ Regexp.new(/\s(de la)\s?/i),
180
+ Regexp.new(/\s(da)\s?/i),
181
+ ]
182
+
183
+ individuals.each do |i|
184
+ a = {} # new author
185
+
186
+ initials = nil
187
+ last_name = nil
188
+ if i =~ match_initials
189
+ initials = $1
190
+ i.slice!(initials)
191
+ i.strip!
192
+ last_name = i
193
+ else
194
+ last_name = i
195
+ end
196
+
197
+ suffix = []
198
+ suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
199
+ if last_name =~ s
200
+ t = $1
201
+ suffix.push(t)
202
+ last_name.slice!(t)
203
+ end
204
+ end
205
+ a[:suffix] = suffix.join(" ") if suffix.size > 0
206
+
207
+ last_name.gsub!(/\.|\,/, '')
208
+
209
+ a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
210
+ a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0
211
+
212
+ @names << a
213
+ end
214
+ end
215
+ end
216
+
217
+ # A token to match volume-number combinations, with various possible formats.
218
+ class VolumeNumber < Token
219
+ attr_reader :volume, :number
220
+
221
+ @regexp = Regexp.new(/\A\s*(([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?)\s*/i)
222
+ # @regexp = Regexp.new(/\A\s*((\d+)\s*[:\(]?\s*(\d+)?\)?)\s*/i) <- only digits allowed in this version
223
+
224
+ def initialize(str)
225
+ str.strip
226
+ str =~ /\A\s*([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?\s*/i
227
+ @volume = $1
228
+ @number = $2
229
+ @volume && @volume.strip!
230
+ @number && @number.strip!
231
+ end
232
+ end
233
+
234
+ # A token to match page ranges, with remainders noted.
235
+ class Pages < Token
236
+ attr_reader :pg_start, :pg_end, :remainder
237
+ @regexp = Regexp.new(/\A\s*((\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?)/i)
238
+
239
+ def initialize(str)
240
+ str.strip
241
+ str =~ /\A\s*(\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?/i
242
+ @pg_start = $1
243
+ @pg_end = $2
244
+ @remainder = $3
245
+ end
246
+ end
247
+
248
+ # A token to match quadrinomial.s
249
+ # Matches:
250
+ # Foo
251
+ # Foo (Bar)
252
+ # Foo (Bar) stuff
253
+ # Foo (Bar) stuff things
254
+ # Foo stuff
255
+ # Foo stuff things
256
+ # TODO: This will likley erroroneously match on authors names that are uncapitalized, e.g.:
257
+ # Foo stuff von Helsing, 1920
258
+ class Quadrinomial < Token
259
+ attr_reader :genus, :subgenus, :species, :subspecies
260
+ @regexp = Regexp.new(/\A\s*(([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?)\s*/)
261
+
262
+ def initialize(str)
263
+ str.strip
264
+ str =~ /\A\s*([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?\s*/i
265
+ @genus = $1
266
+ @subgenus = $2
267
+ @species = $3
268
+ @subspecies = $4
269
+
270
+ if @subgenus =~ /\((.*)\)/
271
+ @subgenus = $1
272
+ end
273
+ end
274
+ end
275
+
276
+ # !! You must register token lists as symbols in
277
+ # !! Taxonifi::Splitter
278
+ #
279
+ # Include all tokens in the global_token_list.
280
+ # Tokens are matched in order of the list. If you
281
+ # re-order an list ensure that unit tests fail.
282
+ # Create an untested list at your own risk, any proposed
283
+ # ordering will be accepted as long as tests pass.
284
+
285
+ # All tokens.
286
+ def self.global_token_list
287
+ [
288
+ Taxonifi::Splitter::Tokens::Quadrinomial,
289
+ Taxonifi::Splitter::Tokens::LeftParen,
290
+ Taxonifi::Splitter::Tokens::Year,
291
+ Taxonifi::Splitter::Tokens::Comma,
292
+ Taxonifi::Splitter::Tokens::RightParen,
293
+ Taxonifi::Splitter::Tokens::AuthorYear,
294
+ Taxonifi::Splitter::Tokens::Authors,
295
+ Taxonifi::Splitter::Tokens::VolumeNumber,
296
+ Taxonifi::Splitter::Tokens::Pages,
297
+ ]
298
+ end
299
+
300
+ # Tokens used in breaking down volume/number ranges.
301
+ def self.volume_number
302
+ [
303
+ Taxonifi::Splitter::Tokens::VolumeNumber
304
+ ]
305
+ end
306
+
307
+ # Tokens used in breaking down page ranges.
308
+ def self.pages
309
+ [
310
+ Taxonifi::Splitter::Tokens::Pages
311
+ ]
312
+ end
313
+
314
+ # Tokens used in breaking down species names.
315
+ def self.species_name
316
+ [
317
+ Taxonifi::Splitter::Tokens::Quadrinomial,
318
+ Taxonifi::Splitter::Tokens::AuthorYear,
319
+ ]
320
+ end
321
+
322
+ end
data/lib/taxonifi.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'csv'
2
+
3
+ # Everything in Taxonifi is in here.
4
+ module Taxonifi
5
+
6
+ # Taxonomic ranks.
7
+ RANKS = %w{
8
+ kingdom
9
+ phylum
10
+ class
11
+ infraclass
12
+ order
13
+ suborder
14
+ infraorder
15
+ superfamily
16
+ family
17
+ subfamily
18
+ tribe
19
+ subtribe
20
+ genus
21
+ subgenus
22
+ species
23
+ subspecies
24
+ }
25
+
26
+
27
+ require File.expand_path(File.join(File.dirname(__FILE__), 'lumper/lumper'))
28
+ require File.expand_path(File.join(File.dirname(__FILE__), 'splitter/splitter'))
29
+ require File.expand_path(File.join(File.dirname(__FILE__), 'assessor/assessor'))
30
+ require File.expand_path(File.join(File.dirname(__FILE__), 'export/export'))
31
+
32
+ Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "models/*.rb") )) do |file|
33
+ require file
34
+ end
35
+
36
+ end