taxonifi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/Gemfile +18 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +155 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/lib/assessor/assessor.rb +31 -0
  9. data/lib/assessor/base.rb +17 -0
  10. data/lib/assessor/row_assessor.rb +131 -0
  11. data/lib/export/export.rb +9 -0
  12. data/lib/export/format/base.rb +43 -0
  13. data/lib/export/format/species_file.rb +341 -0
  14. data/lib/lumper/lumper.rb +334 -0
  15. data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
  16. data/lib/models/author_year.rb +39 -0
  17. data/lib/models/base.rb +73 -0
  18. data/lib/models/collection.rb +92 -0
  19. data/lib/models/generic_object.rb +15 -0
  20. data/lib/models/geog.rb +59 -0
  21. data/lib/models/geog_collection.rb +28 -0
  22. data/lib/models/name.rb +206 -0
  23. data/lib/models/name_collection.rb +149 -0
  24. data/lib/models/person.rb +49 -0
  25. data/lib/models/ref.rb +85 -0
  26. data/lib/models/ref_collection.rb +106 -0
  27. data/lib/models/species_name.rb +85 -0
  28. data/lib/splitter/builder.rb +26 -0
  29. data/lib/splitter/lexer.rb +70 -0
  30. data/lib/splitter/parser.rb +54 -0
  31. data/lib/splitter/splitter.rb +45 -0
  32. data/lib/splitter/tokens.rb +322 -0
  33. data/lib/taxonifi.rb +36 -0
  34. data/test/file_fixtures/Lygaeoidea.csv +801 -0
  35. data/test/helper.rb +38 -0
  36. data/test/test_exporter.rb +32 -0
  37. data/test/test_lumper_geogs.rb +59 -0
  38. data/test/test_lumper_hierarchical_collection.rb +88 -0
  39. data/test/test_lumper_names.rb +119 -0
  40. data/test/test_lumper_parent_child_name_collection.rb +41 -0
  41. data/test/test_lumper_refs.rb +91 -0
  42. data/test/test_parser.rb +34 -0
  43. data/test/test_splitter.rb +27 -0
  44. data/test/test_splitter_tokens.rb +403 -0
  45. data/test/test_taxonifi.rb +11 -0
  46. data/test/test_taxonifi_accessor.rb +61 -0
  47. data/test/test_taxonifi_geog.rb +51 -0
  48. data/test/test_taxonifi_name.rb +186 -0
  49. data/test/test_taxonifi_name_collection.rb +158 -0
  50. data/test/test_taxonifi_ref.rb +90 -0
  51. data/test/test_taxonifi_ref_collection.rb +69 -0
  52. data/test/test_taxonifi_species_name.rb +95 -0
  53. metadata +167 -0
@@ -0,0 +1,322 @@
1
+ #
2
+ # Tokens are simple classes that return a regular expression (pattern to match).
3
+ # You should write a test in test_resolver.rb before defining a token.
4
+ # Remember to register tokens in lists at the bottom of this file.
5
+ #
6
+ module Taxonifi::Splitter::Tokens
7
+
8
+ class Token
9
+ # This allows access the to class attribute regexp, without using a class variable
10
+ class << self
11
+ attr_reader :regexp
12
+ end
13
+
14
+ attr_reader :value, :flag
15
+ def initialize(str)
16
+ @value = str
17
+ end
18
+ end
19
+
20
+ class Year < Token
21
+ @regexp = Regexp.new(/\A\s*(\d\d\d\d)\s*/i)
22
+ end
23
+
24
+ class LeftParen < Token
25
+ @regexp = Regexp.new(/\A\s*(\()\s*/i)
26
+ end
27
+
28
+ class RightParen < Token
29
+ @regexp = Regexp.new(/\A\s*(\))\s*/i)
30
+ end
31
+
32
+ class Comma < Token
33
+ @regexp = Regexp.new(/\A\s*(\,)\s*/i)
34
+ end
35
+
36
+ # A token to match an author year combination, breaks
37
+ # the string into three parts.
38
+ class AuthorYear < Token
39
+ attr_reader :authors, :year, :parens
40
+ # This is going to hit just everything, should only be used
41
+ # in one off when you know you have that string.
42
+ @regexp = Regexp.new(/\A\s*(\(?[^\+\d)]+(\d\d\d\d)?\)?)\s*/i)
43
+
44
+ def initialize(str)
45
+ str.strip!
46
+ # check for parens
47
+ if str =~ /\((.*)\)/
48
+ w = $1
49
+ @parens = true
50
+ else
51
+ w = str
52
+ @parens = false
53
+ end
54
+ # check for year
55
+ if w =~ /(\d\d\d\d)\Z/
56
+ @year = $1.to_i
57
+ w.gsub!(/\d\d\d\d\Z/, "")
58
+ w.strip!
59
+ end
60
+ w.gsub!(/,\s*\Z/, '')
61
+ @authors = w.strip
62
+ true
63
+ end
64
+ end
65
+
66
+ # Complex breakdown of author strings. Handles
67
+ # a wide variety of formats.
68
+ # See test_splitter_tokens.rb for scope. As with
69
+ # AuthorYear this will match just about anything when used alone.
70
+ # Add exceptions at will, just test using TestSplittTokens#test_authors.
71
+ # TODO: Unicode the [a-z] bits?
72
+ class Authors < Token
73
+ attr_reader :names
74
+ @regexp = Regexp.new(/\A\s*([^\d]+)\s*/i)
75
+
76
+ def initialize(input)
77
+ str = input
78
+ @names = []
79
+ str.strip!
80
+ naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
81
+ individuals = []
82
+ last_individual = nil
83
+
84
+ # We can simplify if there is an "and" or &
85
+ if str =~ /(\s+and\s+|\&)/i
86
+ l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
87
+ last_individual = r
88
+ str = l
89
+ naked_and = true
90
+ end
91
+
92
+ # Look for an exception case, no initials, "and" or "&" previously present, like:
93
+ # Foo, Bar and Smith
94
+ if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/
95
+ individuals.unshift str.split(/\s*\,\s*/)
96
+ str = nil
97
+ end
98
+
99
+ # Look for an exception case, no periods and multiple commas, like:
100
+ # Foo A, Bar ZA, Smith-Blorf A
101
+ if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
102
+ individuals = str.split(",")
103
+ str = nil
104
+ end
105
+
106
+ prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
107
+ pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join
108
+
109
+ postfix = ['de la', 'von', 'da', 'van', ', Jr.']
110
+ post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join
111
+
112
+ # Initials second
113
+ m1 = Regexp.new(/^\s*(#{pre_reg} # legal prefix words, includes space if present
114
+ [A-Z][a-z]+ # a captialized Name
115
+ (\-[A-Z][a-z]+)? # optional dashed addition
116
+ \s*,\s* # required comma
117
+ (\s* # initials, optionally surrounded by whitescape
118
+ (\-)? # optional preceeding dash, hits second initials
119
+ [A-Z] # required capital initial
120
+ (\-)? # optional initial dash
121
+ (\-[A-Z])? # optional dashed initial
122
+ \s*\. # required period
123
+ \s*)
124
+ {1,} # repeat initials as necessary
125
+ #{post_reg}) # optional legal postfixes
126
+ \s*/x)
127
+
128
+ # Initials first
129
+ m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/) # (R. Watson | R.F. Watson),
130
+
131
+ # pick off remaining authors one at a time
132
+ if str
133
+ parsing = true
134
+ i = 0
135
+ while parsing
136
+ individual = ''
137
+ check_for_more_individuals = false
138
+ [m2, m1].each do |regex|
139
+ if str =~ regex
140
+ individual = $1
141
+ str.slice!(individual)
142
+ str.strip!
143
+ str.slice!(",")
144
+ individuals.push(individual)
145
+ check_for_more_individuals = true # at least once match, keep going
146
+ end
147
+ end
148
+
149
+ # puts "[#{individual}] : #{str}"
150
+ if !check_for_more_individuals
151
+ if str && str.size != 0
152
+ individuals.push(str)
153
+ parsing = false
154
+ end
155
+ end
156
+
157
+ i += 1
158
+ raise if i > 100
159
+ parsing = false if str.size == 0
160
+ end
161
+ end
162
+
163
+ # Note to remember positive look behind (?<= ) for future hax
164
+ # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)
165
+
166
+ individuals.push(last_individual) if !last_individual.nil?
167
+ individuals.flatten!
168
+
169
+ # At this point we have isolated individuals. Strategy is to slice out initials and remainder is last name.
170
+ # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats)
171
+ # TODO: Make a Token
172
+ match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)
173
+
174
+ # TODO: merge with pre/postfix list
175
+ suffixes = [
176
+ Regexp.new(/\s(van)\s?/i),
177
+ Regexp.new(/\s(jr\.)/i),
178
+ Regexp.new(/\s(von)\s?/i),
179
+ Regexp.new(/\s(de la)\s?/i),
180
+ Regexp.new(/\s(da)\s?/i),
181
+ ]
182
+
183
+ individuals.each do |i|
184
+ a = {} # new author
185
+
186
+ initials = nil
187
+ last_name = nil
188
+ if i =~ match_initials
189
+ initials = $1
190
+ i.slice!(initials)
191
+ i.strip!
192
+ last_name = i
193
+ else
194
+ last_name = i
195
+ end
196
+
197
+ suffix = []
198
+ suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
199
+ if last_name =~ s
200
+ t = $1
201
+ suffix.push(t)
202
+ last_name.slice!(t)
203
+ end
204
+ end
205
+ a[:suffix] = suffix.join(" ") if suffix.size > 0
206
+
207
+ last_name.gsub!(/\.|\,/, '')
208
+
209
+ a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
210
+ a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0
211
+
212
+ @names << a
213
+ end
214
+ end
215
+ end
216
+
217
+ # A token to match volume-number combinations, with various possible formats.
218
+ class VolumeNumber < Token
219
+ attr_reader :volume, :number
220
+
221
+ @regexp = Regexp.new(/\A\s*(([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?)\s*/i)
222
+ # @regexp = Regexp.new(/\A\s*((\d+)\s*[:\(]?\s*(\d+)?\)?)\s*/i) <- only digits allowed in this version
223
+
224
+ def initialize(str)
225
+ str.strip
226
+ str =~ /\A\s*([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?\s*/i
227
+ @volume = $1
228
+ @number = $2
229
+ @volume && @volume.strip!
230
+ @number && @number.strip!
231
+ end
232
+ end
233
+
234
+ # A token to match page ranges, with remainders noted.
235
+ class Pages < Token
236
+ attr_reader :pg_start, :pg_end, :remainder
237
+ @regexp = Regexp.new(/\A\s*((\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?)/i)
238
+
239
+ def initialize(str)
240
+ str.strip
241
+ str =~ /\A\s*(\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?/i
242
+ @pg_start = $1
243
+ @pg_end = $2
244
+ @remainder = $3
245
+ end
246
+ end
247
+
248
+ # A token to match quadrinomial.s
249
+ # Matches:
250
+ # Foo
251
+ # Foo (Bar)
252
+ # Foo (Bar) stuff
253
+ # Foo (Bar) stuff things
254
+ # Foo stuff
255
+ # Foo stuff things
256
+ # TODO: This will likley erroroneously match on authors names that are uncapitalized, e.g.:
257
+ # Foo stuff von Helsing, 1920
258
+ class Quadrinomial < Token
259
+ attr_reader :genus, :subgenus, :species, :subspecies
260
+ @regexp = Regexp.new(/\A\s*(([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?)\s*/)
261
+
262
+ def initialize(str)
263
+ str.strip
264
+ str =~ /\A\s*([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?\s*/i
265
+ @genus = $1
266
+ @subgenus = $2
267
+ @species = $3
268
+ @subspecies = $4
269
+
270
+ if @subgenus =~ /\((.*)\)/
271
+ @subgenus = $1
272
+ end
273
+ end
274
+ end
275
+
276
+ # !! You must register token lists as symbols in
277
+ # !! Taxonifi::Splitter
278
+ #
279
+ # Include all tokens in the global_token_list.
280
+ # Tokens are matched in order of the list. If you
281
+ # re-order an list ensure that unit tests fail.
282
+ # Create an untested list at your own risk, any proposed
283
+ # ordering will be accepted as long as tests pass.
284
+
285
+ # All tokens.
286
+ def self.global_token_list
287
+ [
288
+ Taxonifi::Splitter::Tokens::Quadrinomial,
289
+ Taxonifi::Splitter::Tokens::LeftParen,
290
+ Taxonifi::Splitter::Tokens::Year,
291
+ Taxonifi::Splitter::Tokens::Comma,
292
+ Taxonifi::Splitter::Tokens::RightParen,
293
+ Taxonifi::Splitter::Tokens::AuthorYear,
294
+ Taxonifi::Splitter::Tokens::Authors,
295
+ Taxonifi::Splitter::Tokens::VolumeNumber,
296
+ Taxonifi::Splitter::Tokens::Pages,
297
+ ]
298
+ end
299
+
300
+ # Tokens used in breaking down volume/number ranges.
301
+ def self.volume_number
302
+ [
303
+ Taxonifi::Splitter::Tokens::VolumeNumber
304
+ ]
305
+ end
306
+
307
+ # Tokens used in breaking down page ranges.
308
+ def self.pages
309
+ [
310
+ Taxonifi::Splitter::Tokens::Pages
311
+ ]
312
+ end
313
+
314
+ # Tokens used in breaking down species names.
315
+ def self.species_name
316
+ [
317
+ Taxonifi::Splitter::Tokens::Quadrinomial,
318
+ Taxonifi::Splitter::Tokens::AuthorYear,
319
+ ]
320
+ end
321
+
322
+ end
data/lib/taxonifi.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'csv'
2
+
3
+ # Everything in Taxonifi is in here.
4
+ module Taxonifi
5
+
6
+ # Taxonomic ranks.
7
+ RANKS = %w{
8
+ kingdom
9
+ phylum
10
+ class
11
+ infraclass
12
+ order
13
+ suborder
14
+ infraorder
15
+ superfamily
16
+ family
17
+ subfamily
18
+ tribe
19
+ subtribe
20
+ genus
21
+ subgenus
22
+ species
23
+ subspecies
24
+ }
25
+
26
+
27
+ require File.expand_path(File.join(File.dirname(__FILE__), 'lumper/lumper'))
28
+ require File.expand_path(File.join(File.dirname(__FILE__), 'splitter/splitter'))
29
+ require File.expand_path(File.join(File.dirname(__FILE__), 'assessor/assessor'))
30
+ require File.expand_path(File.join(File.dirname(__FILE__), 'export/export'))
31
+
32
+ Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "models/*.rb") )) do |file|
33
+ require file
34
+ end
35
+
36
+ end