taxonifi 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +155 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/assessor/assessor.rb +31 -0
- data/lib/assessor/base.rb +17 -0
- data/lib/assessor/row_assessor.rb +131 -0
- data/lib/export/export.rb +9 -0
- data/lib/export/format/base.rb +43 -0
- data/lib/export/format/species_file.rb +341 -0
- data/lib/lumper/lumper.rb +334 -0
- data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
- data/lib/models/author_year.rb +39 -0
- data/lib/models/base.rb +73 -0
- data/lib/models/collection.rb +92 -0
- data/lib/models/generic_object.rb +15 -0
- data/lib/models/geog.rb +59 -0
- data/lib/models/geog_collection.rb +28 -0
- data/lib/models/name.rb +206 -0
- data/lib/models/name_collection.rb +149 -0
- data/lib/models/person.rb +49 -0
- data/lib/models/ref.rb +85 -0
- data/lib/models/ref_collection.rb +106 -0
- data/lib/models/species_name.rb +85 -0
- data/lib/splitter/builder.rb +26 -0
- data/lib/splitter/lexer.rb +70 -0
- data/lib/splitter/parser.rb +54 -0
- data/lib/splitter/splitter.rb +45 -0
- data/lib/splitter/tokens.rb +322 -0
- data/lib/taxonifi.rb +36 -0
- data/test/file_fixtures/Lygaeoidea.csv +801 -0
- data/test/helper.rb +38 -0
- data/test/test_exporter.rb +32 -0
- data/test/test_lumper_geogs.rb +59 -0
- data/test/test_lumper_hierarchical_collection.rb +88 -0
- data/test/test_lumper_names.rb +119 -0
- data/test/test_lumper_parent_child_name_collection.rb +41 -0
- data/test/test_lumper_refs.rb +91 -0
- data/test/test_parser.rb +34 -0
- data/test/test_splitter.rb +27 -0
- data/test/test_splitter_tokens.rb +403 -0
- data/test/test_taxonifi.rb +11 -0
- data/test/test_taxonifi_accessor.rb +61 -0
- data/test/test_taxonifi_geog.rb +51 -0
- data/test/test_taxonifi_name.rb +186 -0
- data/test/test_taxonifi_name_collection.rb +158 -0
- data/test/test_taxonifi_ref.rb +90 -0
- data/test/test_taxonifi_ref_collection.rb +69 -0
- data/test/test_taxonifi_species_name.rb +95 -0
- metadata +167 -0
@@ -0,0 +1,322 @@
|
|
1
|
+
#
|
2
|
+
# Tokens are simple classes that return a regular expression (pattern to match).
|
3
|
+
# You should write a test in test_resolver.rb before defining a token.
|
4
|
+
# Remember to register tokens in lists at the bottom of this file.
|
5
|
+
#
|
6
|
+
module Taxonifi::Splitter::Tokens
|
7
|
+
|
8
|
+
class Token
|
9
|
+
# This allows access the to class attribute regexp, without using a class variable
|
10
|
+
class << self
|
11
|
+
attr_reader :regexp
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_reader :value, :flag
|
15
|
+
def initialize(str)
|
16
|
+
@value = str
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class Year < Token
|
21
|
+
@regexp = Regexp.new(/\A\s*(\d\d\d\d)\s*/i)
|
22
|
+
end
|
23
|
+
|
24
|
+
class LeftParen < Token
|
25
|
+
@regexp = Regexp.new(/\A\s*(\()\s*/i)
|
26
|
+
end
|
27
|
+
|
28
|
+
class RightParen < Token
|
29
|
+
@regexp = Regexp.new(/\A\s*(\))\s*/i)
|
30
|
+
end
|
31
|
+
|
32
|
+
class Comma < Token
|
33
|
+
@regexp = Regexp.new(/\A\s*(\,)\s*/i)
|
34
|
+
end
|
35
|
+
|
36
|
+
# A token to match an author year combination, breaks
|
37
|
+
# the string into three parts.
|
38
|
+
class AuthorYear < Token
|
39
|
+
attr_reader :authors, :year, :parens
|
40
|
+
# This is going to hit just everything, should only be used
|
41
|
+
# in one off when you know you have that string.
|
42
|
+
@regexp = Regexp.new(/\A\s*(\(?[^\+\d)]+(\d\d\d\d)?\)?)\s*/i)
|
43
|
+
|
44
|
+
def initialize(str)
|
45
|
+
str.strip!
|
46
|
+
# check for parens
|
47
|
+
if str =~ /\((.*)\)/
|
48
|
+
w = $1
|
49
|
+
@parens = true
|
50
|
+
else
|
51
|
+
w = str
|
52
|
+
@parens = false
|
53
|
+
end
|
54
|
+
# check for year
|
55
|
+
if w =~ /(\d\d\d\d)\Z/
|
56
|
+
@year = $1.to_i
|
57
|
+
w.gsub!(/\d\d\d\d\Z/, "")
|
58
|
+
w.strip!
|
59
|
+
end
|
60
|
+
w.gsub!(/,\s*\Z/, '')
|
61
|
+
@authors = w.strip
|
62
|
+
true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Complex breakdown of author strings. Handles
|
67
|
+
# a wide variety of formats.
|
68
|
+
# See test_splitter_tokens.rb for scope. As with
|
69
|
+
# AuthorYear this will match just about anything when used alone.
|
70
|
+
# Add exceptions at will, just test using TestSplittTokens#test_authors.
|
71
|
+
# TODO: Unicode the [a-z] bits?
|
72
|
+
class Authors < Token
|
73
|
+
attr_reader :names
|
74
|
+
@regexp = Regexp.new(/\A\s*([^\d]+)\s*/i)
|
75
|
+
|
76
|
+
def initialize(input)
|
77
|
+
str = input
|
78
|
+
@names = []
|
79
|
+
str.strip!
|
80
|
+
naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
|
81
|
+
individuals = []
|
82
|
+
last_individual = nil
|
83
|
+
|
84
|
+
# We can simplify if there is an "and" or &
|
85
|
+
if str =~ /(\s+and\s+|\&)/i
|
86
|
+
l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
|
87
|
+
last_individual = r
|
88
|
+
str = l
|
89
|
+
naked_and = true
|
90
|
+
end
|
91
|
+
|
92
|
+
# Look for an exception case, no initials, "and" or "&" previously present, like:
|
93
|
+
# Foo, Bar and Smith
|
94
|
+
if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/
|
95
|
+
individuals.unshift str.split(/\s*\,\s*/)
|
96
|
+
str = nil
|
97
|
+
end
|
98
|
+
|
99
|
+
# Look for an exception case, no periods and multiple commas, like:
|
100
|
+
# Foo A, Bar ZA, Smith-Blorf A
|
101
|
+
if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
|
102
|
+
individuals = str.split(",")
|
103
|
+
str = nil
|
104
|
+
end
|
105
|
+
|
106
|
+
prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
|
107
|
+
pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join
|
108
|
+
|
109
|
+
postfix = ['de la', 'von', 'da', 'van', ', Jr.']
|
110
|
+
post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join
|
111
|
+
|
112
|
+
# Initials second
|
113
|
+
m1 = Regexp.new(/^\s*(#{pre_reg} # legal prefix words, includes space if present
|
114
|
+
[A-Z][a-z]+ # a captialized Name
|
115
|
+
(\-[A-Z][a-z]+)? # optional dashed addition
|
116
|
+
\s*,\s* # required comma
|
117
|
+
(\s* # initials, optionally surrounded by whitescape
|
118
|
+
(\-)? # optional preceeding dash, hits second initials
|
119
|
+
[A-Z] # required capital initial
|
120
|
+
(\-)? # optional initial dash
|
121
|
+
(\-[A-Z])? # optional dashed initial
|
122
|
+
\s*\. # required period
|
123
|
+
\s*)
|
124
|
+
{1,} # repeat initials as necessary
|
125
|
+
#{post_reg}) # optional legal postfixes
|
126
|
+
\s*/x)
|
127
|
+
|
128
|
+
# Initials first
|
129
|
+
m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/) # (R. Watson | R.F. Watson),
|
130
|
+
|
131
|
+
# pick off remaining authors one at a time
|
132
|
+
if str
|
133
|
+
parsing = true
|
134
|
+
i = 0
|
135
|
+
while parsing
|
136
|
+
individual = ''
|
137
|
+
check_for_more_individuals = false
|
138
|
+
[m2, m1].each do |regex|
|
139
|
+
if str =~ regex
|
140
|
+
individual = $1
|
141
|
+
str.slice!(individual)
|
142
|
+
str.strip!
|
143
|
+
str.slice!(",")
|
144
|
+
individuals.push(individual)
|
145
|
+
check_for_more_individuals = true # at least once match, keep going
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# puts "[#{individual}] : #{str}"
|
150
|
+
if !check_for_more_individuals
|
151
|
+
if str && str.size != 0
|
152
|
+
individuals.push(str)
|
153
|
+
parsing = false
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
i += 1
|
158
|
+
raise if i > 100
|
159
|
+
parsing = false if str.size == 0
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Note to remember positive look behind (?<= ) for future hax
|
164
|
+
# str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)
|
165
|
+
|
166
|
+
individuals.push(last_individual) if !last_individual.nil?
|
167
|
+
individuals.flatten!
|
168
|
+
|
169
|
+
# At this point we have isolated individuals. Strategy is to slice out initials and remainder is last name.
|
170
|
+
# Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats)
|
171
|
+
# TODO: Make a Token
|
172
|
+
match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)
|
173
|
+
|
174
|
+
# TODO: merge with pre/postfix list
|
175
|
+
suffixes = [
|
176
|
+
Regexp.new(/\s(van)\s?/i),
|
177
|
+
Regexp.new(/\s(jr\.)/i),
|
178
|
+
Regexp.new(/\s(von)\s?/i),
|
179
|
+
Regexp.new(/\s(de la)\s?/i),
|
180
|
+
Regexp.new(/\s(da)\s?/i),
|
181
|
+
]
|
182
|
+
|
183
|
+
individuals.each do |i|
|
184
|
+
a = {} # new author
|
185
|
+
|
186
|
+
initials = nil
|
187
|
+
last_name = nil
|
188
|
+
if i =~ match_initials
|
189
|
+
initials = $1
|
190
|
+
i.slice!(initials)
|
191
|
+
i.strip!
|
192
|
+
last_name = i
|
193
|
+
else
|
194
|
+
last_name = i
|
195
|
+
end
|
196
|
+
|
197
|
+
suffix = []
|
198
|
+
suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
|
199
|
+
if last_name =~ s
|
200
|
+
t = $1
|
201
|
+
suffix.push(t)
|
202
|
+
last_name.slice!(t)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
a[:suffix] = suffix.join(" ") if suffix.size > 0
|
206
|
+
|
207
|
+
last_name.gsub!(/\.|\,/, '')
|
208
|
+
|
209
|
+
a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
|
210
|
+
a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0
|
211
|
+
|
212
|
+
@names << a
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# A token to match volume-number combinations, with various possible formats.
|
218
|
+
class VolumeNumber < Token
|
219
|
+
attr_reader :volume, :number
|
220
|
+
|
221
|
+
@regexp = Regexp.new(/\A\s*(([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?)\s*/i)
|
222
|
+
# @regexp = Regexp.new(/\A\s*((\d+)\s*[:\(]?\s*(\d+)?\)?)\s*/i) <- only digits allowed in this version
|
223
|
+
|
224
|
+
def initialize(str)
|
225
|
+
str.strip
|
226
|
+
str =~ /\A\s*([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?\s*/i
|
227
|
+
@volume = $1
|
228
|
+
@number = $2
|
229
|
+
@volume && @volume.strip!
|
230
|
+
@number && @number.strip!
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
# A token to match page ranges, with remainders noted.
|
235
|
+
class Pages < Token
|
236
|
+
attr_reader :pg_start, :pg_end, :remainder
|
237
|
+
@regexp = Regexp.new(/\A\s*((\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?)/i)
|
238
|
+
|
239
|
+
def initialize(str)
|
240
|
+
str.strip
|
241
|
+
str =~ /\A\s*(\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?/i
|
242
|
+
@pg_start = $1
|
243
|
+
@pg_end = $2
|
244
|
+
@remainder = $3
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
# A token to match quadrinomial.s
|
249
|
+
# Matches:
|
250
|
+
# Foo
|
251
|
+
# Foo (Bar)
|
252
|
+
# Foo (Bar) stuff
|
253
|
+
# Foo (Bar) stuff things
|
254
|
+
# Foo stuff
|
255
|
+
# Foo stuff things
|
256
|
+
# TODO: This will likley erroroneously match on authors names that are uncapitalized, e.g.:
|
257
|
+
# Foo stuff von Helsing, 1920
|
258
|
+
class Quadrinomial < Token
|
259
|
+
attr_reader :genus, :subgenus, :species, :subspecies
|
260
|
+
@regexp = Regexp.new(/\A\s*(([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?)\s*/)
|
261
|
+
|
262
|
+
def initialize(str)
|
263
|
+
str.strip
|
264
|
+
str =~ /\A\s*([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?\s*/i
|
265
|
+
@genus = $1
|
266
|
+
@subgenus = $2
|
267
|
+
@species = $3
|
268
|
+
@subspecies = $4
|
269
|
+
|
270
|
+
if @subgenus =~ /\((.*)\)/
|
271
|
+
@subgenus = $1
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
# !! You must register token lists as symbols in
|
277
|
+
# !! Taxonifi::Splitter
|
278
|
+
#
|
279
|
+
# Include all tokens in the global_token_list.
|
280
|
+
# Tokens are matched in order of the list. If you
|
281
|
+
# re-order an list ensure that unit tests fail.
|
282
|
+
# Create an untested list at your own risk, any proposed
|
283
|
+
# ordering will be accepted as long as tests pass.
|
284
|
+
|
285
|
+
# All tokens.
|
286
|
+
def self.global_token_list
|
287
|
+
[
|
288
|
+
Taxonifi::Splitter::Tokens::Quadrinomial,
|
289
|
+
Taxonifi::Splitter::Tokens::LeftParen,
|
290
|
+
Taxonifi::Splitter::Tokens::Year,
|
291
|
+
Taxonifi::Splitter::Tokens::Comma,
|
292
|
+
Taxonifi::Splitter::Tokens::RightParen,
|
293
|
+
Taxonifi::Splitter::Tokens::AuthorYear,
|
294
|
+
Taxonifi::Splitter::Tokens::Authors,
|
295
|
+
Taxonifi::Splitter::Tokens::VolumeNumber,
|
296
|
+
Taxonifi::Splitter::Tokens::Pages,
|
297
|
+
]
|
298
|
+
end
|
299
|
+
|
300
|
+
# Tokens used in breaking down volume/number ranges.
|
301
|
+
def self.volume_number
|
302
|
+
[
|
303
|
+
Taxonifi::Splitter::Tokens::VolumeNumber
|
304
|
+
]
|
305
|
+
end
|
306
|
+
|
307
|
+
# Tokens used in breaking down page ranges.
|
308
|
+
def self.pages
|
309
|
+
[
|
310
|
+
Taxonifi::Splitter::Tokens::Pages
|
311
|
+
]
|
312
|
+
end
|
313
|
+
|
314
|
+
# Tokens used in breaking down species names.
|
315
|
+
def self.species_name
|
316
|
+
[
|
317
|
+
Taxonifi::Splitter::Tokens::Quadrinomial,
|
318
|
+
Taxonifi::Splitter::Tokens::AuthorYear,
|
319
|
+
]
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
data/lib/taxonifi.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
# Everything in Taxonifi is in here.
|
4
|
+
module Taxonifi
|
5
|
+
|
6
|
+
# Taxonomic ranks.
|
7
|
+
RANKS = %w{
|
8
|
+
kingdom
|
9
|
+
phylum
|
10
|
+
class
|
11
|
+
infraclass
|
12
|
+
order
|
13
|
+
suborder
|
14
|
+
infraorder
|
15
|
+
superfamily
|
16
|
+
family
|
17
|
+
subfamily
|
18
|
+
tribe
|
19
|
+
subtribe
|
20
|
+
genus
|
21
|
+
subgenus
|
22
|
+
species
|
23
|
+
subspecies
|
24
|
+
}
|
25
|
+
|
26
|
+
|
27
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'lumper/lumper'))
|
28
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'splitter/splitter'))
|
29
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'assessor/assessor'))
|
30
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'export/export'))
|
31
|
+
|
32
|
+
Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "models/*.rb") )) do |file|
|
33
|
+
require file
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|