taxonifi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +155 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/assessor/assessor.rb +31 -0
- data/lib/assessor/base.rb +17 -0
- data/lib/assessor/row_assessor.rb +131 -0
- data/lib/export/export.rb +9 -0
- data/lib/export/format/base.rb +43 -0
- data/lib/export/format/species_file.rb +341 -0
- data/lib/lumper/lumper.rb +334 -0
- data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
- data/lib/models/author_year.rb +39 -0
- data/lib/models/base.rb +73 -0
- data/lib/models/collection.rb +92 -0
- data/lib/models/generic_object.rb +15 -0
- data/lib/models/geog.rb +59 -0
- data/lib/models/geog_collection.rb +28 -0
- data/lib/models/name.rb +206 -0
- data/lib/models/name_collection.rb +149 -0
- data/lib/models/person.rb +49 -0
- data/lib/models/ref.rb +85 -0
- data/lib/models/ref_collection.rb +106 -0
- data/lib/models/species_name.rb +85 -0
- data/lib/splitter/builder.rb +26 -0
- data/lib/splitter/lexer.rb +70 -0
- data/lib/splitter/parser.rb +54 -0
- data/lib/splitter/splitter.rb +45 -0
- data/lib/splitter/tokens.rb +322 -0
- data/lib/taxonifi.rb +36 -0
- data/test/file_fixtures/Lygaeoidea.csv +801 -0
- data/test/helper.rb +38 -0
- data/test/test_exporter.rb +32 -0
- data/test/test_lumper_geogs.rb +59 -0
- data/test/test_lumper_hierarchical_collection.rb +88 -0
- data/test/test_lumper_names.rb +119 -0
- data/test/test_lumper_parent_child_name_collection.rb +41 -0
- data/test/test_lumper_refs.rb +91 -0
- data/test/test_parser.rb +34 -0
- data/test/test_splitter.rb +27 -0
- data/test/test_splitter_tokens.rb +403 -0
- data/test/test_taxonifi.rb +11 -0
- data/test/test_taxonifi_accessor.rb +61 -0
- data/test/test_taxonifi_geog.rb +51 -0
- data/test/test_taxonifi_name.rb +186 -0
- data/test/test_taxonifi_name_collection.rb +158 -0
- data/test/test_taxonifi_ref.rb +90 -0
- data/test/test_taxonifi_ref_collection.rb +69 -0
- data/test/test_taxonifi_species_name.rb +95 -0
- metadata +167 -0
@@ -0,0 +1,322 @@
|
|
1
|
+
#
|
2
|
+
# Tokens are simple classes that return a regular expression (pattern to match).
|
3
|
+
# You should write a test in test_resolver.rb before defining a token.
|
4
|
+
# Remember to register tokens in lists at the bottom of this file.
|
5
|
+
#
|
6
|
+
module Taxonifi::Splitter::Tokens
|
7
|
+
|
8
|
+
class Token
|
9
|
+
# This allows access the to class attribute regexp, without using a class variable
|
10
|
+
class << self
|
11
|
+
attr_reader :regexp
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_reader :value, :flag
|
15
|
+
def initialize(str)
|
16
|
+
@value = str
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class Year < Token
|
21
|
+
@regexp = Regexp.new(/\A\s*(\d\d\d\d)\s*/i)
|
22
|
+
end
|
23
|
+
|
24
|
+
class LeftParen < Token
|
25
|
+
@regexp = Regexp.new(/\A\s*(\()\s*/i)
|
26
|
+
end
|
27
|
+
|
28
|
+
class RightParen < Token
|
29
|
+
@regexp = Regexp.new(/\A\s*(\))\s*/i)
|
30
|
+
end
|
31
|
+
|
32
|
+
class Comma < Token
|
33
|
+
@regexp = Regexp.new(/\A\s*(\,)\s*/i)
|
34
|
+
end
|
35
|
+
|
36
|
+
# A token to match an author year combination, breaks
|
37
|
+
# the string into three parts.
|
38
|
+
class AuthorYear < Token
|
39
|
+
attr_reader :authors, :year, :parens
|
40
|
+
# This is going to hit just everything, should only be used
|
41
|
+
# in one off when you know you have that string.
|
42
|
+
@regexp = Regexp.new(/\A\s*(\(?[^\+\d)]+(\d\d\d\d)?\)?)\s*/i)
|
43
|
+
|
44
|
+
def initialize(str)
|
45
|
+
str.strip!
|
46
|
+
# check for parens
|
47
|
+
if str =~ /\((.*)\)/
|
48
|
+
w = $1
|
49
|
+
@parens = true
|
50
|
+
else
|
51
|
+
w = str
|
52
|
+
@parens = false
|
53
|
+
end
|
54
|
+
# check for year
|
55
|
+
if w =~ /(\d\d\d\d)\Z/
|
56
|
+
@year = $1.to_i
|
57
|
+
w.gsub!(/\d\d\d\d\Z/, "")
|
58
|
+
w.strip!
|
59
|
+
end
|
60
|
+
w.gsub!(/,\s*\Z/, '')
|
61
|
+
@authors = w.strip
|
62
|
+
true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Complex breakdown of author strings. Handles
|
67
|
+
# a wide variety of formats.
|
68
|
+
# See test_splitter_tokens.rb for scope. As with
|
69
|
+
# AuthorYear this will match just about anything when used alone.
|
70
|
+
# Add exceptions at will, just test using TestSplittTokens#test_authors.
|
71
|
+
# TODO: Unicode the [a-z] bits?
|
72
|
+
class Authors < Token
|
73
|
+
attr_reader :names
|
74
|
+
@regexp = Regexp.new(/\A\s*([^\d]+)\s*/i)
|
75
|
+
|
76
|
+
def initialize(input)
|
77
|
+
str = input
|
78
|
+
@names = []
|
79
|
+
str.strip!
|
80
|
+
naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
|
81
|
+
individuals = []
|
82
|
+
last_individual = nil
|
83
|
+
|
84
|
+
# We can simplify if there is an "and" or &
|
85
|
+
if str =~ /(\s+and\s+|\&)/i
|
86
|
+
l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
|
87
|
+
last_individual = r
|
88
|
+
str = l
|
89
|
+
naked_and = true
|
90
|
+
end
|
91
|
+
|
92
|
+
# Look for an exception case, no initials, "and" or "&" previously present, like:
|
93
|
+
# Foo, Bar and Smith
|
94
|
+
if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/
|
95
|
+
individuals.unshift str.split(/\s*\,\s*/)
|
96
|
+
str = nil
|
97
|
+
end
|
98
|
+
|
99
|
+
# Look for an exception case, no periods and multiple commas, like:
|
100
|
+
# Foo A, Bar ZA, Smith-Blorf A
|
101
|
+
if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
|
102
|
+
individuals = str.split(",")
|
103
|
+
str = nil
|
104
|
+
end
|
105
|
+
|
106
|
+
prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
|
107
|
+
pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join
|
108
|
+
|
109
|
+
postfix = ['de la', 'von', 'da', 'van', ', Jr.']
|
110
|
+
post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join
|
111
|
+
|
112
|
+
# Initials second
|
113
|
+
m1 = Regexp.new(/^\s*(#{pre_reg} # legal prefix words, includes space if present
|
114
|
+
[A-Z][a-z]+ # a captialized Name
|
115
|
+
(\-[A-Z][a-z]+)? # optional dashed addition
|
116
|
+
\s*,\s* # required comma
|
117
|
+
(\s* # initials, optionally surrounded by whitescape
|
118
|
+
(\-)? # optional preceeding dash, hits second initials
|
119
|
+
[A-Z] # required capital initial
|
120
|
+
(\-)? # optional initial dash
|
121
|
+
(\-[A-Z])? # optional dashed initial
|
122
|
+
\s*\. # required period
|
123
|
+
\s*)
|
124
|
+
{1,} # repeat initials as necessary
|
125
|
+
#{post_reg}) # optional legal postfixes
|
126
|
+
\s*/x)
|
127
|
+
|
128
|
+
# Initials first
|
129
|
+
m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/) # (R. Watson | R.F. Watson),
|
130
|
+
|
131
|
+
# pick off remaining authors one at a time
|
132
|
+
if str
|
133
|
+
parsing = true
|
134
|
+
i = 0
|
135
|
+
while parsing
|
136
|
+
individual = ''
|
137
|
+
check_for_more_individuals = false
|
138
|
+
[m2, m1].each do |regex|
|
139
|
+
if str =~ regex
|
140
|
+
individual = $1
|
141
|
+
str.slice!(individual)
|
142
|
+
str.strip!
|
143
|
+
str.slice!(",")
|
144
|
+
individuals.push(individual)
|
145
|
+
check_for_more_individuals = true # at least once match, keep going
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# puts "[#{individual}] : #{str}"
|
150
|
+
if !check_for_more_individuals
|
151
|
+
if str && str.size != 0
|
152
|
+
individuals.push(str)
|
153
|
+
parsing = false
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
i += 1
|
158
|
+
raise if i > 100
|
159
|
+
parsing = false if str.size == 0
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Note to remember positive look behind (?<= ) for future hax
|
164
|
+
# str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)
|
165
|
+
|
166
|
+
individuals.push(last_individual) if !last_individual.nil?
|
167
|
+
individuals.flatten!
|
168
|
+
|
169
|
+
# At this point we have isolated individuals. Strategy is to slice out initials and remainder is last name.
|
170
|
+
# Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats)
|
171
|
+
# TODO: Make a Token
|
172
|
+
match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)
|
173
|
+
|
174
|
+
# TODO: merge with pre/postfix list
|
175
|
+
suffixes = [
|
176
|
+
Regexp.new(/\s(van)\s?/i),
|
177
|
+
Regexp.new(/\s(jr\.)/i),
|
178
|
+
Regexp.new(/\s(von)\s?/i),
|
179
|
+
Regexp.new(/\s(de la)\s?/i),
|
180
|
+
Regexp.new(/\s(da)\s?/i),
|
181
|
+
]
|
182
|
+
|
183
|
+
individuals.each do |i|
|
184
|
+
a = {} # new author
|
185
|
+
|
186
|
+
initials = nil
|
187
|
+
last_name = nil
|
188
|
+
if i =~ match_initials
|
189
|
+
initials = $1
|
190
|
+
i.slice!(initials)
|
191
|
+
i.strip!
|
192
|
+
last_name = i
|
193
|
+
else
|
194
|
+
last_name = i
|
195
|
+
end
|
196
|
+
|
197
|
+
suffix = []
|
198
|
+
suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
|
199
|
+
if last_name =~ s
|
200
|
+
t = $1
|
201
|
+
suffix.push(t)
|
202
|
+
last_name.slice!(t)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
a[:suffix] = suffix.join(" ") if suffix.size > 0
|
206
|
+
|
207
|
+
last_name.gsub!(/\.|\,/, '')
|
208
|
+
|
209
|
+
a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
|
210
|
+
a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0
|
211
|
+
|
212
|
+
@names << a
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# A token to match volume-number combinations, with various possible formats.
|
218
|
+
class VolumeNumber < Token
|
219
|
+
attr_reader :volume, :number
|
220
|
+
|
221
|
+
@regexp = Regexp.new(/\A\s*(([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?)\s*/i)
|
222
|
+
# @regexp = Regexp.new(/\A\s*((\d+)\s*[:\(]?\s*(\d+)?\)?)\s*/i) <- only digits allowed in this version
|
223
|
+
|
224
|
+
def initialize(str)
|
225
|
+
str.strip
|
226
|
+
str =~ /\A\s*([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?\s*/i
|
227
|
+
@volume = $1
|
228
|
+
@number = $2
|
229
|
+
@volume && @volume.strip!
|
230
|
+
@number && @number.strip!
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
# A token to match page ranges, with remainders noted.
|
235
|
+
class Pages < Token
|
236
|
+
attr_reader :pg_start, :pg_end, :remainder
|
237
|
+
@regexp = Regexp.new(/\A\s*((\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?)/i)
|
238
|
+
|
239
|
+
def initialize(str)
|
240
|
+
str.strip
|
241
|
+
str =~ /\A\s*(\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?/i
|
242
|
+
@pg_start = $1
|
243
|
+
@pg_end = $2
|
244
|
+
@remainder = $3
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
# A token to match quadrinomial.s
|
249
|
+
# Matches:
|
250
|
+
# Foo
|
251
|
+
# Foo (Bar)
|
252
|
+
# Foo (Bar) stuff
|
253
|
+
# Foo (Bar) stuff things
|
254
|
+
# Foo stuff
|
255
|
+
# Foo stuff things
|
256
|
+
# TODO: This will likley erroroneously match on authors names that are uncapitalized, e.g.:
|
257
|
+
# Foo stuff von Helsing, 1920
|
258
|
+
class Quadrinomial < Token
|
259
|
+
attr_reader :genus, :subgenus, :species, :subspecies
|
260
|
+
@regexp = Regexp.new(/\A\s*(([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?)\s*/)
|
261
|
+
|
262
|
+
def initialize(str)
|
263
|
+
str.strip
|
264
|
+
str =~ /\A\s*([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?\s*/i
|
265
|
+
@genus = $1
|
266
|
+
@subgenus = $2
|
267
|
+
@species = $3
|
268
|
+
@subspecies = $4
|
269
|
+
|
270
|
+
if @subgenus =~ /\((.*)\)/
|
271
|
+
@subgenus = $1
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
# !! You must register token lists as symbols in
|
277
|
+
# !! Taxonifi::Splitter
|
278
|
+
#
|
279
|
+
# Include all tokens in the global_token_list.
|
280
|
+
# Tokens are matched in order of the list. If you
|
281
|
+
# re-order an list ensure that unit tests fail.
|
282
|
+
# Create an untested list at your own risk, any proposed
|
283
|
+
# ordering will be accepted as long as tests pass.
|
284
|
+
|
285
|
+
# All tokens.
|
286
|
+
def self.global_token_list
|
287
|
+
[
|
288
|
+
Taxonifi::Splitter::Tokens::Quadrinomial,
|
289
|
+
Taxonifi::Splitter::Tokens::LeftParen,
|
290
|
+
Taxonifi::Splitter::Tokens::Year,
|
291
|
+
Taxonifi::Splitter::Tokens::Comma,
|
292
|
+
Taxonifi::Splitter::Tokens::RightParen,
|
293
|
+
Taxonifi::Splitter::Tokens::AuthorYear,
|
294
|
+
Taxonifi::Splitter::Tokens::Authors,
|
295
|
+
Taxonifi::Splitter::Tokens::VolumeNumber,
|
296
|
+
Taxonifi::Splitter::Tokens::Pages,
|
297
|
+
]
|
298
|
+
end
|
299
|
+
|
300
|
+
# Tokens used in breaking down volume/number ranges.
|
301
|
+
def self.volume_number
|
302
|
+
[
|
303
|
+
Taxonifi::Splitter::Tokens::VolumeNumber
|
304
|
+
]
|
305
|
+
end
|
306
|
+
|
307
|
+
# Tokens used in breaking down page ranges.
|
308
|
+
def self.pages
|
309
|
+
[
|
310
|
+
Taxonifi::Splitter::Tokens::Pages
|
311
|
+
]
|
312
|
+
end
|
313
|
+
|
314
|
+
# Tokens used in breaking down species names.
|
315
|
+
def self.species_name
|
316
|
+
[
|
317
|
+
Taxonifi::Splitter::Tokens::Quadrinomial,
|
318
|
+
Taxonifi::Splitter::Tokens::AuthorYear,
|
319
|
+
]
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
data/lib/taxonifi.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
# Everything in Taxonifi is in here.
|
4
|
+
module Taxonifi
|
5
|
+
|
6
|
+
# Taxonomic ranks.
|
7
|
+
RANKS = %w{
|
8
|
+
kingdom
|
9
|
+
phylum
|
10
|
+
class
|
11
|
+
infraclass
|
12
|
+
order
|
13
|
+
suborder
|
14
|
+
infraorder
|
15
|
+
superfamily
|
16
|
+
family
|
17
|
+
subfamily
|
18
|
+
tribe
|
19
|
+
subtribe
|
20
|
+
genus
|
21
|
+
subgenus
|
22
|
+
species
|
23
|
+
subspecies
|
24
|
+
}
|
25
|
+
|
26
|
+
|
27
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'lumper/lumper'))
|
28
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'splitter/splitter'))
|
29
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'assessor/assessor'))
|
30
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'export/export'))
|
31
|
+
|
32
|
+
Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "models/*.rb") )) do |file|
|
33
|
+
require file
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|