dwc_agent 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d5065b40f8e665968d5731a48192a8d151db10ac
4
- data.tar.gz: de61ace04702a05a34b306f1713ffc57b4f47e87
3
+ metadata.gz: 85a83233676096ae89385fa15d9df82a8ae4fdbb
4
+ data.tar.gz: 5fba52783d9724a2c3d9b6e0011affb204c1e759
5
5
  SHA512:
6
- metadata.gz: 91458dd061c722e4cce3a9c3d806acf726a992d4daef8c321b637704feffde0229c4cd282dd25e96f0325b6ef4eaa2910f33b792186c6b9018c0d4397116aad5
7
- data.tar.gz: 0c95c9fb7d59a7d63eccd97214ecd10a0800821a8d22f14f77c1bc4de7fa584cc665a1deacb8f7ddbdf562e69ae4996b58a09713c520a4804fa074d653a32657
6
+ metadata.gz: 1d2a61e2b1d95573b8dc456fb53617f518c07b327fce7773effeba4aef9c8322468e6f8e6cbd72f6107d7f0d59c93b0624b44f54522555542f69cb9ee17a0abb
7
+ data.tar.gz: 8acc297375824ed643db7a1d2ec6b51636d80ee194e2decc8abfc54e3db8d39b4431afb12a25039b5dc27be624676dc4cbdde3fada9c2b6a9907fde0eaeb4ec6
@@ -1,298 +1,9 @@
1
- require "dwc_agent/version"
2
1
  require "capitalize_names"
3
2
  require "namae"
4
3
 
5
- class DwcAgent
6
-
7
- STRIP_OUT = %r{
8
- \b\d+\(?(?i:[[:alpha:]])\)?\b|
9
- \b[,;]?\s*(?i:et\s+al)\.?|
10
- \bu\.\s*a\.|
11
- \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
12
- \b[,;]?\s*(?i:etc)\.?|
13
- \b[,;]?\s*(?i:on)\b|
14
- \b[,;]?\s*(?i:unkn?own)\b|
15
- \b[,;]?\s*(?i:n/a)\b|
16
- \b[,;]?\s*(?i:ann?onymous)\b|
17
- \b[,;]?\s*(?i:undetermined|indeterminable|dummy|interim)\b|
18
- \b[,;]?\s*(?i:importer)\b|
19
- \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
20
- (?i:no\s+(data|disponible))|
21
- \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
22
- [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
23
- May|Jun|Jul|Aug|Sept?|
24
- Oct|Nov|Dec)\.?\s*[-/\s+]?\d+|
25
- \b[,;]?\s*(?i:Jan|Jan(uary|vier))[.,;]?\s*\d+|
26
- \b[,;]?\s*(?i:Feb|February|f(é|e)vrier)[.,;]?\s*\d+|
27
- \b[,;]?\s*(?i:Mar|Mar(ch|s))[.,;]?\s*\d+|
28
- \b[,;]?\s*(?i:Apr|Apri|April|avril)[.,;]?\s*\d+|
29
- \b[,;]?\s*(?i:Ma(y|i))[.,;]?\s*\d+|
30
- \b[,;]?\s*(?i:Jun|June|juin)[.,;]?\s*\d+|
31
- \b[,;]?\s*(?i:Jul|July|juillet)[.,;]?\s*\d+|
32
- \b[,;]?\s*(?i:Aug|August|ao(û|u)t)[.,;]?\s*\d+|
33
- \b[,;]?\s*(?i:Sep|Sept|Septemb(er|re))[.,;]?\s*\d+|
34
- \b[,;]?\s*(?i:Oct|Octob(er|re))[.,;]?\s*\d+|
35
- \b[,;]?\s*(?i:Nov|Novemb(er|re))[.,;]?\s*\d+|
36
- \b[,;]?\s*(?i:Dec|D(é|e)cemb(er|re))[.,;]?\s*\d+|
37
- \d+\s+(?i:Jan|Jan(uary|vier))\.?\b|
38
- \d+\s+(?i:Feb|February|f(é|e)vrier)\.?\b|
39
- \d+\s+(?i:Mar|March|mars)\.?\b|
40
- \d+\s+(?i:Apr|Apri|April|avril)\.?\b|
41
- \d+\s+(?i:Ma(y|i))\b|
42
- \d+\s+(?i:Jun|June|juin)\.?\b|
43
- \d+\s+(?i:Jul|July|juillet)\.?\b|
44
- \d+\s+(?i:Aug|August|ao(û|u)t)\.?\b|
45
- \d+\s+(?i:Sep|Septemb(er|re))t?\.?\b|
46
- \d+\s+(?i:Oct|Octob(er|re))\.?\b|
47
- \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
48
- \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
49
- (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
50
- \b\s*maybe\s*\b|
51
- \(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
52
- (?i:fide)\:?\s*\b|
53
- (?i:game\s+dept)\.?\s*\b|
54
- (?i:see\s+notes?\s*(inside)?)|
55
- (?i:see\s+letter\s+enclosed)|
56
- (?i:(by)?\s+correspondance)|
57
- (?i:pers\.?\s+comm\.?)|
58
- (?i:crossed\s+out)|
59
- \(?(?i:source)\(?|
60
- (?i:according\s+to)|
61
- (?i:revised|photograph|fruits\s+only)|
62
- -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
63
- -?\s*(?i:synonym(y|ie))|
64
- \b\s*\(?(?i:(fe)?male)\)?\s*\b|
65
- \b(?i:to\s+(sub)?spp?)\.?|
66
- (?i:nom\.?\s+rev\.?)|
67
- FNA|DAO|HUH|\(MT\)|(?i:\(KEW\))|
68
- (?i:uqam)|
69
- \b[,;]\s+\d+\z|
70
- [":!]|
71
- [,]?\d+|
72
- \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
73
- [,;]\z|
74
- ^\w{0,2}\z|
75
- ^[A-Z]{2,}\z|
76
- \s+(?i:stet)\s*!?\s*\z|
77
- \s+(?i:prep)\.?\s*\z|
78
- \b\s*\([A-Z]{2,}\)
79
- }x
80
-
81
- SPLIT_BY = %r{
82
- [–|&+/;]|
83
- \s+-\s+|
84
- \s+a\.\s+|
85
- \b(?i:and|et|with|per)\s+|
86
- \be\s*\b|
87
- \b(?i:annotated(\s+by)?)\s*\b|
88
- \b(?i:coll\.)\s*\b|
89
- \b(?i:communicate?d(\s+to)?)\s*\b|
90
- \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
91
- \b(?i:checked?(\s+by)?)\s*\b|
92
- \b(?i:det\.?(\s+by)?)\s*\b|
93
- \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
94
- \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
95
- \b(?i:in?dentified(\s+by)?)\s*\b|
96
- \b(?i:in\s+part(\s+by)?)\s*\b|
97
- \b(?i:or)\s+|
98
- \b(?i:prep\.?\s+(?i:by)?)\s*\b|
99
- \b(?i:redet\.?(\s+by?)?)\s*\b|
100
- \b(?i:reidentified(\s+by)?)\s*\b|
101
- \b(?i:stet)\s*\b|
102
- \b(?i:then(\s+by)?)\s+|
103
- \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
104
- \b(?i:via|from)\s*\b|
105
- \b(?i:(donated)?\s*by)\s+
106
- }x
107
-
108
- CHAR_SUBS = {
109
- '|' => ' | ',
110
- '(' => ' ',
111
- ')' => ' ',
112
- '[' => ' ',
113
- ']' => ' ',
114
- '?' => '',
115
- '!' => '',
116
- '=' => '',
117
- '#' => '',
118
- '/' => ' / ',
119
- '&' => ' & ',
120
- '*' => ''
121
- }
122
-
123
- COMPLEX_SEPARATORS = %r{
124
- ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
125
- }x
126
-
127
- BLACKLIST = %r{
128
- (?i:abundant)|
129
- (?i:adult|juvenile)|
130
- (?i:anon)|
131
- (?i:average)|
132
- (?i:believe|unclear|illegible|none|suggested|(dis)?agrees?)|approach|
133
- (?i:barcod)|
134
- (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
135
- (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
136
- (?i:carex|salix)|
137
- (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
138
- \b\s*(?i:help)\s*\b|
139
- (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
140
- (?i:desconocido)|
141
- (?i:evidence)|
142
- (?i:inconn?u)|
143
- (?i:internation|gou?vern|ministry|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
144
- (?i:label)|
145
- (?i:o?\.?m\.?n\.?r\.?)|
146
- (?i:measurement)|
147
- (?i:ent(o|y)mology)|
148
- (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
149
- (?i:univ\.)|
150
- (?i:graduate|student|estudantes|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
151
- (?i:non\s+pr(é|e)cis(é|e))|
152
- (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
153
- (?i:recreation|culture)|
154
- (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
155
- (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|alliance|protective|circle)|
156
- (?i:commercial|company|control|product)|
157
- (?i:size|large|colou?r)\s+|
158
- (?i:skeleton)|
159
- (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|expedi(c|t)ion|festival|generation|inventory|marine|service)|
160
- (?i:submersible)|
161
- (?i:synonymy?)|(topo|syn|holo)type|
162
- (?i:systematic|perspective)|
163
- \s+(?i:off)\s+|
164
- \s*(?i:too)\s+|\s*(?i:the)\s+|
165
- (?i:taxiderm(ies|y))|
166
- (?i:though)|
167
- (?i:toward|seen at)|
168
- (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
169
- (?i:urn\:)|
170
- (?i:usda|ucla)|
171
- (?i:workshop|garden|farm|jardin|public)
172
- }x
173
-
174
- TITLE = /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor)(\s+|$)/i
175
-
176
- Namae.options[:prefer_comma_as_separator] = true
177
- Namae.options[:separator] = SPLIT_BY
178
- Namae.options[:title] = TITLE
179
-
180
- def self.parse(name)
181
- return [] if name.nil? || name == ""
182
- cleaned = name.gsub(STRIP_OUT, ' ')
183
- .gsub(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
184
- .gsub(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
185
- .gsub(COMPLEX_SEPARATORS, '\1 | \2')
186
- .gsub(/,\z/, '')
187
- .squeeze(' ').strip
188
- Namae.parse(cleaned)
189
- end
190
-
191
- def self.clean(parsed_namae)
192
- blank_name = { given: nil, family: nil }
193
-
194
- if parsed_namae.family && parsed_namae.family.length < 3
195
- return blank_name
196
- end
197
- if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
198
- return blank_name
199
- end
200
- if parsed_namae.given && parsed_namae.given.length > 15
201
- return blank_name
202
- end
203
- if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
204
- return blank_name
205
- end
206
- if parsed_namae.family && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.family)
207
- return blank_name
208
- end
209
- if parsed_namae.given && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.given)
210
- return blank_name
211
- end
212
- if parsed_namae.display_order =~ BLACKLIST
213
- return blank_name
214
- end
215
-
216
- if parsed_namae.given &&
217
- parsed_namae.family &&
218
- parsed_namae.family.count(".") > 0 &&
219
- parsed_namae.family.length - parsed_namae.family.count(".") <= 3
220
- given = parsed_namae.given
221
- family = parsed_namae.family
222
- parsed_namae.family = given
223
- parsed_namae.given = family
224
- end
225
-
226
- if parsed_namae.given &&
227
- (parsed_namae.given == parsed_namae.given.upcase ||
228
- parsed_namae.given == parsed_namae.given.downcase) &&
229
- !parsed_namae.given.include?(".") &&
230
- parsed_namae.given.length >= 4
231
- parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given)
232
- end
233
-
234
- if parsed_namae.given && /[A-Za-z]\./.match(parsed_namae.given)
235
- parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given).sub(/[a-z]\./, &:upcase)
236
- end
237
-
238
- parsed_namae.normalize_initials
239
-
240
- family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
241
- given = parsed_namae.given.strip rescue nil
242
- particle = parsed_namae.particle.strip rescue nil
243
-
244
- if family.nil? && !given.nil? && !given.include?(".")
245
- family = given
246
- given = nil
247
- end
248
-
249
- if !family.nil? && given.nil? && !particle.nil?
250
- given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
251
- particle = nil
252
- end
253
-
254
- if !family.nil? && (family == family.upcase || family == family.downcase)
255
- family = family.mb_chars.capitalize.to_s rescue nil
256
- end
257
-
258
- if !family.nil? && family.length <= 4 && family !~ /[aeiouy]/
259
- return blank_name
260
- end
261
-
262
- if !family.nil? && family.match(/[A-Z]$/)
263
- return blank_name
264
- end
265
-
266
- if !family.nil? && family.match(/^[A-Z]{2}/)
267
- return blank_name
268
- end
269
-
270
- { given: given, family: family }
271
- end
272
-
273
- def self.similarity_score(given1, given2)
274
- given1_parts = given1.gsub(/\.\s+/,".").split(/[\.\s]/)
275
- given2_parts = given2.gsub(/\.\s+/,".").split(/[\.\s]/)
276
- largest = [given1_parts,given2_parts].max
277
- smallest = [given1_parts,given2_parts].min
278
-
279
- score = 0
280
- largest.each_with_index do |val,index|
281
- if smallest[index]
282
- if val[0] == smallest[index][0]
283
- score += 1
284
- else
285
- return 0
286
- end
287
- if val.length > 1 && smallest[index].length > 1 && val != smallest[index]
288
- return 0
289
- end
290
- else
291
- score += 0.1
292
- end
293
- end
294
-
295
- score
296
- end
297
-
298
- end
4
+ require "dwc_agent/version"
5
+ require "dwc_agent/constants"
6
+ require "dwc_agent/cleaner"
7
+ require "dwc_agent/parser"
8
+ require "dwc_agent/similarity"
9
+ require "dwc_agent/utility"
@@ -0,0 +1,92 @@
1
+ module DwcAgent
2
+ class Cleaner
3
+
4
+ # Cleans the passed-in namae object from the parse method and
5
+ # re-organizes it to better match expected Darwin Core output.
6
+ #
7
+ # @param parsed_namae [Object] the namae object
8
+ # @return [Hash] the given, family hash
9
+ def self.clean(parsed_namae)
10
+ blank_name = { given: nil, family: nil }
11
+
12
+ if parsed_namae.family && parsed_namae.family.length < 3
13
+ return blank_name
14
+ end
15
+ if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
16
+ return blank_name
17
+ end
18
+ if parsed_namae.given && parsed_namae.given.length > 15
19
+ return blank_name
20
+ end
21
+ if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
22
+ return blank_name
23
+ end
24
+ if parsed_namae.family && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.family)
25
+ return blank_name
26
+ end
27
+ if parsed_namae.given && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.given)
28
+ return blank_name
29
+ end
30
+ if parsed_namae.display_order =~ BLACKLIST
31
+ return blank_name
32
+ end
33
+
34
+ if parsed_namae.given &&
35
+ parsed_namae.family &&
36
+ parsed_namae.family.count(".") > 0 &&
37
+ parsed_namae.family.length - parsed_namae.family.count(".") <= 3
38
+ given = parsed_namae.given
39
+ family = parsed_namae.family
40
+ parsed_namae.family = given
41
+ parsed_namae.given = family
42
+ end
43
+
44
+ if parsed_namae.given &&
45
+ (parsed_namae.given == parsed_namae.given.upcase ||
46
+ parsed_namae.given == parsed_namae.given.downcase) &&
47
+ !parsed_namae.given.include?(".") &&
48
+ parsed_namae.given.length >= 4
49
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given)
50
+ end
51
+
52
+ if parsed_namae.given && /[A-Za-z]\./.match(parsed_namae.given)
53
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given).sub(/[a-z]\./, &:upcase)
54
+ end
55
+
56
+ parsed_namae.normalize_initials
57
+
58
+ family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
59
+ given = parsed_namae.given.strip rescue nil
60
+ particle = parsed_namae.particle.strip rescue nil
61
+
62
+ if family.nil? && !given.nil? && !given.include?(".")
63
+ family = given
64
+ given = nil
65
+ end
66
+
67
+ if !family.nil? && given.nil? && !particle.nil?
68
+ given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
69
+ particle = nil
70
+ end
71
+
72
+ if !family.nil? && (family == family.upcase || family == family.downcase)
73
+ family = family.mb_chars.capitalize.to_s rescue nil
74
+ end
75
+
76
+ if !family.nil? && family.length <= 4 && family !~ /[aeiouy]/
77
+ return blank_name
78
+ end
79
+
80
+ if !family.nil? && family.match(/[A-Z]$/)
81
+ return blank_name
82
+ end
83
+
84
+ if !family.nil? && family.match(/^[A-Z]{2}/)
85
+ return blank_name
86
+ end
87
+
88
+ { given: given, family: family }
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,174 @@
1
+ module DwcAgent
2
+ STRIP_OUT = %r{
3
+ \b\d+\(?(?i:[[:alpha:]])\)?\b|
4
+ \b[,;]?\s*(?i:et\s+al)\.?|
5
+ \bu\.\s*a\.|
6
+ \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
7
+ \b[,;]?\s*(?i:etc)\.?|
8
+ \b[,;]?\s*(?i:on)\b|
9
+ \b[,;]?\s*(?i:unkn?own)\b|
10
+ \b[,;]?\s*(?i:n/a)\b|
11
+ \b[,;]?\s*(?i:ann?onymous)\b|
12
+ \b[,;]?\s*(?i:undetermined|indeterminable|dummy|interim)\b|
13
+ \b[,;]?\s*(?i:importer)\b|
14
+ \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
15
+ (?i:no\s+(data|disponible))|
16
+ \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
17
+ [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
18
+ May|Jun|Jul|Aug|Sept?|
19
+ Oct|Nov|Dec)\.?\s*[-/\s+]?\d+|
20
+ \b[,;]?\s*(?i:Jan|Jan(uary|vier))[.,;]?\s*\d+|
21
+ \b[,;]?\s*(?i:Feb|February|f(é|e)vrier)[.,;]?\s*\d+|
22
+ \b[,;]?\s*(?i:Mar|Mar(ch|s))[.,;]?\s*\d+|
23
+ \b[,;]?\s*(?i:Apr|Apri|April|avril)[.,;]?\s*\d+|
24
+ \b[,;]?\s*(?i:Ma(y|i))[.,;]?\s*\d+|
25
+ \b[,;]?\s*(?i:Jun|June|juin)[.,;]?\s*\d+|
26
+ \b[,;]?\s*(?i:Jul|July|juillet)[.,;]?\s*\d+|
27
+ \b[,;]?\s*(?i:Aug|August|ao(û|u)t)[.,;]?\s*\d+|
28
+ \b[,;]?\s*(?i:Sep|Sept|Septemb(er|re))[.,;]?\s*\d+|
29
+ \b[,;]?\s*(?i:Oct|Octob(er|re))[.,;]?\s*\d+|
30
+ \b[,;]?\s*(?i:Nov|Novemb(er|re))[.,;]?\s*\d+|
31
+ \b[,;]?\s*(?i:Dec|D(é|e)cemb(er|re))[.,;]?\s*\d+|
32
+ \d+\s+(?i:Jan|Jan(uary|vier))\.?\b|
33
+ \d+\s+(?i:Feb|February|f(é|e)vrier)\.?\b|
34
+ \d+\s+(?i:Mar|March|mars)\.?\b|
35
+ \d+\s+(?i:Apr|Apri|April|avril)\.?\b|
36
+ \d+\s+(?i:Ma(y|i))\b|
37
+ \d+\s+(?i:Jun|June|juin)\.?\b|
38
+ \d+\s+(?i:Jul|July|juillet)\.?\b|
39
+ \d+\s+(?i:Aug|August|ao(û|u)t)\.?\b|
40
+ \d+\s+(?i:Sep|Septemb(er|re))t?\.?\b|
41
+ \d+\s+(?i:Oct|Octob(er|re))\.?\b|
42
+ \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
43
+ \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
44
+ (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
45
+ \b\s*maybe\s*\b|
46
+ \(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
47
+ (?i:fide)\:?\s*\b|
48
+ (?i:game\s+dept)\.?\s*\b|
49
+ (?i:see\s+notes?\s*(inside)?)|
50
+ (?i:see\s+letter\s+enclosed)|
51
+ (?i:(by)?\s+correspondance)|
52
+ (?i:pers\.?\s+comm\.?)|
53
+ (?i:crossed\s+out)|
54
+ \(?(?i:source)\(?|
55
+ (?i:according\s+to)|
56
+ (?i:revised|photograph|fruits\s+only)|
57
+ -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
58
+ -?\s*(?i:synonym(y|ie))|
59
+ \b\s*\(?(?i:(fe)?male)\)?\s*\b|
60
+ \b(?i:to\s+(sub)?spp?)\.?|
61
+ (?i:nom\.?\s+rev\.?)|
62
+ FNA|DAO|HUH|\(MT\)|(?i:\(KEW\))|
63
+ (?i:uqam)|
64
+ \b[,;]\s+\d+\z|
65
+ [":!]|
66
+ [,]?\d+|
67
+ \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
68
+ [,;]\z|
69
+ ^\w{0,2}\z|
70
+ ^[A-Z]{2,}\z|
71
+ \s+(?i:stet)\s*!?\s*\z|
72
+ \s+(?i:prep)\.?\s*\z|
73
+ \b\s*\([A-Z]{2,}\)
74
+ }x
75
+
76
+ SPLIT_BY = %r{
77
+ [–|&+/;]|
78
+ \s+-\s+|
79
+ \s+a\.\s+|
80
+ \b(?i:and|et|with|per)\s+|
81
+ \be\s*\b|
82
+ \b(?i:annotated(\s+by)?)\s*\b|
83
+ \b(?i:coll\.)\s*\b|
84
+ \b(?i:communicate?d(\s+to)?)\s*\b|
85
+ \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
86
+ \b(?i:checked?(\s+by)?)\s*\b|
87
+ \b(?i:det\.?(\s+by)?)\s*\b|
88
+ \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
89
+ \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
90
+ \b(?i:in?dentified(\s+by)?)\s*\b|
91
+ \b(?i:in\s+part(\s+by)?)\s*\b|
92
+ \b(?i:or)\s+|
93
+ \b(?i:prep\.?\s+(?i:by)?)\s*\b|
94
+ \b(?i:redet\.?(\s+by?)?)\s*\b|
95
+ \b(?i:reidentified(\s+by)?)\s*\b|
96
+ \b(?i:stet)\s*\b|
97
+ \b(?i:then(\s+by)?)\s+|
98
+ \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
99
+ \b(?i:via|from)\s*\b|
100
+ \b(?i:(donated)?\s*by)\s+
101
+ }x
102
+
103
+ CHAR_SUBS = {
104
+ '|' => ' | ',
105
+ '(' => ' ',
106
+ ')' => ' ',
107
+ '[' => ' ',
108
+ ']' => ' ',
109
+ '?' => '',
110
+ '!' => '',
111
+ '=' => '',
112
+ '#' => '',
113
+ '/' => ' / ',
114
+ '&' => ' & ',
115
+ '*' => ''
116
+ }
117
+
118
+ COMPLEX_SEPARATORS = %r{
119
+ ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
120
+ }x
121
+
122
+ BLACKLIST = %r{
123
+ (?i:abundant)|
124
+ (?i:adult|juvenile)|
125
+ (?i:anon)|
126
+ (?i:average)|
127
+ (?i:believe|unclear|illegible|none|suggested|(dis)?agrees?)|approach|
128
+ (?i:barcod)|
129
+ (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
130
+ (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
131
+ (?i:carex|salix)|
132
+ (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
133
+ \b\s*(?i:help)\s*\b|
134
+ (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
135
+ (?i:desconocido)|
136
+ (?i:evidence)|
137
+ (?i:inconn?u)|
138
+ (?i:internation|gou?vern|ministry|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
139
+ (?i:label)|
140
+ (?i:o?\.?m\.?n\.?r\.?)|
141
+ (?i:measurement)|
142
+ (?i:ent(o|y)mology)|
143
+ (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
144
+ (?i:univ\.)|
145
+ (?i:graduate|student|estudantes|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
146
+ (?i:non\s+pr(é|e)cis(é|e))|
147
+ (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
148
+ (?i:recreation|culture)|
149
+ (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
150
+ (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|alliance|protective|circle)|
151
+ (?i:commercial|company|control|product)|
152
+ (?i:size|large|colou?r)\s+|
153
+ (?i:skeleton)|
154
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|expedi(c|t)ion|festival|generation|inventory|marine|service)|
155
+ (?i:submersible)|
156
+ (?i:synonymy?)|(topo|syn|holo)type|
157
+ (?i:systematic|perspective)|
158
+ \s+(?i:off)\s+|
159
+ \s*(?i:too)\s+|\s*(?i:the)\s+|
160
+ (?i:taxiderm(ies|y))|
161
+ (?i:though)|
162
+ (?i:toward|seen at)|
163
+ (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
164
+ (?i:urn\:)|
165
+ (?i:usda|ucla)|
166
+ (?i:workshop|garden|farm|jardin|public)
167
+ }x
168
+
169
+ TITLE = /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor)(\s+|$)/i
170
+
171
+ Namae.options[:prefer_comma_as_separator] = true
172
+ Namae.options[:separator] = SPLIT_BY
173
+ Namae.options[:title] = TITLE
174
+ end
@@ -0,0 +1,20 @@
1
+ module DwcAgent
2
+ class Parser
3
+
4
+ # Parses the passed-in string and returns a list of names.
5
+ #
6
+ # @param names [String] the name or names to be parsed
7
+ # @return [Array] the list of parsed names
8
+ def self.parse(name)
9
+ return [] if name.nil? || name == ""
10
+ cleaned = name.gsub(STRIP_OUT, ' ')
11
+ .gsub(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
12
+ .gsub(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
13
+ .gsub(COMPLEX_SEPARATORS, '\1 | \2')
14
+ .gsub(/,\z/, '')
15
+ .squeeze(' ').strip
16
+ Namae.parse(cleaned)
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ module DwcAgent
2
+ class Similarity
3
+
4
+ # Produces a similarity score of two given names
5
+ # Logic inspired by R.D.M. Page, https://orcid.org/0000-0002-7101-9767
6
+ # At https://linen-baseball.glitch.me/
7
+ #
8
+ # @param given1 [String] one given name
9
+ # @param given2 [String] a second given name
10
+ # @return [Float] the similarity score
11
+ def self.similarity_score(given1, given2)
12
+ given1_parts = given1.gsub(/\.\s+/,".").split(/[\.\s]/)
13
+ given2_parts = given2.gsub(/\.\s+/,".").split(/[\.\s]/)
14
+ largest = [given1_parts,given2_parts].max
15
+ smallest = [given1_parts,given2_parts].min
16
+
17
+ score = 0
18
+ largest.each_with_index do |val,index|
19
+ if smallest[index]
20
+ if val[0] == smallest[index][0]
21
+ score += 1
22
+ else
23
+ return 0
24
+ end
25
+ if val.length > 1 && smallest[index].length > 1 && val != smallest[index]
26
+ return 0
27
+ end
28
+ else
29
+ score += 0.1
30
+ end
31
+ end
32
+
33
+ score
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,17 @@
1
+ module DwcAgent
2
+
3
+ module_function
4
+
5
+ def parse(names)
6
+ Parser.parse(names)
7
+ end
8
+
9
+ def clean(parsed_namae)
10
+ Cleaner.clean(parsed_namae)
11
+ end
12
+
13
+ def similarity(given1, given2)
14
+ Similarity.similarity(given1, given2)
15
+ end
16
+
17
+ end
@@ -1,7 +1,14 @@
1
- class DwcAgent
2
- VERSION = "0.1.0"
1
+ module DwcAgent
2
+ class Version
3
+
4
+ MAJOR = 0
5
+ MINOR = 1
6
+ PATCH = 1
7
+ BUILD = nil
8
+
9
+ def self.version
10
+ [MAJOR, MINOR, PATCH, BUILD].compact.join('.').freeze
11
+ end
3
12
 
4
- def self.version
5
- VERSION
6
13
  end
7
14
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
@@ -90,6 +90,11 @@ extra_rdoc_files: []
90
90
  files:
91
91
  - bin/dwcagent
92
92
  - lib/dwc_agent.rb
93
+ - lib/dwc_agent/cleaner.rb
94
+ - lib/dwc_agent/constants.rb
95
+ - lib/dwc_agent/parser.rb
96
+ - lib/dwc_agent/similarity.rb
97
+ - lib/dwc_agent/utility.rb
93
98
  - lib/dwc_agent/version.rb
94
99
  homepage: https://github.com/dshorthouse/dwc_agent
95
100
  licenses: