dwc_agent 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d5065b40f8e665968d5731a48192a8d151db10ac
4
- data.tar.gz: de61ace04702a05a34b306f1713ffc57b4f47e87
3
+ metadata.gz: 85a83233676096ae89385fa15d9df82a8ae4fdbb
4
+ data.tar.gz: 5fba52783d9724a2c3d9b6e0011affb204c1e759
5
5
  SHA512:
6
- metadata.gz: 91458dd061c722e4cce3a9c3d806acf726a992d4daef8c321b637704feffde0229c4cd282dd25e96f0325b6ef4eaa2910f33b792186c6b9018c0d4397116aad5
7
- data.tar.gz: 0c95c9fb7d59a7d63eccd97214ecd10a0800821a8d22f14f77c1bc4de7fa584cc665a1deacb8f7ddbdf562e69ae4996b58a09713c520a4804fa074d653a32657
6
+ metadata.gz: 1d2a61e2b1d95573b8dc456fb53617f518c07b327fce7773effeba4aef9c8322468e6f8e6cbd72f6107d7f0d59c93b0624b44f54522555542f69cb9ee17a0abb
7
+ data.tar.gz: 8acc297375824ed643db7a1d2ec6b51636d80ee194e2decc8abfc54e3db8d39b4431afb12a25039b5dc27be624676dc4cbdde3fada9c2b6a9907fde0eaeb4ec6
@@ -1,298 +1,9 @@
1
- require "dwc_agent/version"
2
1
  require "capitalize_names"
3
2
  require "namae"
4
3
 
5
- class DwcAgent
6
-
7
- STRIP_OUT = %r{
8
- \b\d+\(?(?i:[[:alpha:]])\)?\b|
9
- \b[,;]?\s*(?i:et\s+al)\.?|
10
- \bu\.\s*a\.|
11
- \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
12
- \b[,;]?\s*(?i:etc)\.?|
13
- \b[,;]?\s*(?i:on)\b|
14
- \b[,;]?\s*(?i:unkn?own)\b|
15
- \b[,;]?\s*(?i:n/a)\b|
16
- \b[,;]?\s*(?i:ann?onymous)\b|
17
- \b[,;]?\s*(?i:undetermined|indeterminable|dummy|interim)\b|
18
- \b[,;]?\s*(?i:importer)\b|
19
- \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
20
- (?i:no\s+(data|disponible))|
21
- \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
22
- [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
23
- May|Jun|Jul|Aug|Sept?|
24
- Oct|Nov|Dec)\.?\s*[-/\s+]?\d+|
25
- \b[,;]?\s*(?i:Jan|Jan(uary|vier))[.,;]?\s*\d+|
26
- \b[,;]?\s*(?i:Feb|February|f(é|e)vrier)[.,;]?\s*\d+|
27
- \b[,;]?\s*(?i:Mar|Mar(ch|s))[.,;]?\s*\d+|
28
- \b[,;]?\s*(?i:Apr|Apri|April|avril)[.,;]?\s*\d+|
29
- \b[,;]?\s*(?i:Ma(y|i))[.,;]?\s*\d+|
30
- \b[,;]?\s*(?i:Jun|June|juin)[.,;]?\s*\d+|
31
- \b[,;]?\s*(?i:Jul|July|juillet)[.,;]?\s*\d+|
32
- \b[,;]?\s*(?i:Aug|August|ao(û|u)t)[.,;]?\s*\d+|
33
- \b[,;]?\s*(?i:Sep|Sept|Septemb(er|re))[.,;]?\s*\d+|
34
- \b[,;]?\s*(?i:Oct|Octob(er|re))[.,;]?\s*\d+|
35
- \b[,;]?\s*(?i:Nov|Novemb(er|re))[.,;]?\s*\d+|
36
- \b[,;]?\s*(?i:Dec|D(é|e)cemb(er|re))[.,;]?\s*\d+|
37
- \d+\s+(?i:Jan|Jan(uary|vier))\.?\b|
38
- \d+\s+(?i:Feb|February|f(é|e)vrier)\.?\b|
39
- \d+\s+(?i:Mar|March|mars)\.?\b|
40
- \d+\s+(?i:Apr|Apri|April|avril)\.?\b|
41
- \d+\s+(?i:Ma(y|i))\b|
42
- \d+\s+(?i:Jun|June|juin)\.?\b|
43
- \d+\s+(?i:Jul|July|juillet)\.?\b|
44
- \d+\s+(?i:Aug|August|ao(û|u)t)\.?\b|
45
- \d+\s+(?i:Sep|Septemb(er|re))t?\.?\b|
46
- \d+\s+(?i:Oct|Octob(er|re))\.?\b|
47
- \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
48
- \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
49
- (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
50
- \b\s*maybe\s*\b|
51
- \(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
52
- (?i:fide)\:?\s*\b|
53
- (?i:game\s+dept)\.?\s*\b|
54
- (?i:see\s+notes?\s*(inside)?)|
55
- (?i:see\s+letter\s+enclosed)|
56
- (?i:(by)?\s+correspondance)|
57
- (?i:pers\.?\s+comm\.?)|
58
- (?i:crossed\s+out)|
59
- \(?(?i:source)\(?|
60
- (?i:according\s+to)|
61
- (?i:revised|photograph|fruits\s+only)|
62
- -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
63
- -?\s*(?i:synonym(y|ie))|
64
- \b\s*\(?(?i:(fe)?male)\)?\s*\b|
65
- \b(?i:to\s+(sub)?spp?)\.?|
66
- (?i:nom\.?\s+rev\.?)|
67
- FNA|DAO|HUH|\(MT\)|(?i:\(KEW\))|
68
- (?i:uqam)|
69
- \b[,;]\s+\d+\z|
70
- [":!]|
71
- [,]?\d+|
72
- \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
73
- [,;]\z|
74
- ^\w{0,2}\z|
75
- ^[A-Z]{2,}\z|
76
- \s+(?i:stet)\s*!?\s*\z|
77
- \s+(?i:prep)\.?\s*\z|
78
- \b\s*\([A-Z]{2,}\)
79
- }x
80
-
81
- SPLIT_BY = %r{
82
- [–|&+/;]|
83
- \s+-\s+|
84
- \s+a\.\s+|
85
- \b(?i:and|et|with|per)\s+|
86
- \be\s*\b|
87
- \b(?i:annotated(\s+by)?)\s*\b|
88
- \b(?i:coll\.)\s*\b|
89
- \b(?i:communicate?d(\s+to)?)\s*\b|
90
- \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
91
- \b(?i:checked?(\s+by)?)\s*\b|
92
- \b(?i:det\.?(\s+by)?)\s*\b|
93
- \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
94
- \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
95
- \b(?i:in?dentified(\s+by)?)\s*\b|
96
- \b(?i:in\s+part(\s+by)?)\s*\b|
97
- \b(?i:or)\s+|
98
- \b(?i:prep\.?\s+(?i:by)?)\s*\b|
99
- \b(?i:redet\.?(\s+by?)?)\s*\b|
100
- \b(?i:reidentified(\s+by)?)\s*\b|
101
- \b(?i:stet)\s*\b|
102
- \b(?i:then(\s+by)?)\s+|
103
- \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
104
- \b(?i:via|from)\s*\b|
105
- \b(?i:(donated)?\s*by)\s+
106
- }x
107
-
108
- CHAR_SUBS = {
109
- '|' => ' | ',
110
- '(' => ' ',
111
- ')' => ' ',
112
- '[' => ' ',
113
- ']' => ' ',
114
- '?' => '',
115
- '!' => '',
116
- '=' => '',
117
- '#' => '',
118
- '/' => ' / ',
119
- '&' => ' & ',
120
- '*' => ''
121
- }
122
-
123
- COMPLEX_SEPARATORS = %r{
124
- ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
125
- }x
126
-
127
- BLACKLIST = %r{
128
- (?i:abundant)|
129
- (?i:adult|juvenile)|
130
- (?i:anon)|
131
- (?i:average)|
132
- (?i:believe|unclear|illegible|none|suggested|(dis)?agrees?)|approach|
133
- (?i:barcod)|
134
- (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
135
- (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
136
- (?i:carex|salix)|
137
- (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
138
- \b\s*(?i:help)\s*\b|
139
- (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
140
- (?i:desconocido)|
141
- (?i:evidence)|
142
- (?i:inconn?u)|
143
- (?i:internation|gou?vern|ministry|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
144
- (?i:label)|
145
- (?i:o?\.?m\.?n\.?r\.?)|
146
- (?i:measurement)|
147
- (?i:ent(o|y)mology)|
148
- (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
149
- (?i:univ\.)|
150
- (?i:graduate|student|estudantes|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
151
- (?i:non\s+pr(é|e)cis(é|e))|
152
- (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
153
- (?i:recreation|culture)|
154
- (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
155
- (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|alliance|protective|circle)|
156
- (?i:commercial|company|control|product)|
157
- (?i:size|large|colou?r)\s+|
158
- (?i:skeleton)|
159
- (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|expedi(c|t)ion|festival|generation|inventory|marine|service)|
160
- (?i:submersible)|
161
- (?i:synonymy?)|(topo|syn|holo)type|
162
- (?i:systematic|perspective)|
163
- \s+(?i:off)\s+|
164
- \s*(?i:too)\s+|\s*(?i:the)\s+|
165
- (?i:taxiderm(ies|y))|
166
- (?i:though)|
167
- (?i:toward|seen at)|
168
- (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
169
- (?i:urn\:)|
170
- (?i:usda|ucla)|
171
- (?i:workshop|garden|farm|jardin|public)
172
- }x
173
-
174
- TITLE = /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor)(\s+|$)/i
175
-
176
- Namae.options[:prefer_comma_as_separator] = true
177
- Namae.options[:separator] = SPLIT_BY
178
- Namae.options[:title] = TITLE
179
-
180
- def self.parse(name)
181
- return [] if name.nil? || name == ""
182
- cleaned = name.gsub(STRIP_OUT, ' ')
183
- .gsub(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
184
- .gsub(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
185
- .gsub(COMPLEX_SEPARATORS, '\1 | \2')
186
- .gsub(/,\z/, '')
187
- .squeeze(' ').strip
188
- Namae.parse(cleaned)
189
- end
190
-
191
- def self.clean(parsed_namae)
192
- blank_name = { given: nil, family: nil }
193
-
194
- if parsed_namae.family && parsed_namae.family.length < 3
195
- return blank_name
196
- end
197
- if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
198
- return blank_name
199
- end
200
- if parsed_namae.given && parsed_namae.given.length > 15
201
- return blank_name
202
- end
203
- if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
204
- return blank_name
205
- end
206
- if parsed_namae.family && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.family)
207
- return blank_name
208
- end
209
- if parsed_namae.given && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.given)
210
- return blank_name
211
- end
212
- if parsed_namae.display_order =~ BLACKLIST
213
- return blank_name
214
- end
215
-
216
- if parsed_namae.given &&
217
- parsed_namae.family &&
218
- parsed_namae.family.count(".") > 0 &&
219
- parsed_namae.family.length - parsed_namae.family.count(".") <= 3
220
- given = parsed_namae.given
221
- family = parsed_namae.family
222
- parsed_namae.family = given
223
- parsed_namae.given = family
224
- end
225
-
226
- if parsed_namae.given &&
227
- (parsed_namae.given == parsed_namae.given.upcase ||
228
- parsed_namae.given == parsed_namae.given.downcase) &&
229
- !parsed_namae.given.include?(".") &&
230
- parsed_namae.given.length >= 4
231
- parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given)
232
- end
233
-
234
- if parsed_namae.given && /[A-Za-z]\./.match(parsed_namae.given)
235
- parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given).sub(/[a-z]\./, &:upcase)
236
- end
237
-
238
- parsed_namae.normalize_initials
239
-
240
- family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
241
- given = parsed_namae.given.strip rescue nil
242
- particle = parsed_namae.particle.strip rescue nil
243
-
244
- if family.nil? && !given.nil? && !given.include?(".")
245
- family = given
246
- given = nil
247
- end
248
-
249
- if !family.nil? && given.nil? && !particle.nil?
250
- given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
251
- particle = nil
252
- end
253
-
254
- if !family.nil? && (family == family.upcase || family == family.downcase)
255
- family = family.mb_chars.capitalize.to_s rescue nil
256
- end
257
-
258
- if !family.nil? && family.length <= 4 && family !~ /[aeiouy]/
259
- return blank_name
260
- end
261
-
262
- if !family.nil? && family.match(/[A-Z]$/)
263
- return blank_name
264
- end
265
-
266
- if !family.nil? && family.match(/^[A-Z]{2}/)
267
- return blank_name
268
- end
269
-
270
- { given: given, family: family }
271
- end
272
-
273
- def self.similarity_score(given1, given2)
274
- given1_parts = given1.gsub(/\.\s+/,".").split(/[\.\s]/)
275
- given2_parts = given2.gsub(/\.\s+/,".").split(/[\.\s]/)
276
- largest = [given1_parts,given2_parts].max
277
- smallest = [given1_parts,given2_parts].min
278
-
279
- score = 0
280
- largest.each_with_index do |val,index|
281
- if smallest[index]
282
- if val[0] == smallest[index][0]
283
- score += 1
284
- else
285
- return 0
286
- end
287
- if val.length > 1 && smallest[index].length > 1 && val != smallest[index]
288
- return 0
289
- end
290
- else
291
- score += 0.1
292
- end
293
- end
294
-
295
- score
296
- end
297
-
298
- end
4
+ require "dwc_agent/version"
5
+ require "dwc_agent/constants"
6
+ require "dwc_agent/cleaner"
7
+ require "dwc_agent/parser"
8
+ require "dwc_agent/similarity"
9
+ require "dwc_agent/utility"
@@ -0,0 +1,92 @@
1
+ module DwcAgent
2
+ class Cleaner
3
+
4
+ # Cleans the passed-in namae object from the parse method and
5
+ # re-organizes it to better match expected Darwin Core output.
6
+ #
7
+ # @param parsed_namae [Object] the namae object
8
+ # @return [Hash] the given, family hash
9
+ def self.clean(parsed_namae)
10
+ blank_name = { given: nil, family: nil }
11
+
12
+ if parsed_namae.family && parsed_namae.family.length < 3
13
+ return blank_name
14
+ end
15
+ if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
16
+ return blank_name
17
+ end
18
+ if parsed_namae.given && parsed_namae.given.length > 15
19
+ return blank_name
20
+ end
21
+ if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
22
+ return blank_name
23
+ end
24
+ if parsed_namae.family && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.family)
25
+ return blank_name
26
+ end
27
+ if parsed_namae.given && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.given)
28
+ return blank_name
29
+ end
30
+ if parsed_namae.display_order =~ BLACKLIST
31
+ return blank_name
32
+ end
33
+
34
+ if parsed_namae.given &&
35
+ parsed_namae.family &&
36
+ parsed_namae.family.count(".") > 0 &&
37
+ parsed_namae.family.length - parsed_namae.family.count(".") <= 3
38
+ given = parsed_namae.given
39
+ family = parsed_namae.family
40
+ parsed_namae.family = given
41
+ parsed_namae.given = family
42
+ end
43
+
44
+ if parsed_namae.given &&
45
+ (parsed_namae.given == parsed_namae.given.upcase ||
46
+ parsed_namae.given == parsed_namae.given.downcase) &&
47
+ !parsed_namae.given.include?(".") &&
48
+ parsed_namae.given.length >= 4
49
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given)
50
+ end
51
+
52
+ if parsed_namae.given && /[A-Za-z]\./.match(parsed_namae.given)
53
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given).sub(/[a-z]\./, &:upcase)
54
+ end
55
+
56
+ parsed_namae.normalize_initials
57
+
58
+ family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
59
+ given = parsed_namae.given.strip rescue nil
60
+ particle = parsed_namae.particle.strip rescue nil
61
+
62
+ if family.nil? && !given.nil? && !given.include?(".")
63
+ family = given
64
+ given = nil
65
+ end
66
+
67
+ if !family.nil? && given.nil? && !particle.nil?
68
+ given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
69
+ particle = nil
70
+ end
71
+
72
+ if !family.nil? && (family == family.upcase || family == family.downcase)
73
+ family = family.mb_chars.capitalize.to_s rescue nil
74
+ end
75
+
76
+ if !family.nil? && family.length <= 4 && family !~ /[aeiouy]/
77
+ return blank_name
78
+ end
79
+
80
+ if !family.nil? && family.match(/[A-Z]$/)
81
+ return blank_name
82
+ end
83
+
84
+ if !family.nil? && family.match(/^[A-Z]{2}/)
85
+ return blank_name
86
+ end
87
+
88
+ { given: given, family: family }
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,174 @@
1
+ module DwcAgent
2
+ STRIP_OUT = %r{
3
+ \b\d+\(?(?i:[[:alpha:]])\)?\b|
4
+ \b[,;]?\s*(?i:et\s+al)\.?|
5
+ \bu\.\s*a\.|
6
+ \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
7
+ \b[,;]?\s*(?i:etc)\.?|
8
+ \b[,;]?\s*(?i:on)\b|
9
+ \b[,;]?\s*(?i:unkn?own)\b|
10
+ \b[,;]?\s*(?i:n/a)\b|
11
+ \b[,;]?\s*(?i:ann?onymous)\b|
12
+ \b[,;]?\s*(?i:undetermined|indeterminable|dummy|interim)\b|
13
+ \b[,;]?\s*(?i:importer)\b|
14
+ \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
15
+ (?i:no\s+(data|disponible))|
16
+ \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
17
+ [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
18
+ May|Jun|Jul|Aug|Sept?|
19
+ Oct|Nov|Dec)\.?\s*[-/\s+]?\d+|
20
+ \b[,;]?\s*(?i:Jan|Jan(uary|vier))[.,;]?\s*\d+|
21
+ \b[,;]?\s*(?i:Feb|February|f(é|e)vrier)[.,;]?\s*\d+|
22
+ \b[,;]?\s*(?i:Mar|Mar(ch|s))[.,;]?\s*\d+|
23
+ \b[,;]?\s*(?i:Apr|Apri|April|avril)[.,;]?\s*\d+|
24
+ \b[,;]?\s*(?i:Ma(y|i))[.,;]?\s*\d+|
25
+ \b[,;]?\s*(?i:Jun|June|juin)[.,;]?\s*\d+|
26
+ \b[,;]?\s*(?i:Jul|July|juillet)[.,;]?\s*\d+|
27
+ \b[,;]?\s*(?i:Aug|August|ao(û|u)t)[.,;]?\s*\d+|
28
+ \b[,;]?\s*(?i:Sep|Sept|Septemb(er|re))[.,;]?\s*\d+|
29
+ \b[,;]?\s*(?i:Oct|Octob(er|re))[.,;]?\s*\d+|
30
+ \b[,;]?\s*(?i:Nov|Novemb(er|re))[.,;]?\s*\d+|
31
+ \b[,;]?\s*(?i:Dec|D(é|e)cemb(er|re))[.,;]?\s*\d+|
32
+ \d+\s+(?i:Jan|Jan(uary|vier))\.?\b|
33
+ \d+\s+(?i:Feb|February|f(é|e)vrier)\.?\b|
34
+ \d+\s+(?i:Mar|March|mars)\.?\b|
35
+ \d+\s+(?i:Apr|Apri|April|avril)\.?\b|
36
+ \d+\s+(?i:Ma(y|i))\b|
37
+ \d+\s+(?i:Jun|June|juin)\.?\b|
38
+ \d+\s+(?i:Jul|July|juillet)\.?\b|
39
+ \d+\s+(?i:Aug|August|ao(û|u)t)\.?\b|
40
+ \d+\s+(?i:Sep|Septemb(er|re))t?\.?\b|
41
+ \d+\s+(?i:Oct|Octob(er|re))\.?\b|
42
+ \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
43
+ \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
44
+ (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
45
+ \b\s*maybe\s*\b|
46
+ \(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
47
+ (?i:fide)\:?\s*\b|
48
+ (?i:game\s+dept)\.?\s*\b|
49
+ (?i:see\s+notes?\s*(inside)?)|
50
+ (?i:see\s+letter\s+enclosed)|
51
+ (?i:(by)?\s+correspondance)|
52
+ (?i:pers\.?\s+comm\.?)|
53
+ (?i:crossed\s+out)|
54
+ \(?(?i:source)\(?|
55
+ (?i:according\s+to)|
56
+ (?i:revised|photograph|fruits\s+only)|
57
+ -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
58
+ -?\s*(?i:synonym(y|ie))|
59
+ \b\s*\(?(?i:(fe)?male)\)?\s*\b|
60
+ \b(?i:to\s+(sub)?spp?)\.?|
61
+ (?i:nom\.?\s+rev\.?)|
62
+ FNA|DAO|HUH|\(MT\)|(?i:\(KEW\))|
63
+ (?i:uqam)|
64
+ \b[,;]\s+\d+\z|
65
+ [":!]|
66
+ [,]?\d+|
67
+ \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
68
+ [,;]\z|
69
+ ^\w{0,2}\z|
70
+ ^[A-Z]{2,}\z|
71
+ \s+(?i:stet)\s*!?\s*\z|
72
+ \s+(?i:prep)\.?\s*\z|
73
+ \b\s*\([A-Z]{2,}\)
74
+ }x
75
+
76
+ SPLIT_BY = %r{
77
+ [–|&+/;]|
78
+ \s+-\s+|
79
+ \s+a\.\s+|
80
+ \b(?i:and|et|with|per)\s+|
81
+ \be\s*\b|
82
+ \b(?i:annotated(\s+by)?)\s*\b|
83
+ \b(?i:coll\.)\s*\b|
84
+ \b(?i:communicate?d(\s+to)?)\s*\b|
85
+ \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
86
+ \b(?i:checked?(\s+by)?)\s*\b|
87
+ \b(?i:det\.?(\s+by)?)\s*\b|
88
+ \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
89
+ \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
90
+ \b(?i:in?dentified(\s+by)?)\s*\b|
91
+ \b(?i:in\s+part(\s+by)?)\s*\b|
92
+ \b(?i:or)\s+|
93
+ \b(?i:prep\.?\s+(?i:by)?)\s*\b|
94
+ \b(?i:redet\.?(\s+by?)?)\s*\b|
95
+ \b(?i:reidentified(\s+by)?)\s*\b|
96
+ \b(?i:stet)\s*\b|
97
+ \b(?i:then(\s+by)?)\s+|
98
+ \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
99
+ \b(?i:via|from)\s*\b|
100
+ \b(?i:(donated)?\s*by)\s+
101
+ }x
102
+
103
+ CHAR_SUBS = {
104
+ '|' => ' | ',
105
+ '(' => ' ',
106
+ ')' => ' ',
107
+ '[' => ' ',
108
+ ']' => ' ',
109
+ '?' => '',
110
+ '!' => '',
111
+ '=' => '',
112
+ '#' => '',
113
+ '/' => ' / ',
114
+ '&' => ' & ',
115
+ '*' => ''
116
+ }
117
+
118
+ COMPLEX_SEPARATORS = %r{
119
+ ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
120
+ }x
121
+
122
+ BLACKLIST = %r{
123
+ (?i:abundant)|
124
+ (?i:adult|juvenile)|
125
+ (?i:anon)|
126
+ (?i:average)|
127
+ (?i:believe|unclear|illegible|none|suggested|(dis)?agrees?)|approach|
128
+ (?i:barcod)|
129
+ (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
130
+ (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
131
+ (?i:carex|salix)|
132
+ (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
133
+ \b\s*(?i:help)\s*\b|
134
+ (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
135
+ (?i:desconocido)|
136
+ (?i:evidence)|
137
+ (?i:inconn?u)|
138
+ (?i:internation|gou?vern|ministry|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
139
+ (?i:label)|
140
+ (?i:o?\.?m\.?n\.?r\.?)|
141
+ (?i:measurement)|
142
+ (?i:ent(o|y)mology)|
143
+ (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
144
+ (?i:univ\.)|
145
+ (?i:graduate|student|estudantes|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
146
+ (?i:non\s+pr(é|e)cis(é|e))|
147
+ (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
148
+ (?i:recreation|culture)|
149
+ (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
150
+ (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|alliance|protective|circle)|
151
+ (?i:commercial|company|control|product)|
152
+ (?i:size|large|colou?r)\s+|
153
+ (?i:skeleton)|
154
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|expedi(c|t)ion|festival|generation|inventory|marine|service)|
155
+ (?i:submersible)|
156
+ (?i:synonymy?)|(topo|syn|holo)type|
157
+ (?i:systematic|perspective)|
158
+ \s+(?i:off)\s+|
159
+ \s*(?i:too)\s+|\s*(?i:the)\s+|
160
+ (?i:taxiderm(ies|y))|
161
+ (?i:though)|
162
+ (?i:toward|seen at)|
163
+ (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
164
+ (?i:urn\:)|
165
+ (?i:usda|ucla)|
166
+ (?i:workshop|garden|farm|jardin|public)
167
+ }x
168
+
169
+ TITLE = /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor)(\s+|$)/i
170
+
171
+ Namae.options[:prefer_comma_as_separator] = true
172
+ Namae.options[:separator] = SPLIT_BY
173
+ Namae.options[:title] = TITLE
174
+ end
@@ -0,0 +1,20 @@
1
+ module DwcAgent
2
+ class Parser
3
+
4
+ # Parses the passed-in string and returns a list of names.
5
+ #
6
+ # @param names [String] the name or names to be parsed
7
+ # @return [Array] the list of parsed names
8
+ def self.parse(name)
9
+ return [] if name.nil? || name == ""
10
+ cleaned = name.gsub(STRIP_OUT, ' ')
11
+ .gsub(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
12
+ .gsub(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
13
+ .gsub(COMPLEX_SEPARATORS, '\1 | \2')
14
+ .gsub(/,\z/, '')
15
+ .squeeze(' ').strip
16
+ Namae.parse(cleaned)
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ module DwcAgent
2
+ class Similarity
3
+
4
+ # Produces a similarity score of two given names
5
+ # Logic inspired by R.D.M. Page, https://orcid.org/0000-0002-7101-9767
6
+ # At https://linen-baseball.glitch.me/
7
+ #
8
+ # @param given1 [String] one given name
9
+ # @param given2 [String] a second given name
10
+ # @return [Float] the similarity score
11
+ def self.similarity_score(given1, given2)
12
+ given1_parts = given1.gsub(/\.\s+/,".").split(/[\.\s]/)
13
+ given2_parts = given2.gsub(/\.\s+/,".").split(/[\.\s]/)
14
+ largest = [given1_parts,given2_parts].max
15
+ smallest = [given1_parts,given2_parts].min
16
+
17
+ score = 0
18
+ largest.each_with_index do |val,index|
19
+ if smallest[index]
20
+ if val[0] == smallest[index][0]
21
+ score += 1
22
+ else
23
+ return 0
24
+ end
25
+ if val.length > 1 && smallest[index].length > 1 && val != smallest[index]
26
+ return 0
27
+ end
28
+ else
29
+ score += 0.1
30
+ end
31
+ end
32
+
33
+ score
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,17 @@
1
+ module DwcAgent
2
+
3
+ module_function
4
+
5
+ def parse(names)
6
+ Parser.parse(names)
7
+ end
8
+
9
+ def clean(parsed_namae)
10
+ Cleaner.clean(parsed_namae)
11
+ end
12
+
13
+ def similarity(given1, given2)
14
+ Similarity.similarity(given1, given2)
15
+ end
16
+
17
+ end
@@ -1,7 +1,14 @@
1
- class DwcAgent
2
- VERSION = "0.1.0"
1
+ module DwcAgent
2
+ class Version
3
+
4
+ MAJOR = 0
5
+ MINOR = 1
6
+ PATCH = 1
7
+ BUILD = nil
8
+
9
+ def self.version
10
+ [MAJOR, MINOR, PATCH, BUILD].compact.join('.').freeze
11
+ end
3
12
 
4
- def self.version
5
- VERSION
6
13
  end
7
14
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
@@ -90,6 +90,11 @@ extra_rdoc_files: []
90
90
  files:
91
91
  - bin/dwcagent
92
92
  - lib/dwc_agent.rb
93
+ - lib/dwc_agent/cleaner.rb
94
+ - lib/dwc_agent/constants.rb
95
+ - lib/dwc_agent/parser.rb
96
+ - lib/dwc_agent/similarity.rb
97
+ - lib/dwc_agent/utility.rb
93
98
  - lib/dwc_agent/version.rb
94
99
  homepage: https://github.com/dshorthouse/dwc_agent
95
100
  licenses: