dwc_agent 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 45eaa059cce81b1bcd7efd1b8d7a88ff0f57c9e3
4
+ data.tar.gz: f3200aa768500b618c156b8fb28531670e956288
5
+ SHA512:
6
+ metadata.gz: 2eb764cf79c7650731f77f95c1fb2a40eb35d0ad784dc39ba7b0496d6264a62169490ac0543994941a9512cec0e77ea5c31f4635eaaec85cd5c8d2aa24be2096
7
+ data.tar.gz: 82ec7f273eb65eef51395d7736479d65593b064ca94a8cfff53182d8834d58bb95f417112a25842a26f514202310e7267371a97ac300079f5bed2c2eda45b4d7
data/lib/dwc_agent.rb ADDED
@@ -0,0 +1,271 @@
1
+ require "dwc_agent/version"
2
+ require "capitalize_names"
3
+ require "namae"
4
+
5
+ class DwcAgent
6
+
7
+ STRIP_OUT = %r{
8
+ \b\d+\(?(?i:[[:alpha:]])\)?\b|
9
+ \b[,;]?\s*(?i:et\s+al)\.?|
10
+ \bu\.\s*a\.|
11
+ \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
12
+ \b[,;]?\s*(?i:etc)\.?|
13
+ \b[,;]?\s*(?i:on)\b|
14
+ \b[,;]?\s*(?i:unkn?own)\b|
15
+ \b[,;]?\s*(?i:n/a)\b|
16
+ \b[,;]?\s*(?i:ann?onymous)\b|
17
+ \b[,;]?\s*(?i:undetermined|indeterminable|dummy)\b|
18
+ \b[,;]?\s*(?i:importer)\b|
19
+ \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
20
+ (?i:no\s+(data|disponible))|
21
+ \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
22
+ [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
23
+ May|Jun|Jul|Aug|Sept?|
24
+ Oct|Nov|Dec)\.?\s*[-/\s+]?\d+|
25
+ \b[,;]?\s*(?i:Jan|Jan(uary|vier))[.,;]?\s*\d+|
26
+ \b[,;]?\s*(?i:Feb|February|f(é|e)vrier)[.,;]?\s*\d+|
27
+ \b[,;]?\s*(?i:Mar|Mar(ch|s))[.,;]?\s*\d+|
28
+ \b[,;]?\s*(?i:Apr|Apri|April|avril)[.,;]?\s*\d+|
29
+ \b[,;]?\s*(?i:Ma(y|i))[.,;]?\s*\d+|
30
+ \b[,;]?\s*(?i:Jun|June|juin)[.,;]?\s*\d+|
31
+ \b[,;]?\s*(?i:Jul|July|juillet)[.,;]?\s*\d+|
32
+ \b[,;]?\s*(?i:Aug|August|ao(û|u)t)[.,;]?\s*\d+|
33
+ \b[,;]?\s*(?i:Sep|Sept|Septemb(er|re))[.,;]?\s*\d+|
34
+ \b[,;]?\s*(?i:Oct|Octob(er|re))[.,;]?\s*\d+|
35
+ \b[,;]?\s*(?i:Nov|Novemb(er|re))[.,;]?\s*\d+|
36
+ \b[,;]?\s*(?i:Dec|D(é|e)cemb(er|re))[.,;]?\s*\d+|
37
+ \d+\s+(?i:Jan|Jan(uary|vier))\.?\b|
38
+ \d+\s+(?i:Feb|February|f(é|e)vrier)\.?\b|
39
+ \d+\s+(?i:Mar|March|mars)\.?\b|
40
+ \d+\s+(?i:Apr|Apri|April|avril)\.?\b|
41
+ \d+\s+(?i:Ma(y|i))\b|
42
+ \d+\s+(?i:Jun|June|juin)\.?\b|
43
+ \d+\s+(?i:Jul|July|juillet)\.?\b|
44
+ \d+\s+(?i:Aug|August|ao(û|u)t)\.?\b|
45
+ \d+\s+(?i:Sep|Septemb(er|re))t?\.?\b|
46
+ \d+\s+(?i:Oct|Octob(er|re))\.?\b|
47
+ \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
48
+ \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
49
+ (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
50
+ \b\s*maybe\s*\b|
51
+ \(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
52
+ (?i:fide)\:?\s*\b|
53
+ (?i:game\s+dept)\.?\s*\b|
54
+ (?i:see\s+notes?\s*(inside)?)|
55
+ (?i:see\s+letter\s+enclosed)|
56
+ (?i:(by)?\s+correspondance)|
57
+ (?i:pers\.?\s+comm\.?)|
58
+ (?i:crossed\s+out)|
59
+ \(?(?i:source)\(?|
60
+ (?i:according\s+to)|
61
+ (?i:revised|photograph|fruits\s+only)|
62
+ -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
63
+ -?\s*(?i:synonym(y|ie))|
64
+ \b\s*\(?(?i:(fe)?male)\)?\s*\b|
65
+ \b(?i:to\s+(sub)?spp?)\.?|
66
+ (?i:nom\.?\s+rev\.?)|
67
+ FNA|DAO|HUH|\(MT\)|(?i:\(KEW\))|
68
+ (?i:uqam)|
69
+ \b[,;]\s+\d+\z|
70
+ [":!]|
71
+ [,]?\d+|
72
+ \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
73
+ [,;]\z|
74
+ ^\w{0,2}\z|
75
+ ^[A-Z]{2,}\z|
76
+ \s+(?i:stet)\s*!?\s*\z|
77
+ \s+(?i:prep)\.?\s*\z
78
+ }x
79
+
80
+ SPLIT_BY = %r{
81
+ [–|&+/;]|
82
+ \s+-\s+|
83
+ \s+a\.\s+|
84
+ \b(?i:and|et|with|per)\s+|
85
+ \be\s*\b|
86
+ \b(?i:annotated(\s+by)?)\s*\b|
87
+ \b(?i:coll\.)\s*\b|
88
+ \b(?i:communicate?d(\s+to)?)\s*\b|
89
+ \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
90
+ \b(?i:checked?(\s+by)?)\s*\b|
91
+ \b(?i:det\.?(\s+by)?)\s*\b|
92
+ \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
93
+ \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
94
+ \b(?i:in?dentified(\s+by)?)\s*\b|
95
+ \b(?i:in\s+part(\s+by)?)\s*\b|
96
+ \b(?i:or)\s+|
97
+ \b(?i:prep\.?\s+(?i:by)?)\s*\b|
98
+ \b(?i:redet\.?(\s+by?)?)\s*\b|
99
+ \b(?i:reidentified(\s+by)?)\s*\b|
100
+ \b(?i:stet)\s*\b|
101
+ \b(?i:then(\s+by)?)\s+|
102
+ \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
103
+ \b(?i:via|from)\s*\b|
104
+ \b(?i:(donated)?\s*by)\s+
105
+ }x
106
+
107
+ CHAR_SUBS = {
108
+ '|' => ' | ',
109
+ '(' => ' ',
110
+ ')' => ' ',
111
+ '[' => ' ',
112
+ ']' => ' ',
113
+ '?' => '',
114
+ '!' => '',
115
+ '=' => '',
116
+ '#' => '',
117
+ '/' => ' / ',
118
+ '&' => ' & ',
119
+ '*' => ''
120
+ }
121
+
122
+ COMPLEX_SEPARATORS = %r{
123
+ ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
124
+ }x
125
+
126
+ BLACKLIST = %r{
127
+ (?i:abundant)|
128
+ (?i:adult|juvenile)|
129
+ (?i:anon)|
130
+ (?i:average)|
131
+ (?i:believe|unclear|illegible|none|suggested|(dis)?agrees?)|approach|
132
+ (?i:barcod)|
133
+ (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
134
+ (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
135
+ (?i:carex|salix)|
136
+ (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
137
+ \b\s*(?i:help)\s*\b|
138
+ (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
139
+ (?i:desconocido)|
140
+ (?i:evidence)|
141
+ (?i:inconn?u)|
142
+ (?i:internation|gou?vern|ministry|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
143
+ (?i:label)|
144
+ (?i:o?\.?m\.?n\.?r\.?)|
145
+ (?i:measurement)|
146
+ (?i:ent(o|y)mology)|
147
+ (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
148
+ (?i:univ\.)|
149
+ (?i:graduate|student|estudantes|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
150
+ (?i:non\s+pr(é|e)cis(é|e))|
151
+ (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
152
+ (?i:recreation|culture)|
153
+ (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
154
+ (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|alliance|protective|circle)|
155
+ (?i:commercial|company|control|product)|
156
+ (?i:size|large|colou?r)\s+|
157
+ (?i:skeleton)|
158
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|expedi(c|t)ion|festival|generation|inventory|marine|service)|
159
+ (?i:submersible)|
160
+ (?i:synonymy?)|(topo|syn|holo)type|
161
+ (?i:systematic|perspective)|
162
+ \s+(?i:off)\s+|
163
+ \s*(?i:too)\s+|\s*(?i:the)\s+|
164
+ (?i:taxiderm(ies|y))|
165
+ (?i:though)|
166
+ (?i:toward|seen at)|
167
+ (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
168
+ (?i:urn\:)|
169
+ (?i:usda|ucla)|
170
+ (?i:workshop|garden|farm|jardin|public)
171
+ }x
172
+
173
+ TITLE = /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor)(\s+|$)/i
174
+
175
+ Namae.options[:prefer_comma_as_separator] = true
176
+ Namae.options[:separator] = SPLIT_BY
177
+ Namae.options[:title] = TITLE
178
+
179
+ def self.parse(name)
180
+ cleaned = name.gsub(STRIP_OUT, ' ')
181
+ .gsub(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
182
+ .gsub(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
183
+ .gsub(COMPLEX_SEPARATORS, '\1 | \2')
184
+ .gsub(/,\z/, '')
185
+ .squeeze(' ').strip
186
+ Namae.parse(cleaned)
187
+ end
188
+
189
+ def self.clean(parsed_namae)
190
+ blank_name = { given: nil, family: nil }
191
+
192
+ if parsed_namae.family && parsed_namae.family.length < 3
193
+ return blank_name
194
+ end
195
+ if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
196
+ return blank_name
197
+ end
198
+ if parsed_namae.given && parsed_namae.given.length > 15
199
+ return blank_name
200
+ end
201
+ if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
202
+ return blank_name
203
+ end
204
+ if parsed_namae.family && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.family)
205
+ return blank_name
206
+ end
207
+ if parsed_namae.given && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.given)
208
+ return blank_name
209
+ end
210
+ if parsed_namae.display_order =~ BLACKLIST
211
+ return blank_name
212
+ end
213
+
214
+ if parsed_namae.given &&
215
+ parsed_namae.family &&
216
+ parsed_namae.family.count(".") > 0 &&
217
+ parsed_namae.family.length - parsed_namae.family.count(".") <= 3
218
+ given = parsed_namae.given
219
+ family = parsed_namae.family
220
+ parsed_namae.family = given
221
+ parsed_namae.given = family
222
+ end
223
+
224
+ if parsed_namae.given &&
225
+ (parsed_namae.given == parsed_namae.given.upcase ||
226
+ parsed_namae.given == parsed_namae.given.downcase) &&
227
+ !parsed_namae.given.include?(".") &&
228
+ parsed_namae.given.length >= 4
229
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given)
230
+ end
231
+
232
+ if parsed_namae.given && /[A-Za-z]\./.match(parsed_namae.given)
233
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given).sub(/[a-z]\./, &:upcase)
234
+ end
235
+
236
+ parsed_namae.normalize_initials
237
+
238
+ family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
239
+ given = parsed_namae.given.strip rescue nil
240
+ particle = parsed_namae.particle.strip rescue nil
241
+
242
+ if family.nil? && !given.nil? && !given.include?(".")
243
+ family = given
244
+ given = nil
245
+ end
246
+
247
+ if !family.nil? && given.nil? && !particle.nil?
248
+ given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
249
+ particle = nil
250
+ end
251
+
252
+ if !family.nil? && (family == family.upcase || family == family.downcase)
253
+ family = family.mb_chars.capitalize.to_s rescue nil
254
+ end
255
+
256
+ if !family.nil? && family.length <= 4 && family !~ /[aeiouy]/
257
+ return blank_name
258
+ end
259
+
260
+ if !family.nil? && family.match(/[A-Z]$/)
261
+ return blank_name
262
+ end
263
+
264
+ if !family.nil? && family.match(/^[A-Z]{2}/)
265
+ return blank_name
266
+ end
267
+
268
+ { given: given, family: family }
269
+ end
270
+
271
+ end
@@ -0,0 +1,7 @@
1
+ class DwcAgent
2
+ VERSION = "0.0.1"
3
+
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dwc_agent
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - David P. Shorthouse
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-09-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: namae
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: capitalize-names
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '11.1'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '11.1'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.4'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.16'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.16'
83
+ description: Parses the typically messy content in Darwin Core terms that contain
84
+ people names
85
+ email: davidpshorthouse@gmail.coms
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - lib/dwc_agent.rb
91
+ - lib/dwc_agent/version.rb
92
+ homepage: https://github.com/dshorthouse/dwc_agent
93
+ licenses:
94
+ - MIT
95
+ metadata: {}
96
+ post_install_message:
97
+ rdoc_options:
98
+ - "--encoding"
99
+ - UTF-8
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '2.4'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project:
114
+ rubygems_version: 2.6.12
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
118
+ test_files: []