dwc_agent 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 45eaa059cce81b1bcd7efd1b8d7a88ff0f57c9e3
4
+ data.tar.gz: f3200aa768500b618c156b8fb28531670e956288
5
+ SHA512:
6
+ metadata.gz: 2eb764cf79c7650731f77f95c1fb2a40eb35d0ad784dc39ba7b0496d6264a62169490ac0543994941a9512cec0e77ea5c31f4635eaaec85cd5c8d2aa24be2096
7
+ data.tar.gz: 82ec7f273eb65eef51395d7736479d65593b064ca94a8cfff53182d8834d58bb95f417112a25842a26f514202310e7267371a97ac300079f5bed2c2eda45b4d7
data/lib/dwc_agent.rb ADDED
@@ -0,0 +1,271 @@
1
+ require "dwc_agent/version"
2
+ require "capitalize_names"
3
+ require "namae"
4
+
5
+ class DwcAgent
6
+
7
+ STRIP_OUT = %r{
8
+ \b\d+\(?(?i:[[:alpha:]])\)?\b|
9
+ \b[,;]?\s*(?i:et\s+al)\.?|
10
+ \bu\.\s*a\.|
11
+ \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
12
+ \b[,;]?\s*(?i:etc)\.?|
13
+ \b[,;]?\s*(?i:on)\b|
14
+ \b[,;]?\s*(?i:unkn?own)\b|
15
+ \b[,;]?\s*(?i:n/a)\b|
16
+ \b[,;]?\s*(?i:ann?onymous)\b|
17
+ \b[,;]?\s*(?i:undetermined|indeterminable|dummy)\b|
18
+ \b[,;]?\s*(?i:importer)\b|
19
+ \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
20
+ (?i:no\s+(data|disponible))|
21
+ \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
22
+ [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
23
+ May|Jun|Jul|Aug|Sept?|
24
+ Oct|Nov|Dec)\.?\s*[-/\s+]?\d+|
25
+ \b[,;]?\s*(?i:Jan|Jan(uary|vier))[.,;]?\s*\d+|
26
+ \b[,;]?\s*(?i:Feb|February|f(é|e)vrier)[.,;]?\s*\d+|
27
+ \b[,;]?\s*(?i:Mar|Mar(ch|s))[.,;]?\s*\d+|
28
+ \b[,;]?\s*(?i:Apr|Apri|April|avril)[.,;]?\s*\d+|
29
+ \b[,;]?\s*(?i:Ma(y|i))[.,;]?\s*\d+|
30
+ \b[,;]?\s*(?i:Jun|June|juin)[.,;]?\s*\d+|
31
+ \b[,;]?\s*(?i:Jul|July|juillet)[.,;]?\s*\d+|
32
+ \b[,;]?\s*(?i:Aug|August|ao(û|u)t)[.,;]?\s*\d+|
33
+ \b[,;]?\s*(?i:Sep|Sept|Septemb(er|re))[.,;]?\s*\d+|
34
+ \b[,;]?\s*(?i:Oct|Octob(er|re))[.,;]?\s*\d+|
35
+ \b[,;]?\s*(?i:Nov|Novemb(er|re))[.,;]?\s*\d+|
36
+ \b[,;]?\s*(?i:Dec|D(é|e)cemb(er|re))[.,;]?\s*\d+|
37
+ \d+\s+(?i:Jan|Jan(uary|vier))\.?\b|
38
+ \d+\s+(?i:Feb|February|f(é|e)vrier)\.?\b|
39
+ \d+\s+(?i:Mar|March|mars)\.?\b|
40
+ \d+\s+(?i:Apr|Apri|April|avril)\.?\b|
41
+ \d+\s+(?i:Ma(y|i))\b|
42
+ \d+\s+(?i:Jun|June|juin)\.?\b|
43
+ \d+\s+(?i:Jul|July|juillet)\.?\b|
44
+ \d+\s+(?i:Aug|August|ao(û|u)t)\.?\b|
45
+ \d+\s+(?i:Sep|Septemb(er|re))t?\.?\b|
46
+ \d+\s+(?i:Oct|Octob(er|re))\.?\b|
47
+ \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
48
+ \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
49
+ (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
50
+ \b\s*maybe\s*\b|
51
+ \(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
52
+ (?i:fide)\:?\s*\b|
53
+ (?i:game\s+dept)\.?\s*\b|
54
+ (?i:see\s+notes?\s*(inside)?)|
55
+ (?i:see\s+letter\s+enclosed)|
56
+ (?i:(by)?\s+correspondance)|
57
+ (?i:pers\.?\s+comm\.?)|
58
+ (?i:crossed\s+out)|
59
+ \(?(?i:source)\(?|
60
+ (?i:according\s+to)|
61
+ (?i:revised|photograph|fruits\s+only)|
62
+ -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
63
+ -?\s*(?i:synonym(y|ie))|
64
+ \b\s*\(?(?i:(fe)?male)\)?\s*\b|
65
+ \b(?i:to\s+(sub)?spp?)\.?|
66
+ (?i:nom\.?\s+rev\.?)|
67
+ FNA|DAO|HUH|\(MT\)|(?i:\(KEW\))|
68
+ (?i:uqam)|
69
+ \b[,;]\s+\d+\z|
70
+ [":!]|
71
+ [,]?\d+|
72
+ \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
73
+ [,;]\z|
74
+ ^\w{0,2}\z|
75
+ ^[A-Z]{2,}\z|
76
+ \s+(?i:stet)\s*!?\s*\z|
77
+ \s+(?i:prep)\.?\s*\z
78
+ }x
79
+
80
+ SPLIT_BY = %r{
81
+ [–|&+/;]|
82
+ \s+-\s+|
83
+ \s+a\.\s+|
84
+ \b(?i:and|et|with|per)\s+|
85
+ \be\s*\b|
86
+ \b(?i:annotated(\s+by)?)\s*\b|
87
+ \b(?i:coll\.)\s*\b|
88
+ \b(?i:communicate?d(\s+to)?)\s*\b|
89
+ \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
90
+ \b(?i:checked?(\s+by)?)\s*\b|
91
+ \b(?i:det\.?(\s+by)?)\s*\b|
92
+ \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
93
+ \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
94
+ \b(?i:in?dentified(\s+by)?)\s*\b|
95
+ \b(?i:in\s+part(\s+by)?)\s*\b|
96
+ \b(?i:or)\s+|
97
+ \b(?i:prep\.?\s+(?i:by)?)\s*\b|
98
+ \b(?i:redet\.?(\s+by?)?)\s*\b|
99
+ \b(?i:reidentified(\s+by)?)\s*\b|
100
+ \b(?i:stet)\s*\b|
101
+ \b(?i:then(\s+by)?)\s+|
102
+ \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
103
+ \b(?i:via|from)\s*\b|
104
+ \b(?i:(donated)?\s*by)\s+
105
+ }x
106
+
107
+ CHAR_SUBS = {
108
+ '|' => ' | ',
109
+ '(' => ' ',
110
+ ')' => ' ',
111
+ '[' => ' ',
112
+ ']' => ' ',
113
+ '?' => '',
114
+ '!' => '',
115
+ '=' => '',
116
+ '#' => '',
117
+ '/' => ' / ',
118
+ '&' => ' & ',
119
+ '*' => ''
120
+ }
121
+
122
+ COMPLEX_SEPARATORS = %r{
123
+ ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
124
+ }x
125
+
126
+ BLACKLIST = %r{
127
+ (?i:abundant)|
128
+ (?i:adult|juvenile)|
129
+ (?i:anon)|
130
+ (?i:average)|
131
+ (?i:believe|unclear|illegible|none|suggested|(dis)?agrees?)|approach|
132
+ (?i:barcod)|
133
+ (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
134
+ (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
135
+ (?i:carex|salix)|
136
+ (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
137
+ \b\s*(?i:help)\s*\b|
138
+ (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
139
+ (?i:desconocido)|
140
+ (?i:evidence)|
141
+ (?i:inconn?u)|
142
+ (?i:internation|gou?vern|ministry|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
143
+ (?i:label)|
144
+ (?i:o?\.?m\.?n\.?r\.?)|
145
+ (?i:measurement)|
146
+ (?i:ent(o|y)mology)|
147
+ (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
148
+ (?i:univ\.)|
149
+ (?i:graduate|student|estudantes|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
150
+ (?i:non\s+pr(é|e)cis(é|e))|
151
+ (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
152
+ (?i:recreation|culture)|
153
+ (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
154
+ (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|alliance|protective|circle)|
155
+ (?i:commercial|company|control|product)|
156
+ (?i:size|large|colou?r)\s+|
157
+ (?i:skeleton)|
158
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|expedi(c|t)ion|festival|generation|inventory|marine|service)|
159
+ (?i:submersible)|
160
+ (?i:synonymy?)|(topo|syn|holo)type|
161
+ (?i:systematic|perspective)|
162
+ \s+(?i:off)\s+|
163
+ \s*(?i:too)\s+|\s*(?i:the)\s+|
164
+ (?i:taxiderm(ies|y))|
165
+ (?i:though)|
166
+ (?i:toward|seen at)|
167
+ (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
168
+ (?i:urn\:)|
169
+ (?i:usda|ucla)|
170
+ (?i:workshop|garden|farm|jardin|public)
171
+ }x
172
+
173
+ TITLE = /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor)(\s+|$)/i
174
+
175
+ Namae.options[:prefer_comma_as_separator] = true
176
+ Namae.options[:separator] = SPLIT_BY
177
+ Namae.options[:title] = TITLE
178
+
179
+ def self.parse(name)
180
+ cleaned = name.gsub(STRIP_OUT, ' ')
181
+ .gsub(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
182
+ .gsub(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
183
+ .gsub(COMPLEX_SEPARATORS, '\1 | \2')
184
+ .gsub(/,\z/, '')
185
+ .squeeze(' ').strip
186
+ Namae.parse(cleaned)
187
+ end
188
+
189
+ def self.clean(parsed_namae)
190
+ blank_name = { given: nil, family: nil }
191
+
192
+ if parsed_namae.family && parsed_namae.family.length < 3
193
+ return blank_name
194
+ end
195
+ if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
196
+ return blank_name
197
+ end
198
+ if parsed_namae.given && parsed_namae.given.length > 15
199
+ return blank_name
200
+ end
201
+ if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
202
+ return blank_name
203
+ end
204
+ if parsed_namae.family && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.family)
205
+ return blank_name
206
+ end
207
+ if parsed_namae.given && /[a-zA-Z]{2,}\.?\s+[a-zA-Z]{2,}/.match(parsed_namae.given)
208
+ return blank_name
209
+ end
210
+ if parsed_namae.display_order =~ BLACKLIST
211
+ return blank_name
212
+ end
213
+
214
+ if parsed_namae.given &&
215
+ parsed_namae.family &&
216
+ parsed_namae.family.count(".") > 0 &&
217
+ parsed_namae.family.length - parsed_namae.family.count(".") <= 3
218
+ given = parsed_namae.given
219
+ family = parsed_namae.family
220
+ parsed_namae.family = given
221
+ parsed_namae.given = family
222
+ end
223
+
224
+ if parsed_namae.given &&
225
+ (parsed_namae.given == parsed_namae.given.upcase ||
226
+ parsed_namae.given == parsed_namae.given.downcase) &&
227
+ !parsed_namae.given.include?(".") &&
228
+ parsed_namae.given.length >= 4
229
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given)
230
+ end
231
+
232
+ if parsed_namae.given && /[A-Za-z]\./.match(parsed_namae.given)
233
+ parsed_namae.given = CapitalizeNames.capitalize(parsed_namae.given).sub(/[a-z]\./, &:upcase)
234
+ end
235
+
236
+ parsed_namae.normalize_initials
237
+
238
+ family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
239
+ given = parsed_namae.given.strip rescue nil
240
+ particle = parsed_namae.particle.strip rescue nil
241
+
242
+ if family.nil? && !given.nil? && !given.include?(".")
243
+ family = given
244
+ given = nil
245
+ end
246
+
247
+ if !family.nil? && given.nil? && !particle.nil?
248
+ given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
249
+ particle = nil
250
+ end
251
+
252
+ if !family.nil? && (family == family.upcase || family == family.downcase)
253
+ family = family.mb_chars.capitalize.to_s rescue nil
254
+ end
255
+
256
+ if !family.nil? && family.length <= 4 && family !~ /[aeiouy]/
257
+ return blank_name
258
+ end
259
+
260
+ if !family.nil? && family.match(/[A-Z]$/)
261
+ return blank_name
262
+ end
263
+
264
+ if !family.nil? && family.match(/^[A-Z]{2}/)
265
+ return blank_name
266
+ end
267
+
268
+ { given: given, family: family }
269
+ end
270
+
271
+ end
@@ -0,0 +1,7 @@
1
+ class DwcAgent
2
+ VERSION = "0.0.1"
3
+
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dwc_agent
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - David P. Shorthouse
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-09-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: namae
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: capitalize-names
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '11.1'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '11.1'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.4'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.16'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.16'
83
+ description: Parses the typically messy content in Darwin Core terms that contain
84
+ people names
85
+ email: davidpshorthouse@gmail.coms
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - lib/dwc_agent.rb
91
+ - lib/dwc_agent/version.rb
92
+ homepage: https://github.com/dshorthouse/dwc_agent
93
+ licenses:
94
+ - MIT
95
+ metadata: {}
96
+ post_install_message:
97
+ rdoc_options:
98
+ - "--encoding"
99
+ - UTF-8
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '2.4'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project:
114
+ rubygems_version: 2.6.12
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
118
+ test_files: []