dwc_agent 0.4.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bf17473ddabfee14ba9e7a8d451486ef8ebb058c9d1d0214b0ed9cbae996e48
4
- data.tar.gz: cb404e507a9f9de35a5ddeebf94a6bdd4e0ad7b245e08909b1d71f50bd7595da
3
+ metadata.gz: dcdfb4922038d07715bd064b74a516203bcce44ef824683ac9766586764bb98f
4
+ data.tar.gz: d9113f199abc420d7c608bfbead6e70f1f6192437a82e43541f77eb187752c2a
5
5
  SHA512:
6
- metadata.gz: a25e5863c6fa384604e399815ec1c4c9befadaf1ebdefb4ef52abdb1e9a296ace16718bab6020a32c3f1ed9c0728c46e76fdfa8c5b3133e98e48b0478c155858
7
- data.tar.gz: 1243c70e9d479ca15dfc931e823939216873f536c3441f77e90bc9461e639e2d408e335d31418e0e31616686d24914ceffd43ae97fe9388f2ccfab443b102f1f
6
+ metadata.gz: 403049856dbfbc83c984b5175615718c738b46312ecced4d3b5853fb4584a080d1e868492ee476187c8cf7238481845c0f77bd181e6ceaeebc22bae8b7aef127
7
+ data.tar.gz: 01e8bc4beb84140a80763c6c0dc8ff6ee6f967e8382cbacf04a230aa3405f2394ec1a90570471e113d90264c1e202c5452428858eb5aff6bc499f4e795dc3158
@@ -16,13 +16,9 @@ module DwcAgent
16
16
  # @param parsed_namae [Object] the namae object
17
17
  # @return [Hash] the given, family hash
18
18
  def clean(parsed_namae)
19
- blank_name = { given: nil, family: nil }
19
+ blank_name = { given: nil, family: nil, particle: nil }
20
20
 
21
- if parsed_namae.family && FAMILY_BLACKLIST.include?(parsed_namae.family)
22
- return blank_name
23
- end
24
-
25
- if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
21
+ if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
26
22
  return blank_name
27
23
  end
28
24
 
@@ -99,6 +95,10 @@ module DwcAgent
99
95
  particle = nil
100
96
  end
101
97
 
98
+ if !particle.nil? && particle.include?(".")
99
+ particle = nil
100
+ end
101
+
102
102
  if !family.nil? && (family == family.upcase || family == family.downcase)
103
103
  family = NameCase(family)
104
104
  end
@@ -111,7 +111,11 @@ module DwcAgent
111
111
  return blank_name
112
112
  end
113
113
 
114
- { given: given, family: family }
114
+ if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
115
+ return blank_name
116
+ end
117
+
118
+ { given: given, family: family, particle: particle }
115
119
  end
116
120
 
117
121
  end
@@ -20,7 +20,6 @@ module DwcAgent
20
20
  \b[,;]?\s*(?i:person\s*string)\b|
21
21
  \b[,;]?\s*(?i:colls)\.(\b|\z)|
22
22
  \b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
23
- (?i:no\s+(data|disponible))|
24
23
  \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
25
24
  [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
26
25
  May|Jun|Jul|Aug|Sept?|
@@ -78,10 +77,11 @@ module DwcAgent
78
77
  \b\s*\(?(?i:(fe)?male)\)?\s*\b|
79
78
  \b(?i:to\s+(sub)?spp?)\.?|
80
79
  (?i:nom\.?\s+rev\.?)|
81
- FNA|DAO|HUH|FDNMB|
80
+ FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
82
81
  AFSC\/POLISH\s+SORTING\s+CTR\.?|
83
82
  (?i:university|museum|exhibits?)|
84
83
  (?i:uqam)|
84
+ (?i:sem\s+(colec?tor|data))|
85
85
  \b[,;]\s+\d+\z|
86
86
  ["!@?]|
87
87
  [,]?\d+|
@@ -147,7 +147,8 @@ module DwcAgent
147
147
  '{' => '',
148
148
  '}' => '',
149
149
  '@' => '',
150
- '%' => ''
150
+ '%' => '',
151
+ '\\' => ''
151
152
  }
152
153
 
153
154
  PHRASE_SUBS = {
@@ -158,12 +159,23 @@ module DwcAgent
158
159
  }
159
160
 
160
161
  COMPLEX_SEPARATORS = %r{
161
- ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
162
+ ^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
162
163
  }x
163
164
 
165
+ # Was used in 1.1.0 but it sunk performance so threw it back to a WIP
166
+ #
167
+ # @contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
168
+ #
169
+ # name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
170
+ #
171
+ # CONTRACTED_LIST = %r{
172
+ # ^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
173
+ # }x
174
+
164
175
  BLACKLIST = %r{
165
176
  (?i:abundant)|
166
177
  (?i:adult|juvenile)|
178
+ (?i:administra(d|t)or)|
167
179
  (?i:anon)|
168
180
  (?i:australian?)|
169
181
  (?i:average)|
@@ -172,9 +184,10 @@ module DwcAgent
172
184
  (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
173
185
  (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
174
186
  (?i:carex|salix)|
175
- (?:catalog)|
187
+ (?i:catalog(ue)?)|
176
188
  (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
177
189
  \b\s*(?i:help)\s*\b|
190
+ (?i:data\s+not\s+captured)|
178
191
  (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
179
192
  (?i:desconocido)|
180
193
  (?i:exc?s?icc?at(a|i))|
@@ -192,7 +205,9 @@ module DwcAgent
192
205
  (?i:univ\.)|
193
206
  (?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
194
207
  (?i:non\s+pr(é|e)cis(é|e))|
195
- (?i:not?\s+stated)|
208
+ (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
209
+ (?i:not?\s+(entered|stated))|
210
+ (?i:nomenclatur(e|al)\s+adjustment)|
196
211
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
197
212
  (?i:recreation|culture)|
198
213
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
@@ -220,7 +235,12 @@ module DwcAgent
220
235
  FAMILY_BLACKLIST = [
221
236
  "der",
222
237
  "van",
223
- "von"
238
+ "von",
239
+ "the",
240
+ "of",
241
+ "curators",
242
+ "nomenclatural",
243
+ "adjustment"
224
244
  ]
225
245
 
226
246
  TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
@@ -20,7 +20,7 @@ module DwcAgent
20
20
  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
21
21
  @phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
22
22
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
- @add_separators_regex = Regexp.new %r{([A-Z]{1}\.)([[:alpha:]]{2,})}.to_s
23
+ @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
24
24
  end
25
25
 
26
26
  # Parses the passed-in string and returns a list of names.
@@ -1,9 +1,9 @@
1
1
  module DwcAgent
2
2
  class Version
3
3
 
4
- MAJOR = 0
5
- MINOR = 4
6
- PATCH = 1
4
+ MAJOR = 1
5
+ MINOR = 2
6
+ PATCH = 0
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-26 00:00:00.000000000 Z
11
+ date: 2019-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae