dwc_agent 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +19 -0
- data/lib/dwc_agent/constants.rb +17 -6
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7bf17473ddabfee14ba9e7a8d451486ef8ebb058c9d1d0214b0ed9cbae996e48
|
4
|
+
data.tar.gz: cb404e507a9f9de35a5ddeebf94a6bdd4e0ad7b245e08909b1d71f50bd7595da
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a25e5863c6fa384604e399815ec1c4c9befadaf1ebdefb4ef52abdb1e9a296ace16718bab6020a32c3f1ed9c0728c46e76fdfa8c5b3133e98e48b0478c155858
|
7
|
+
data.tar.gz: 1243c70e9d479ca15dfc931e823939216873f536c3441f77e90bc9461e639e2d408e335d31418e0e31616686d24914ceffd43ae97fe9388f2ccfab443b102f1f
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,18 +18,26 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { given: nil, family: nil }
|
20
20
|
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.include?(parsed_namae.family)
|
22
|
+
return blank_name
|
23
|
+
end
|
24
|
+
|
21
25
|
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
22
26
|
return blank_name
|
23
27
|
end
|
28
|
+
|
24
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
25
30
|
return blank_name
|
26
31
|
end
|
32
|
+
|
27
33
|
if parsed_namae.given && parsed_namae.given.length > 25
|
28
34
|
return blank_name
|
29
35
|
end
|
36
|
+
|
30
37
|
if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
31
38
|
return blank_name
|
32
39
|
end
|
40
|
+
|
33
41
|
if parsed_namae.display_order =~ BLACKLIST
|
34
42
|
return blank_name
|
35
43
|
end
|
@@ -44,6 +52,17 @@ module DwcAgent
|
|
44
52
|
parsed_namae.given = family
|
45
53
|
end
|
46
54
|
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
58
|
+
parsed_namae.family == parsed_namae.family.upcase &&
|
59
|
+
parsed_namae.given[-1] != "."
|
60
|
+
given = parsed_namae.given
|
61
|
+
family = parsed_namae.family
|
62
|
+
parsed_namae.family = given
|
63
|
+
parsed_namae.given = family
|
64
|
+
end
|
65
|
+
|
47
66
|
if parsed_namae.given &&
|
48
67
|
(parsed_namae.given == parsed_namae.given.upcase ||
|
49
68
|
parsed_namae.given == parsed_namae.given.downcase) &&
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
|
+
^[\[{(]|
|
4
|
+
[\]})]$|
|
3
5
|
\s*?\d+\.\d+|
|
4
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
5
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
@@ -11,7 +13,7 @@ module DwcAgent
|
|
11
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
12
14
|
\b[,;]?\s*(?i:n/a)\b|
|
13
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
14
|
-
\b[,;]?\s
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|illegible|scripsit)\)?\b|
|
15
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
16
18
|
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
17
19
|
\b[,;]?\s*(?i:string)\b|
|
@@ -81,7 +83,7 @@ module DwcAgent
|
|
81
83
|
(?i:university|museum|exhibits?)|
|
82
84
|
(?i:uqam)|
|
83
85
|
\b[,;]\s+\d+\z|
|
84
|
-
["
|
86
|
+
["!@?]|
|
85
87
|
[,]?\d+|
|
86
88
|
\s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
|
87
89
|
[,;]\z|
|
@@ -97,7 +99,8 @@ module DwcAgent
|
|
97
99
|
^[-,.\s;*\d]+\s?|
|
98
100
|
-\d?\z|
|
99
101
|
\s*?-{2,}\s*?|
|
100
|
-
^(?i:exc?p?)[:.]\s
|
102
|
+
^(?i:exc?p?)[:.]\s*|
|
103
|
+
\s+de\s*$
|
101
104
|
}x
|
102
105
|
|
103
106
|
SPLIT_BY = %r{
|
@@ -164,7 +167,7 @@ module DwcAgent
|
|
164
167
|
(?i:anon)|
|
165
168
|
(?i:australian?)|
|
166
169
|
(?i:average)|
|
167
|
-
(?i:believe|unclear|
|
170
|
+
(?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
|
168
171
|
(?i:barcod)|
|
169
172
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
170
173
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
@@ -189,6 +192,7 @@ module DwcAgent
|
|
189
192
|
(?i:univ\.)|
|
190
193
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
191
194
|
(?i:non\s+pr(é|e)cis(é|e))|
|
195
|
+
(?i:not?\s+stated)|
|
192
196
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
193
197
|
(?i:recreation|culture)|
|
194
198
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -209,9 +213,16 @@ module DwcAgent
|
|
209
213
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
210
214
|
(?i:urn\:)|
|
211
215
|
(?i:usda|ucla)|
|
212
|
-
(?i:workshop|garden|farm|jardin|public)
|
216
|
+
(?i:workshop|garden|farm|jardin|public)|
|
217
|
+
^\s*?de\s*?$
|
213
218
|
}x
|
214
219
|
|
215
|
-
|
220
|
+
FAMILY_BLACKLIST = [
|
221
|
+
"der",
|
222
|
+
"van",
|
223
|
+
"von"
|
224
|
+
]
|
225
|
+
|
226
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
216
227
|
|
217
228
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|