dwc_agent 0.3.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +23 -0
- data/lib/dwc_agent/constants.rb +19 -6
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f54434352d90dff97e7c733fbc2fb53117380d5f04d6841d9796f48a5603dc18
|
4
|
+
data.tar.gz: 1ca4ae5b5bf104b51fa030cfe3984e428ab4325df266a6c90de50dff134038c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd1a44b5926abba1a7124493980fcf8e91aebc3bf854bca8c92da973299a345f8eb79cd3a02696e93d1c4426a6fa3bd5ff3414239c884566ed5de0679b0e3760
|
7
|
+
data.tar.gz: 2dcea697b1c7d589227469e5a69a3da40f71886c01e29a714557276bcdc0d3a2755cc0ff89bf36dcd70388fd640917384b776ace8de4bd3352cdea217d08ae3e
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,18 +18,26 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { given: nil, family: nil }
|
20
20
|
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.include?(parsed_namae.family)
|
22
|
+
return blank_name
|
23
|
+
end
|
24
|
+
|
21
25
|
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
22
26
|
return blank_name
|
23
27
|
end
|
28
|
+
|
24
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
25
30
|
return blank_name
|
26
31
|
end
|
32
|
+
|
27
33
|
if parsed_namae.given && parsed_namae.given.length > 25
|
28
34
|
return blank_name
|
29
35
|
end
|
36
|
+
|
30
37
|
if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
31
38
|
return blank_name
|
32
39
|
end
|
40
|
+
|
33
41
|
if parsed_namae.display_order =~ BLACKLIST
|
34
42
|
return blank_name
|
35
43
|
end
|
@@ -44,6 +52,17 @@ module DwcAgent
|
|
44
52
|
parsed_namae.given = family
|
45
53
|
end
|
46
54
|
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
58
|
+
parsed_namae.family == parsed_namae.family.upcase &&
|
59
|
+
parsed_namae.given[-1] != "."
|
60
|
+
given = parsed_namae.given
|
61
|
+
family = parsed_namae.family
|
62
|
+
parsed_namae.family = given
|
63
|
+
parsed_namae.given = family
|
64
|
+
end
|
65
|
+
|
47
66
|
if parsed_namae.given &&
|
48
67
|
(parsed_namae.given == parsed_namae.given.upcase ||
|
49
68
|
parsed_namae.given == parsed_namae.given.downcase) &&
|
@@ -92,6 +111,10 @@ module DwcAgent
|
|
92
111
|
return blank_name
|
93
112
|
end
|
94
113
|
|
114
|
+
if !family.nil? && FAMILY_BLACKLIST.include?(family)
|
115
|
+
return blank_name
|
116
|
+
end
|
117
|
+
|
95
118
|
{ given: given, family: family }
|
96
119
|
end
|
97
120
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -83,7 +83,7 @@ module DwcAgent
|
|
83
83
|
(?i:university|museum|exhibits?)|
|
84
84
|
(?i:uqam)|
|
85
85
|
\b[,;]\s+\d+\z|
|
86
|
-
["
|
86
|
+
["!@?]|
|
87
87
|
[,]?\d+|
|
88
88
|
\s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
|
89
89
|
[,;]\z|
|
@@ -99,7 +99,8 @@ module DwcAgent
|
|
99
99
|
^[-,.\s;*\d]+\s?|
|
100
100
|
-\d?\z|
|
101
101
|
\s*?-{2,}\s*?|
|
102
|
-
^(?i:exc?p?)[:.]\s
|
102
|
+
^(?i:exc?p?)[:.]\s*|
|
103
|
+
\s+de\s*$
|
103
104
|
}x
|
104
105
|
|
105
106
|
SPLIT_BY = %r{
|
@@ -146,7 +147,8 @@ module DwcAgent
|
|
146
147
|
'{' => '',
|
147
148
|
'}' => '',
|
148
149
|
'@' => '',
|
149
|
-
'%' => ''
|
150
|
+
'%' => '',
|
151
|
+
'\\' => ''
|
150
152
|
}
|
151
153
|
|
152
154
|
PHRASE_SUBS = {
|
@@ -166,7 +168,7 @@ module DwcAgent
|
|
166
168
|
(?i:anon)|
|
167
169
|
(?i:australian?)|
|
168
170
|
(?i:average)|
|
169
|
-
(?i:believe|unclear|
|
171
|
+
(?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
|
170
172
|
(?i:barcod)|
|
171
173
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
172
174
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
@@ -191,6 +193,7 @@ module DwcAgent
|
|
191
193
|
(?i:univ\.)|
|
192
194
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
193
195
|
(?i:non\s+pr(é|e)cis(é|e))|
|
196
|
+
(?i:not?\s+stated)|
|
194
197
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
195
198
|
(?i:recreation|culture)|
|
196
199
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -211,9 +214,19 @@ module DwcAgent
|
|
211
214
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
212
215
|
(?i:urn\:)|
|
213
216
|
(?i:usda|ucla)|
|
214
|
-
(?i:workshop|garden|farm|jardin|public)
|
217
|
+
(?i:workshop|garden|farm|jardin|public)|
|
218
|
+
^\s*?de\s*?$
|
215
219
|
}x
|
216
220
|
|
217
|
-
|
221
|
+
FAMILY_BLACKLIST = [
|
222
|
+
"der",
|
223
|
+
"van",
|
224
|
+
"von",
|
225
|
+
"the",
|
226
|
+
"The",
|
227
|
+
"Catalog"
|
228
|
+
]
|
229
|
+
|
230
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
218
231
|
|
219
232
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|