dwc_agent 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +19 -0
- data/lib/dwc_agent/constants.rb +17 -7
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 988e79daa81edb97377cdaf65ad2e6aa79b6e524cd3c9ab940d7b4cc1c1c9885
|
4
|
+
data.tar.gz: 7b6969b8da5e772858e68313e24954bd352c78f62e552d0b5b16a328b3d20e0c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5794e0e8b1d779a72e6d8ba599e85b196f870f16494e663830f3c265c3ee39101bf729e6670f0f8f7d5762dadd438b3843aab5d7bcd450bd1212d4c95f0c6183
|
7
|
+
data.tar.gz: 91fca40528d0f20a698ea66602de134ba6c442e6ab7e51118003ba0afdf49eca1247caa33fcf025f9cced537dc6f2ecaad8a440f07d7d34232d1326bb2eaa213
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,18 +18,26 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { given: nil, family: nil }
|
20
20
|
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.include?(parsed_namae.family)
|
22
|
+
return blank_name
|
23
|
+
end
|
24
|
+
|
21
25
|
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
22
26
|
return blank_name
|
23
27
|
end
|
28
|
+
|
24
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
25
30
|
return blank_name
|
26
31
|
end
|
32
|
+
|
27
33
|
if parsed_namae.given && parsed_namae.given.length > 25
|
28
34
|
return blank_name
|
29
35
|
end
|
36
|
+
|
30
37
|
if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
31
38
|
return blank_name
|
32
39
|
end
|
40
|
+
|
33
41
|
if parsed_namae.display_order =~ BLACKLIST
|
34
42
|
return blank_name
|
35
43
|
end
|
@@ -44,6 +52,17 @@ module DwcAgent
|
|
44
52
|
parsed_namae.given = family
|
45
53
|
end
|
46
54
|
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
58
|
+
parsed_namae.family == parsed_namae.family.upcase &&
|
59
|
+
parsed_namae.given[-1] != "."
|
60
|
+
given = parsed_namae.given
|
61
|
+
family = parsed_namae.family
|
62
|
+
parsed_namae.family = given
|
63
|
+
parsed_namae.given = family
|
64
|
+
end
|
65
|
+
|
47
66
|
if parsed_namae.given &&
|
48
67
|
(parsed_namae.given == parsed_namae.given.upcase ||
|
49
68
|
parsed_namae.given == parsed_namae.given.downcase) &&
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
|
+
^[\[{(]|
|
4
|
+
[\]})]$|
|
3
5
|
\s*?\d+\.\d+|
|
4
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
5
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
@@ -11,7 +13,7 @@ module DwcAgent
|
|
11
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
12
14
|
\b[,;]?\s*(?i:n/a)\b|
|
13
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
14
|
-
\b[,;]?\s
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|illegible|scripsit)\)?\b|
|
15
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
16
18
|
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
17
19
|
\b[,;]?\s*(?i:string)\b|
|
@@ -50,7 +52,7 @@ module DwcAgent
|
|
50
52
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
51
53
|
\b\s*(?i:maybe)\s*\b|
|
52
54
|
\b\s*(?i:prob)\.\s*\b|
|
53
|
-
\(?[
|
55
|
+
\(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
54
56
|
(?i:fide)\:?\s*\b|
|
55
57
|
(?i:game\s+dept)\.?\s*\b|
|
56
58
|
(?i:see\s+notes?\s*(inside)?)|
|
@@ -81,7 +83,7 @@ module DwcAgent
|
|
81
83
|
(?i:university|museum|exhibits?)|
|
82
84
|
(?i:uqam)|
|
83
85
|
\b[,;]\s+\d+\z|
|
84
|
-
["
|
86
|
+
["!@?]|
|
85
87
|
[,]?\d+|
|
86
88
|
\s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+|
|
87
89
|
[,;]\z|
|
@@ -97,7 +99,8 @@ module DwcAgent
|
|
97
99
|
^[-,.\s;*\d]+\s?|
|
98
100
|
-\d?\z|
|
99
101
|
\s*?-{2,}\s*?|
|
100
|
-
^(?i:exc?p?)[:.]\s
|
102
|
+
^(?i:exc?p?)[:.]\s*|
|
103
|
+
\s+de\s*$
|
101
104
|
}x
|
102
105
|
|
103
106
|
SPLIT_BY = %r{
|
@@ -164,7 +167,7 @@ module DwcAgent
|
|
164
167
|
(?i:anon)|
|
165
168
|
(?i:australian?)|
|
166
169
|
(?i:average)|
|
167
|
-
(?i:believe|unclear|
|
170
|
+
(?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
|
168
171
|
(?i:barcod)|
|
169
172
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
170
173
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
@@ -209,9 +212,16 @@ module DwcAgent
|
|
209
212
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
210
213
|
(?i:urn\:)|
|
211
214
|
(?i:usda|ucla)|
|
212
|
-
(?i:workshop|garden|farm|jardin|public)
|
215
|
+
(?i:workshop|garden|farm|jardin|public)|
|
216
|
+
^\s*?de\s*?$
|
213
217
|
}x
|
214
218
|
|
215
|
-
|
219
|
+
FAMILY_BLACKLIST = [
|
220
|
+
"der",
|
221
|
+
"van",
|
222
|
+
"von"
|
223
|
+
]
|
224
|
+
|
225
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
216
226
|
|
217
227
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07
|
11
|
+
date: 2019-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|