dwc_agent 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/dwcagent +1 -1
- data/lib/dwc_agent/cleaner.rb +4 -1
- data/lib/dwc_agent/constants.rb +1 -1
- data/lib/dwc_agent/parser.rb +18 -14
- data/lib/dwc_agent/similarity.rb +9 -4
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5175936a78d6dc64f17a2827dccfaa80c66f07242b17639b3668e6403e9a19b7
|
4
|
+
data.tar.gz: a13ead08f2756db93e5abaff55d2cd9316b7ee0739926571045d52abfc4243c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 368ea5fd8755f19dafd4b651fc68ef89a0e34246deedd6261b4cd97ec3dbf6b3f4eb13bcc3e7552ce46d77f25b721e9269e4082b861681486dfb12a797e8c254
|
7
|
+
data.tar.gz: f61d5cc267f5b210a327431eec44da687d03d991455e6c7797aa6d1aac1fca47c2664e8579143e46951c1413cafb068acc5d8fb13ad9c0c76442ccc578581833
|
data/bin/dwcagent
CHANGED
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -7,6 +7,9 @@ module DwcAgent
|
|
7
7
|
end
|
8
8
|
end
|
9
9
|
|
10
|
+
def initialize
|
11
|
+
end
|
12
|
+
|
10
13
|
# Cleans the passed-in namae object from the parse method and
|
11
14
|
# re-organizes it to better match expected Darwin Core output.
|
12
15
|
#
|
@@ -45,7 +48,7 @@ module DwcAgent
|
|
45
48
|
(parsed_namae.given == parsed_namae.given.upcase ||
|
46
49
|
parsed_namae.given == parsed_namae.given.downcase) &&
|
47
50
|
!parsed_namae.given.include?(".") &&
|
48
|
-
parsed_namae.given.
|
51
|
+
parsed_namae.given.tr(".","").length >= 4
|
49
52
|
parsed_namae.given = NameCase(parsed_namae.given)
|
50
53
|
end
|
51
54
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -50,7 +50,7 @@ module DwcAgent
|
|
50
50
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
51
51
|
\b\s*(?i:maybe)\s*\b|
|
52
52
|
\b\s*(?i:prob)\.\s*\b|
|
53
|
-
\(?(?i:collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
53
|
+
\(?[,;]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
54
54
|
(?i:fide)\:?\s*\b|
|
55
55
|
(?i:game\s+dept)\.?\s*\b|
|
56
56
|
(?i:see\s+notes?\s*(inside)?)|
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -7,6 +7,15 @@ module DwcAgent
|
|
7
7
|
end
|
8
8
|
end
|
9
9
|
|
10
|
+
def initialize
|
11
|
+
options = {
|
12
|
+
prefer_comma_as_separator: true,
|
13
|
+
separator: SPLIT_BY,
|
14
|
+
title: TITLE
|
15
|
+
}
|
16
|
+
@namae = Namae::Parser.new(options)
|
17
|
+
end
|
18
|
+
|
10
19
|
# Parses the passed-in string and returns a list of names.
|
11
20
|
#
|
12
21
|
# @param names [String] the name or names to be parsed
|
@@ -14,20 +23,15 @@ module DwcAgent
|
|
14
23
|
def parse(name)
|
15
24
|
return [] if name.nil? || name == ""
|
16
25
|
residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
separator: SPLIT_BY,
|
27
|
-
title: TITLE
|
28
|
-
}
|
29
|
-
namae = Namae::Parser.new(options)
|
30
|
-
namae.parse(cleaned)
|
26
|
+
name.gsub!(STRIP_OUT, ' ')
|
27
|
+
name.gsub!(/[#{CHAR_SUBS.keys.join('\\')}]/, CHAR_SUBS)
|
28
|
+
name.gsub!(/(#{PHRASE_SUBS.keys.join('|')})/, PHRASE_SUBS)
|
29
|
+
name.gsub!(/([A-Z]{1}\.)([[:alpha:]]{2,})/, '\1 \2')
|
30
|
+
name.gsub!(COMPLEX_SEPARATORS, '\1 | \2')
|
31
|
+
name.gsub!(residual_terminators_regex, '')
|
32
|
+
name.squeeze!(' ')
|
33
|
+
name.strip!
|
34
|
+
@namae.parse(name)
|
31
35
|
end
|
32
36
|
|
33
37
|
end
|
data/lib/dwc_agent/similarity.rb
CHANGED
@@ -7,6 +7,9 @@ module DwcAgent
|
|
7
7
|
end
|
8
8
|
end
|
9
9
|
|
10
|
+
def initialize
|
11
|
+
end
|
12
|
+
|
10
13
|
# Produces a similarity score of two given names
|
11
14
|
# Logic inspired by R.D.M. Page, https://orcid.org/0000-0002-7101-9767
|
12
15
|
# At https://linen-baseball.glitch.me/
|
@@ -15,10 +18,12 @@ module DwcAgent
|
|
15
18
|
# @param given2 [String] a second given name
|
16
19
|
# @return [Float] the similarity score
|
17
20
|
def similarity_score(given1, given2)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
21
|
+
given1.gsub!(/\.\s+/,".")
|
22
|
+
g1_arr = given1.split(/[\.\s]/)
|
23
|
+
given2.gsub!(/\.\s+/,".")
|
24
|
+
g2_arr = given2.split(/[\.\s]/)
|
25
|
+
largest = [g1_arr,g2_arr].max
|
26
|
+
smallest = [g1_arr,g2_arr].min
|
22
27
|
|
23
28
|
score = 0
|
24
29
|
largest.each_with_index do |val,index|
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|