dwc_agent 1.4.8 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/dwc_agent/cleaner.rb +5 -2
 - data/lib/dwc_agent/constants.rb +13 -8
 - data/lib/dwc_agent/parser.rb +4 -2
 - data/lib/dwc_agent/version.rb +2 -2
 - metadata +3 -3
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 87cfd6b6ab9ee83156f9503e48669691d486f143f64a623f93a40f064d2dc7f7
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 7f40287aa6bf7e7b90408d72594fd9ab5498401c5ee402da6f2a78163f011a9a
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 4cb91ffc7530bd1a93fbbc70c716ad96d49cab139bf95507e3a688566ce997cb71510b13013bb07cb52dffe2ce7419d294392028eedd28f1caba8b032eb98c9a
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: ae226c63106a2b41899b6d4c85c92aa372d8d1cbce8a90023cf9b8d1a4c153a28bd33a9eb8064c492fa79d5d4229ec8979d5ca4f751d97fc1ec09132b0e46ca3
         
     | 
    
        data/lib/dwc_agent/cleaner.rb
    CHANGED
    
    | 
         @@ -16,7 +16,7 @@ module DwcAgent 
     | 
|
| 
       16 
16 
     | 
    
         
             
                # @param parsed_namae [Object] the namae object
         
     | 
| 
       17 
17 
     | 
    
         
             
                # @return [Hash] the given, family hash
         
     | 
| 
       18 
18 
     | 
    
         
             
                def clean(parsed_namae)
         
     | 
| 
       19 
     | 
    
         
            -
                  blank_name = { given: nil, family: nil,  
     | 
| 
      
 19 
     | 
    
         
            +
                  blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
                  if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
         
     | 
| 
       22 
22 
     | 
    
         
             
                    return blank_name
         
     | 
| 
         @@ -84,6 +84,9 @@ module DwcAgent 
     | 
|
| 
       84 
84 
     | 
    
         
             
                  family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
         
     | 
| 
       85 
85 
     | 
    
         
             
                  given = parsed_namae.given.strip rescue nil
         
     | 
| 
       86 
86 
     | 
    
         
             
                  particle = parsed_namae.particle.strip rescue nil
         
     | 
| 
      
 87 
     | 
    
         
            +
                  appellation = parsed_namae.appellation.strip rescue nil
         
     | 
| 
      
 88 
     | 
    
         
            +
                  suffix = parsed_names.suffix.strip rescue nil
         
     | 
| 
      
 89 
     | 
    
         
            +
                  title = parsed_names.title.strip rescue nil
         
     | 
| 
       87 
90 
     | 
    
         | 
| 
       88 
91 
     | 
    
         
             
                  if !given.nil? && given.match(/[A-Z]\.[A-Za-z]{2,}/)
         
     | 
| 
       89 
92 
     | 
    
         
             
                    given = given.gsub(".", ". ").strip
         
     | 
| 
         @@ -123,7 +126,7 @@ module DwcAgent 
     | 
|
| 
       123 
126 
     | 
    
         
             
                    return blank_name
         
     | 
| 
       124 
127 
     | 
    
         
             
                  end
         
     | 
| 
       125 
128 
     | 
    
         | 
| 
       126 
     | 
    
         
            -
                  { given: given, family: family,  
     | 
| 
      
 129 
     | 
    
         
            +
                  { title: nil, appellation: nil, given: given, particle: particle, family: family, suffix: nil }
         
     | 
| 
       127 
130 
     | 
    
         
             
                end
         
     | 
| 
       128 
131 
     | 
    
         | 
| 
       129 
132 
     | 
    
         
             
              end
         
     | 
    
        data/lib/dwc_agent/constants.rb
    CHANGED
    
    | 
         @@ -14,7 +14,7 @@ module DwcAgent 
     | 
|
| 
       14 
14 
     | 
    
         
             
                \b[,;]?\s*(?i:unkn?own)\b|
         
     | 
| 
       15 
15 
     | 
    
         
             
                \b[,;]?\s*(?i:n/a)\b|
         
     | 
| 
       16 
16 
     | 
    
         
             
                \b[,;]?\s*(?i:ann?onymous)\b|
         
     | 
| 
       17 
     | 
    
         
            -
                \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
         
     | 
| 
      
 17 
     | 
    
         
            +
                \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?)\)?\b|
         
     | 
| 
       18 
18 
     | 
    
         
             
                \b[,;]?\s*(?i:importer|gift)\:?\b|
         
     | 
| 
       19 
19 
     | 
    
         
             
                \b[,;]?\s*(?i:string)\b|
         
     | 
| 
       20 
20 
     | 
    
         
             
                \b[,;]?\s*(?i:person\s*string)\b|
         
     | 
| 
         @@ -49,11 +49,13 @@ module DwcAgent 
     | 
|
| 
       49 
49 
     | 
    
         
             
                \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
         
     | 
| 
       50 
50 
     | 
    
         
             
                \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
         
     | 
| 
       51 
51 
     | 
    
         
             
                \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
         
     | 
| 
      
 52 
     | 
    
         
            +
                \b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|China|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|France|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Georgia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Israel|Italy|Jamaica|Japan|Jersey|Jordan|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Montserrat|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Portugal|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)|
         
     | 
| 
       52 
53 
     | 
    
         
             
                (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
         
     | 
| 
       53 
54 
     | 
    
         
             
                \b\s*(?i:maybe)\s*\b|
         
     | 
| 
       54 
55 
     | 
    
         
             
                \b\s*(?i:prob)\.\s*\b|
         
     | 
| 
       55 
56 
     | 
    
         
             
                \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
         
     | 
| 
       56 
57 
     | 
    
         
             
                \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
         
     | 
| 
      
 58 
     | 
    
         
            +
                (?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
         
     | 
| 
       57 
59 
     | 
    
         
             
                (?i:fide)\:?\s*\b|
         
     | 
| 
       58 
60 
     | 
    
         
             
                (?i:game\s+dept)\.?\s*\b|
         
     | 
| 
       59 
61 
     | 
    
         
             
                (?i:see\s+notes?\s*(inside)?)|
         
     | 
| 
         @@ -137,7 +139,6 @@ module DwcAgent 
     | 
|
| 
       137 
139 
     | 
    
         
             
                '|' => ' | ',
         
     | 
| 
       138 
140 
     | 
    
         
             
                'ǀ' => ' | ',
         
     | 
| 
       139 
141 
     | 
    
         
             
                '∣' => ' | ',
         
     | 
| 
       140 
     | 
    
         
            -
                '|' => ' | ',
         
     | 
| 
       141 
142 
     | 
    
         
             
                '│' => ' | ',
         
     | 
| 
       142 
143 
     | 
    
         
             
                '(' => ' ',
         
     | 
| 
       143 
144 
     | 
    
         
             
                ')' => ' ',
         
     | 
| 
         @@ -158,12 +159,12 @@ module DwcAgent 
     | 
|
| 
       158 
159 
     | 
    
         
             
              }
         
     | 
| 
       159 
160 
     | 
    
         | 
| 
       160 
161 
     | 
    
         
             
              PHRASE_SUBS = {
         
     | 
| 
       161 
     | 
    
         
            -
                'dr\.' => 'Dr. ',
         
     | 
| 
       162 
     | 
    
         
            -
                'mr\.' => 'Mr. ',
         
     | 
| 
       163 
     | 
    
         
            -
                'mrs\.' => 'Mrs. ',
         
     | 
| 
       164 
162 
     | 
    
         
             
                'prof\.' => 'Prof. ',
         
     | 
| 
       165 
163 
     | 
    
         
             
                '\, ph\.d\.' => ' Ph.D.',
         
     | 
| 
       166 
     | 
    
         
            -
                '\, bro\.' => ' Bro.'
         
     | 
| 
      
 164 
     | 
    
         
            +
                '\, bro\.' => ' Bro.',
         
     | 
| 
      
 165 
     | 
    
         
            +
                ' jr\.\,' => ' Jr.;',
         
     | 
| 
      
 166 
     | 
    
         
            +
                ' jr\,' => ' Jr.;',
         
     | 
| 
      
 167 
     | 
    
         
            +
                '\-jr' => ' Jr.'
         
     | 
| 
       167 
168 
     | 
    
         
             
              }
         
     | 
| 
       168 
169 
     | 
    
         | 
| 
       169 
170 
     | 
    
         
             
              COMPLEX_SEPARATORS = %r{
         
     | 
| 
         @@ -202,7 +203,7 @@ module DwcAgent 
     | 
|
| 
       202 
203 
     | 
    
         
             
                (?i:geographic)|
         
     | 
| 
       203 
204 
     | 
    
         
             
                (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
         
     | 
| 
       204 
205 
     | 
    
         
             
                (?i:univ\.)|
         
     | 
| 
       205 
     | 
    
         
            -
                (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
         
     | 
| 
      
 206 
     | 
    
         
            +
                (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker|gamekeeper)|
         
     | 
| 
       206 
207 
     | 
    
         
             
                (?i:non\s+pr(é|e)cis(é|e))|
         
     | 
| 
       207 
208 
     | 
    
         
             
                (?i:no\s+consta)|
         
     | 
| 
       208 
209 
     | 
    
         
             
                (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
         
     | 
| 
         @@ -267,6 +268,10 @@ module DwcAgent 
     | 
|
| 
       267 
268 
     | 
    
         
             
                "has not"
         
     | 
| 
       268 
269 
     | 
    
         
             
              ]
         
     | 
| 
       269 
270 
     | 
    
         | 
| 
       270 
     | 
    
         
            -
              TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
         
     | 
| 
      
 271 
     | 
    
         
            +
              TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
              APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
         
     | 
| 
      
 274 
     | 
    
         
            +
             
     | 
| 
      
 275 
     | 
    
         
            +
              SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
         
     | 
| 
       271 
276 
     | 
    
         | 
| 
       272 
277 
     | 
    
         
             
            end
         
     | 
    
        data/lib/dwc_agent/parser.rb
    CHANGED
    
    | 
         @@ -11,14 +11,16 @@ module DwcAgent 
     | 
|
| 
       11 
11 
     | 
    
         
             
                  options = {
         
     | 
| 
       12 
12 
     | 
    
         
             
                    prefer_comma_as_separator: true,
         
     | 
| 
       13 
13 
     | 
    
         
             
                    separator: SPLIT_BY,
         
     | 
| 
       14 
     | 
    
         
            -
                    title: TITLE
         
     | 
| 
      
 14 
     | 
    
         
            +
                    title: TITLE,
         
     | 
| 
      
 15 
     | 
    
         
            +
                    appellation: APPELLATION,
         
     | 
| 
      
 16 
     | 
    
         
            +
                    suffix: SUFFIX
         
     | 
| 
       15 
17 
     | 
    
         
             
                  }
         
     | 
| 
       16 
18 
     | 
    
         
             
                  @namae = Namae::Parser.new(options)
         
     | 
| 
       17 
19 
     | 
    
         | 
| 
       18 
20 
     | 
    
         
             
                  @strip_out_regex = Regexp.new STRIP_OUT.to_s
         
     | 
| 
       19 
21 
     | 
    
         
             
                  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
         
     | 
| 
       20 
22 
     | 
    
         
             
                  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
         
     | 
| 
       21 
     | 
    
         
            -
                  @phrase_subs_regex = Regexp.new 
     | 
| 
      
 23 
     | 
    
         
            +
                  @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.join('|').to_s, Regexp::IGNORECASE
         
     | 
| 
       22 
24 
     | 
    
         
             
                  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
         
     | 
| 
       23 
25 
     | 
    
         
             
                  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
         
     | 
| 
       24 
26 
     | 
    
         
             
                end
         
     | 
    
        data/lib/dwc_agent/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: dwc_agent
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 1.5.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - David P. Shorthouse
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2020- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2020-08-06 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: namae
         
     | 
| 
         @@ -98,7 +98,7 @@ files: 
     | 
|
| 
       98 
98 
     | 
    
         
             
            - lib/dwc_agent/similarity.rb
         
     | 
| 
       99 
99 
     | 
    
         
             
            - lib/dwc_agent/utility.rb
         
     | 
| 
       100 
100 
     | 
    
         
             
            - lib/dwc_agent/version.rb
         
     | 
| 
       101 
     | 
    
         
            -
            homepage: https://github.com/ 
     | 
| 
      
 101 
     | 
    
         
            +
            homepage: https://github.com/bionomia/dwc_agent
         
     | 
| 
       102 
102 
     | 
    
         
             
            licenses:
         
     | 
| 
       103 
103 
     | 
    
         
             
            - MIT
         
     | 
| 
       104 
104 
     | 
    
         
             
            metadata: {}
         
     |