dwc_agent 1.5.0.2 → 1.5.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +5 -5
- data/lib/dwc_agent/constants.rb +55 -18
- data/lib/dwc_agent/parser.rb +1 -1
- data/lib/dwc_agent/version.rb +1 -1
- metadata +6 -6
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: a63a84b3c095994d4b5053ddb389cfc1e7e0375f51b6cbb2668742bc4381a0da
         | 
| 4 | 
            +
              data.tar.gz: 4f4cc7668f64196458bc126e8dc81753afd92ee77ce729f241f96fa5df39315c
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: da8f649cbc9d4ddf82b66e044e0f9e6a6354d5f613ecc2bf1e5cf900a37868178e3056f9c84311050738ec3b3dd7b7466df2ea25b5cebd3595821c35f634eb29
         | 
| 7 | 
            +
              data.tar.gz: e52cec918b39c1aceb010e55ba0d5feaa7891234368fbdb56abd48c36c0945a5de6db5e89ee1371ee8240c3030caf4bae268257b0a780036df54f9521c3a6f1e
         | 
    
        data/lib/dwc_agent/cleaner.rb
    CHANGED
    
    | @@ -18,10 +18,6 @@ module DwcAgent | |
| 18 18 | 
             
                def clean(parsed_namae)
         | 
| 19 19 | 
             
                  blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
         | 
| 20 20 |  | 
| 21 | 
            -
                  if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
         | 
| 22 | 
            -
                    return blank_name
         | 
| 23 | 
            -
                  end
         | 
| 24 | 
            -
             | 
| 25 21 | 
             
                  if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
         | 
| 26 22 | 
             
                    return blank_name
         | 
| 27 23 | 
             
                  end
         | 
| @@ -30,7 +26,7 @@ module DwcAgent | |
| 30 26 | 
             
                    return blank_name
         | 
| 31 27 | 
             
                  end
         | 
| 32 28 |  | 
| 33 | 
            -
                  if parsed_namae.given && parsed_namae.given.length >  | 
| 29 | 
            +
                  if parsed_namae.given && parsed_namae.given.length > 35
         | 
| 34 30 | 
             
                    return blank_name
         | 
| 35 31 | 
             
                  end
         | 
| 36 32 |  | 
| @@ -79,6 +75,10 @@ module DwcAgent | |
| 79 75 | 
             
                    parsed_namae.given = NameCase(parsed_namae.given)
         | 
| 80 76 | 
             
                  end
         | 
| 81 77 |  | 
| 78 | 
            +
                  if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
         | 
| 79 | 
            +
                    return blank_name
         | 
| 80 | 
            +
                  end
         | 
| 81 | 
            +
             | 
| 82 82 | 
             
                  parsed_namae.normalize_initials
         | 
| 83 83 |  | 
| 84 84 | 
             
                  family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
         | 
    
        data/lib/dwc_agent/constants.rb
    CHANGED
    
    | @@ -3,18 +3,20 @@ module DwcAgent | |
| 3 3 | 
             
                ^[\[{(]|
         | 
| 4 4 | 
             
                [\]})]\??$|
         | 
| 5 5 | 
             
                (?i:acc\s?\#)|
         | 
| 6 | 
            +
                [,;]?\s*(?i:1st|2nd|3rd|[4-9]th)|
         | 
| 6 7 | 
             
                \s*?\d+\.\d+|
         | 
| 7 8 | 
             
                \b\d+\(?(?i:[[:alpha:]])\)?\b|
         | 
| 8 9 | 
             
                \b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
         | 
| 10 | 
            +
                \b[,;]?\s*(?i:etal)\.?|
         | 
| 9 11 | 
             
                \b\s+(bis|ter)(\b|\z)|
         | 
| 10 12 | 
             
                \bu\.\s*a\.|
         | 
| 11 | 
            -
                \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
         | 
| 13 | 
            +
                \b[,;]?\s*(?i:and|&)?\s*(?i:others|party)\s*\b|
         | 
| 12 14 | 
             
                \b[,;]?\s*(?i:etc)\.?|
         | 
| 13 15 | 
             
                \b[,;]?\s*(?i:on)\b|
         | 
| 14 16 | 
             
                \b[,;]?\s*(?i:unkn?own)\b|
         | 
| 15 17 | 
             
                \b[,;]?\s*(?i:n/a)\b|
         | 
| 16 18 | 
             
                \b[,;]?\s*(?i:ann?onymous)\b|
         | 
| 17 | 
            -
                \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed | 
| 19 | 
            +
                \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?|presumably)\)?\b|
         | 
| 18 20 | 
             
                \b[,;]?\s*(?i:importer|gift)\:?\b|
         | 
| 19 21 | 
             
                \b[,;]?\s*(?i:string)\b|
         | 
| 20 22 | 
             
                \b[,;]?\s*(?i:person\s*string)\b|
         | 
| @@ -53,29 +55,32 @@ module DwcAgent | |
| 53 55 | 
             
                (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
         | 
| 54 56 | 
             
                \b\s*(?i:maybe)\s*\b|
         | 
| 55 57 | 
             
                \b\s*(?i:prob)\.\s*\b|
         | 
| 56 | 
            -
                \(?[,]?\s*?(?i:(local)?\s? | 
| 58 | 
            +
                \(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
         | 
| 57 59 | 
             
                \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
         | 
| 58 60 | 
             
                (?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
         | 
| 59 61 | 
             
                (?i:fide)\:?\s*\b|
         | 
| 62 | 
            +
                (?i:first\s+name\s+unknown)|
         | 
| 60 63 | 
             
                (?i:game\s+dept)\.?\s*\b|
         | 
| 61 64 | 
             
                (?i:see\s+notes?\s*(inside)?)|
         | 
| 62 65 | 
             
                (?i:see\s+letter\s+enclosed)|
         | 
| 63 66 | 
             
                (?i:(by)?\s+correspondance)|
         | 
| 64 | 
            -
                (?i:pers\.?\s | 
| 67 | 
            +
                (?i:pers\.?\s*comm\.?)|
         | 
| 65 68 | 
             
                (?i:crossed\s+out)|
         | 
| 66 69 | 
             
                \(?(?i:source)\(?|
         | 
| 67 70 | 
             
                (?i:according\s+to)|
         | 
| 68 71 | 
             
                (?i:lanuv)\d+|
         | 
| 72 | 
            +
                \b\s*name\b|
         | 
| 73 | 
            +
                \b\s*lost\b|
         | 
| 69 74 | 
             
                (?i:nswobs)|
         | 
| 70 75 | 
             
                ORCID|
         | 
| 71 76 | 
             
                MRI(\s|-)PAS|
         | 
| 72 77 | 
             
                urn\:qm\.qld\.gov\.au\:collector|
         | 
| 73 78 | 
             
                (?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
         | 
| 74 | 
            -
                (?i: | 
| 75 | 
            -
                (?i: | 
| 79 | 
            +
                (?i:field\s+museum\s+of\s+natural\s+history)|
         | 
| 80 | 
            +
                (?i:american\s+museum\s+of\s+natural\s+history)|
         | 
| 76 81 | 
             
                (?i:The\s+Paleontological\s+Research\s+Institution)|
         | 
| 77 82 | 
             
                (?i:museums?\s+victoria)|
         | 
| 78 | 
            -
                \b\s*(?i: | 
| 83 | 
            +
                \b\s*(?i:united\s+states|russia)\s*\b|
         | 
| 79 84 | 
             
                (?i:revised|photograph|fruits\s+only)|
         | 
| 80 85 | 
             
                -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
         | 
| 81 86 | 
             
                -?\s*(?i:synonym(y|ie))|
         | 
| @@ -83,11 +88,14 @@ module DwcAgent | |
| 83 88 | 
             
                \b(?i:to\s+(sub)?spp?)\.?|
         | 
| 84 89 | 
             
                (?i:nom\.?\s+rev\.?)|
         | 
| 85 90 | 
             
                FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|USGS|NAWQA|
         | 
| 91 | 
            +
                \b,?\s*(?i:para|topo|syn)?(?i:type)|
         | 
| 86 92 | 
             
                AFSC\/POLISH\s+SORTING\s+CTR\.?|
         | 
| 87 93 | 
             
                (?i:university|museum|exhibits?)|
         | 
| 88 94 | 
             
                (?i:uqam)|
         | 
| 89 95 | 
             
                (?i:sem\s+(colec?tor|data))|
         | 
| 90 96 | 
             
                (?i:no\s+coll\.?(ector)?)|
         | 
| 97 | 
            +
                (?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)|
         | 
| 98 | 
            +
                (?i:non?)\s+(?i:specificato)|
         | 
| 91 99 | 
             
                \b[,;]\s+\d+\z|
         | 
| 92 100 | 
             
                ["!@?]|
         | 
| 93 101 | 
             
                [,]?\d+|
         | 
| @@ -113,19 +121,22 @@ module DwcAgent | |
| 113 121 | 
             
                [–|ǀ∣|│&+\/;:]|
         | 
| 114 122 | 
             
                \s+-\s+|
         | 
| 115 123 | 
             
                \s+a\.\s+|
         | 
| 116 | 
            -
                \b(e|y|i|en|et|or|per|for)\s*\b|
         | 
| 124 | 
            +
                \b(con|e|y|i|en|et|or|per|for)\s*\b|
         | 
| 117 125 | 
             
                \b(?i:and|with)\s*\b|
         | 
| 118 126 | 
             
                \b(?i:annotated(\s+by)?)\s*\b|
         | 
| 119 127 | 
             
                \b(?i:coll\.)\s*\b|
         | 
| 120 128 | 
             
                \b(?i:communicate?d(\s+to)?)\s*\b|
         | 
| 121 129 | 
             
                \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
         | 
| 130 | 
            +
                \b(?i:confirmada)(\s+por)?\s*\b|
         | 
| 122 131 | 
             
                \b(?i:checked?(\s+by)?)\s*\b|
         | 
| 123 132 | 
             
                \b(?i:det\.?(\s+by)?)\s*\b|
         | 
| 124 133 | 
             
                \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
         | 
| 125 134 | 
             
                \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
         | 
| 126 135 | 
             
                \b(?i:in?dentified(\s+by)?)\s*\b|
         | 
| 127 136 | 
             
                \b(?i:in\s+part(\s+by)?)\s*\b|
         | 
| 137 | 
            +
                \b(?i:och)\s*\b|
         | 
| 128 138 | 
             
                \b(?i:prep\.?\s+(?i:by)?)\s*\b|
         | 
| 139 | 
            +
                \b(?i:purchased?)(\s+by)?\s*\b|
         | 
| 129 140 | 
             
                \b(?i:redet\.?(\s+by?)?)\s*\b|
         | 
| 130 141 | 
             
                \b(?i:reidentified(\s+by)?)\s*\b|
         | 
| 131 142 | 
             
                \b(?i:stet)\s*\b|
         | 
| @@ -155,16 +166,24 @@ module DwcAgent | |
| 155 166 | 
             
                '}' => '',
         | 
| 156 167 | 
             
                '@' => '',
         | 
| 157 168 | 
             
                '%' => '',
         | 
| 158 | 
            -
                '\\' => ''
         | 
| 169 | 
            +
                '\\' => '',
         | 
| 170 | 
            +
                '´' => '\'',
         | 
| 171 | 
            +
                '+' => ' | '
         | 
| 159 172 | 
             
              }
         | 
| 160 173 |  | 
| 161 174 | 
             
              PHRASE_SUBS = {
         | 
| 162 | 
            -
                ' | 
| 163 | 
            -
                ' | 
| 164 | 
            -
                ' | 
| 165 | 
            -
                '  | 
| 166 | 
            -
                '  | 
| 167 | 
            -
                ' | 
| 175 | 
            +
                ', ph.d.' => ' Ph.D.',
         | 
| 176 | 
            +
                ', Ph.D.' => ' Ph.D.',
         | 
| 177 | 
            +
                ', bro.' => ' Bro.',
         | 
| 178 | 
            +
                ', Jr.,' => ' Jr.;',
         | 
| 179 | 
            +
                ', Jr.' => ' Jr.',
         | 
| 180 | 
            +
                ',Jr.' => ' Jr.',
         | 
| 181 | 
            +
                ', Sr.' => ' Sr.',
         | 
| 182 | 
            +
                ',Sr.' => ' Sr.',
         | 
| 183 | 
            +
                ' jr.,' => ' Jr.;',
         | 
| 184 | 
            +
                ' jr,' => ' Jr.;',
         | 
| 185 | 
            +
                '-jr' => ' Jr.',
         | 
| 186 | 
            +
                '-Jr' => ' Jr.'
         | 
| 168 187 | 
             
              }
         | 
| 169 188 |  | 
| 170 189 | 
             
              COMPLEX_SEPARATORS = %r{
         | 
| @@ -229,7 +248,7 @@ module DwcAgent | |
| 229 248 | 
             
                (?i:though)|
         | 
| 230 249 | 
             
                (?i:texas\s+instruments?)\s*?(for)?|
         | 
| 231 250 | 
             
                (?:tropical)|
         | 
| 232 | 
            -
                (?i:toward|seen | 
| 251 | 
            +
                (?i:toward|seen\s+at)|
         | 
| 233 252 | 
             
                (?i:unidentified|unspecified|unk?nown?|unnamed|unread|unmistak|no agent)|
         | 
| 234 253 | 
             
                (?i:urn\:)|
         | 
| 235 254 | 
             
                (?i:usda|ucla)|
         | 
| @@ -238,29 +257,47 @@ module DwcAgent | |
| 238 257 | 
             
              }x
         | 
| 239 258 |  | 
| 240 259 | 
             
              FAMILY_BLACKLIST = [
         | 
| 260 | 
            +
                "ap",
         | 
| 241 261 | 
             
                "da",
         | 
| 262 | 
            +
                "de",
         | 
| 242 263 | 
             
                "de'",
         | 
| 243 264 | 
             
                "del",
         | 
| 244 265 | 
             
                "der",
         | 
| 266 | 
            +
                "di",
         | 
| 267 | 
            +
                "do",
         | 
| 268 | 
            +
                "dos",
         | 
| 245 269 | 
             
                "du",
         | 
| 246 270 | 
             
                "el",
         | 
| 271 | 
            +
                "le",
         | 
| 272 | 
            +
                "la",
         | 
| 247 273 | 
             
                "van",
         | 
| 248 274 | 
             
                "von",
         | 
| 249 275 | 
             
                "the",
         | 
| 250 276 | 
             
                "of",
         | 
| 251 277 | 
             
                "adjustment",
         | 
| 278 | 
            +
                "annotator",
         | 
| 252 279 | 
             
                "available",
         | 
| 253 280 | 
             
                "arachnology",
         | 
| 254 281 | 
             
                "catalogue",
         | 
| 255 282 | 
             
                "curators",
         | 
| 256 283 | 
             
                "data",
         | 
| 284 | 
            +
                "details",
         | 
| 285 | 
            +
                "determiner",
         | 
| 257 286 | 
             
                "determination",
         | 
| 258 287 | 
             
                "dissection",
         | 
| 259 288 | 
             
                "entered",
         | 
| 289 | 
            +
                "erased",
         | 
| 290 | 
            +
                "indecipherable",
         | 
| 260 291 | 
             
                "nomenclatural",
         | 
| 261 292 | 
             
                "orig",
         | 
| 262 293 | 
             
                "registration",
         | 
| 263 | 
            -
                "science"
         | 
| 294 | 
            +
                "science",
         | 
| 295 | 
            +
                "wg",
         | 
| 296 | 
            +
                "wm",
         | 
| 297 | 
            +
                "wn",
         | 
| 298 | 
            +
                "zw",
         | 
| 299 | 
            +
                "zz",
         | 
| 300 | 
            +
                "z-"
         | 
| 264 301 | 
             
              ]
         | 
| 265 302 |  | 
| 266 303 | 
             
              GIVEN_BLACKLIST = [
         | 
| @@ -268,7 +305,7 @@ module DwcAgent | |
| 268 305 | 
             
                "has not"
         | 
| 269 306 | 
             
              ]
         | 
| 270 307 |  | 
| 271 | 
            -
              TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt| | 
| 308 | 
            +
              TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|major|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
         | 
| 272 309 |  | 
| 273 310 | 
             
              APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
         | 
| 274 311 |  | 
    
        data/lib/dwc_agent/parser.rb
    CHANGED
    
    | @@ -20,7 +20,7 @@ module DwcAgent | |
| 20 20 | 
             
                  @strip_out_regex = Regexp.new STRIP_OUT.to_s
         | 
| 21 21 | 
             
                  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
         | 
| 22 22 | 
             
                  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
         | 
| 23 | 
            -
                  @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.join('|').to_s | 
| 23 | 
            +
                  @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
         | 
| 24 24 | 
             
                  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
         | 
| 25 25 | 
             
                  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
         | 
| 26 26 | 
             
                end
         | 
    
        data/lib/dwc_agent/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: dwc_agent
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1.5.0. | 
| 4 | 
            +
              version: 1.5.0.7
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - David P. Shorthouse
         | 
| 8 | 
            -
            autorequire: | 
| 8 | 
            +
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020- | 
| 11 | 
            +
            date: 2020-09-15 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: namae
         | 
| @@ -102,7 +102,7 @@ homepage: https://github.com/bionomia/dwc_agent | |
| 102 102 | 
             
            licenses:
         | 
| 103 103 | 
             
            - MIT
         | 
| 104 104 | 
             
            metadata: {}
         | 
| 105 | 
            -
            post_install_message: | 
| 105 | 
            +
            post_install_message:
         | 
| 106 106 | 
             
            rdoc_options:
         | 
| 107 107 | 
             
            - "--encoding"
         | 
| 108 108 | 
             
            - UTF-8
         | 
| @@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 119 119 | 
             
                - !ruby/object:Gem::Version
         | 
| 120 120 | 
             
                  version: '0'
         | 
| 121 121 | 
             
            requirements: []
         | 
| 122 | 
            -
            rubygems_version: 3. | 
| 123 | 
            -
            signing_key: | 
| 122 | 
            +
            rubygems_version: 3.1.2
         | 
| 123 | 
            +
            signing_key:
         | 
| 124 124 | 
             
            specification_version: 4
         | 
| 125 125 | 
             
            summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
         | 
| 126 126 | 
             
            test_files: []
         |