confidential_info_redactor_lite 0.0.14 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 456160b2c0748728398a7a5804f40389d141a56a
         | 
| 4 | 
            +
              data.tar.gz: 29194c0181768c1f35b9558e5934c6133a376a41
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: f8d4a592e035ccadef1c8bd595978d998dfa297064fef08abe396d7b309441b54dfbd5cd02a0aeca231765f25d5316a8d334af5018d3a6244ec40e493db5c653
         | 
| 7 | 
            +
              data.tar.gz: 22d0eea27d9a10fb5f42c8076b713995490ba95918761816cdeee4aa43acfb37248133944c4adcb396959f30bb7cbeb69624c8119abc5fea21c419043b4abfa4
         | 
| @@ -13,19 +13,19 @@ module ConfidentialInfoRedactorLite | |
| 13 13 | 
             
                def extract
         | 
| 14 14 | 
             
                  extracted_terms = []
         | 
| 15 15 | 
             
                  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
         | 
| 16 | 
            -
                    initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
         | 
| 16 | 
            +
                    initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
         | 
| 17 17 | 
             
                    initial_extracted_terms.each do |ngram|
         | 
| 18 18 | 
             
                      ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
         | 
| 19 | 
            -
                        if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
         | 
| 20 | 
            -
                          extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
         | 
| 19 | 
            +
                        if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
         | 
| 20 | 
            +
                          extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
         | 
| 21 21 | 
             
                        else
         | 
| 22 22 | 
             
                          tracker = true
         | 
| 23 | 
            -
                          unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
         | 
| 24 | 
            -
                            t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
         | 
| 23 | 
            +
                          unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
         | 
| 24 | 
            +
                            t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
         | 
| 25 25 | 
             
                              tracker = false if corpus.include?(token.downcase)
         | 
| 26 26 | 
             
                            end
         | 
| 27 27 | 
             
                          end
         | 
| 28 | 
            -
                          extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip) || !tracker
         | 
| 28 | 
            +
                          extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker
         | 
| 29 29 | 
             
                        end
         | 
| 30 30 | 
             
                      end
         | 
| 31 31 | 
             
                    end
         | 
| @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 3 | 
             
            RSpec.describe ConfidentialInfoRedactorLite::Extractor do
         | 
| 4 | 
            -
              let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy'] }
         | 
| 4 | 
            +
              let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
         | 
| 5 5 | 
             
              describe '#extract' do
         | 
| 6 6 | 
             
                context 'English (en)' do
         | 
| 7 7 | 
             
                  it 'extracts the proper nouns from a text #001' do
         | 
| @@ -119,6 +119,21 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do | |
| 119 119 | 
             
                    text = 'GOOD CARBS VS. BAD CARBS'
         | 
| 120 120 | 
             
                    expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
         | 
| 121 121 | 
             
                  end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                  it 'extracts the proper nouns from a text #013' do
         | 
| 124 | 
            +
                    text = 'Reducing”'
         | 
| 125 | 
            +
                    expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
         | 
| 126 | 
            +
                  end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                  it 'extracts the proper nouns from a text #014' do
         | 
| 129 | 
            +
                    text = '”'
         | 
| 130 | 
            +
                    expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
         | 
| 131 | 
            +
                  end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                  it 'extracts the proper nouns from a text #015' do
         | 
| 134 | 
            +
                    text = '“Reducing'
         | 
| 135 | 
            +
                    expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
         | 
| 136 | 
            +
                  end
         | 
| 122 137 | 
             
                end
         | 
| 123 138 |  | 
| 124 139 | 
             
                context 'German (de)' do
         |