RubyGems - confidential_info_redactor_lite - Versions diffs - 1.0.10 → 1.0.11 - Mend

confidential_info_redactor_lite 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/confidential_info_redactor_lite/extractor.rb +7 -8
data/lib/confidential_info_redactor_lite/version.rb +1 -1
data/spec/confidential_info_redactor_lite/extractor_spec.rb +6 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8912d3e15f41e1f844eaa419ab85cb2ea6a17c62
-  data.tar.gz: 5ddbf4fa910e5a439c43b8f0734f981ab1a18249
+  metadata.gz: 6a5ec9e9eeb82984f87cba0c612404b7a6e9f4b4
+  data.tar.gz: 5e410055fe45224d70930f7e6c8f2858eeda4bc0
 SHA512:
-  metadata.gz: 61b8a8408644671cc111edc75b1faca3b0d92eee0faf1f9d119b4a29b95546764fe8648fd93f952bcdc7024493461aefe05da7c86b9cd3dae0ca1212703e3598
-  data.tar.gz: 832ba89f209760c18833b7d3bd300496700890231c6053ecb50536fd97d47c796a8f12792e620c05c34f6e39f40513144825f93f830b1ae27c6d3fb0a7da9724
+  metadata.gz: 5e3c14787d5013846629bbab16ef5117fe96a0e8d1193be266e1325f7d21ff978100f4af666289d933869f744d7bd0383e7c136f81d6bac733e72dabde40f766
+  data.tar.gz: ef113dda2f3a5928f1281d0fb96fd9d99bc8d4d8d7bc0c5aa5d227b77e3182b05934e498aa8c7481bb1c80ee51be7a1132c3740207b33e19264c9005bf8d5b77

data/lib/confidential_info_redactor_lite/extractor.rb CHANGED Viewed

@@ -15,7 +15,6 @@ module ConfidentialInfoRedactorLite
       extracted_terms = []
       PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
         initial_extracted_terms = extract_preliminary_terms(segment)
-        next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
         search_ngrams(initial_extracted_terms, extracted_terms)
       end
       extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
@@ -27,21 +26,21 @@ module ConfidentialInfoRedactorLite
       segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
     end
-    def in_corpus?(tokens)
-      tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
-    end
     def clean_token(token)
       token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
     end
     def non_confidential_token?(token, includes_confidential)
-      corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
+      corpus.include?(token) || !includes_confidential || stem_in_corpus?(token)
     end
-    def singular_in_corpus?(token)
+    def stem_in_corpus?(token)
       corpus.include?(token[0...-1]) &&
-        token[-1].eql?('s')
+        token[-1].eql?('s') ||
+        corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
+        corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
+        corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
+        corpus.include?(token[0...-1]) && token[-1].eql?('n')
     end
     def includes_confidential?(token)

data/lib/confidential_info_redactor_lite/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ConfidentialInfoRedactorLite
-  VERSION = "1.0.10"
+  VERSION = "1.0.11"
 end

data/spec/confidential_info_redactor_lite/extractor_spec.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require 'spec_helper'
 RSpec.describe ConfidentialInfoRedactorLite::Extractor do
-  let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
+  let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing', 'ich', 'routine', 'studium', 'uni', 'tag', 'mahlzeit'] }
   describe '#extract' do
     context 'English (en)' do
       it 'extracts the proper nouns from a text #001' do
@@ -196,6 +196,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
         text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
         expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
       end
+      it 'extracts the proper nouns from a text #008' do
+        text = 'Ich behielt diese Routine während und sogar während des Studiums an der Uni bei, und ich war damals froh, wenn ich pro Tag zwei ganze Mahlzeiten zu mir nahm.'
+        expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq([])
+      end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: confidential_info_redactor_lite
 version: !ruby/object:Gem::Version
-  version: 1.0.10
+  version: 1.0.11
 platform: ruby
 authors:
 - Kevin S. Dias