RubyGems - confidential_info_redactor_lite - Versions diffs - 0.0.18 → 0.0.19 - Mend

confidential_info_redactor_lite 0.0.18 → 0.0.19

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 52f6dc75062fd5b96bf6ebddda6091ece0984a5f
-  data.tar.gz: f1e59cf4d26237d79076474ca22ee5eaeb27ae41
+  metadata.gz: ecfaf02e5c062c12167515680d1ce6e4a0f73d62
+  data.tar.gz: 069f0d297228e1fa76ab940776b43398a0ca130e
 SHA512:
-  metadata.gz: 9295558573274c81494ad53a53f61da538cba25a8bf5b32035a0af7baa78f8e2b83b63ecd3516902fc2457acfda037e5a45bf9106a4976d8c7489554182ebb34
-  data.tar.gz: 3b881aef75c2df2c7507034feb57ab02deba5ba4735bf50ca00c63d83eee034fd0c92a53b80a0fdf77203a8a19434c70850847cf08b9b3adc1b9796acd0e21af
+  metadata.gz: fe912ff29029746481b880e81090fb8298f3b0581270bb954f903afd8a9c6ff7d0e6da0269af0eb2a8ad444562811d68d0faee43eadd329e3dedd59eb6330400
+  data.tar.gz: 4dd204b8f0d34e0d13beaa1795806c4a39c406d01ee258367a7c877cf144e8ea7c3866c65b4910dc6a13b4352a669b11c61fb9df9813fe64de5546a89b97005c

@@ -16,6 +16,7 @@ module ConfidentialInfoRedactorLite
         initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
         initial_extracted_terms.each do |ngram|
           ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
+            next if !(t !~ /.*\d+.*/)
             if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
               extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
             else

@@ -1,3 +1,3 @@
 module ConfidentialInfoRedactorLite
-  VERSION = "0.0.18"
+  VERSION = "0.0.19"
 end

@@ -134,6 +134,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
         text = '“Reducing'
         expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
       end
+      it 'extracts the proper nouns from a text #016' do
+        text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
+        expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
+      end
     end
     context 'German (de)' do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: confidential_info_redactor_lite
 version: !ruby/object:Gem::Version
-  version: 0.0.18
+  version: 0.0.19
 platform: ruby
 authors:
 - Kevin S. Dias