RubyGems - confidential_info_redactor_lite - Versions diffs - 0.0.31 → 0.0.32 - Mend

confidential_info_redactor_lite 0.0.31 → 0.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/confidential_info_redactor_lite/extractor.rb +12 -10
data/lib/confidential_info_redactor_lite/hyperlink.rb +1 -12
data/lib/confidential_info_redactor_lite/version.rb +1 -1
data/spec/confidential_info_redactor_lite/extractor_spec.rb +20 -0
data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +0 -44
data/spec/confidential_info_redactor_lite/redactor_spec.rb +5 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e4392706f3a5e307a5453009fadcf4010a7162da
-  data.tar.gz: 107759c2fd4bbe553f6389bb4c419698c0513285
+  metadata.gz: 69347cc82e199d28d8bf542fba216c7b8c6d3410
+  data.tar.gz: ceebbfa4046ba5fd1349d47c6eff0111ad18b091
 SHA512:
-  metadata.gz: 77ea54f659ac2a4e340fa55ac5afc558d0e6807630c1b646554da52a3a0a88f138c9db932d92b86e06b7334b79ea91b97b93011d1afea055a427953d148d55cf
-  data.tar.gz: 62c0fd961e7ab69fb571f5ab5edae83d736feb1bf81a77d33195ad7c3fdfe7e5cd99ccb2a2dc5c65e0e49e6a0da0c009ce362ac854aa93ffe0b576ccd91f0369
+  metadata.gz: 13b2789ed7b91d38fb3963a554b5f7570b13f7ce39891f4a43be479fc59770794d513f74c1c3ea66c1d312a0b3b919644c096ec4a726b529ffd4e23913cf32d1
+  data.tar.gz: f2a713b84a11c782f452bc2b4279bf5142e2b1ecbc7eb8374f77f924937bd759932cf870a4ecad2416d80adf6bbcbade97b8ff17de249a379e46c754197923ba

data/lib/confidential_info_redactor_lite/extractor.rb CHANGED Viewed

@@ -2,7 +2,9 @@ module ConfidentialInfoRedactorLite
   # This class extracts proper nouns from a text
   class Extractor
     # Rubular: http://rubular.com/r/qE0g4r9zR7
-    EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
+    EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
+    PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
     attr_reader :text, :language, :corpus
     def initialize(text:, corpus:, **args)
       @text = text.gsub(/[’‘]/, "'")
@@ -13,29 +15,29 @@ module ConfidentialInfoRedactorLite
     def extract
       extracted_terms = []
       PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
-        initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
+        initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
         in_corpus = true
         initial_extracted_terms.each do |ngram|
-          ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
-            unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
+          ngram.split(PUNCTUATION_REGEX).each do |t|
+            unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
               in_corpus = false
             end
           end
         end
         next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
         initial_extracted_terms.each do |ngram|
-          ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
+          ngram.split(PUNCTUATION_REGEX).each do |t|
             next if !(t !~ /.*\d+.*/)
-            if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
-              extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
+            if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
+              extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
             else
               tracker = true
-              unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
-                t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
+              unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
+                t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
                   tracker = false if corpus.include?(token.downcase)
                 end
               end
-              extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('n'))
+              extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
             end
           end
         end

data/lib/confidential_info_redactor_lite/hyperlink.rb CHANGED Viewed

@@ -1,8 +1,5 @@
-require 'uri'
 module ConfidentialInfoRedactorLite
   class Hyperlink
-    NON_HYPERLINK_REGEX = /\A\w+:$/
     # Rubular: http://rubular.com/r/fXa4lp0gfS
     HYPERLINK_REGEX = /(http|https|www)(\.|:)/
@@ -12,18 +9,10 @@ module ConfidentialInfoRedactorLite
       @string = string
     end
-    def hyperlink?
-      !(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
-    end
     def replace
       new_string = string.dup
       string.split(/\s+/).each do |token|
-        if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
-          new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
-        elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
-          new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
-        end
+        new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
       end
       new_string
     end

data/lib/confidential_info_redactor_lite/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ConfidentialInfoRedactorLite
-  VERSION = "0.0.31"
+  VERSION = "0.0.32"
 end

data/spec/confidential_info_redactor_lite/extractor_spec.rb CHANGED Viewed

@@ -161,6 +161,26 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
         text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
         expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
       end
+       it 'extracts the proper nouns from a text #004' do
+        text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
+      it 'extracts the proper nouns from a text #005' do
+        text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
+      it 'extracts the proper nouns from a text #006' do
+        text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
+      it 'extracts the proper nouns from a text #007' do
+        text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
     end
   end
 end

data/spec/confidential_info_redactor_lite/hyperlink_spec.rb CHANGED Viewed

@@ -1,50 +1,6 @@
 require 'spec_helper'
 RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
-  context '#hyperlink?' do
-    it 'returns true if the string is a hyperlink #001' do
-      string = "http://www.example.com/this-IS-a_test/hello.html"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(true)
-    end
-    it 'returns true if the string is a hyperlink #002' do
-      string = "http://www.google.co.uk"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(true)
-    end
-    it 'returns true if the string is a hyperlink #003' do
-      string = "https://google.co.uk"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(true)
-    end
-    it 'returns false if the string is not a hyperlink #004' do
-      string = "hello"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-    it 'returns false if the string is not a hyperlink #005' do
-      string = "john@gmail.com"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-    it 'returns false if the string is not a hyperlink #006' do
-      string = "date:"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-    it 'returns false if the string is not a hyperlink #007' do
-      string = 'The file location is c:\Users\johndoe.'
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-  end
   context '#replace' do
     it 'replaces the hyperlinks in a string with regular tokens #001' do
       string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"

data/spec/confidential_info_redactor_lite/redactor_spec.rb CHANGED Viewed

@@ -129,6 +129,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
       text = 'Visit https://www.tm-town.com for more info.'
       expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
     end
+    it 'redacts hyperlinks from a text #002' do
+      text = 'Visit www.tm-town.com for more info.'
+      expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
+    end
   end
   describe '#hyperlinks_html' do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: confidential_info_redactor_lite
 version: !ruby/object:Gem::Version
-  version: 0.0.31
+  version: 0.0.32
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-27 00:00:00.000000000 Z
+date: 2015-05-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler