RubyGems - confidential_info_redactor_lite - Versions diffs - 0.0.31 → 0.0.32 - Mend

confidential_info_redactor_lite 0.0.31 → 0.0.32

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/confidential_info_redactor_lite/extractor.rb +12 -10
data/lib/confidential_info_redactor_lite/hyperlink.rb +1 -12
data/lib/confidential_info_redactor_lite/version.rb +1 -1
data/spec/confidential_info_redactor_lite/extractor_spec.rb +20 -0
data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +0 -44
data/spec/confidential_info_redactor_lite/redactor_spec.rb +5 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e4392706f3a5e307a5453009fadcf4010a7162da
-  data.tar.gz: 107759c2fd4bbe553f6389bb4c419698c0513285
+  metadata.gz: 69347cc82e199d28d8bf542fba216c7b8c6d3410
+  data.tar.gz: ceebbfa4046ba5fd1349d47c6eff0111ad18b091
 SHA512:
-  metadata.gz: 77ea54f659ac2a4e340fa55ac5afc558d0e6807630c1b646554da52a3a0a88f138c9db932d92b86e06b7334b79ea91b97b93011d1afea055a427953d148d55cf
-  data.tar.gz: 62c0fd961e7ab69fb571f5ab5edae83d736feb1bf81a77d33195ad7c3fdfe7e5cd99ccb2a2dc5c65e0e49e6a0da0c009ce362ac854aa93ffe0b576ccd91f0369
+  metadata.gz: 13b2789ed7b91d38fb3963a554b5f7570b13f7ce39891f4a43be479fc59770794d513f74c1c3ea66c1d312a0b3b919644c096ec4a726b529ffd4e23913cf32d1
+  data.tar.gz: f2a713b84a11c782f452bc2b4279bf5142e2b1ecbc7eb8374f77f924937bd759932cf870a4ecad2416d80adf6bbcbade97b8ff17de249a379e46c754197923ba

data/lib/confidential_info_redactor_lite/extractor.rb CHANGED Viewed

@@ -2,7 +2,9 @@ module ConfidentialInfoRedactorLite
   # This class extracts proper nouns from a text
   class Extractor
     # Rubular: http://rubular.com/r/qE0g4r9zR7
-    EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
+    EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
+    PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
     attr_reader :text, :language, :corpus
     def initialize(text:, corpus:, **args)
       @text = text.gsub(/[’‘]/, "'")
@@ -13,29 +15,29 @@ module ConfidentialInfoRedactorLite
     def extract
       extracted_terms = []
       PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
-        initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
+        initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
         in_corpus = true
         initial_extracted_terms.each do |ngram|
-          ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
-            unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
+          ngram.split(PUNCTUATION_REGEX).each do |t|
+            unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
               in_corpus = false
             end
           end
         end
         next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
         initial_extracted_terms.each do |ngram|
-          ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
+          ngram.split(PUNCTUATION_REGEX).each do |t|
             next if !(t !~ /.*\d+.*/)
-            if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
-              extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
+            if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
+              extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
             else
               tracker = true
-              unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
-                t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
+              unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
+                t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
                   tracker = false if corpus.include?(token.downcase)
                 end
               end
-              extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('n'))
+              extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
             end
           end
         end

data/lib/confidential_info_redactor_lite/hyperlink.rb CHANGED Viewed

@@ -1,8 +1,5 @@
-require 'uri'
 module ConfidentialInfoRedactorLite
   class Hyperlink
-    NON_HYPERLINK_REGEX = /\A\w+:$/
     # Rubular: http://rubular.com/r/fXa4lp0gfS
     HYPERLINK_REGEX = /(http|https|www)(\.|:)/
@@ -12,18 +9,10 @@ module ConfidentialInfoRedactorLite
       @string = string
     end
-    def hyperlink?
-      !(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
-    end
     def replace
       new_string = string.dup
       string.split(/\s+/).each do |token|
-        if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
-          new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
-        elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
-          new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
-        end
+        new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
       end
       new_string
     end

data/lib/confidential_info_redactor_lite/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ConfidentialInfoRedactorLite
-  VERSION = "0.0.31"
+  VERSION = "0.0.32"
 end

data/spec/confidential_info_redactor_lite/extractor_spec.rb CHANGED Viewed

@@ -161,6 +161,26 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
         text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
         expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
       end
+       it 'extracts the proper nouns from a text #004' do
+        text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
+      it 'extracts the proper nouns from a text #005' do
+        text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
+      it 'extracts the proper nouns from a text #006' do
+        text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
+      it 'extracts the proper nouns from a text #007' do
+        text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
+        expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
+      end
     end
   end
 end

data/spec/confidential_info_redactor_lite/hyperlink_spec.rb CHANGED Viewed

@@ -1,50 +1,6 @@
 require 'spec_helper'
 RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
-  context '#hyperlink?' do
-    it 'returns true if the string is a hyperlink #001' do
-      string = "http://www.example.com/this-IS-a_test/hello.html"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(true)
-    end
-    it 'returns true if the string is a hyperlink #002' do
-      string = "http://www.google.co.uk"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(true)
-    end
-    it 'returns true if the string is a hyperlink #003' do
-      string = "https://google.co.uk"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(true)
-    end
-    it 'returns false if the string is not a hyperlink #004' do
-      string = "hello"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-    it 'returns false if the string is not a hyperlink #005' do
-      string = "john@gmail.com"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-    it 'returns false if the string is not a hyperlink #006' do
-      string = "date:"
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-    it 'returns false if the string is not a hyperlink #007' do
-      string = 'The file location is c:\Users\johndoe.'
-      ws = described_class.new(string: string)
-      expect(ws.hyperlink?).to eq(false)
-    end
-  end
   context '#replace' do
     it 'replaces the hyperlinks in a string with regular tokens #001' do
       string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"

data/spec/confidential_info_redactor_lite/redactor_spec.rb CHANGED Viewed

@@ -129,6 +129,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
       text = 'Visit https://www.tm-town.com for more info.'
       expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
     end
+    it 'redacts hyperlinks from a text #002' do
+      text = 'Visit www.tm-town.com for more info.'
+      expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
+    end
   end
   describe '#hyperlinks_html' do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: confidential_info_redactor_lite
 version: !ruby/object:Gem::Version
-  version: 0.0.31
+  version: 0.0.32
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-27 00:00:00.000000000 Z
+date: 2015-05-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler