confidential_info_redactor 0.0.18 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/README.md +11 -11
 - data/lib/confidential_info_redactor/date.rb +98 -118
 - data/lib/confidential_info_redactor/extractor.rb +66 -33
 - data/lib/confidential_info_redactor/hyperlink.rb +5 -17
 - data/lib/confidential_info_redactor/redactor.rb +13 -13
 - data/lib/confidential_info_redactor/version.rb +1 -1
 - data/lib/confidential_info_redactor/word_lists.rb +2 -2
 - data/lib/confidential_info_redactor.rb +2 -1
 - data/spec/confidential_info_redactor/date_spec.rb +88 -88
 - data/spec/confidential_info_redactor/extractor_spec.rb +20 -20
 - data/spec/confidential_info_redactor/hyperlink_spec.rb +18 -18
 - data/spec/confidential_info_redactor/redactor_spec.rb +22 -22
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: e5b5d282da6e72d809ac29188c1423775d513931
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 09252aaf2b2e49f1360d03470d38c1d82a6e968d
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 50c47839331dba86546a92929140d5156599a8699a0d0ecb79f2f0cb767837c206ac14784c661818a1519d2f06b7c0323561477e868ee63aae94580acc191929
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: acfc20dd05bf76f7ed8b719643afb3017a302fd6d1cd1eb1f68060513a1059d1efb19c68efdb935c40860afea8d42c63bdc9a28ae99df52eaef0a4a5cdcc59dd
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -40,45 +40,45 @@ gem 'confidential_info_redactor' 
     | 
|
| 
       40 
40 
     | 
    
         
             
            ```ruby
         
     | 
| 
       41 
41 
     | 
    
         
             
            text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
         
     | 
| 
       42 
42 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
            tokens = ConfidentialInfoRedactor::Extractor.new(text 
     | 
| 
      
 43 
     | 
    
         
            +
            tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
         
     | 
| 
       44 
44 
     | 
    
         
             
            # => ["Coca-Cola", "Pepsi", "John Smith"]
         
     | 
| 
       45 
45 
     | 
    
         | 
| 
       46 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new( 
     | 
| 
      
 46 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text)
         
     | 
| 
       47 
47 
     | 
    
         
             
            # => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
         
     | 
| 
       48 
48 
     | 
    
         | 
| 
       49 
49 
     | 
    
         
             
            # You can also just use a specific redactor
         
     | 
| 
       50 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new(text 
     | 
| 
      
 50 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new.dates(text)
         
     | 
| 
       51 
51 
     | 
    
         
             
            # => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
         
     | 
| 
       52 
52 
     | 
    
         | 
| 
       53 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new(text 
     | 
| 
      
 53 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new.numbers(text)
         
     | 
| 
       54 
54 
     | 
    
         
             
            # => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
         
     | 
| 
       55 
55 
     | 
    
         | 
| 
       56 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new(text 
     | 
| 
      
 56 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new.emails(text)
         
     | 
| 
       57 
57 
     | 
    
         
             
            # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
         
     | 
| 
       58 
58 
     | 
    
         | 
| 
       59 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new(text 
     | 
| 
      
 59 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new.hyperlinks(text)
         
     | 
| 
       60 
60 
     | 
    
         
             
            # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
         
     | 
| 
       61 
61 
     | 
    
         | 
| 
       62 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new( 
     | 
| 
      
 62 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text)
         
     | 
| 
       63 
63 
     | 
    
         
             
            # => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
         
     | 
| 
       64 
64 
     | 
    
         | 
| 
       65 
65 
     | 
    
         
             
            # It is possible to 'turn off' any of the specific redactors
         
     | 
| 
       66 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new( 
     | 
| 
      
 66 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text)
         
     | 
| 
       67 
67 
     | 
    
         
             
            # => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
         
     | 
| 
       68 
68 
     | 
    
         | 
| 
       69 
69 
     | 
    
         
             
            # German Example
         
     | 
| 
       70 
70 
     | 
    
         
             
            text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
         
     | 
| 
       71 
71 
     | 
    
         | 
| 
       72 
     | 
    
         
            -
            tokens = ConfidentialInfoRedactor::Extractor.new( 
     | 
| 
      
 72 
     | 
    
         
            +
            tokens = ConfidentialInfoRedactor::Extractor.new(language: 'de').extract(text)
         
     | 
| 
       73 
73 
     | 
    
         
             
            # => ['Deutschen Bank']
         
     | 
| 
       74 
74 
     | 
    
         | 
| 
       75 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new( 
     | 
| 
      
 75 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new(language: 'de', tokens: tokens).redact(text)
         
     | 
| 
       76 
76 
     | 
    
         
             
            # => 'Viele Mitarbeiter der <redacted> suchen eine andere Arbeitsstelle.'
         
     | 
| 
       77 
77 
     | 
    
         | 
| 
       78 
78 
     | 
    
         
             
            # It is also possible to change the redaction text
         
     | 
| 
       79 
79 
     | 
    
         
             
            text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
         
     | 
| 
       80 
80 
     | 
    
         
             
            tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
         
     | 
| 
       81 
     | 
    
         
            -
            ConfidentialInfoRedactor::Redactor.new( 
     | 
| 
      
 81 
     | 
    
         
            +
            ConfidentialInfoRedactor::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)
         
     | 
| 
       82 
82 
     | 
    
         
             
            # => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
         
     | 
| 
       83 
83 
     | 
    
         
             
            ```
         
     | 
| 
       84 
84 
     | 
    
         | 
| 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module ConfidentialInfoRedactor
         
     | 
| 
       2 
2 
     | 
    
         
             
              class Date
         
     | 
| 
       3 
     | 
    
         
            -
                EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
         
     | 
| 
       4 
     | 
    
         
            -
                EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
         
     | 
| 
       5 
     | 
    
         
            -
                EN_MONTHS = %w(january february march april may june july august september october november december)
         
     | 
| 
       6 
     | 
    
         
            -
                EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
         
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
                DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend)
         
     | 
| 
       9 
     | 
    
         
            -
                DE_DOW_ABBR = %w(mo di mi do fr sa so)
         
     | 
| 
       10 
     | 
    
         
            -
                DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember)
         
     | 
| 
       11 
     | 
    
         
            -
                DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez)
         
     | 
| 
      
 3 
     | 
    
         
            +
                EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday).freeze
         
     | 
| 
      
 4 
     | 
    
         
            +
                EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun).freeze
         
     | 
| 
      
 5 
     | 
    
         
            +
                EN_MONTHS = %w(january february march april may june july august september october november december).freeze
         
     | 
| 
      
 6 
     | 
    
         
            +
                EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec).freeze
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
                DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend).freeze
         
     | 
| 
      
 9 
     | 
    
         
            +
                DE_DOW_ABBR = %w(mo di mi do fr sa so).freeze
         
     | 
| 
      
 10 
     | 
    
         
            +
                DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember).freeze
         
     | 
| 
      
 11 
     | 
    
         
            +
                DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez).freeze
         
     | 
| 
       12 
12 
     | 
    
         
             
                # Rubular: http://rubular.com/r/73CZ2HU0q6
         
     | 
| 
       13 
13 
     | 
    
         
             
                DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}/
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
         @@ -21,9 +21,8 @@ module ConfidentialInfoRedactor 
     | 
|
| 
       21 
21 
     | 
    
         
             
                # Rubular: http://rubular.com/r/mpVSeaKwdY
         
     | 
| 
       22 
22 
     | 
    
         
             
                DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D/
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
                attr_reader : 
     | 
| 
       25 
     | 
    
         
            -
                def initialize( 
     | 
| 
       26 
     | 
    
         
            -
                  @string = string
         
     | 
| 
      
 24 
     | 
    
         
            +
                attr_reader :language, :dow, :dow_abbr, :months, :months_abbr
         
     | 
| 
      
 25 
     | 
    
         
            +
                def initialize(language:)
         
     | 
| 
       27 
26 
     | 
    
         
             
                  @language = language
         
     | 
| 
       28 
27 
     | 
    
         
             
                  case language
         
     | 
| 
       29 
28 
     | 
    
         
             
                  when 'en'
         
     | 
| 
         @@ -44,133 +43,114 @@ module ConfidentialInfoRedactor 
     | 
|
| 
       44 
43 
     | 
    
         
             
                  end
         
     | 
| 
       45 
44 
     | 
    
         
             
                end
         
     | 
| 
       46 
45 
     | 
    
         | 
| 
       47 
     | 
    
         
            -
                def includes_date?
         
     | 
| 
       48 
     | 
    
         
            -
                   
     | 
| 
      
 46 
     | 
    
         
            +
                def includes_date?(text)
         
     | 
| 
      
 47 
     | 
    
         
            +
                  includes_long_date?(text) || includes_number_only_date?(text)
         
     | 
| 
       49 
48 
     | 
    
         
             
                end
         
     | 
| 
       50 
49 
     | 
    
         | 
| 
       51 
     | 
    
         
            -
                def replace
         
     | 
| 
       52 
     | 
    
         
            -
                   
     | 
| 
      
 50 
     | 
    
         
            +
                def replace(text)
         
     | 
| 
      
 51 
     | 
    
         
            +
                  return text unless is_an_array?
         
     | 
| 
       53 
52 
     | 
    
         
             
                  counter = 0
         
     | 
| 
       54 
     | 
    
         
            -
                  dow_abbr. 
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
                   
     | 
| 
      
 53 
     | 
    
         
            +
                  dow_abbr.map { |day| counter +=1 if text.include?('day') }
         
     | 
| 
      
 54 
     | 
    
         
            +
                  text = redact_dates(counter, text)
         
     | 
| 
      
 55 
     | 
    
         
            +
                  redact_regex(text)
         
     | 
| 
      
 56 
     | 
    
         
            +
                end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                def occurences(text)
         
     | 
| 
      
 59 
     | 
    
         
            +
                  replace(text).scan(/<redacted date>/).size
         
     | 
| 
      
 60 
     | 
    
         
            +
                end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                def replace_number_only_date(text)
         
     | 
| 
      
 63 
     | 
    
         
            +
                  text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
         
     | 
| 
      
 64 
     | 
    
         
            +
                      .gsub(YMD_YDM_REGEX, ' <redacted date> ')
         
     | 
| 
      
 65 
     | 
    
         
            +
                      .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
         
     | 
| 
      
 66 
     | 
    
         
            +
                      .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
         
     | 
| 
      
 67 
     | 
    
         
            +
                end
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                private
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
                def is_an_array?
         
     | 
| 
      
 72 
     | 
    
         
            +
                  dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
                def redact_dates(counter, text)
         
     | 
| 
       57 
76 
     | 
    
         
             
                  if counter > 0
         
     | 
| 
       58 
     | 
    
         
            -
                     
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
       60 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       61 
     | 
    
         
            -
                      end
         
     | 
| 
       62 
     | 
    
         
            -
                      months_abbr.each do |month|
         
     | 
| 
       63 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       64 
     | 
    
         
            -
                      end
         
     | 
| 
       65 
     | 
    
         
            -
                    end
         
     | 
| 
       66 
     | 
    
         
            -
                    dow.each do |day|
         
     | 
| 
       67 
     | 
    
         
            -
                      months.each do |month|
         
     | 
| 
       68 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       69 
     | 
    
         
            -
                                               .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
         
     | 
| 
       70 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       71 
     | 
    
         
            -
                                               .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       72 
     | 
    
         
            -
                                               .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
         
     | 
| 
       73 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       74 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       75 
     | 
    
         
            -
                      end
         
     | 
| 
       76 
     | 
    
         
            -
                      months_abbr.each do |month|
         
     | 
| 
       77 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       78 
     | 
    
         
            -
                                               .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
         
     | 
| 
       79 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       80 
     | 
    
         
            -
                                               .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       81 
     | 
    
         
            -
                                               .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
         
     | 
| 
       82 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       83 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       84 
     | 
    
         
            -
                      end
         
     | 
| 
       85 
     | 
    
         
            -
                    end
         
     | 
| 
      
 77 
     | 
    
         
            +
                    text = redact_dow_abbr(text)
         
     | 
| 
      
 78 
     | 
    
         
            +
                    text = redact_dow(text)
         
     | 
| 
       86 
79 
     | 
    
         
             
                  else
         
     | 
| 
       87 
     | 
    
         
            -
                     
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       90 
     | 
    
         
            -
                                               .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
         
     | 
| 
       91 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       92 
     | 
    
         
            -
                                               .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       93 
     | 
    
         
            -
                                               .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
         
     | 
| 
       94 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       95 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       96 
     | 
    
         
            -
                      end
         
     | 
| 
       97 
     | 
    
         
            -
                      months_abbr.each do |month|
         
     | 
| 
       98 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       99 
     | 
    
         
            -
                                               .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
         
     | 
| 
       100 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       101 
     | 
    
         
            -
                                               .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       102 
     | 
    
         
            -
                                               .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
         
     | 
| 
       103 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       104 
     | 
    
         
            -
                                               .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
       105 
     | 
    
         
            -
                      end
         
     | 
| 
       106 
     | 
    
         
            -
                    end
         
     | 
| 
       107 
     | 
    
         
            -
                    dow_abbr.each do |day|
         
     | 
| 
       108 
     | 
    
         
            -
                      months.each do |month|
         
     | 
| 
       109 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       110 
     | 
    
         
            -
                      end
         
     | 
| 
       111 
     | 
    
         
            -
                      months_abbr.each do |month|
         
     | 
| 
       112 
     | 
    
         
            -
                        new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
       113 
     | 
    
         
            -
                      end
         
     | 
| 
       114 
     | 
    
         
            -
                    end
         
     | 
| 
      
 80 
     | 
    
         
            +
                    text = redact_dow(text)
         
     | 
| 
      
 81 
     | 
    
         
            +
                    text = redact_dow_abbr(text)
         
     | 
| 
       115 
82 
     | 
    
         
             
                  end
         
     | 
| 
       116 
     | 
    
         
            -
                   
     | 
| 
       117 
     | 
    
         
            -
                                 .gsub(YMD_YDM_REGEX, ' <redacted date> ')
         
     | 
| 
       118 
     | 
    
         
            -
                                 .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
         
     | 
| 
       119 
     | 
    
         
            -
                                 .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
         
     | 
| 
      
 83 
     | 
    
         
            +
                  text
         
     | 
| 
       120 
84 
     | 
    
         
             
                end
         
     | 
| 
       121 
85 
     | 
    
         | 
| 
       122 
     | 
    
         
            -
                def  
     | 
| 
       123 
     | 
    
         
            -
                   
     | 
| 
      
 86 
     | 
    
         
            +
                def redact_regex(text)
         
     | 
| 
      
 87 
     | 
    
         
            +
                  text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
         
     | 
| 
      
 88 
     | 
    
         
            +
                      .gsub(YMD_YDM_REGEX, ' <redacted date> ')
         
     | 
| 
      
 89 
     | 
    
         
            +
                      .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
         
     | 
| 
      
 90 
     | 
    
         
            +
                      .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
         
     | 
| 
       124 
91 
     | 
    
         
             
                end
         
     | 
| 
       125 
92 
     | 
    
         | 
| 
       126 
     | 
    
         
            -
                def  
     | 
| 
       127 
     | 
    
         
            -
                   
     | 
| 
       128 
     | 
    
         
            -
             
     | 
| 
       129 
     | 
    
         
            -
             
     | 
| 
       130 
     | 
    
         
            -
             
     | 
| 
      
 93 
     | 
    
         
            +
                def redact_dow(text)
         
     | 
| 
      
 94 
     | 
    
         
            +
                  dow.each do |day|
         
     | 
| 
      
 95 
     | 
    
         
            +
                    months.map { |month| text = redact_date(text, day, month) }
         
     | 
| 
      
 96 
     | 
    
         
            +
                    months_abbr.map { |month| text = redact_date(text, day, month) }
         
     | 
| 
      
 97 
     | 
    
         
            +
                  end
         
     | 
| 
      
 98 
     | 
    
         
            +
                  text
         
     | 
| 
       131 
99 
     | 
    
         
             
                end
         
     | 
| 
       132 
100 
     | 
    
         | 
| 
       133 
     | 
    
         
            -
                 
     | 
| 
      
 101 
     | 
    
         
            +
                def redact_dow_abbr(text)
         
     | 
| 
      
 102 
     | 
    
         
            +
                  dow_abbr.each do |day|
         
     | 
| 
      
 103 
     | 
    
         
            +
                    months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
         
     | 
| 
      
 104 
     | 
    
         
            +
                    months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
         
     | 
| 
      
 105 
     | 
    
         
            +
                  end
         
     | 
| 
      
 106 
     | 
    
         
            +
                  text
         
     | 
| 
      
 107 
     | 
    
         
            +
                end
         
     | 
| 
       134 
108 
     | 
    
         | 
| 
       135 
     | 
    
         
            -
                def  
     | 
| 
       136 
     | 
    
         
            -
                   
     | 
| 
      
 109 
     | 
    
         
            +
                def redact_date(text, day, month)
         
     | 
| 
      
 110 
     | 
    
         
            +
                  text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
      
 111 
     | 
    
         
            +
                                   .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
      
 112 
     | 
    
         
            +
                                   .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
         
     | 
| 
      
 113 
     | 
    
         
            +
                                   .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
         
     | 
| 
      
 114 
     | 
    
         
            +
                                   .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
      
 115 
     | 
    
         
            +
                                   .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
         
     | 
| 
      
 116 
     | 
    
         
            +
                                   .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
      
 117 
     | 
    
         
            +
                                   .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
         
     | 
| 
      
 118 
     | 
    
         
            +
                end
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
                def includes_long_date?(text)
         
     | 
| 
      
 121 
     | 
    
         
            +
                  includes_long_date_1?(text) || includes_long_date_2?(text)
         
     | 
| 
      
 122 
     | 
    
         
            +
                end
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
                def includes_long_date_1?(text)
         
     | 
| 
       137 
125 
     | 
    
         
             
                  dow.each do |day|
         
     | 
| 
       138 
     | 
    
         
            -
                    months. 
     | 
| 
       139 
     | 
    
         
            -
             
     | 
| 
       140 
     | 
    
         
            -
                      match_found = check_for_matches(day, month)
         
     | 
| 
       141 
     | 
    
         
            -
                    end
         
     | 
| 
       142 
     | 
    
         
            -
                    months_abbr.each do |month|
         
     | 
| 
       143 
     | 
    
         
            -
                      break if match_found
         
     | 
| 
       144 
     | 
    
         
            -
                      match_found = check_for_matches(day, month)
         
     | 
| 
       145 
     | 
    
         
            -
                    end
         
     | 
| 
      
 126 
     | 
    
         
            +
                    months.map { |month| return true if check_for_matches(day, month, text) }
         
     | 
| 
      
 127 
     | 
    
         
            +
                    months_abbr.map { |month| return true if check_for_matches(day, month, text) }
         
     | 
| 
       146 
128 
     | 
    
         
             
                  end
         
     | 
| 
      
 129 
     | 
    
         
            +
                  false
         
     | 
| 
      
 130 
     | 
    
         
            +
                end
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
                def includes_long_date_2?(text)
         
     | 
| 
       147 
133 
     | 
    
         
             
                  dow_abbr.each do |day|
         
     | 
| 
       148 
     | 
    
         
            -
                    months. 
     | 
| 
       149 
     | 
    
         
            -
             
     | 
| 
       150 
     | 
    
         
            -
                      match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
         
     | 
| 
       151 
     | 
    
         
            -
                    end
         
     | 
| 
       152 
     | 
    
         
            -
                    months_abbr.each do |month|
         
     | 
| 
       153 
     | 
    
         
            -
                      break if match_found
         
     | 
| 
       154 
     | 
    
         
            -
                      match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
         
     | 
| 
       155 
     | 
    
         
            -
                    end
         
     | 
| 
      
 134 
     | 
    
         
            +
                    months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
         
     | 
| 
      
 135 
     | 
    
         
            +
                    months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
         
     | 
| 
       156 
136 
     | 
    
         
             
                  end
         
     | 
| 
       157 
     | 
    
         
            -
                   
     | 
| 
      
 137 
     | 
    
         
            +
                  false
         
     | 
| 
       158 
138 
     | 
    
         
             
                end
         
     | 
| 
       159 
139 
     | 
    
         | 
| 
       160 
     | 
    
         
            -
                def  
     | 
| 
       161 
     | 
    
         
            -
                  !( 
     | 
| 
       162 
     | 
    
         
            -
                  !( 
     | 
| 
       163 
     | 
    
         
            -
                  !( 
     | 
| 
       164 
     | 
    
         
            -
                  !( 
     | 
| 
      
 140 
     | 
    
         
            +
                def includes_number_only_date?(text)
         
     | 
| 
      
 141 
     | 
    
         
            +
                  !(text !~ DMY_MDY_REGEX) ||
         
     | 
| 
      
 142 
     | 
    
         
            +
                  !(text !~ YMD_YDM_REGEX) ||
         
     | 
| 
      
 143 
     | 
    
         
            +
                  !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
         
     | 
| 
      
 144 
     | 
    
         
            +
                  !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
         
     | 
| 
       165 
145 
     | 
    
         
             
                end
         
     | 
| 
       166 
146 
     | 
    
         | 
| 
       167 
     | 
    
         
            -
                def check_for_matches(day, month)
         
     | 
| 
       168 
     | 
    
         
            -
                  !( 
     | 
| 
       169 
     | 
    
         
            -
                  !( 
     | 
| 
       170 
     | 
    
         
            -
                  !( 
     | 
| 
       171 
     | 
    
         
            -
                  !( 
     | 
| 
       172 
     | 
    
         
            -
                  !( 
     | 
| 
       173 
     | 
    
         
            -
                  !( 
     | 
| 
      
 147 
     | 
    
         
            +
                def check_for_matches(day, month, text)
         
     | 
| 
      
 148 
     | 
    
         
            +
                  !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
         
     | 
| 
      
 149 
     | 
    
         
            +
                  !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
         
     | 
| 
      
 150 
     | 
    
         
            +
                  !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
         
     | 
| 
      
 151 
     | 
    
         
            +
                  !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
         
     | 
| 
      
 152 
     | 
    
         
            +
                  !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
         
     | 
| 
      
 153 
     | 
    
         
            +
                  !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
         
     | 
| 
       174 
154 
     | 
    
         
             
                end
         
     | 
| 
       175 
155 
     | 
    
         
             
              end
         
     | 
| 
       176 
156 
     | 
    
         
             
            end
         
     | 
| 
         @@ -4,10 +4,12 @@ module ConfidentialInfoRedactor 
     | 
|
| 
       4 
4 
     | 
    
         
             
              # This class extracts proper nouns from a text
         
     | 
| 
       5 
5 
     | 
    
         
             
              class Extractor
         
     | 
| 
       6 
6 
     | 
    
         
             
                # Rubular: http://rubular.com/r/qE0g4r9zR7
         
     | 
| 
       7 
     | 
    
         
            -
                EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
         
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
                 
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
      
 7 
     | 
    
         
            +
                EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                attr_reader :language, :corpus
         
     | 
| 
      
 12 
     | 
    
         
            +
                def initialize(**args)
         
     | 
| 
       11 
13 
     | 
    
         
             
                  @language = args[:language] || 'en'
         
     | 
| 
       12 
14 
     | 
    
         
             
                  case @language
         
     | 
| 
       13 
15 
     | 
    
         
             
                  when 'en'
         
     | 
| 
         @@ -19,38 +21,69 @@ module ConfidentialInfoRedactor 
     | 
|
| 
       19 
21 
     | 
    
         
             
                  end
         
     | 
| 
       20 
22 
     | 
    
         
             
                end
         
     | 
| 
       21 
23 
     | 
    
         | 
| 
       22 
     | 
    
         
            -
                def extract
         
     | 
| 
      
 24 
     | 
    
         
            +
                def extract(text)
         
     | 
| 
       23 
25 
     | 
    
         
             
                  extracted_terms = []
         
     | 
| 
       24 
     | 
    
         
            -
                  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
         
     | 
| 
       25 
     | 
    
         
            -
                    initial_extracted_terms = segment 
     | 
| 
       26 
     | 
    
         
            -
                     
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
       50 
     | 
    
         
            -
                     
     | 
| 
      
 26 
     | 
    
         
            +
                  PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
         
     | 
| 
      
 27 
     | 
    
         
            +
                    initial_extracted_terms = extract_preliminary_terms(segment)
         
     | 
| 
      
 28 
     | 
    
         
            +
                    search_ngrams(initial_extracted_terms, extracted_terms)
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
                  extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
         
     | 
| 
      
 31 
     | 
    
         
            +
                end
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                private
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                def extract_preliminary_terms(segment)
         
     | 
| 
      
 36 
     | 
    
         
            +
                  segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                def clean_token(token)
         
     | 
| 
      
 40 
     | 
    
         
            +
                  token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
         
     | 
| 
      
 41 
     | 
    
         
            +
                end
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                def non_confidential_token?(token, includes_confidential)
         
     | 
| 
      
 44 
     | 
    
         
            +
                  corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
         
     | 
| 
      
 45 
     | 
    
         
            +
                end
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                def singular_in_corpus?(token)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  corpus.include?(token[0...-1]) &&
         
     | 
| 
      
 49 
     | 
    
         
            +
                    token[-1].eql?('s') ||
         
     | 
| 
      
 50 
     | 
    
         
            +
                    corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
         
     | 
| 
      
 51 
     | 
    
         
            +
                    corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
         
     | 
| 
      
 52 
     | 
    
         
            +
                    corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
         
     | 
| 
      
 53 
     | 
    
         
            +
                    corpus.include?(token[0...-1]) && token[-1].eql?('n')
         
     | 
| 
      
 54 
     | 
    
         
            +
                end
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                def includes_confidential?(token)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
         
     | 
| 
      
 58 
     | 
    
         
            +
                  true
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                def matching_first_token?(tokens)
         
     | 
| 
      
 62 
     | 
    
         
            +
                  corpus.include?(tokens[0]) &&
         
     | 
| 
      
 63 
     | 
    
         
            +
                    tokens[0] != 'the' &&
         
     | 
| 
      
 64 
     | 
    
         
            +
                    tokens[0] != 'deutsche' &&
         
     | 
| 
      
 65 
     | 
    
         
            +
                    tokens.length.eql?(2)
         
     | 
| 
      
 66 
     | 
    
         
            +
                end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                def find_extracted_terms(string, extracted_terms)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  cleaned_token_downcased = clean_token(string.downcase)
         
     | 
| 
      
 70 
     | 
    
         
            +
                  cleaned_token = clean_token(string)
         
     | 
| 
      
 71 
     | 
    
         
            +
                  tokens = cleaned_token_downcased.split(' ')
         
     | 
| 
      
 72 
     | 
    
         
            +
                  if matching_first_token?(tokens)
         
     | 
| 
      
 73 
     | 
    
         
            +
                    extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
         
     | 
| 
      
 74 
     | 
    
         
            +
                  else
         
     | 
| 
      
 75 
     | 
    
         
            +
                    extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
         
     | 
| 
       51 
76 
     | 
    
         
             
                  end
         
     | 
| 
      
 77 
     | 
    
         
            +
                  extracted_terms
         
     | 
| 
      
 78 
     | 
    
         
            +
                end
         
     | 
| 
       52 
79 
     | 
    
         | 
| 
       53 
     | 
    
         
            -
             
     | 
| 
      
 80 
     | 
    
         
            +
                def search_ngrams(tokens, extracted_terms)
         
     | 
| 
      
 81 
     | 
    
         
            +
                  tokens.each do |ngram|
         
     | 
| 
      
 82 
     | 
    
         
            +
                    ngram.split(PUNCTUATION_REGEX).each do |t|
         
     | 
| 
      
 83 
     | 
    
         
            +
                      next if !(t !~ /.*\d+.*/)
         
     | 
| 
      
 84 
     | 
    
         
            +
                      extracted_terms = find_extracted_terms(t, extracted_terms)
         
     | 
| 
      
 85 
     | 
    
         
            +
                    end
         
     | 
| 
      
 86 
     | 
    
         
            +
                  end
         
     | 
| 
       54 
87 
     | 
    
         
             
                end
         
     | 
| 
       55 
88 
     | 
    
         
             
              end
         
     | 
| 
       56 
89 
     | 
    
         
             
            end
         
     | 
| 
         @@ -7,25 +7,13 @@ module ConfidentialInfoRedactor 
     | 
|
| 
       7 
7 
     | 
    
         
             
                # Rubular: http://rubular.com/r/fXa4lp0gfS
         
     | 
| 
       8 
8 
     | 
    
         
             
                HYPERLINK_REGEX = /(http|https|www)(\.|:)/
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
                 
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
                  @string = string
         
     | 
| 
      
 10 
     | 
    
         
            +
                def hyperlink?(text)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  !(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
         
     | 
| 
       13 
12 
     | 
    
         
             
                end
         
     | 
| 
       14 
13 
     | 
    
         | 
| 
       15 
     | 
    
         
            -
                def  
     | 
| 
       16 
     | 
    
         
            -
                   
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
                def replace
         
     | 
| 
       20 
     | 
    
         
            -
                  new_string = string.dup
         
     | 
| 
       21 
     | 
    
         
            -
                  string.split(/\s+/).each do |token|
         
     | 
| 
       22 
     | 
    
         
            -
                    if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
         
     | 
| 
       23 
     | 
    
         
            -
                      new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
         
     | 
| 
       24 
     | 
    
         
            -
                    elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
         
     | 
| 
       25 
     | 
    
         
            -
                      new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
         
     | 
| 
       26 
     | 
    
         
            -
                    end
         
     | 
| 
       27 
     | 
    
         
            -
                  end
         
     | 
| 
       28 
     | 
    
         
            -
                  new_string
         
     | 
| 
      
 14 
     | 
    
         
            +
                def replace(text)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ') if !(token !~ HYPERLINK_REGEX) }
         
     | 
| 
      
 16 
     | 
    
         
            +
                  text
         
     | 
| 
       29 
17 
     | 
    
         
             
                end
         
     | 
| 
       30 
18 
     | 
    
         
             
              end
         
     | 
| 
       31 
19 
     | 
    
         
             
            end
         
     |