confidential_info_redactor_lite 0.0.31 → 0.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e4392706f3a5e307a5453009fadcf4010a7162da
4
- data.tar.gz: 107759c2fd4bbe553f6389bb4c419698c0513285
3
+ metadata.gz: 69347cc82e199d28d8bf542fba216c7b8c6d3410
4
+ data.tar.gz: ceebbfa4046ba5fd1349d47c6eff0111ad18b091
5
5
  SHA512:
6
- metadata.gz: 77ea54f659ac2a4e340fa55ac5afc558d0e6807630c1b646554da52a3a0a88f138c9db932d92b86e06b7334b79ea91b97b93011d1afea055a427953d148d55cf
7
- data.tar.gz: 62c0fd961e7ab69fb571f5ab5edae83d736feb1bf81a77d33195ad7c3fdfe7e5cd99ccb2a2dc5c65e0e49e6a0da0c009ce362ac854aa93ffe0b576ccd91f0369
6
+ metadata.gz: 13b2789ed7b91d38fb3963a554b5f7570b13f7ce39891f4a43be479fc59770794d513f74c1c3ea66c1d312a0b3b919644c096ec4a726b529ffd4e23913cf32d1
7
+ data.tar.gz: f2a713b84a11c782f452bc2b4279bf5142e2b1ecbc7eb8374f77f924937bd759932cf870a4ecad2416d80adf6bbcbade97b8ff17de249a379e46c754197923ba
@@ -2,7 +2,9 @@ module ConfidentialInfoRedactorLite
2
2
  # This class extracts proper nouns from a text
3
3
  class Extractor
4
4
  # Rubular: http://rubular.com/r/qE0g4r9zR7
5
- EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
5
+ EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
6
+
7
+ PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
6
8
  attr_reader :text, :language, :corpus
7
9
  def initialize(text:, corpus:, **args)
8
10
  @text = text.gsub(/[’‘]/, "'")
@@ -13,29 +15,29 @@ module ConfidentialInfoRedactorLite
13
15
  def extract
14
16
  extracted_terms = []
15
17
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
16
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
18
+ initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
17
19
  in_corpus = true
18
20
  initial_extracted_terms.each do |ngram|
19
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
20
- unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
21
+ ngram.split(PUNCTUATION_REGEX).each do |t|
22
+ unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
21
23
  in_corpus = false
22
24
  end
23
25
  end
24
26
  end
25
27
  next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
26
28
  initial_extracted_terms.each do |ngram|
27
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
29
+ ngram.split(PUNCTUATION_REGEX).each do |t|
28
30
  next if !(t !~ /.*\d+.*/)
29
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
30
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
31
+ if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
32
+ extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
31
33
  else
32
34
  tracker = true
33
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
34
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
35
+ unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
36
+ t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
35
37
  tracker = false if corpus.include?(token.downcase)
36
38
  end
37
39
  end
38
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('n'))
40
+ extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
39
41
  end
40
42
  end
41
43
  end
@@ -1,8 +1,5 @@
1
- require 'uri'
2
-
3
1
  module ConfidentialInfoRedactorLite
4
2
  class Hyperlink
5
- NON_HYPERLINK_REGEX = /\A\w+:$/
6
3
 
7
4
  # Rubular: http://rubular.com/r/fXa4lp0gfS
8
5
  HYPERLINK_REGEX = /(http|https|www)(\.|:)/
@@ -12,18 +9,10 @@ module ConfidentialInfoRedactorLite
12
9
  @string = string
13
10
  end
14
11
 
15
- def hyperlink?
16
- !(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
17
- end
18
-
19
12
  def replace
20
13
  new_string = string.dup
21
14
  string.split(/\s+/).each do |token|
22
- if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
23
- new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
24
- elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
25
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
26
- end
15
+ new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
27
16
  end
28
17
  new_string
29
18
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.31"
2
+ VERSION = "0.0.32"
3
3
  end
@@ -161,6 +161,26 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
161
161
  text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
162
162
  expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
163
163
  end
164
+
165
+ it 'extracts the proper nouns from a text #004' do
166
+ text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
167
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
168
+ end
169
+
170
+ it 'extracts the proper nouns from a text #005' do
171
+ text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
172
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
173
+ end
174
+
175
+ it 'extracts the proper nouns from a text #006' do
176
+ text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
177
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
178
+ end
179
+
180
+ it 'extracts the proper nouns from a text #007' do
181
+ text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
182
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
183
+ end
164
184
  end
165
185
  end
166
186
  end
@@ -1,50 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
4
- context '#hyperlink?' do
5
- it 'returns true if the string is a hyperlink #001' do
6
- string = "http://www.example.com/this-IS-a_test/hello.html"
7
- ws = described_class.new(string: string)
8
- expect(ws.hyperlink?).to eq(true)
9
- end
10
-
11
- it 'returns true if the string is a hyperlink #002' do
12
- string = "http://www.google.co.uk"
13
- ws = described_class.new(string: string)
14
- expect(ws.hyperlink?).to eq(true)
15
- end
16
-
17
- it 'returns true if the string is a hyperlink #003' do
18
- string = "https://google.co.uk"
19
- ws = described_class.new(string: string)
20
- expect(ws.hyperlink?).to eq(true)
21
- end
22
-
23
- it 'returns false if the string is not a hyperlink #004' do
24
- string = "hello"
25
- ws = described_class.new(string: string)
26
- expect(ws.hyperlink?).to eq(false)
27
- end
28
-
29
- it 'returns false if the string is not a hyperlink #005' do
30
- string = "john@gmail.com"
31
- ws = described_class.new(string: string)
32
- expect(ws.hyperlink?).to eq(false)
33
- end
34
-
35
- it 'returns false if the string is not a hyperlink #006' do
36
- string = "date:"
37
- ws = described_class.new(string: string)
38
- expect(ws.hyperlink?).to eq(false)
39
- end
40
-
41
- it 'returns false if the string is not a hyperlink #007' do
42
- string = 'The file location is c:\Users\johndoe.'
43
- ws = described_class.new(string: string)
44
- expect(ws.hyperlink?).to eq(false)
45
- end
46
- end
47
-
48
4
  context '#replace' do
49
5
  it 'replaces the hyperlinks in a string with regular tokens #001' do
50
6
  string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
@@ -129,6 +129,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
129
129
  text = 'Visit https://www.tm-town.com for more info.'
130
130
  expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
131
131
  end
132
+
133
+ it 'redacts hyperlinks from a text #002' do
134
+ text = 'Visit www.tm-town.com for more info.'
135
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
136
+ end
132
137
  end
133
138
 
134
139
  describe '#hyperlinks_html' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.31
4
+ version: 0.0.32
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-27 00:00:00.000000000 Z
11
+ date: 2015-05-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler