confidential_info_redactor_lite 0.0.31 → 0.0.32

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e4392706f3a5e307a5453009fadcf4010a7162da
4
- data.tar.gz: 107759c2fd4bbe553f6389bb4c419698c0513285
3
+ metadata.gz: 69347cc82e199d28d8bf542fba216c7b8c6d3410
4
+ data.tar.gz: ceebbfa4046ba5fd1349d47c6eff0111ad18b091
5
5
  SHA512:
6
- metadata.gz: 77ea54f659ac2a4e340fa55ac5afc558d0e6807630c1b646554da52a3a0a88f138c9db932d92b86e06b7334b79ea91b97b93011d1afea055a427953d148d55cf
7
- data.tar.gz: 62c0fd961e7ab69fb571f5ab5edae83d736feb1bf81a77d33195ad7c3fdfe7e5cd99ccb2a2dc5c65e0e49e6a0da0c009ce362ac854aa93ffe0b576ccd91f0369
6
+ metadata.gz: 13b2789ed7b91d38fb3963a554b5f7570b13f7ce39891f4a43be479fc59770794d513f74c1c3ea66c1d312a0b3b919644c096ec4a726b529ffd4e23913cf32d1
7
+ data.tar.gz: f2a713b84a11c782f452bc2b4279bf5142e2b1ecbc7eb8374f77f924937bd759932cf870a4ecad2416d80adf6bbcbade97b8ff17de249a379e46c754197923ba
@@ -2,7 +2,9 @@ module ConfidentialInfoRedactorLite
2
2
  # This class extracts proper nouns from a text
3
3
  class Extractor
4
4
  # Rubular: http://rubular.com/r/qE0g4r9zR7
5
- EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
5
+ EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
6
+
7
+ PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
6
8
  attr_reader :text, :language, :corpus
7
9
  def initialize(text:, corpus:, **args)
8
10
  @text = text.gsub(/[’‘]/, "'")
@@ -13,29 +15,29 @@ module ConfidentialInfoRedactorLite
13
15
  def extract
14
16
  extracted_terms = []
15
17
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
16
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
18
+ initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
17
19
  in_corpus = true
18
20
  initial_extracted_terms.each do |ngram|
19
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
20
- unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
21
+ ngram.split(PUNCTUATION_REGEX).each do |t|
22
+ unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
21
23
  in_corpus = false
22
24
  end
23
25
  end
24
26
  end
25
27
  next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
26
28
  initial_extracted_terms.each do |ngram|
27
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
29
+ ngram.split(PUNCTUATION_REGEX).each do |t|
28
30
  next if !(t !~ /.*\d+.*/)
29
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
30
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
31
+ if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
32
+ extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
31
33
  else
32
34
  tracker = true
33
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
34
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
35
+ unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
36
+ t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
35
37
  tracker = false if corpus.include?(token.downcase)
36
38
  end
37
39
  end
38
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('n'))
40
+ extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
39
41
  end
40
42
  end
41
43
  end
@@ -1,8 +1,5 @@
1
- require 'uri'
2
-
3
1
  module ConfidentialInfoRedactorLite
4
2
  class Hyperlink
5
- NON_HYPERLINK_REGEX = /\A\w+:$/
6
3
 
7
4
  # Rubular: http://rubular.com/r/fXa4lp0gfS
8
5
  HYPERLINK_REGEX = /(http|https|www)(\.|:)/
@@ -12,18 +9,10 @@ module ConfidentialInfoRedactorLite
12
9
  @string = string
13
10
  end
14
11
 
15
- def hyperlink?
16
- !(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
17
- end
18
-
19
12
  def replace
20
13
  new_string = string.dup
21
14
  string.split(/\s+/).each do |token|
22
- if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
23
- new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
24
- elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
25
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
26
- end
15
+ new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
27
16
  end
28
17
  new_string
29
18
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.31"
2
+ VERSION = "0.0.32"
3
3
  end
@@ -161,6 +161,26 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
161
161
  text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
162
162
  expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
163
163
  end
164
+
165
+ it 'extracts the proper nouns from a text #004' do
166
+ text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
167
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
168
+ end
169
+
170
+ it 'extracts the proper nouns from a text #005' do
171
+ text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
172
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
173
+ end
174
+
175
+ it 'extracts the proper nouns from a text #006' do
176
+ text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
177
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
178
+ end
179
+
180
+ it 'extracts the proper nouns from a text #007' do
181
+ text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
182
+ expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
183
+ end
164
184
  end
165
185
  end
166
186
  end
@@ -1,50 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
4
- context '#hyperlink?' do
5
- it 'returns true if the string is a hyperlink #001' do
6
- string = "http://www.example.com/this-IS-a_test/hello.html"
7
- ws = described_class.new(string: string)
8
- expect(ws.hyperlink?).to eq(true)
9
- end
10
-
11
- it 'returns true if the string is a hyperlink #002' do
12
- string = "http://www.google.co.uk"
13
- ws = described_class.new(string: string)
14
- expect(ws.hyperlink?).to eq(true)
15
- end
16
-
17
- it 'returns true if the string is a hyperlink #003' do
18
- string = "https://google.co.uk"
19
- ws = described_class.new(string: string)
20
- expect(ws.hyperlink?).to eq(true)
21
- end
22
-
23
- it 'returns false if the string is not a hyperlink #004' do
24
- string = "hello"
25
- ws = described_class.new(string: string)
26
- expect(ws.hyperlink?).to eq(false)
27
- end
28
-
29
- it 'returns false if the string is not a hyperlink #005' do
30
- string = "john@gmail.com"
31
- ws = described_class.new(string: string)
32
- expect(ws.hyperlink?).to eq(false)
33
- end
34
-
35
- it 'returns false if the string is not a hyperlink #006' do
36
- string = "date:"
37
- ws = described_class.new(string: string)
38
- expect(ws.hyperlink?).to eq(false)
39
- end
40
-
41
- it 'returns false if the string is not a hyperlink #007' do
42
- string = 'The file location is c:\Users\johndoe.'
43
- ws = described_class.new(string: string)
44
- expect(ws.hyperlink?).to eq(false)
45
- end
46
- end
47
-
48
4
  context '#replace' do
49
5
  it 'replaces the hyperlinks in a string with regular tokens #001' do
50
6
  string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
@@ -129,6 +129,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
129
129
  text = 'Visit https://www.tm-town.com for more info.'
130
130
  expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
131
131
  end
132
+
133
+ it 'redacts hyperlinks from a text #002' do
134
+ text = 'Visit www.tm-town.com for more info.'
135
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
136
+ end
132
137
  end
133
138
 
134
139
  describe '#hyperlinks_html' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.31
4
+ version: 0.0.32
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-27 00:00:00.000000000 Z
11
+ date: 2015-05-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler