confidential_info_redactor_lite 0.0.31 → 0.0.32
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/extractor.rb +12 -10
- data/lib/confidential_info_redactor_lite/hyperlink.rb +1 -12
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/extractor_spec.rb +20 -0
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +0 -44
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69347cc82e199d28d8bf542fba216c7b8c6d3410
|
4
|
+
data.tar.gz: ceebbfa4046ba5fd1349d47c6eff0111ad18b091
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13b2789ed7b91d38fb3963a554b5f7570b13f7ce39891f4a43be479fc59770794d513f74c1c3ea66c1d312a0b3b919644c096ec4a726b529ffd4e23913cf32d1
|
7
|
+
data.tar.gz: f2a713b84a11c782f452bc2b4279bf5142e2b1ecbc7eb8374f77f924937bd759932cf870a4ecad2416d80adf6bbcbade97b8ff17de249a379e46c754197923ba
|
@@ -2,7 +2,9 @@ module ConfidentialInfoRedactorLite
|
|
2
2
|
# This class extracts proper nouns from a text
|
3
3
|
class Extractor
|
4
4
|
# Rubular: http://rubular.com/r/qE0g4r9zR7
|
5
|
-
EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
|
5
|
+
EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
|
6
|
+
|
7
|
+
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
6
8
|
attr_reader :text, :language, :corpus
|
7
9
|
def initialize(text:, corpus:, **args)
|
8
10
|
@text = text.gsub(/[’‘]/, "'")
|
@@ -13,29 +15,29 @@ module ConfidentialInfoRedactorLite
|
|
13
15
|
def extract
|
14
16
|
extracted_terms = []
|
15
17
|
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
16
|
-
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(
|
18
|
+
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
17
19
|
in_corpus = true
|
18
20
|
initial_extracted_terms.each do |ngram|
|
19
|
-
ngram.split(
|
20
|
-
unless corpus.include?(t.downcase.gsub(
|
21
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
22
|
+
unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
|
21
23
|
in_corpus = false
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
25
27
|
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
|
26
28
|
initial_extracted_terms.each do |ngram|
|
27
|
-
ngram.split(
|
29
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
28
30
|
next if !(t !~ /.*\d+.*/)
|
29
|
-
if corpus.include?(t.downcase.gsub(
|
30
|
-
extracted_terms << t.gsub(
|
31
|
+
if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
32
|
+
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
|
31
33
|
else
|
32
34
|
tracker = true
|
33
|
-
unless t.gsub(
|
34
|
-
t.gsub(
|
35
|
+
unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
36
|
+
t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
35
37
|
tracker = false if corpus.include?(token.downcase)
|
36
38
|
end
|
37
39
|
end
|
38
|
-
extracted_terms << t.gsub(
|
40
|
+
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
|
39
41
|
end
|
40
42
|
end
|
41
43
|
end
|
@@ -1,8 +1,5 @@
|
|
1
|
-
require 'uri'
|
2
|
-
|
3
1
|
module ConfidentialInfoRedactorLite
|
4
2
|
class Hyperlink
|
5
|
-
NON_HYPERLINK_REGEX = /\A\w+:$/
|
6
3
|
|
7
4
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
8
5
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
@@ -12,18 +9,10 @@ module ConfidentialInfoRedactorLite
|
|
12
9
|
@string = string
|
13
10
|
end
|
14
11
|
|
15
|
-
def hyperlink?
|
16
|
-
!(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
|
17
|
-
end
|
18
|
-
|
19
12
|
def replace
|
20
13
|
new_string = string.dup
|
21
14
|
string.split(/\s+/).each do |token|
|
22
|
-
|
23
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
24
|
-
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
25
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
26
|
-
end
|
15
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
|
27
16
|
end
|
28
17
|
new_string
|
29
18
|
end
|
@@ -161,6 +161,26 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
161
161
|
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
162
162
|
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
163
163
|
end
|
164
|
+
|
165
|
+
it 'extracts the proper nouns from a text #004' do
|
166
|
+
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
167
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
168
|
+
end
|
169
|
+
|
170
|
+
it 'extracts the proper nouns from a text #005' do
|
171
|
+
text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
|
172
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
173
|
+
end
|
174
|
+
|
175
|
+
it 'extracts the proper nouns from a text #006' do
|
176
|
+
text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
|
177
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
178
|
+
end
|
179
|
+
|
180
|
+
it 'extracts the proper nouns from a text #007' do
|
181
|
+
text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
|
182
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
183
|
+
end
|
164
184
|
end
|
165
185
|
end
|
166
186
|
end
|
@@ -1,50 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
|
4
|
-
context '#hyperlink?' do
|
5
|
-
it 'returns true if the string is a hyperlink #001' do
|
6
|
-
string = "http://www.example.com/this-IS-a_test/hello.html"
|
7
|
-
ws = described_class.new(string: string)
|
8
|
-
expect(ws.hyperlink?).to eq(true)
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'returns true if the string is a hyperlink #002' do
|
12
|
-
string = "http://www.google.co.uk"
|
13
|
-
ws = described_class.new(string: string)
|
14
|
-
expect(ws.hyperlink?).to eq(true)
|
15
|
-
end
|
16
|
-
|
17
|
-
it 'returns true if the string is a hyperlink #003' do
|
18
|
-
string = "https://google.co.uk"
|
19
|
-
ws = described_class.new(string: string)
|
20
|
-
expect(ws.hyperlink?).to eq(true)
|
21
|
-
end
|
22
|
-
|
23
|
-
it 'returns false if the string is not a hyperlink #004' do
|
24
|
-
string = "hello"
|
25
|
-
ws = described_class.new(string: string)
|
26
|
-
expect(ws.hyperlink?).to eq(false)
|
27
|
-
end
|
28
|
-
|
29
|
-
it 'returns false if the string is not a hyperlink #005' do
|
30
|
-
string = "john@gmail.com"
|
31
|
-
ws = described_class.new(string: string)
|
32
|
-
expect(ws.hyperlink?).to eq(false)
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'returns false if the string is not a hyperlink #006' do
|
36
|
-
string = "date:"
|
37
|
-
ws = described_class.new(string: string)
|
38
|
-
expect(ws.hyperlink?).to eq(false)
|
39
|
-
end
|
40
|
-
|
41
|
-
it 'returns false if the string is not a hyperlink #007' do
|
42
|
-
string = 'The file location is c:\Users\johndoe.'
|
43
|
-
ws = described_class.new(string: string)
|
44
|
-
expect(ws.hyperlink?).to eq(false)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
4
|
context '#replace' do
|
49
5
|
it 'replaces the hyperlinks in a string with regular tokens #001' do
|
50
6
|
string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
|
@@ -129,6 +129,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
129
129
|
text = 'Visit https://www.tm-town.com for more info.'
|
130
130
|
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
|
131
131
|
end
|
132
|
+
|
133
|
+
it 'redacts hyperlinks from a text #002' do
|
134
|
+
text = 'Visit www.tm-town.com for more info.'
|
135
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
|
136
|
+
end
|
132
137
|
end
|
133
138
|
|
134
139
|
describe '#hyperlinks_html' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: confidential_info_redactor_lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.32
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-05-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|