confidential_info_redactor_lite 0.0.31 → 0.0.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/extractor.rb +12 -10
- data/lib/confidential_info_redactor_lite/hyperlink.rb +1 -12
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/extractor_spec.rb +20 -0
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +0 -44
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69347cc82e199d28d8bf542fba216c7b8c6d3410
|
4
|
+
data.tar.gz: ceebbfa4046ba5fd1349d47c6eff0111ad18b091
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13b2789ed7b91d38fb3963a554b5f7570b13f7ce39891f4a43be479fc59770794d513f74c1c3ea66c1d312a0b3b919644c096ec4a726b529ffd4e23913cf32d1
|
7
|
+
data.tar.gz: f2a713b84a11c782f452bc2b4279bf5142e2b1ecbc7eb8374f77f924937bd759932cf870a4ecad2416d80adf6bbcbade97b8ff17de249a379e46c754197923ba
|
@@ -2,7 +2,9 @@ module ConfidentialInfoRedactorLite
|
|
2
2
|
# This class extracts proper nouns from a text
|
3
3
|
class Extractor
|
4
4
|
# Rubular: http://rubular.com/r/qE0g4r9zR7
|
5
|
-
EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
|
5
|
+
EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
|
6
|
+
|
7
|
+
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
6
8
|
attr_reader :text, :language, :corpus
|
7
9
|
def initialize(text:, corpus:, **args)
|
8
10
|
@text = text.gsub(/[’‘]/, "'")
|
@@ -13,29 +15,29 @@ module ConfidentialInfoRedactorLite
|
|
13
15
|
def extract
|
14
16
|
extracted_terms = []
|
15
17
|
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
16
|
-
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(
|
18
|
+
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
17
19
|
in_corpus = true
|
18
20
|
initial_extracted_terms.each do |ngram|
|
19
|
-
ngram.split(
|
20
|
-
unless corpus.include?(t.downcase.gsub(
|
21
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
22
|
+
unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
|
21
23
|
in_corpus = false
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
25
27
|
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
|
26
28
|
initial_extracted_terms.each do |ngram|
|
27
|
-
ngram.split(
|
29
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
28
30
|
next if !(t !~ /.*\d+.*/)
|
29
|
-
if corpus.include?(t.downcase.gsub(
|
30
|
-
extracted_terms << t.gsub(
|
31
|
+
if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
32
|
+
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
|
31
33
|
else
|
32
34
|
tracker = true
|
33
|
-
unless t.gsub(
|
34
|
-
t.gsub(
|
35
|
+
unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
36
|
+
t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
35
37
|
tracker = false if corpus.include?(token.downcase)
|
36
38
|
end
|
37
39
|
end
|
38
|
-
extracted_terms << t.gsub(
|
40
|
+
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
|
39
41
|
end
|
40
42
|
end
|
41
43
|
end
|
@@ -1,8 +1,5 @@
|
|
1
|
-
require 'uri'
|
2
|
-
|
3
1
|
module ConfidentialInfoRedactorLite
|
4
2
|
class Hyperlink
|
5
|
-
NON_HYPERLINK_REGEX = /\A\w+:$/
|
6
3
|
|
7
4
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
8
5
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
@@ -12,18 +9,10 @@ module ConfidentialInfoRedactorLite
|
|
12
9
|
@string = string
|
13
10
|
end
|
14
11
|
|
15
|
-
def hyperlink?
|
16
|
-
!(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
|
17
|
-
end
|
18
|
-
|
19
12
|
def replace
|
20
13
|
new_string = string.dup
|
21
14
|
string.split(/\s+/).each do |token|
|
22
|
-
|
23
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
24
|
-
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
25
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
26
|
-
end
|
15
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
|
27
16
|
end
|
28
17
|
new_string
|
29
18
|
end
|
@@ -161,6 +161,26 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
161
161
|
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
162
162
|
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
163
163
|
end
|
164
|
+
|
165
|
+
it 'extracts the proper nouns from a text #004' do
|
166
|
+
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
167
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
168
|
+
end
|
169
|
+
|
170
|
+
it 'extracts the proper nouns from a text #005' do
|
171
|
+
text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
|
172
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
173
|
+
end
|
174
|
+
|
175
|
+
it 'extracts the proper nouns from a text #006' do
|
176
|
+
text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
|
177
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
178
|
+
end
|
179
|
+
|
180
|
+
it 'extracts the proper nouns from a text #007' do
|
181
|
+
text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
|
182
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
|
183
|
+
end
|
164
184
|
end
|
165
185
|
end
|
166
186
|
end
|
@@ -1,50 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
|
4
|
-
context '#hyperlink?' do
|
5
|
-
it 'returns true if the string is a hyperlink #001' do
|
6
|
-
string = "http://www.example.com/this-IS-a_test/hello.html"
|
7
|
-
ws = described_class.new(string: string)
|
8
|
-
expect(ws.hyperlink?).to eq(true)
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'returns true if the string is a hyperlink #002' do
|
12
|
-
string = "http://www.google.co.uk"
|
13
|
-
ws = described_class.new(string: string)
|
14
|
-
expect(ws.hyperlink?).to eq(true)
|
15
|
-
end
|
16
|
-
|
17
|
-
it 'returns true if the string is a hyperlink #003' do
|
18
|
-
string = "https://google.co.uk"
|
19
|
-
ws = described_class.new(string: string)
|
20
|
-
expect(ws.hyperlink?).to eq(true)
|
21
|
-
end
|
22
|
-
|
23
|
-
it 'returns false if the string is not a hyperlink #004' do
|
24
|
-
string = "hello"
|
25
|
-
ws = described_class.new(string: string)
|
26
|
-
expect(ws.hyperlink?).to eq(false)
|
27
|
-
end
|
28
|
-
|
29
|
-
it 'returns false if the string is not a hyperlink #005' do
|
30
|
-
string = "john@gmail.com"
|
31
|
-
ws = described_class.new(string: string)
|
32
|
-
expect(ws.hyperlink?).to eq(false)
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'returns false if the string is not a hyperlink #006' do
|
36
|
-
string = "date:"
|
37
|
-
ws = described_class.new(string: string)
|
38
|
-
expect(ws.hyperlink?).to eq(false)
|
39
|
-
end
|
40
|
-
|
41
|
-
it 'returns false if the string is not a hyperlink #007' do
|
42
|
-
string = 'The file location is c:\Users\johndoe.'
|
43
|
-
ws = described_class.new(string: string)
|
44
|
-
expect(ws.hyperlink?).to eq(false)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
4
|
context '#replace' do
|
49
5
|
it 'replaces the hyperlinks in a string with regular tokens #001' do
|
50
6
|
string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
|
@@ -129,6 +129,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
129
129
|
text = 'Visit https://www.tm-town.com for more info.'
|
130
130
|
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
|
131
131
|
end
|
132
|
+
|
133
|
+
it 'redacts hyperlinks from a text #002' do
|
134
|
+
text = 'Visit www.tm-town.com for more info.'
|
135
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
|
136
|
+
end
|
132
137
|
end
|
133
138
|
|
134
139
|
describe '#hyperlinks_html' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: confidential_info_redactor_lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.32
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-05-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|