confidential_info_redactor 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2aba4fe814eb5d71124ad7a1b5f53840f1831d23
|
4
|
+
data.tar.gz: f682003e08e11ef4747255034fb40baed2fcf639
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e39b8a938438e920cdfc2c158048dd779168fa16ff7a47ba29e5d7bc604c1d12f71ea626875aa805b83c12143a20fa256da33d594e332c279b99d5e049c07e99
|
7
|
+
data.tar.gz: 6b0096fe0d4b85068edc78326274d1a3d4a3b8bafe1f32bfe8d847623a9af65ff058d53dc004a28457e2eb33057065a62ce4277ef6b79ad83876d82c41334a47
|
@@ -22,19 +22,19 @@ module ConfidentialInfoRedactor
|
|
22
22
|
def extract
|
23
23
|
extracted_terms = []
|
24
24
|
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
25
|
-
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
|
25
|
+
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
|
26
26
|
initial_extracted_terms.each do |ngram|
|
27
27
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
28
|
-
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
29
|
-
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
|
28
|
+
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
29
|
+
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
|
30
30
|
else
|
31
31
|
tracker = true
|
32
|
-
unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
33
|
-
t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
32
|
+
unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
33
|
+
t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
34
34
|
tracker = false if corpus.include?(token.downcase)
|
35
35
|
end
|
36
36
|
end
|
37
|
-
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip) || !tracker
|
37
|
+
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker
|
38
38
|
end
|
39
39
|
end
|
40
40
|
end
|
@@ -118,6 +118,21 @@ RSpec.describe ConfidentialInfoRedactor::Extractor do
|
|
118
118
|
text = 'GOOD CARBS VS. BAD CARBS'
|
119
119
|
expect(described_class.new(text: text, language: 'en').extract).to eq([])
|
120
120
|
end
|
121
|
+
|
122
|
+
it 'extracts the proper nouns from a text #013' do
|
123
|
+
text = 'Reducing”'
|
124
|
+
expect(described_class.new(text: text, language: 'en').extract).to eq([])
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'extracts the proper nouns from a text #014' do
|
128
|
+
text = '”'
|
129
|
+
expect(described_class.new(text: text, language: 'en').extract).to eq([])
|
130
|
+
end
|
131
|
+
|
132
|
+
it 'extracts the proper nouns from a text #015' do
|
133
|
+
text = '“Reducing'
|
134
|
+
expect(described_class.new(text: text, language: 'en').extract).to eq([])
|
135
|
+
end
|
121
136
|
end
|
122
137
|
|
123
138
|
context 'German (de)' do
|