confidential_info_redactor_lite 1.0.10 → 1.0.11
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6a5ec9e9eeb82984f87cba0c612404b7a6e9f4b4
|
4
|
+
data.tar.gz: 5e410055fe45224d70930f7e6c8f2858eeda4bc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e3c14787d5013846629bbab16ef5117fe96a0e8d1193be266e1325f7d21ff978100f4af666289d933869f744d7bd0383e7c136f81d6bac733e72dabde40f766
|
7
|
+
data.tar.gz: ef113dda2f3a5928f1281d0fb96fd9d99bc8d4d8d7bc0c5aa5d227b77e3182b05934e498aa8c7481bb1c80ee51be7a1132c3740207b33e19264c9005bf8d5b77
|
@@ -15,7 +15,6 @@ module ConfidentialInfoRedactorLite
|
|
15
15
|
extracted_terms = []
|
16
16
|
PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
|
17
17
|
initial_extracted_terms = extract_preliminary_terms(segment)
|
18
|
-
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
|
19
18
|
search_ngrams(initial_extracted_terms, extracted_terms)
|
20
19
|
end
|
21
20
|
extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
|
@@ -27,21 +26,21 @@ module ConfidentialInfoRedactorLite
|
|
27
26
|
segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
28
27
|
end
|
29
28
|
|
30
|
-
def in_corpus?(tokens)
|
31
|
-
tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
|
32
|
-
end
|
33
|
-
|
34
29
|
def clean_token(token)
|
35
30
|
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
36
31
|
end
|
37
32
|
|
38
33
|
def non_confidential_token?(token, includes_confidential)
|
39
|
-
corpus.include?(token) || !includes_confidential ||
|
34
|
+
corpus.include?(token) || !includes_confidential || stem_in_corpus?(token)
|
40
35
|
end
|
41
36
|
|
42
|
-
def
|
37
|
+
def stem_in_corpus?(token)
|
43
38
|
corpus.include?(token[0...-1]) &&
|
44
|
-
token[-1].eql?('s')
|
39
|
+
token[-1].eql?('s') ||
|
40
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
|
41
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
|
42
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
|
43
|
+
corpus.include?(token[0...-1]) && token[-1].eql?('n')
|
45
44
|
end
|
46
45
|
|
47
46
|
def includes_confidential?(token)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
4
|
-
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
|
4
|
+
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing', 'ich', 'routine', 'studium', 'uni', 'tag', 'mahlzeit'] }
|
5
5
|
describe '#extract' do
|
6
6
|
context 'English (en)' do
|
7
7
|
it 'extracts the proper nouns from a text #001' do
|
@@ -196,6 +196,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
196
196
|
text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
|
197
197
|
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
198
198
|
end
|
199
|
+
|
200
|
+
it 'extracts the proper nouns from a text #008' do
|
201
|
+
text = 'Ich behielt diese Routine während und sogar während des Studiums an der Uni bei, und ich war damals froh, wenn ich pro Tag zwei ganze Mahlzeiten zu mir nahm.'
|
202
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq([])
|
203
|
+
end
|
199
204
|
end
|
200
205
|
end
|
201
206
|
end
|