confidential_info_redactor_lite 1.0.10 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6a5ec9e9eeb82984f87cba0c612404b7a6e9f4b4
|
4
|
+
data.tar.gz: 5e410055fe45224d70930f7e6c8f2858eeda4bc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e3c14787d5013846629bbab16ef5117fe96a0e8d1193be266e1325f7d21ff978100f4af666289d933869f744d7bd0383e7c136f81d6bac733e72dabde40f766
|
7
|
+
data.tar.gz: ef113dda2f3a5928f1281d0fb96fd9d99bc8d4d8d7bc0c5aa5d227b77e3182b05934e498aa8c7481bb1c80ee51be7a1132c3740207b33e19264c9005bf8d5b77
|
@@ -15,7 +15,6 @@ module ConfidentialInfoRedactorLite
|
|
15
15
|
extracted_terms = []
|
16
16
|
PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
|
17
17
|
initial_extracted_terms = extract_preliminary_terms(segment)
|
18
|
-
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
|
19
18
|
search_ngrams(initial_extracted_terms, extracted_terms)
|
20
19
|
end
|
21
20
|
extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
|
@@ -27,21 +26,21 @@ module ConfidentialInfoRedactorLite
|
|
27
26
|
segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
28
27
|
end
|
29
28
|
|
30
|
-
def in_corpus?(tokens)
|
31
|
-
tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
|
32
|
-
end
|
33
|
-
|
34
29
|
def clean_token(token)
|
35
30
|
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
36
31
|
end
|
37
32
|
|
38
33
|
def non_confidential_token?(token, includes_confidential)
|
39
|
-
corpus.include?(token) || !includes_confidential ||
|
34
|
+
corpus.include?(token) || !includes_confidential || stem_in_corpus?(token)
|
40
35
|
end
|
41
36
|
|
42
|
-
def
|
37
|
+
def stem_in_corpus?(token)
|
43
38
|
corpus.include?(token[0...-1]) &&
|
44
|
-
token[-1].eql?('s')
|
39
|
+
token[-1].eql?('s') ||
|
40
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
|
41
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
|
42
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
|
43
|
+
corpus.include?(token[0...-1]) && token[-1].eql?('n')
|
45
44
|
end
|
46
45
|
|
47
46
|
def includes_confidential?(token)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
4
|
-
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
|
4
|
+
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing', 'ich', 'routine', 'studium', 'uni', 'tag', 'mahlzeit'] }
|
5
5
|
describe '#extract' do
|
6
6
|
context 'English (en)' do
|
7
7
|
it 'extracts the proper nouns from a text #001' do
|
@@ -196,6 +196,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
196
196
|
text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
|
197
197
|
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
198
198
|
end
|
199
|
+
|
200
|
+
it 'extracts the proper nouns from a text #008' do
|
201
|
+
text = 'Ich behielt diese Routine während und sogar während des Studiums an der Uni bei, und ich war damals froh, wenn ich pro Tag zwei ganze Mahlzeiten zu mir nahm.'
|
202
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq([])
|
203
|
+
end
|
199
204
|
end
|
200
205
|
end
|
201
206
|
end
|