confidential_info_redactor 0.0.15 → 0.0.16
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd49179c366c68c4563353b5f11b5a547086c1e5
|
4
|
+
data.tar.gz: 91155c7b65e5267084e049b5f6876f5287e3df31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6b5797b7113e41ad1bda8e1991b6e41939bff11f4ac48b0d0785552d1859b0db0da97551553ef56e97589e93d0446c07f5d747e81fec87692bac8933df211f5
|
7
|
+
data.tar.gz: f04627d38fd8de849da500afed42194204b6baa2a5fae8505afecebb766bc8f9a425e2281c7be579257711319752e081fe02dc86a47dbc0fa7dc644c0a578c86
|
@@ -23,7 +23,15 @@ module ConfidentialInfoRedactor
|
|
23
23
|
extracted_terms = []
|
24
24
|
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
25
25
|
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
|
26
|
-
|
26
|
+
in_corpus = true
|
27
|
+
initial_extracted_terms.each do |ngram|
|
28
|
+
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
29
|
+
unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
|
30
|
+
in_corpus = false
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
|
27
35
|
initial_extracted_terms.each do |ngram|
|
28
36
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
29
37
|
next if !(t !~ /.*\d+.*/)
|
@@ -141,7 +141,7 @@ RSpec.describe ConfidentialInfoRedactor::Extractor do
|
|
141
141
|
|
142
142
|
it 'extracts the proper nouns from a text #017' do
|
143
143
|
text = 'John'
|
144
|
-
expect(described_class.new(text: text, language: 'en').extract).to eq([])
|
144
|
+
expect(described_class.new(text: text, language: 'en').extract).to eq(['John'])
|
145
145
|
end
|
146
146
|
end
|
147
147
|
|