confidential_info_redactor_lite 0.0.20 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3f4f178e9f12c2b63b517cb83c476e4bba526f60
|
4
|
+
data.tar.gz: 3724ad4b679f8c56f27a23893d6a6b2e58f47d27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 370acfe9773d924906cd8fc5bb133cab78d605ced79cdfd317dbc7fe78d441c138e1c9df93710b9e4e608487f3647e18c4e1dc4a009fed8ca79762e0afd4d2b6
|
7
|
+
data.tar.gz: 51eda51e3bb37741e950be881473335680c2269c08378a42a63e25fc8d632527c0351c72f4a114fab43c97dc963f39c677dc7a6a266c1e65472cba405fc45f5a
|
@@ -14,7 +14,15 @@ module ConfidentialInfoRedactorLite
|
|
14
14
|
extracted_terms = []
|
15
15
|
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
16
16
|
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
|
17
|
-
|
17
|
+
in_corpus = true
|
18
|
+
initial_extracted_terms.each do |ngram|
|
19
|
+
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
20
|
+
unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
|
21
|
+
in_corpus = false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
|
18
26
|
initial_extracted_terms.each do |ngram|
|
19
27
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
20
28
|
next if !(t !~ /.*\d+.*/)
|
@@ -142,7 +142,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
142
142
|
|
143
143
|
it 'extracts the proper nouns from a text #017' do
|
144
144
|
text = 'John'
|
145
|
-
expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
|
145
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['John'])
|
146
146
|
end
|
147
147
|
end
|
148
148
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
4
|
-
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please'] }
|
4
|
+
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'prizes'] }
|
5
5
|
let(:en_dow) { %w(monday tuesday wednesday thursday friday saturday sunday) }
|
6
6
|
let(:en_dow_abbr) { %w(mon tu tue tues wed th thu thur thurs fri sat sun) }
|
7
7
|
let(:en_months) { %w(january february march april may june july august september october november december) }
|