confidential_info_redactor_lite 0.0.18 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ecfaf02e5c062c12167515680d1ce6e4a0f73d62
|
4
|
+
data.tar.gz: 069f0d297228e1fa76ab940776b43398a0ca130e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe912ff29029746481b880e81090fb8298f3b0581270bb954f903afd8a9c6ff7d0e6da0269af0eb2a8ad444562811d68d0faee43eadd329e3dedd59eb6330400
|
7
|
+
data.tar.gz: 4dd204b8f0d34e0d13beaa1795806c4a39c406d01ee258367a7c877cf144e8ea7c3866c65b4910dc6a13b4352a669b11c61fb9df9813fe64de5546a89b97005c
|
@@ -16,6 +16,7 @@ module ConfidentialInfoRedactorLite
|
|
16
16
|
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
|
17
17
|
initial_extracted_terms.each do |ngram|
|
18
18
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
19
|
+
next if !(t !~ /.*\d+.*/)
|
19
20
|
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
20
21
|
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
|
21
22
|
else
|
@@ -134,6 +134,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
134
134
|
text = '“Reducing'
|
135
135
|
expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
|
136
136
|
end
|
137
|
+
|
138
|
+
it 'extracts the proper nouns from a text #016' do
|
139
|
+
text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
|
140
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
|
141
|
+
end
|
137
142
|
end
|
138
143
|
|
139
144
|
context 'German (de)' do
|