confidential_info_redactor 0.0.13 → 0.0.14
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f372184466d5b6452bc24fcba0cc4b7f6754d5c8
|
4
|
+
data.tar.gz: e6cc96f50cb2ff83e4d74b5e9fb201c2c0f93a17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3752768a77fd3514e3717363c9c23cfccfe62b6acbdba59a540d9eb1506a55573a582dc581b7f433c94822c43cbfc9d82d27ca754a0b51751306f8cfdc9d8ea7
|
7
|
+
data.tar.gz: 2de4f5514ea01869ae0f552d9bdefeba79c60c79cbb62167644f47c2a3a3d0213fb546c5c70fd3466e0afd8d421a3c712e163a6cfbc3e82629e44e8501798d7a
|
@@ -25,6 +25,7 @@ module ConfidentialInfoRedactor
|
|
25
25
|
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
|
26
26
|
initial_extracted_terms.each do |ngram|
|
27
27
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
28
|
+
next if !(t !~ /.*\d+.*/)
|
28
29
|
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
29
30
|
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
|
30
31
|
else
|
@@ -133,6 +133,11 @@ RSpec.describe ConfidentialInfoRedactor::Extractor do
|
|
133
133
|
text = '“Reducing'
|
134
134
|
expect(described_class.new(text: text, language: 'en').extract).to eq([])
|
135
135
|
end
|
136
|
+
|
137
|
+
it 'extracts the proper nouns from a text #015' do
|
138
|
+
text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
|
139
|
+
expect(described_class.new(text: text, language: 'en').extract).to eq(["Corrigendum"])
|
140
|
+
end
|
136
141
|
end
|
137
142
|
|
138
143
|
context 'German (de)' do
|