confidential_info_redactor_lite 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 216e96faca24f56d0c98efd9fe537d443ebf99ee
|
4
|
+
data.tar.gz: 79fd2f8c443c6d3875298b860a81ba84b4d68113
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f82751b04aa5c32af2de0fc4048a3fa79a82dbf0a9fdbe63b8118371172c62fc0b0d9cffa9eb036e1529e2b66b1ff37ce77e1cb4ee849fe45cff55cfd0b5ab5d
|
7
|
+
data.tar.gz: 7b2e643f54eb7bcfd624f16fc64ea2301f99cbad9c19a8b11c10faa1a3bd9282a319fbbd506ece94a3ea42246e8f310337a73ff2a5611632b8d0e9f04aab83e6
|
@@ -16,7 +16,7 @@ module ConfidentialInfoRedactorLite
|
|
16
16
|
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
|
17
17
|
initial_extracted_terms.each do |ngram|
|
18
18
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
19
|
-
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the'
|
19
|
+
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
20
20
|
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
|
21
21
|
else
|
22
22
|
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip)
|
@@ -25,14 +25,7 @@ module ConfidentialInfoRedactorLite
|
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
-
|
29
|
-
extracted_terms.delete_if do |token|
|
30
|
-
corpus.include?(token.split(' ')[0].downcase.strip) &&
|
31
|
-
token.split(' ')[0].downcase.strip != 'deutsche'
|
32
|
-
end.uniq.reject(&:empty?)
|
33
|
-
else
|
34
|
-
extracted_terms.uniq.reject(&:empty?)
|
35
|
-
end
|
28
|
+
extracted_terms.uniq.reject(&:empty?)
|
36
29
|
end
|
37
30
|
end
|
38
31
|
end
|
@@ -87,7 +87,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
87
87
|
|
88
88
|
Don’t forget to use your imagination and creativity!
|
89
89
|
EOF
|
90
|
-
expect(described_class.new(text: text, corpus: corpus).extract).to eq(["Putter King Miniature Golf Scavenger Hunt", "Putter King", "Annual Miniature Golf Scavenger Hunt", "The Official List", "Nostalgic Miniature Golf Obstacles", "Putter King Hole Design Contest", "World Heritage Site", "PGA", "iTunes", "Gift Card", "Putter King Scavenger Hunt Trophy", "Engraved Crystal Trophy", "The Putter King", "The U.S. Government", "Putter King Scavenger Hunt Submission", "YouTube", "Flickr", "Picasa", "Photobucket"])
|
90
|
+
expect(described_class.new(text: text, corpus: corpus).extract).to eq(["Putter King Miniature Golf Scavenger Hunt", "Putter King", "Annual Miniature Golf Scavenger Hunt", "The Official List", "Nostalgic Miniature Golf Obstacles", "Putter King Hole Design Contest", "World Heritage Site", "PGA", "iTunes", "Gift Card", "Putter King Scavenger Hunt Trophy", "Engraved Crystal Trophy", "Picture Coming Soon", "The Putter King", "The U.S. Government", "Putter King Scavenger Hunt Submission", "YouTube", "Flickr", "Picasa", "Photobucket"])
|
91
91
|
end
|
92
92
|
|
93
93
|
it 'extracts the proper nouns from a text #007' do
|