confidential_info_redactor_lite 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a00a35159e096f07346d94b3ffd1ebf139ef9561
|
4
|
+
data.tar.gz: adacee6c75993572f2343bfef71fa94b70ab885a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8958fef084a98d839334725773f4fa1a5a4ec0de746d0826a02902398ac83184052b9cc12bd9e22bfc819133a33d12b1f8891731801fcc3d49a717cac605711b
|
7
|
+
data.tar.gz: eb78915b7d9f8ccdb57d377b1af1be2afa754a91f12a222a7946b532b590212dd788eefcc3e8f0a808eebd8a63f5017f1df79bbac23c293c51ade05143e82c7e
|
@@ -16,7 +16,11 @@ module ConfidentialInfoRedactorLite
|
|
16
16
|
initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
|
17
17
|
initial_extracted_terms.each do |ngram|
|
18
18
|
ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
|
19
|
-
|
19
|
+
if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the'
|
20
|
+
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
|
21
|
+
else
|
22
|
+
extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
22
26
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
4
|
-
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to'] }
|
4
|
+
let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot'] }
|
5
5
|
describe '#extract' do
|
6
6
|
context 'English (en)' do
|
7
7
|
it 'extracts the proper nouns from a text #001' do
|
@@ -87,7 +87,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
87
87
|
|
88
88
|
Don’t forget to use your imagination and creativity!
|
89
89
|
EOF
|
90
|
-
expect(described_class.new(text: text, corpus: corpus).extract).to eq(["Putter King Miniature Golf Scavenger Hunt", "Putter King", "Annual Miniature Golf Scavenger Hunt", "The Official List", "Nostalgic Miniature Golf Obstacles", "Putter King Hole Design Contest", "
|
90
|
+
expect(described_class.new(text: text, corpus: corpus).extract).to eq(["Putter King Miniature Golf Scavenger Hunt", "Putter King", "Annual Miniature Golf Scavenger Hunt", "The Official List", "Nostalgic Miniature Golf Obstacles", "Putter King Hole Design Contest", "World Heritage Site", "PGA", "iTunes", "Gift Card", "Putter King Scavenger Hunt Trophy", "Engraved Crystal Trophy", "The Putter King", "The U.S. Government", "Putter King Scavenger Hunt Submission", "YouTube", "Flickr", "Picasa", "Photobucket"])
|
91
91
|
end
|
92
92
|
|
93
93
|
it 'extracts the proper nouns from a text #007' do
|
@@ -99,6 +99,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
99
99
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
100
100
|
expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Coca-Cola", "Pepsi", "John Smith"])
|
101
101
|
end
|
102
|
+
|
103
|
+
it 'extracts the proper nouns from a text #009' do
|
104
|
+
text = 'Then Peter went to the store.'
|
105
|
+
expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Peter"])
|
106
|
+
end
|
102
107
|
end
|
103
108
|
|
104
109
|
context 'German (de)' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: confidential_info_redactor_lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|