confidential_info_redactor 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d4d140024c6ad95745f30e4ce6e9ecccce14bef8
4
- data.tar.gz: 94510a7ae535f5b1d537cb64f0ddcb0cb76c8cef
3
+ metadata.gz: 2aba4fe814eb5d71124ad7a1b5f53840f1831d23
4
+ data.tar.gz: f682003e08e11ef4747255034fb40baed2fcf639
5
5
  SHA512:
6
- metadata.gz: 5fb3c0593c24ce7e924da8906505040fc856d43bd6a422b15be892507c452cdc50dde8158f648942f911778a54447c204262def5914b4a728f541709519509c6
7
- data.tar.gz: 8de04ce27bbeb9ae51c4793a52d518ccb4cf000b07f0ef893b54d8e099f11f8d08970a35be57cb4b445bb37b54a6dd1d53e7cfb36d189c4888d5e560189feeb8
6
+ metadata.gz: e39b8a938438e920cdfc2c158048dd779168fa16ff7a47ba29e5d7bc604c1d12f71ea626875aa805b83c12143a20fa256da33d594e332c279b99d5e049c07e99
7
+ data.tar.gz: 6b0096fe0d4b85068edc78326274d1a3d4a3b8bafe1f32bfe8d847623a9af65ff058d53dc004a28457e2eb33057065a62ce4277ef6b79ad83876d82c41334a47
@@ -22,19 +22,19 @@ module ConfidentialInfoRedactor
22
22
  def extract
23
23
  extracted_terms = []
24
24
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
25
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
25
+ initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
26
26
  initial_extracted_terms.each do |ngram|
27
27
  ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
28
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
29
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
28
+ if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
29
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
30
30
  else
31
31
  tracker = true
32
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
33
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
32
+ unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
33
+ t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
34
34
  tracker = false if corpus.include?(token.downcase)
35
35
  end
36
36
  end
37
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip) || !tracker
37
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker
38
38
  end
39
39
  end
40
40
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactor
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -118,6 +118,21 @@ RSpec.describe ConfidentialInfoRedactor::Extractor do
118
118
  text = 'GOOD CARBS VS. BAD CARBS'
119
119
  expect(described_class.new(text: text, language: 'en').extract).to eq([])
120
120
  end
121
+
122
+ it 'extracts the proper nouns from a text #013' do
123
+ text = 'Reducing”'
124
+ expect(described_class.new(text: text, language: 'en').extract).to eq([])
125
+ end
126
+
127
+ it 'extracts the proper nouns from a text #014' do
128
+ text = '”'
129
+ expect(described_class.new(text: text, language: 'en').extract).to eq([])
130
+ end
131
+
132
+ it 'extracts the proper nouns from a text #015' do
133
+ text = '“Reducing'
134
+ expect(described_class.new(text: text, language: 'en').extract).to eq([])
135
+ end
121
136
  end
122
137
 
123
138
  context 'German (de)' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias