confidential_info_redactor 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d4d140024c6ad95745f30e4ce6e9ecccce14bef8
4
- data.tar.gz: 94510a7ae535f5b1d537cb64f0ddcb0cb76c8cef
3
+ metadata.gz: 2aba4fe814eb5d71124ad7a1b5f53840f1831d23
4
+ data.tar.gz: f682003e08e11ef4747255034fb40baed2fcf639
5
5
  SHA512:
6
- metadata.gz: 5fb3c0593c24ce7e924da8906505040fc856d43bd6a422b15be892507c452cdc50dde8158f648942f911778a54447c204262def5914b4a728f541709519509c6
7
- data.tar.gz: 8de04ce27bbeb9ae51c4793a52d518ccb4cf000b07f0ef893b54d8e099f11f8d08970a35be57cb4b445bb37b54a6dd1d53e7cfb36d189c4888d5e560189feeb8
6
+ metadata.gz: e39b8a938438e920cdfc2c158048dd779168fa16ff7a47ba29e5d7bc604c1d12f71ea626875aa805b83c12143a20fa256da33d594e332c279b99d5e049c07e99
7
+ data.tar.gz: 6b0096fe0d4b85068edc78326274d1a3d4a3b8bafe1f32bfe8d847623a9af65ff058d53dc004a28457e2eb33057065a62ce4277ef6b79ad83876d82c41334a47
@@ -22,19 +22,19 @@ module ConfidentialInfoRedactor
22
22
  def extract
23
23
  extracted_terms = []
24
24
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
25
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
25
+ initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
26
26
  initial_extracted_terms.each do |ngram|
27
27
  ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
28
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
29
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
28
+ if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
29
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
30
30
  else
31
31
  tracker = true
32
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
33
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
32
+ unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
33
+ t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
34
34
  tracker = false if corpus.include?(token.downcase)
35
35
  end
36
36
  end
37
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip) || !tracker
37
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker
38
38
  end
39
39
  end
40
40
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactor
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -118,6 +118,21 @@ RSpec.describe ConfidentialInfoRedactor::Extractor do
118
118
  text = 'GOOD CARBS VS. BAD CARBS'
119
119
  expect(described_class.new(text: text, language: 'en').extract).to eq([])
120
120
  end
121
+
122
+ it 'extracts the proper nouns from a text #013' do
123
+ text = 'Reducing”'
124
+ expect(described_class.new(text: text, language: 'en').extract).to eq([])
125
+ end
126
+
127
+ it 'extracts the proper nouns from a text #014' do
128
+ text = '”'
129
+ expect(described_class.new(text: text, language: 'en').extract).to eq([])
130
+ end
131
+
132
+ it 'extracts the proper nouns from a text #015' do
133
+ text = '“Reducing'
134
+ expect(described_class.new(text: text, language: 'en').extract).to eq([])
135
+ end
121
136
  end
122
137
 
123
138
  context 'German (de)' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias