confidential_info_redactor_lite 0.0.14 → 0.0.15

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7d546836b7298bd773fad876c9af181000f3867b
4
- data.tar.gz: b66109b97cf689dabf3f92fa4445969aa0b24f70
3
+ metadata.gz: 456160b2c0748728398a7a5804f40389d141a56a
4
+ data.tar.gz: 29194c0181768c1f35b9558e5934c6133a376a41
5
5
  SHA512:
6
- metadata.gz: 4c986381225e75100ea16ac1c8c53fc9de21cdfcf0302adb0f3cc55c36a63e6d35b5474e3990f92495b7977cbd3618383517d876dfc11431c25ee178f7aa34ae
7
- data.tar.gz: 5084bd86c4c02ae26a7be74850d9c7431c99deb9373c71f0904afcfa63148eaf6bd1fc8f55636d73347f67ae8c3c081d96b6e1340d7f9fe076d0a84b516bc1a2
6
+ metadata.gz: f8d4a592e035ccadef1c8bd595978d998dfa297064fef08abe396d7b309441b54dfbd5cd02a0aeca231765f25d5316a8d334af5018d3a6244ec40e493db5c653
7
+ data.tar.gz: 22d0eea27d9a10fb5f42c8076b713995490ba95918761816cdeee4aa43acfb37248133944c4adcb396959f30bb7cbeb69624c8119abc5fea21c419043b4abfa4
@@ -13,19 +13,19 @@ module ConfidentialInfoRedactorLite
13
13
  def extract
14
14
  extracted_terms = []
15
15
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
16
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
16
+ initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
17
17
  initial_extracted_terms.each do |ngram|
18
18
  ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
19
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
20
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
19
+ if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
20
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
21
21
  else
22
22
  tracker = true
23
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
24
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
23
+ unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
24
+ t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
25
25
  tracker = false if corpus.include?(token.downcase)
26
26
  end
27
27
  end
28
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip) || !tracker
28
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker
29
29
  end
30
30
  end
31
31
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.14"
2
+ VERSION = "0.0.15"
3
3
  end
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Extractor do
4
- let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy'] }
4
+ let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
5
5
  describe '#extract' do
6
6
  context 'English (en)' do
7
7
  it 'extracts the proper nouns from a text #001' do
@@ -119,6 +119,21 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
119
119
  text = 'GOOD CARBS VS. BAD CARBS'
120
120
  expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
121
121
  end
122
+
123
+ it 'extracts the proper nouns from a text #013' do
124
+ text = 'Reducing”'
125
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
126
+ end
127
+
128
+ it 'extracts the proper nouns from a text #014' do
129
+ text = '”'
130
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
131
+ end
132
+
133
+ it 'extracts the proper nouns from a text #015' do
134
+ text = '“Reducing'
135
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
136
+ end
122
137
  end
123
138
 
124
139
  context 'German (de)' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.14
4
+ version: 0.0.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias