confidential_info_redactor_lite 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7d546836b7298bd773fad876c9af181000f3867b
4
- data.tar.gz: b66109b97cf689dabf3f92fa4445969aa0b24f70
3
+ metadata.gz: 456160b2c0748728398a7a5804f40389d141a56a
4
+ data.tar.gz: 29194c0181768c1f35b9558e5934c6133a376a41
5
5
  SHA512:
6
- metadata.gz: 4c986381225e75100ea16ac1c8c53fc9de21cdfcf0302adb0f3cc55c36a63e6d35b5474e3990f92495b7977cbd3618383517d876dfc11431c25ee178f7aa34ae
7
- data.tar.gz: 5084bd86c4c02ae26a7be74850d9c7431c99deb9373c71f0904afcfa63148eaf6bd1fc8f55636d73347f67ae8c3c081d96b6e1340d7f9fe076d0a84b516bc1a2
6
+ metadata.gz: f8d4a592e035ccadef1c8bd595978d998dfa297064fef08abe396d7b309441b54dfbd5cd02a0aeca231765f25d5316a8d334af5018d3a6244ec40e493db5c653
7
+ data.tar.gz: 22d0eea27d9a10fb5f42c8076b713995490ba95918761816cdeee4aa43acfb37248133944c4adcb396959f30bb7cbeb69624c8119abc5fea21c419043b4abfa4
@@ -13,19 +13,19 @@ module ConfidentialInfoRedactorLite
13
13
  def extract
14
14
  extracted_terms = []
15
15
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
16
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
16
+ initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
17
17
  initial_extracted_terms.each do |ngram|
18
18
  ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
19
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
20
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
19
+ if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
20
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
21
21
  else
22
22
  tracker = true
23
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
24
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
23
+ unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
24
+ t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
25
25
  tracker = false if corpus.include?(token.downcase)
26
26
  end
27
27
  end
28
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip) || !tracker
28
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker
29
29
  end
30
30
  end
31
31
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.14"
2
+ VERSION = "0.0.15"
3
3
  end
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Extractor do
4
- let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy'] }
4
+ let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
5
5
  describe '#extract' do
6
6
  context 'English (en)' do
7
7
  it 'extracts the proper nouns from a text #001' do
@@ -119,6 +119,21 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
119
119
  text = 'GOOD CARBS VS. BAD CARBS'
120
120
  expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
121
121
  end
122
+
123
+ it 'extracts the proper nouns from a text #013' do
124
+ text = 'Reducing”'
125
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
126
+ end
127
+
128
+ it 'extracts the proper nouns from a text #014' do
129
+ text = '”'
130
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
131
+ end
132
+
133
+ it 'extracts the proper nouns from a text #015' do
134
+ text = '“Reducing'
135
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
136
+ end
122
137
  end
123
138
 
124
139
  context 'German (de)' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.14
4
+ version: 0.0.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias