confidential_info_redactor_lite 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8912d3e15f41e1f844eaa419ab85cb2ea6a17c62
4
- data.tar.gz: 5ddbf4fa910e5a439c43b8f0734f981ab1a18249
3
+ metadata.gz: 6a5ec9e9eeb82984f87cba0c612404b7a6e9f4b4
4
+ data.tar.gz: 5e410055fe45224d70930f7e6c8f2858eeda4bc0
5
5
  SHA512:
6
- metadata.gz: 61b8a8408644671cc111edc75b1faca3b0d92eee0faf1f9d119b4a29b95546764fe8648fd93f952bcdc7024493461aefe05da7c86b9cd3dae0ca1212703e3598
7
- data.tar.gz: 832ba89f209760c18833b7d3bd300496700890231c6053ecb50536fd97d47c796a8f12792e620c05c34f6e39f40513144825f93f830b1ae27c6d3fb0a7da9724
6
+ metadata.gz: 5e3c14787d5013846629bbab16ef5117fe96a0e8d1193be266e1325f7d21ff978100f4af666289d933869f744d7bd0383e7c136f81d6bac733e72dabde40f766
7
+ data.tar.gz: ef113dda2f3a5928f1281d0fb96fd9d99bc8d4d8d7bc0c5aa5d227b77e3182b05934e498aa8c7481bb1c80ee51be7a1132c3740207b33e19264c9005bf8d5b77
@@ -15,7 +15,6 @@ module ConfidentialInfoRedactorLite
15
15
  extracted_terms = []
16
16
  PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
17
17
  initial_extracted_terms = extract_preliminary_terms(segment)
18
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
19
18
  search_ngrams(initial_extracted_terms, extracted_terms)
20
19
  end
21
20
  extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
@@ -27,21 +26,21 @@ module ConfidentialInfoRedactorLite
27
26
  segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
28
27
  end
29
28
 
30
- def in_corpus?(tokens)
31
- tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
32
- end
33
-
34
29
  def clean_token(token)
35
30
  token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
36
31
  end
37
32
 
38
33
  def non_confidential_token?(token, includes_confidential)
39
- corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
34
+ corpus.include?(token) || !includes_confidential || stem_in_corpus?(token)
40
35
  end
41
36
 
42
- def singular_in_corpus?(token)
37
+ def stem_in_corpus?(token)
43
38
  corpus.include?(token[0...-1]) &&
44
- token[-1].eql?('s')
39
+ token[-1].eql?('s') ||
40
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
41
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
42
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
43
+ corpus.include?(token[0...-1]) && token[-1].eql?('n')
45
44
  end
46
45
 
47
46
  def includes_confidential?(token)
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "1.0.10"
2
+ VERSION = "1.0.11"
3
3
  end
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Extractor do
4
- let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
4
+ let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing', 'ich', 'routine', 'studium', 'uni', 'tag', 'mahlzeit'] }
5
5
  describe '#extract' do
6
6
  context 'English (en)' do
7
7
  it 'extracts the proper nouns from a text #001' do
@@ -196,6 +196,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
196
196
  text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
197
197
  expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
198
198
  end
199
+
200
+ it 'extracts the proper nouns from a text #008' do
201
+ text = 'Ich behielt diese Routine während und sogar während des Studiums an der Uni bei, und ich war damals froh, wenn ich pro Tag zwei ganze Mahlzeiten zu mir nahm.'
202
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq([])
203
+ end
199
204
  end
200
205
  end
201
206
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias