confidential_info_redactor_lite 1.0.10 → 1.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8912d3e15f41e1f844eaa419ab85cb2ea6a17c62
4
- data.tar.gz: 5ddbf4fa910e5a439c43b8f0734f981ab1a18249
3
+ metadata.gz: 6a5ec9e9eeb82984f87cba0c612404b7a6e9f4b4
4
+ data.tar.gz: 5e410055fe45224d70930f7e6c8f2858eeda4bc0
5
5
  SHA512:
6
- metadata.gz: 61b8a8408644671cc111edc75b1faca3b0d92eee0faf1f9d119b4a29b95546764fe8648fd93f952bcdc7024493461aefe05da7c86b9cd3dae0ca1212703e3598
7
- data.tar.gz: 832ba89f209760c18833b7d3bd300496700890231c6053ecb50536fd97d47c796a8f12792e620c05c34f6e39f40513144825f93f830b1ae27c6d3fb0a7da9724
6
+ metadata.gz: 5e3c14787d5013846629bbab16ef5117fe96a0e8d1193be266e1325f7d21ff978100f4af666289d933869f744d7bd0383e7c136f81d6bac733e72dabde40f766
7
+ data.tar.gz: ef113dda2f3a5928f1281d0fb96fd9d99bc8d4d8d7bc0c5aa5d227b77e3182b05934e498aa8c7481bb1c80ee51be7a1132c3740207b33e19264c9005bf8d5b77
@@ -15,7 +15,6 @@ module ConfidentialInfoRedactorLite
15
15
  extracted_terms = []
16
16
  PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
17
17
  initial_extracted_terms = extract_preliminary_terms(segment)
18
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
19
18
  search_ngrams(initial_extracted_terms, extracted_terms)
20
19
  end
21
20
  extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
@@ -27,21 +26,21 @@ module ConfidentialInfoRedactorLite
27
26
  segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
28
27
  end
29
28
 
30
- def in_corpus?(tokens)
31
- tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
32
- end
33
-
34
29
  def clean_token(token)
35
30
  token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
36
31
  end
37
32
 
38
33
  def non_confidential_token?(token, includes_confidential)
39
- corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
34
+ corpus.include?(token) || !includes_confidential || stem_in_corpus?(token)
40
35
  end
41
36
 
42
- def singular_in_corpus?(token)
37
+ def stem_in_corpus?(token)
43
38
  corpus.include?(token[0...-1]) &&
44
- token[-1].eql?('s')
39
+ token[-1].eql?('s') ||
40
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
41
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
42
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
43
+ corpus.include?(token[0...-1]) && token[-1].eql?('n')
45
44
  end
46
45
 
47
46
  def includes_confidential?(token)
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "1.0.10"
2
+ VERSION = "1.0.11"
3
3
  end
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Extractor do
4
- let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing'] }
4
+ let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot', 'putter', 'king', 'miniature', 'good', 'bad', 'vs.', 'carbs', 'all', 'natural', 'peanut', 'butter', 'world', 'heritage', 'site', 'gift', 'card', 'engraved', 'crystal', 'trophy', 'reducing', 'ich', 'routine', 'studium', 'uni', 'tag', 'mahlzeit'] }
5
5
  describe '#extract' do
6
6
  context 'English (en)' do
7
7
  it 'extracts the proper nouns from a text #001' do
@@ -196,6 +196,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
196
196
  text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
197
197
  expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
198
198
  end
199
+
200
+ it 'extracts the proper nouns from a text #008' do
201
+ text = 'Ich behielt diese Routine während und sogar während des Studiums an der Uni bei, und ich war damals froh, wenn ich pro Tag zwei ganze Mahlzeiten zu mir nahm.'
202
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq([])
203
+ end
199
204
  end
200
205
  end
201
206
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias