confidential_info_redactor_lite 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ae3ba12bb0731420494dd385bff38b9315ff938d
4
- data.tar.gz: 5482bf82d5ea1551b205885e6aaec908f328910f
3
+ metadata.gz: a00a35159e096f07346d94b3ffd1ebf139ef9561
4
+ data.tar.gz: adacee6c75993572f2343bfef71fa94b70ab885a
5
5
  SHA512:
6
- metadata.gz: 187274627b9d905463e45e6c24465cb5d55cf7e8c84207568e3feaa237a6e96d31035e20c21a623e177a3ca8a60c5a157fa25ac878271e853ffcd630bb361164
7
- data.tar.gz: c96104ef54fc174b547591e7569c6c404d1d8c6d077e2cd4e87db0673817956050df526104b2b38d920c873a8d993294993d897765988b36362760ffe0af20c4
6
+ metadata.gz: 8958fef084a98d839334725773f4fa1a5a4ec0de746d0826a02902398ac83184052b9cc12bd9e22bfc819133a33d12b1f8891731801fcc3d49a717cac605711b
7
+ data.tar.gz: eb78915b7d9f8ccdb57d377b1af1be2afa754a91f12a222a7946b532b590212dd788eefcc3e8f0a808eebd8a63f5017f1df79bbac23c293c51ade05143e82c7e
@@ -16,7 +16,11 @@ module ConfidentialInfoRedactorLite
16
16
  initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '')) }.compact
17
17
  initial_extracted_terms.each do |ngram|
18
18
  ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
19
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip)
19
+ if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the'
20
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip.split(' ')[1])
21
+ else
22
+ extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').strip)
23
+ end
20
24
  end
21
25
  end
22
26
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe ConfidentialInfoRedactorLite::Extractor do
4
- let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to'] }
4
+ let(:corpus) { ['i', 'in', 'you', 'top', 'so', 'are', 'december', 'please', 'viele', 'mitarbeiter', 'arbeitsstelle', 'some', 'there', 'king', 'by', "don't", 'dec', 'at', 'dot', 'and', 'project', 'activity', 'complete', 'prizes', 'build', 'video', 'many', 'autographs', 'picture', 'the', 'each', 'submit', 'to', 'then', 'coming', 'screenshot'] }
5
5
  describe '#extract' do
6
6
  context 'English (en)' do
7
7
  it 'extracts the proper nouns from a text #001' do
@@ -87,7 +87,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
87
87
 
88
88
  Don’t forget to use your imagination and creativity!
89
89
  EOF
90
- expect(described_class.new(text: text, corpus: corpus).extract).to eq(["Putter King Miniature Golf Scavenger Hunt", "Putter King", "Annual Miniature Golf Scavenger Hunt", "The Official List", "Nostalgic Miniature Golf Obstacles", "Putter King Hole Design Contest", "Screenshot", "World Heritage Site", "PGA", "iTunes", "Gift Card", "Putter King Scavenger Hunt Trophy", "Engraved Crystal Trophy", "Picture Coming Soon", "The Putter King", "The U.S. Government", "Putter King Scavenger Hunt Submission", "YouTube", "Flickr", "Picasa", "Photobucket"])
90
+ expect(described_class.new(text: text, corpus: corpus).extract).to eq(["Putter King Miniature Golf Scavenger Hunt", "Putter King", "Annual Miniature Golf Scavenger Hunt", "The Official List", "Nostalgic Miniature Golf Obstacles", "Putter King Hole Design Contest", "World Heritage Site", "PGA", "iTunes", "Gift Card", "Putter King Scavenger Hunt Trophy", "Engraved Crystal Trophy", "The Putter King", "The U.S. Government", "Putter King Scavenger Hunt Submission", "YouTube", "Flickr", "Picasa", "Photobucket"])
91
91
  end
92
92
 
93
93
  it 'extracts the proper nouns from a text #007' do
@@ -99,6 +99,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
99
99
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
100
100
  expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Coca-Cola", "Pepsi", "John Smith"])
101
101
  end
102
+
103
+ it 'extracts the proper nouns from a text #009' do
104
+ text = 'Then Peter went to the store.'
105
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Peter"])
106
+ end
102
107
  end
103
108
 
104
109
  context 'German (de)' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-17 00:00:00.000000000 Z
11
+ date: 2015-04-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler