confidential_info_redactor_lite 0.0.18 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 52f6dc75062fd5b96bf6ebddda6091ece0984a5f
4
- data.tar.gz: f1e59cf4d26237d79076474ca22ee5eaeb27ae41
3
+ metadata.gz: ecfaf02e5c062c12167515680d1ce6e4a0f73d62
4
+ data.tar.gz: 069f0d297228e1fa76ab940776b43398a0ca130e
5
5
  SHA512:
6
- metadata.gz: 9295558573274c81494ad53a53f61da538cba25a8bf5b32035a0af7baa78f8e2b83b63ecd3516902fc2457acfda037e5a45bf9106a4976d8c7489554182ebb34
7
- data.tar.gz: 3b881aef75c2df2c7507034feb57ab02deba5ba4735bf50ca00c63d83eee034fd0c92a53b80a0fdf77203a8a19434c70850847cf08b9b3adc1b9796acd0e21af
6
+ metadata.gz: fe912ff29029746481b880e81090fb8298f3b0581270bb954f903afd8a9c6ff7d0e6da0269af0eb2a8ad444562811d68d0faee43eadd329e3dedd59eb6330400
7
+ data.tar.gz: 4dd204b8f0d34e0d13beaa1795806c4a39c406d01ee258367a7c877cf144e8ea7c3866c65b4910dc6a13b4352a669b11c61fb9df9813fe64de5546a89b97005c
@@ -16,6 +16,7 @@ module ConfidentialInfoRedactorLite
16
16
  initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
17
17
  initial_extracted_terms.each do |ngram|
18
18
  ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
19
+ next if !(t !~ /.*\d+.*/)
19
20
  if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
20
21
  extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
21
22
  else
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.18"
2
+ VERSION = "0.0.19"
3
3
  end
@@ -134,6 +134,11 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
134
134
  text = '“Reducing'
135
135
  expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
136
136
  end
137
+
138
+ it 'extracts the proper nouns from a text #016' do
139
+ text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
140
+ expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
141
+ end
137
142
  end
138
143
 
139
144
  context 'German (de)' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.18
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias