confidential_info_redactor_lite 0.0.33 → 0.0.34

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3663d0bbfe01b799ee393dbd5c7fdf1124c87d42
4
- data.tar.gz: 978f762e6c496064cfb59995c737895009dd2bcc
3
+ metadata.gz: 05d26fbe6fe84f3a1f695b05a47ff9ae555cbb06
4
+ data.tar.gz: 9be6e91bf1e96e8f237820ac53de547ce534c9d6
5
5
  SHA512:
6
- metadata.gz: 73cbcfad95c1100dd4362469b9ddff18852554a05a9542fb25adb8ade73c8ab6980360b39f63c4482fca7cffd8b358c6a41c89e5f038481164c69d3b9254c3c9
7
- data.tar.gz: 90249a5a9272046f357523970d2e0f9a28dda314b444c122fd60b12f52b8b07b58fa3b807e16890838dc9629006a33d2e962b5b84b70980bc25945f759f702d0
6
+ metadata.gz: 528f42365aadf05514ec5d56d088e838392473c3cb80a8210302fc8a256bcbb9ed98817c2fd6d3034f99f0aef4b47f4ed67a393c654cdaecb84bc034f0eddd3d
7
+ data.tar.gz: d1c900aefe94e6a45a1c7c28d8dd61ac23b39f46f6f0ea1076e126312148254aa84349527b0183221e3c4519997a6d865993ddb573e94ee3dfd37e45506042d4
@@ -22,5 +22,6 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency "bundler", "~> 1.6"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
25
- spec.add_runtime_dependency "pragmatic_segmenter"
25
+ spec.add_development_dependency "stackprof"
26
+ spec.add_runtime_dependency "pragmatic_segmenter", "~> 0.3.7"
26
27
  end
@@ -7,43 +7,57 @@ module ConfidentialInfoRedactorLite
7
7
  PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
8
8
  attr_reader :text, :language, :corpus
9
9
  def initialize(text:, corpus:, **args)
10
- @text = text.gsub(/[’‘]/, "'")
11
- @corpus = corpus
10
+ @text = text.gsub(/[’‘]/, "'").freeze
11
+ @corpus = Set.new(corpus).freeze
12
12
  @language = args[:language] || 'en'
13
13
  end
14
14
 
15
15
  def extract
16
16
  extracted_terms = []
17
17
  PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
18
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
19
- in_corpus = true
20
- initial_extracted_terms.each do |ngram|
21
- ngram.split(PUNCTUATION_REGEX).each do |t|
22
- unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
23
- in_corpus = false
24
- end
18
+ initial_extracted_terms = extract_preliminary_terms(segment)
19
+ next if initial_extracted_terms.length.eql?(segment.split(' ').length) && search_for_ngrams(initial_extracted_terms)
20
+ search_ngrams(initial_extracted_terms, extracted_terms)
21
+ end
22
+ extracted_terms.uniq.reject(&:empty?)
23
+ end
24
+
25
+ private
26
+
27
+ def extract_preliminary_terms(segment)
28
+ segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
29
+ end
30
+
31
+ def search_for_ngrams(tokens)
32
+ in_corpus = true
33
+ tokens.each do |ngram|
34
+ ngram.split(PUNCTUATION_REGEX).each do |t|
35
+ unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
36
+ in_corpus = false
25
37
  end
26
38
  end
27
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
28
- initial_extracted_terms.each do |ngram|
29
- ngram.split(PUNCTUATION_REGEX).each do |t|
30
- next if !(t !~ /.*\d+.*/)
31
- if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
32
- extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
33
- else
34
- tracker = true
35
- unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
36
- t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
37
- tracker = false if corpus.include?(token.downcase)
38
- end
39
+ end
40
+ in_corpus
41
+ end
42
+
43
+ def search_ngrams(tokens, extracted_terms)
44
+ tokens.each do |ngram|
45
+ ngram.split(PUNCTUATION_REGEX).each do |t|
46
+ next if !(t !~ /.*\d+.*/)
47
+ if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
48
+ extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
49
+ else
50
+ tracker = true
51
+ unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
52
+ t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
53
+ tracker = false if corpus.include?(token.downcase)
39
54
  end
40
- extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
41
55
  end
56
+ extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
42
57
  end
43
58
  end
44
59
  end
45
-
46
- extracted_terms.uniq.reject(&:empty?)
60
+ extracted_terms
47
61
  end
48
62
  end
49
63
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.33"
2
+ VERSION = "0.0.34"
3
3
  end
@@ -1,4 +1,5 @@
1
1
  require 'confidential_info_redactor_lite/version'
2
2
  require 'confidential_info_redactor_lite/extractor'
3
3
  require 'confidential_info_redactor_lite/redactor'
4
- require 'pragmatic_segmenter'
4
+ require 'pragmatic_segmenter'
5
+ require 'set'