confidential_info_redactor_lite 0.0.33 → 0.0.34
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/confidential_info_redactor_lite.gemspec +2 -1
- data/lib/confidential_info_redactor_lite/extractor.rb +38 -24
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/lib/confidential_info_redactor_lite.rb +2 -1
- data/spec/confidential_info_redactor_lite/performance_spec.rb +58 -0
- metadata +20 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05d26fbe6fe84f3a1f695b05a47ff9ae555cbb06
|
4
|
+
data.tar.gz: 9be6e91bf1e96e8f237820ac53de547ce534c9d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 528f42365aadf05514ec5d56d088e838392473c3cb80a8210302fc8a256bcbb9ed98817c2fd6d3034f99f0aef4b47f4ed67a393c654cdaecb84bc034f0eddd3d
|
7
|
+
data.tar.gz: d1c900aefe94e6a45a1c7c28d8dd61ac23b39f46f6f0ea1076e126312148254aa84349527b0183221e3c4519997a6d865993ddb573e94ee3dfd37e45506042d4
|
@@ -22,5 +22,6 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.6"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
24
|
spec.add_development_dependency "rspec"
|
25
|
-
spec.
|
25
|
+
spec.add_development_dependency "stackprof"
|
26
|
+
spec.add_runtime_dependency "pragmatic_segmenter", "~> 0.3.7"
|
26
27
|
end
|
@@ -7,43 +7,57 @@ module ConfidentialInfoRedactorLite
|
|
7
7
|
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
8
8
|
attr_reader :text, :language, :corpus
|
9
9
|
def initialize(text:, corpus:, **args)
|
10
|
-
@text = text.gsub(/[’‘]/, "'")
|
11
|
-
@corpus = corpus
|
10
|
+
@text = text.gsub(/[’‘]/, "'").freeze
|
11
|
+
@corpus = Set.new(corpus).freeze
|
12
12
|
@language = args[:language] || 'en'
|
13
13
|
end
|
14
14
|
|
15
15
|
def extract
|
16
16
|
extracted_terms = []
|
17
17
|
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
18
|
-
initial_extracted_terms = segment
|
19
|
-
|
20
|
-
initial_extracted_terms
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
18
|
+
initial_extracted_terms = extract_preliminary_terms(segment)
|
19
|
+
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && search_for_ngrams(initial_extracted_terms)
|
20
|
+
search_ngrams(initial_extracted_terms, extracted_terms)
|
21
|
+
end
|
22
|
+
extracted_terms.uniq.reject(&:empty?)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def extract_preliminary_terms(segment)
|
28
|
+
segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
29
|
+
end
|
30
|
+
|
31
|
+
def search_for_ngrams(tokens)
|
32
|
+
in_corpus = true
|
33
|
+
tokens.each do |ngram|
|
34
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
35
|
+
unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
|
36
|
+
in_corpus = false
|
25
37
|
end
|
26
38
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
+
end
|
40
|
+
in_corpus
|
41
|
+
end
|
42
|
+
|
43
|
+
def search_ngrams(tokens, extracted_terms)
|
44
|
+
tokens.each do |ngram|
|
45
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
46
|
+
next if !(t !~ /.*\d+.*/)
|
47
|
+
if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
|
48
|
+
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
|
49
|
+
else
|
50
|
+
tracker = true
|
51
|
+
unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
52
|
+
t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
53
|
+
tracker = false if corpus.include?(token.downcase)
|
39
54
|
end
|
40
|
-
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
|
41
55
|
end
|
56
|
+
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
|
42
57
|
end
|
43
58
|
end
|
44
59
|
end
|
45
|
-
|
46
|
-
extracted_terms.uniq.reject(&:empty?)
|
60
|
+
extracted_terms
|
47
61
|
end
|
48
62
|
end
|
49
63
|
end
|