stockade 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +22 -6
- data/README.md +37 -0
- data/Rakefile +5 -3
- data/bin/load +6 -32
- data/data/firstnames.dump +0 -0
- data/data/firstnames.txt +5496 -0
- data/data/lastnames.dump +0 -0
- data/data/lastnames.txt +240470 -0
- data/data/words.dump +0 -0
- data/data/words.txt +370099 -0
- data/lib/stockade/lexemes/base.rb +47 -0
- data/lib/stockade/lexemes/date.rb +49 -0
- data/lib/stockade/lexemes/dict.rb +42 -0
- data/lib/stockade/lexemes/email.rb +18 -0
- data/lib/stockade/lexemes/firstname.rb +14 -0
- data/lib/stockade/lexemes/lastname.rb +14 -0
- data/lib/stockade/lexemes/phone.rb +51 -0
- data/lib/stockade/lexemes/word.rb +17 -0
- data/lib/stockade/lexer.rb +61 -0
- data/lib/stockade/parser.rb +92 -0
- data/lib/stockade/version.rb +3 -1
- data/lib/stockade.rb +22 -107
- data/stockade.gemspec +13 -7
- metadata +50 -10
- data/data/firstnames/1.csv +0 -5496
- data/data/surnames/1.csv +0 -151671
- data/data/surnames/2.csv +0 -88799
- data/data/surnames.dump +0 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Base class for all lexemes
|
6
|
+
#
|
7
|
+
# Lexer extracts lexem candidates of text using `.regex` of
|
8
|
+
# corresponding= subclass, instantiates it and then furtner calls
|
9
|
+
# its `#valid?` to verify that this is indeed a valid lexeme.
|
10
|
+
#
|
11
|
+
class Base
|
12
|
+
attr_reader :raw_value, :start_pos
|
13
|
+
|
14
|
+
def initialize(value, start_pos = nil)
|
15
|
+
@raw_value = value
|
16
|
+
@start_pos = start_pos
|
17
|
+
end
|
18
|
+
|
19
|
+
def value
|
20
|
+
raw_value.downcase.strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def end_pos
|
24
|
+
start_pos + raw_value.size
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.regex; end
|
28
|
+
|
29
|
+
def valid?
|
30
|
+
true
|
31
|
+
end
|
32
|
+
|
33
|
+
def ==(other)
|
34
|
+
value == other.value &&
|
35
|
+
self.class == other.class
|
36
|
+
end
|
37
|
+
|
38
|
+
def range
|
39
|
+
start_pos..end_pos
|
40
|
+
end
|
41
|
+
|
42
|
+
def mask
|
43
|
+
'*' * raw_value.size
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Date lexeme
|
6
|
+
class Date < Base
|
7
|
+
class << self
|
8
|
+
def regex
|
9
|
+
/
|
10
|
+
(?<!\d)
|
11
|
+
(\d{1,4})
|
12
|
+
#{delim}
|
13
|
+
(\d{1,4})
|
14
|
+
#{delim}
|
15
|
+
(\d{1,4})
|
16
|
+
(?!\d)
|
17
|
+
/x
|
18
|
+
end
|
19
|
+
|
20
|
+
def delim
|
21
|
+
%r{[\s\.\-\/]}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def valid?
|
26
|
+
possible_dates.any? &&
|
27
|
+
possible_dates.all? do |date|
|
28
|
+
date <= ::Date.today
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def possible_dates
|
35
|
+
parts.permutation.map do |permutation|
|
36
|
+
begin
|
37
|
+
::Date.new(*permutation)
|
38
|
+
rescue ArgumentError
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
end.compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def parts
|
45
|
+
self.class.regex.match(value).captures.map(&:to_i)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Abstract Dictionary lexeme
|
6
|
+
#
|
7
|
+
# Dictionary lexemes are those that can only be verified by
|
8
|
+
# checking the corresponding dictionary
|
9
|
+
#
|
10
|
+
class Dict < Base
|
11
|
+
def self.regex
|
12
|
+
/
|
13
|
+
[a-zA-Z]+
|
14
|
+
/x
|
15
|
+
end
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
return false unless self.class.dict
|
19
|
+
self.class.dict.include?(value)
|
20
|
+
end
|
21
|
+
|
22
|
+
def name
|
23
|
+
raise 'Abstract'
|
24
|
+
end
|
25
|
+
|
26
|
+
def common_word?
|
27
|
+
Word.new(value: value).valid?
|
28
|
+
end
|
29
|
+
|
30
|
+
class << self
|
31
|
+
extend Memoist
|
32
|
+
|
33
|
+
def dict_name; end
|
34
|
+
|
35
|
+
def dict
|
36
|
+
Rambling::Trie.load("data/#{dict_name}.dump")
|
37
|
+
end
|
38
|
+
memoize :dict
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Phone lexeme
|
6
|
+
class Phone < Base
|
7
|
+
# Less noisy phone mask syntax compared to regexes
|
8
|
+
MASKS = [
|
9
|
+
'#-###-###-####',
|
10
|
+
'+#-###-###-####',
|
11
|
+
'+##-###-###-####',
|
12
|
+
'+###-###-###-####',
|
13
|
+
'###-###-####',
|
14
|
+
'### ### ####',
|
15
|
+
'(## ##) #### ####',
|
16
|
+
'##########',
|
17
|
+
'(##) #### ####',
|
18
|
+
'(##) ## #### ####',
|
19
|
+
'###-###-###-####',
|
20
|
+
'###-####',
|
21
|
+
'(###) ###-####'
|
22
|
+
].freeze
|
23
|
+
|
24
|
+
class << self
|
25
|
+
def regex
|
26
|
+
/
|
27
|
+
#{MASKS
|
28
|
+
.map { |mask| to_re(mask) }
|
29
|
+
.join(" |\n")
|
30
|
+
}
|
31
|
+
/x
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Convert phone number mask to its regex
|
37
|
+
# ### ### #### => (?:\d{3}\s\d{3}\s\d{4})
|
38
|
+
def to_re(mask)
|
39
|
+
'(?:' +
|
40
|
+
mask
|
41
|
+
.gsub('+', '\\\+')
|
42
|
+
.gsub(/(#+)/) { |m| "\\d{#{m.size}}" }
|
43
|
+
.gsub(' ', '\s')
|
44
|
+
.gsub('(', '\(\s*')
|
45
|
+
.gsub(')', '\\s*\)') +
|
46
|
+
')'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# A word found is common words dictionary
|
6
|
+
class Word < Dict
|
7
|
+
def self.dict_name
|
8
|
+
'words'
|
9
|
+
end
|
10
|
+
|
11
|
+
# common dictionary words are safe
|
12
|
+
def mask
|
13
|
+
raw_value
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rambling-trie'
|
4
|
+
require 'memoist'
|
5
|
+
require 'strscan'
|
6
|
+
require 'pry-byebug'
|
7
|
+
|
8
|
+
module Stockade
|
9
|
+
# Class Lexer
|
10
|
+
#
|
11
|
+
# Usage `Stockade::Lexer.call(context)`
|
12
|
+
#
|
13
|
+
# Returns list of found lexemes.
|
14
|
+
#
|
15
|
+
class Lexer
|
16
|
+
extend Memoist
|
17
|
+
|
18
|
+
attr_reader :context
|
19
|
+
|
20
|
+
def initialize(context)
|
21
|
+
@context = context.dup
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.call(context)
|
25
|
+
new(context).call
|
26
|
+
end
|
27
|
+
|
28
|
+
def lexeme_classes
|
29
|
+
[
|
30
|
+
Stockade::Lexemes::Date,
|
31
|
+
Stockade::Lexemes::Email,
|
32
|
+
Stockade::Lexemes::Phone,
|
33
|
+
Stockade::Lexemes::Word,
|
34
|
+
Stockade::Lexemes::Firstname,
|
35
|
+
Stockade::Lexemes::Lastname
|
36
|
+
]
|
37
|
+
end
|
38
|
+
|
39
|
+
def call
|
40
|
+
lexeme_classes.map do |lexeme_class|
|
41
|
+
tokenize(lexeme_class)
|
42
|
+
end.flatten
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def tokenize(lexeme_class)
|
48
|
+
lexemes = []
|
49
|
+
scanner = StringScanner.new(context)
|
50
|
+
|
51
|
+
while scanner.scan_until(lexeme_class.regex)
|
52
|
+
lexemes << lexeme_class.new(
|
53
|
+
scanner.matched,
|
54
|
+
scanner.pos - scanner.matched.size
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
lexemes.select(&:valid?)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
# Parser
|
5
|
+
#
|
6
|
+
# Takes a raw list (a set really) of lexemes returned by Lexer
|
7
|
+
# and further processes them. Currently this means:
|
8
|
+
# 1. Deduping - >1 ambiguous lexemes matching exactly same fragment
|
9
|
+
# are deduped according to their priority. Only the highest priority
|
10
|
+
# lexeme kept.
|
11
|
+
# 2. Removing covered lexems - lexeme that matches a string that is a
|
12
|
+
# substring of another matched string is removed.
|
13
|
+
#
|
14
|
+
class Parser
|
15
|
+
extend Memoist
|
16
|
+
|
17
|
+
attr_reader :lexemes
|
18
|
+
|
19
|
+
def initialize(lexemes)
|
20
|
+
@lexemes = lexemes
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.call(lexemes)
|
24
|
+
new(lexemes).call
|
25
|
+
end
|
26
|
+
|
27
|
+
def call
|
28
|
+
non_covered_lexemes
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def grouped
|
34
|
+
lexemes.group_by(&:range).values
|
35
|
+
end
|
36
|
+
memoize :grouped
|
37
|
+
|
38
|
+
def deduped
|
39
|
+
grouped.map do |group|
|
40
|
+
if group.size == 1
|
41
|
+
group.first
|
42
|
+
else
|
43
|
+
group.max_by do |lexeme|
|
44
|
+
priority(lexeme)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
memoize :deduped
|
50
|
+
|
51
|
+
def ordered
|
52
|
+
lexemes.sort_by(&:start_pos)
|
53
|
+
end
|
54
|
+
memoize :ordered
|
55
|
+
|
56
|
+
def non_covered_lexemes
|
57
|
+
res = deduped.dup
|
58
|
+
|
59
|
+
res.each_index do |index|
|
60
|
+
head, *rest = *res[index..-1]
|
61
|
+
|
62
|
+
rest.each do |lex|
|
63
|
+
next unless covers?(head, lex)
|
64
|
+
res.delete(lex)
|
65
|
+
end
|
66
|
+
end.to_a
|
67
|
+
|
68
|
+
res
|
69
|
+
end
|
70
|
+
|
71
|
+
def covers?(head, lex)
|
72
|
+
head != lex &&
|
73
|
+
head.start_pos <= lex.start_pos &&
|
74
|
+
head.end_pos >= lex.end_pos
|
75
|
+
end
|
76
|
+
|
77
|
+
def priority(lexeme)
|
78
|
+
lexeme_priorities.index(lexeme.class)
|
79
|
+
end
|
80
|
+
|
81
|
+
def lexeme_priorities
|
82
|
+
[
|
83
|
+
Lexemes::Word,
|
84
|
+
Lexemes::Firstname,
|
85
|
+
Lexemes::Lastname,
|
86
|
+
Lexemes::Phone,
|
87
|
+
Lexemes::Date,
|
88
|
+
Lexemes::Email
|
89
|
+
]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/stockade/version.rb
CHANGED
data/lib/stockade.rb
CHANGED
@@ -1,114 +1,29 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'bloomfilter-rb'
|
4
3
|
require 'memoist'
|
5
|
-
require 'strscan'
|
6
4
|
|
5
|
+
require 'stockade/version'
|
6
|
+
require 'stockade/lexer'
|
7
|
+
require 'stockade/parser'
|
8
|
+
require 'stockade/lexemes/base'
|
9
|
+
require 'stockade/lexemes/date'
|
10
|
+
require 'stockade/lexemes/email'
|
11
|
+
require 'stockade/lexemes/phone'
|
12
|
+
require 'stockade/lexemes/dict'
|
13
|
+
require 'stockade/lexemes/word'
|
14
|
+
require 'stockade/lexemes/lastname'
|
15
|
+
require 'stockade/lexemes/firstname'
|
16
|
+
|
17
|
+
# Stockade module
|
7
18
|
module Stockade
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def self.call(datum)
|
18
|
-
new(datum).call
|
19
|
-
end
|
20
|
-
|
21
|
-
# order is important - from most specific to least
|
22
|
-
# the first one that matches stops the scan
|
23
|
-
def patterns
|
24
|
-
{
|
25
|
-
email: email_regex,
|
26
|
-
phone: phone_regex,
|
27
|
-
name: name_regex,
|
28
|
-
}
|
29
|
-
end
|
30
|
-
|
31
|
-
def scanner
|
32
|
-
StringScanner.new(datum)
|
33
|
-
end
|
34
|
-
memoize :scanner
|
35
|
-
|
36
|
-
def call
|
37
|
-
res = []
|
38
|
-
|
39
|
-
patterns.each do |name, regex|
|
40
|
-
scanner = StringScanner.new(datum)
|
41
|
-
|
42
|
-
loop do
|
43
|
-
break unless scanner.scan_until(regex)
|
44
|
-
value = scanner.matched
|
45
|
-
|
46
|
-
lexeme = name
|
47
|
-
if lexeme == :name
|
48
|
-
lexeme = :surname if surname?(value)
|
49
|
-
lexeme = :firstname if firstname?(value)
|
50
|
-
end
|
51
|
-
next if lexeme == :name
|
52
|
-
|
53
|
-
res << {
|
54
|
-
lexeme: lexeme,
|
55
|
-
value: scanner.matched
|
56
|
-
}
|
57
|
-
|
58
|
-
@datum = @datum[0..scanner.pos-scanner.matched.size] +
|
59
|
-
'*' * scanner.matched.size +
|
60
|
-
@datum[scanner.pos..-1]
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
res
|
65
|
-
end
|
66
|
-
|
67
|
-
def name_regex
|
68
|
-
/\w+/
|
69
|
-
end
|
70
|
-
|
71
|
-
def word_regex
|
72
|
-
/\W+/
|
73
|
-
end
|
74
|
-
|
75
|
-
private def email_regex
|
76
|
-
/
|
77
|
-
[\w+\-\.\+]+
|
78
|
-
@
|
79
|
-
[a-z\d\-]+
|
80
|
-
(\.[a-z]+)*
|
81
|
-
\.[a-z]+ # TLD
|
82
|
-
/x
|
83
|
-
end
|
84
|
-
|
85
|
-
private def email_address?
|
86
|
-
datum =~ email_regex
|
87
|
-
end
|
88
|
-
|
89
|
-
private def phone_number?
|
90
|
-
datum =~ phone_number_regex
|
91
|
-
end
|
92
|
-
|
93
|
-
private def phone_regex
|
94
|
-
/\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
|
95
|
-
end
|
96
|
-
|
97
|
-
private def surname?(value)
|
98
|
-
found?('surnames', value)
|
99
|
-
end
|
100
|
-
|
101
|
-
private def firstname?(value)
|
102
|
-
found?('firstnames', value)
|
103
|
-
end
|
104
|
-
|
105
|
-
private def found?(db, value)
|
106
|
-
db(db).include?(value.downcase)
|
107
|
-
end
|
108
|
-
|
109
|
-
private def db(name)
|
110
|
-
Marshal.load(File.read("data/#{name}.dump"))
|
19
|
+
# Mask all PII in `text` with `*`
|
20
|
+
#
|
21
|
+
def self.mask(text)
|
22
|
+
lexemes = Parser.call(Lexer.call(text))
|
23
|
+
lexemes.inject(text) do |mask, lexeme|
|
24
|
+
prefix = lexeme.start_pos.zero? ? '' : mask[0..lexeme.start_pos - 1]
|
25
|
+
postfix = mask[lexeme.end_pos..-1]
|
26
|
+
"#{prefix}#{lexeme.mask}#{postfix}"
|
111
27
|
end
|
112
|
-
memoize :db
|
113
28
|
end
|
114
29
|
end
|
data/stockade.gemspec
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'stockade/version'
|
4
6
|
|
@@ -8,23 +10,27 @@ Gem::Specification.new do |spec|
|
|
8
10
|
spec.authors = ['Stan Mazhara']
|
9
11
|
spec.email = ['akmegran@gmail.com']
|
10
12
|
|
11
|
-
spec.summary =
|
12
|
-
spec.description =
|
13
|
+
spec.summary = 'Stockade is a lexer for PII'
|
14
|
+
spec.description = '
|
13
15
|
Stockade is a lexer that reads unstructured text information (from files,
|
14
16
|
logs, databases etc.) and tokenizes pieces that look like personally
|
15
17
|
identifiable information (PII).
|
16
|
-
|
18
|
+
'
|
17
19
|
spec.homepage = 'https://github.com/smazhara/stockade'
|
18
20
|
spec.license = 'MIT'
|
19
21
|
|
20
|
-
spec.files = Dir.chdir(File.expand_path(
|
21
|
-
`git ls-files`.split(/\n/).reject
|
22
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
23
|
+
`git ls-files`.split(/\n/).reject do |f|
|
24
|
+
f.match(%r{^(test|spec|features)/})
|
25
|
+
end
|
22
26
|
end
|
23
27
|
|
24
28
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
29
|
+
spec.add_development_dependency 'pry-byebug', '~> 3.0'
|
25
30
|
spec.add_development_dependency 'rake', '~> 10.0'
|
26
31
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
32
|
+
spec.add_development_dependency 'rubocop', '~> 0.5'
|
27
33
|
|
28
|
-
spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
|
29
34
|
spec.add_runtime_dependency 'memoist', '~> 0.1'
|
35
|
+
spec.add_runtime_dependency 'rambling-trie', '~> 2.0'
|
30
36
|
end
|