stockade 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +22 -6
- data/README.md +37 -0
- data/Rakefile +5 -3
- data/bin/load +6 -32
- data/data/firstnames.dump +0 -0
- data/data/firstnames.txt +5496 -0
- data/data/lastnames.dump +0 -0
- data/data/lastnames.txt +240470 -0
- data/data/words.dump +0 -0
- data/data/words.txt +370099 -0
- data/lib/stockade/lexemes/base.rb +47 -0
- data/lib/stockade/lexemes/date.rb +49 -0
- data/lib/stockade/lexemes/dict.rb +42 -0
- data/lib/stockade/lexemes/email.rb +18 -0
- data/lib/stockade/lexemes/firstname.rb +14 -0
- data/lib/stockade/lexemes/lastname.rb +14 -0
- data/lib/stockade/lexemes/phone.rb +51 -0
- data/lib/stockade/lexemes/word.rb +17 -0
- data/lib/stockade/lexer.rb +61 -0
- data/lib/stockade/parser.rb +92 -0
- data/lib/stockade/version.rb +3 -1
- data/lib/stockade.rb +22 -107
- data/stockade.gemspec +13 -7
- metadata +50 -10
- data/data/firstnames/1.csv +0 -5496
- data/data/surnames/1.csv +0 -151671
- data/data/surnames/2.csv +0 -88799
- data/data/surnames.dump +0 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Base class for all lexemes
|
6
|
+
#
|
7
|
+
# Lexer extracts lexem candidates of text using `.regex` of
|
8
|
+
# corresponding= subclass, instantiates it and then furtner calls
|
9
|
+
# its `#valid?` to verify that this is indeed a valid lexeme.
|
10
|
+
#
|
11
|
+
class Base
|
12
|
+
attr_reader :raw_value, :start_pos
|
13
|
+
|
14
|
+
def initialize(value, start_pos = nil)
|
15
|
+
@raw_value = value
|
16
|
+
@start_pos = start_pos
|
17
|
+
end
|
18
|
+
|
19
|
+
def value
|
20
|
+
raw_value.downcase.strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def end_pos
|
24
|
+
start_pos + raw_value.size
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.regex; end
|
28
|
+
|
29
|
+
def valid?
|
30
|
+
true
|
31
|
+
end
|
32
|
+
|
33
|
+
def ==(other)
|
34
|
+
value == other.value &&
|
35
|
+
self.class == other.class
|
36
|
+
end
|
37
|
+
|
38
|
+
def range
|
39
|
+
start_pos..end_pos
|
40
|
+
end
|
41
|
+
|
42
|
+
def mask
|
43
|
+
'*' * raw_value.size
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Date lexeme
|
6
|
+
class Date < Base
|
7
|
+
class << self
|
8
|
+
def regex
|
9
|
+
/
|
10
|
+
(?<!\d)
|
11
|
+
(\d{1,4})
|
12
|
+
#{delim}
|
13
|
+
(\d{1,4})
|
14
|
+
#{delim}
|
15
|
+
(\d{1,4})
|
16
|
+
(?!\d)
|
17
|
+
/x
|
18
|
+
end
|
19
|
+
|
20
|
+
def delim
|
21
|
+
%r{[\s\.\-\/]}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def valid?
|
26
|
+
possible_dates.any? &&
|
27
|
+
possible_dates.all? do |date|
|
28
|
+
date <= ::Date.today
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def possible_dates
|
35
|
+
parts.permutation.map do |permutation|
|
36
|
+
begin
|
37
|
+
::Date.new(*permutation)
|
38
|
+
rescue ArgumentError
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
end.compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def parts
|
45
|
+
self.class.regex.match(value).captures.map(&:to_i)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Abstract Dictionary lexeme
|
6
|
+
#
|
7
|
+
# Dictionary lexemes are those that can only be verified by
|
8
|
+
# checking the corresponding dictionary
|
9
|
+
#
|
10
|
+
class Dict < Base
|
11
|
+
def self.regex
|
12
|
+
/
|
13
|
+
[a-zA-Z]+
|
14
|
+
/x
|
15
|
+
end
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
return false unless self.class.dict
|
19
|
+
self.class.dict.include?(value)
|
20
|
+
end
|
21
|
+
|
22
|
+
def name
|
23
|
+
raise 'Abstract'
|
24
|
+
end
|
25
|
+
|
26
|
+
def common_word?
|
27
|
+
Word.new(value: value).valid?
|
28
|
+
end
|
29
|
+
|
30
|
+
class << self
|
31
|
+
extend Memoist
|
32
|
+
|
33
|
+
def dict_name; end
|
34
|
+
|
35
|
+
def dict
|
36
|
+
Rambling::Trie.load("data/#{dict_name}.dump")
|
37
|
+
end
|
38
|
+
memoize :dict
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# Phone lexeme
|
6
|
+
class Phone < Base
|
7
|
+
# Less noisy phone mask syntax compared to regexes
|
8
|
+
MASKS = [
|
9
|
+
'#-###-###-####',
|
10
|
+
'+#-###-###-####',
|
11
|
+
'+##-###-###-####',
|
12
|
+
'+###-###-###-####',
|
13
|
+
'###-###-####',
|
14
|
+
'### ### ####',
|
15
|
+
'(## ##) #### ####',
|
16
|
+
'##########',
|
17
|
+
'(##) #### ####',
|
18
|
+
'(##) ## #### ####',
|
19
|
+
'###-###-###-####',
|
20
|
+
'###-####',
|
21
|
+
'(###) ###-####'
|
22
|
+
].freeze
|
23
|
+
|
24
|
+
class << self
|
25
|
+
def regex
|
26
|
+
/
|
27
|
+
#{MASKS
|
28
|
+
.map { |mask| to_re(mask) }
|
29
|
+
.join(" |\n")
|
30
|
+
}
|
31
|
+
/x
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Convert phone number mask to its regex
|
37
|
+
# ### ### #### => (?:\d{3}\s\d{3}\s\d{4})
|
38
|
+
def to_re(mask)
|
39
|
+
'(?:' +
|
40
|
+
mask
|
41
|
+
.gsub('+', '\\\+')
|
42
|
+
.gsub(/(#+)/) { |m| "\\d{#{m.size}}" }
|
43
|
+
.gsub(' ', '\s')
|
44
|
+
.gsub('(', '\(\s*')
|
45
|
+
.gsub(')', '\\s*\)') +
|
46
|
+
')'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
module Lexemes
|
5
|
+
# A word found is common words dictionary
|
6
|
+
class Word < Dict
|
7
|
+
def self.dict_name
|
8
|
+
'words'
|
9
|
+
end
|
10
|
+
|
11
|
+
# common dictionary words are safe
|
12
|
+
def mask
|
13
|
+
raw_value
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rambling-trie'
|
4
|
+
require 'memoist'
|
5
|
+
require 'strscan'
|
6
|
+
require 'pry-byebug'
|
7
|
+
|
8
|
+
module Stockade
|
9
|
+
# Class Lexer
|
10
|
+
#
|
11
|
+
# Usage `Stockade::Lexer.call(context)`
|
12
|
+
#
|
13
|
+
# Returns list of found lexemes.
|
14
|
+
#
|
15
|
+
class Lexer
|
16
|
+
extend Memoist
|
17
|
+
|
18
|
+
attr_reader :context
|
19
|
+
|
20
|
+
def initialize(context)
|
21
|
+
@context = context.dup
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.call(context)
|
25
|
+
new(context).call
|
26
|
+
end
|
27
|
+
|
28
|
+
def lexeme_classes
|
29
|
+
[
|
30
|
+
Stockade::Lexemes::Date,
|
31
|
+
Stockade::Lexemes::Email,
|
32
|
+
Stockade::Lexemes::Phone,
|
33
|
+
Stockade::Lexemes::Word,
|
34
|
+
Stockade::Lexemes::Firstname,
|
35
|
+
Stockade::Lexemes::Lastname
|
36
|
+
]
|
37
|
+
end
|
38
|
+
|
39
|
+
def call
|
40
|
+
lexeme_classes.map do |lexeme_class|
|
41
|
+
tokenize(lexeme_class)
|
42
|
+
end.flatten
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def tokenize(lexeme_class)
|
48
|
+
lexemes = []
|
49
|
+
scanner = StringScanner.new(context)
|
50
|
+
|
51
|
+
while scanner.scan_until(lexeme_class.regex)
|
52
|
+
lexemes << lexeme_class.new(
|
53
|
+
scanner.matched,
|
54
|
+
scanner.pos - scanner.matched.size
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
lexemes.select(&:valid?)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stockade
|
4
|
+
# Parser
|
5
|
+
#
|
6
|
+
# Takes a raw list (a set really) of lexemes returned by Lexer
|
7
|
+
# and further processes them. Currently this means:
|
8
|
+
# 1. Deduping - >1 ambiguous lexemes matching exactly same fragment
|
9
|
+
# are deduped according to their priority. Only the highest priority
|
10
|
+
# lexeme kept.
|
11
|
+
# 2. Removing covered lexems - lexeme that matches a string that is a
|
12
|
+
# substring of another matched string is removed.
|
13
|
+
#
|
14
|
+
class Parser
|
15
|
+
extend Memoist
|
16
|
+
|
17
|
+
attr_reader :lexemes
|
18
|
+
|
19
|
+
def initialize(lexemes)
|
20
|
+
@lexemes = lexemes
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.call(lexemes)
|
24
|
+
new(lexemes).call
|
25
|
+
end
|
26
|
+
|
27
|
+
def call
|
28
|
+
non_covered_lexemes
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def grouped
|
34
|
+
lexemes.group_by(&:range).values
|
35
|
+
end
|
36
|
+
memoize :grouped
|
37
|
+
|
38
|
+
def deduped
|
39
|
+
grouped.map do |group|
|
40
|
+
if group.size == 1
|
41
|
+
group.first
|
42
|
+
else
|
43
|
+
group.max_by do |lexeme|
|
44
|
+
priority(lexeme)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
memoize :deduped
|
50
|
+
|
51
|
+
def ordered
|
52
|
+
lexemes.sort_by(&:start_pos)
|
53
|
+
end
|
54
|
+
memoize :ordered
|
55
|
+
|
56
|
+
def non_covered_lexemes
|
57
|
+
res = deduped.dup
|
58
|
+
|
59
|
+
res.each_index do |index|
|
60
|
+
head, *rest = *res[index..-1]
|
61
|
+
|
62
|
+
rest.each do |lex|
|
63
|
+
next unless covers?(head, lex)
|
64
|
+
res.delete(lex)
|
65
|
+
end
|
66
|
+
end.to_a
|
67
|
+
|
68
|
+
res
|
69
|
+
end
|
70
|
+
|
71
|
+
def covers?(head, lex)
|
72
|
+
head != lex &&
|
73
|
+
head.start_pos <= lex.start_pos &&
|
74
|
+
head.end_pos >= lex.end_pos
|
75
|
+
end
|
76
|
+
|
77
|
+
def priority(lexeme)
|
78
|
+
lexeme_priorities.index(lexeme.class)
|
79
|
+
end
|
80
|
+
|
81
|
+
def lexeme_priorities
|
82
|
+
[
|
83
|
+
Lexemes::Word,
|
84
|
+
Lexemes::Firstname,
|
85
|
+
Lexemes::Lastname,
|
86
|
+
Lexemes::Phone,
|
87
|
+
Lexemes::Date,
|
88
|
+
Lexemes::Email
|
89
|
+
]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/stockade/version.rb
CHANGED
data/lib/stockade.rb
CHANGED
@@ -1,114 +1,29 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'bloomfilter-rb'
|
4
3
|
require 'memoist'
|
5
|
-
require 'strscan'
|
6
4
|
|
5
|
+
require 'stockade/version'
|
6
|
+
require 'stockade/lexer'
|
7
|
+
require 'stockade/parser'
|
8
|
+
require 'stockade/lexemes/base'
|
9
|
+
require 'stockade/lexemes/date'
|
10
|
+
require 'stockade/lexemes/email'
|
11
|
+
require 'stockade/lexemes/phone'
|
12
|
+
require 'stockade/lexemes/dict'
|
13
|
+
require 'stockade/lexemes/word'
|
14
|
+
require 'stockade/lexemes/lastname'
|
15
|
+
require 'stockade/lexemes/firstname'
|
16
|
+
|
17
|
+
# Stockade module
|
7
18
|
module Stockade
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def self.call(datum)
|
18
|
-
new(datum).call
|
19
|
-
end
|
20
|
-
|
21
|
-
# order is important - from most specific to least
|
22
|
-
# the first one that matches stops the scan
|
23
|
-
def patterns
|
24
|
-
{
|
25
|
-
email: email_regex,
|
26
|
-
phone: phone_regex,
|
27
|
-
name: name_regex,
|
28
|
-
}
|
29
|
-
end
|
30
|
-
|
31
|
-
def scanner
|
32
|
-
StringScanner.new(datum)
|
33
|
-
end
|
34
|
-
memoize :scanner
|
35
|
-
|
36
|
-
def call
|
37
|
-
res = []
|
38
|
-
|
39
|
-
patterns.each do |name, regex|
|
40
|
-
scanner = StringScanner.new(datum)
|
41
|
-
|
42
|
-
loop do
|
43
|
-
break unless scanner.scan_until(regex)
|
44
|
-
value = scanner.matched
|
45
|
-
|
46
|
-
lexeme = name
|
47
|
-
if lexeme == :name
|
48
|
-
lexeme = :surname if surname?(value)
|
49
|
-
lexeme = :firstname if firstname?(value)
|
50
|
-
end
|
51
|
-
next if lexeme == :name
|
52
|
-
|
53
|
-
res << {
|
54
|
-
lexeme: lexeme,
|
55
|
-
value: scanner.matched
|
56
|
-
}
|
57
|
-
|
58
|
-
@datum = @datum[0..scanner.pos-scanner.matched.size] +
|
59
|
-
'*' * scanner.matched.size +
|
60
|
-
@datum[scanner.pos..-1]
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
res
|
65
|
-
end
|
66
|
-
|
67
|
-
def name_regex
|
68
|
-
/\w+/
|
69
|
-
end
|
70
|
-
|
71
|
-
def word_regex
|
72
|
-
/\W+/
|
73
|
-
end
|
74
|
-
|
75
|
-
private def email_regex
|
76
|
-
/
|
77
|
-
[\w+\-\.\+]+
|
78
|
-
@
|
79
|
-
[a-z\d\-]+
|
80
|
-
(\.[a-z]+)*
|
81
|
-
\.[a-z]+ # TLD
|
82
|
-
/x
|
83
|
-
end
|
84
|
-
|
85
|
-
private def email_address?
|
86
|
-
datum =~ email_regex
|
87
|
-
end
|
88
|
-
|
89
|
-
private def phone_number?
|
90
|
-
datum =~ phone_number_regex
|
91
|
-
end
|
92
|
-
|
93
|
-
private def phone_regex
|
94
|
-
/\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
|
95
|
-
end
|
96
|
-
|
97
|
-
private def surname?(value)
|
98
|
-
found?('surnames', value)
|
99
|
-
end
|
100
|
-
|
101
|
-
private def firstname?(value)
|
102
|
-
found?('firstnames', value)
|
103
|
-
end
|
104
|
-
|
105
|
-
private def found?(db, value)
|
106
|
-
db(db).include?(value.downcase)
|
107
|
-
end
|
108
|
-
|
109
|
-
private def db(name)
|
110
|
-
Marshal.load(File.read("data/#{name}.dump"))
|
19
|
+
# Mask all PII in `text` with `*`
|
20
|
+
#
|
21
|
+
def self.mask(text)
|
22
|
+
lexemes = Parser.call(Lexer.call(text))
|
23
|
+
lexemes.inject(text) do |mask, lexeme|
|
24
|
+
prefix = lexeme.start_pos.zero? ? '' : mask[0..lexeme.start_pos - 1]
|
25
|
+
postfix = mask[lexeme.end_pos..-1]
|
26
|
+
"#{prefix}#{lexeme.mask}#{postfix}"
|
111
27
|
end
|
112
|
-
memoize :db
|
113
28
|
end
|
114
29
|
end
|
data/stockade.gemspec
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'stockade/version'
|
4
6
|
|
@@ -8,23 +10,27 @@ Gem::Specification.new do |spec|
|
|
8
10
|
spec.authors = ['Stan Mazhara']
|
9
11
|
spec.email = ['akmegran@gmail.com']
|
10
12
|
|
11
|
-
spec.summary =
|
12
|
-
spec.description =
|
13
|
+
spec.summary = 'Stockade is a lexer for PII'
|
14
|
+
spec.description = '
|
13
15
|
Stockade is a lexer that reads unstructured text information (from files,
|
14
16
|
logs, databases etc.) and tokenizes pieces that look like personally
|
15
17
|
identifiable information (PII).
|
16
|
-
|
18
|
+
'
|
17
19
|
spec.homepage = 'https://github.com/smazhara/stockade'
|
18
20
|
spec.license = 'MIT'
|
19
21
|
|
20
|
-
spec.files = Dir.chdir(File.expand_path(
|
21
|
-
`git ls-files`.split(/\n/).reject
|
22
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
23
|
+
`git ls-files`.split(/\n/).reject do |f|
|
24
|
+
f.match(%r{^(test|spec|features)/})
|
25
|
+
end
|
22
26
|
end
|
23
27
|
|
24
28
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
29
|
+
spec.add_development_dependency 'pry-byebug', '~> 3.0'
|
25
30
|
spec.add_development_dependency 'rake', '~> 10.0'
|
26
31
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
32
|
+
spec.add_development_dependency 'rubocop', '~> 0.5'
|
27
33
|
|
28
|
-
spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
|
29
34
|
spec.add_runtime_dependency 'memoist', '~> 0.1'
|
35
|
+
spec.add_runtime_dependency 'rambling-trie', '~> 2.0'
|
30
36
|
end
|