stockade 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +23 -3
- data/README.md +35 -17
- data/bin/load +1 -1
- data/data/firstnames.zip +0 -0
- data/data/lastnames.zip +0 -0
- data/data/phones.txt +13 -0
- data/data/words.zip +0 -0
- data/lib/stockade.rb +32 -8
- data/lib/stockade/lexemes/base.rb +18 -0
- data/lib/stockade/lexemes/dict.rb +5 -1
- data/lib/stockade/lexemes/filler.rb +8 -0
- data/lib/stockade/lexemes/payment_card.rb +28 -0
- data/lib/stockade/lexemes/phone.rb +2 -17
- data/lib/stockade/lexemes/word.rb +4 -0
- data/lib/stockade/lexer.rb +1 -0
- data/lib/stockade/parser.rb +7 -12
- data/lib/stockade/version.rb +1 -1
- data/stockade.gemspec +5 -1
- metadata +38 -10
- data/data/firstnames.dump +0 -0
- data/data/firstnames.txt +0 -5496
- data/data/lastnames.dump +0 -0
- data/data/lastnames.txt +0 -240470
- data/data/words.dump +0 -0
- data/data/words.txt +0 -370099
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e40900217ae6ce9c9548707578efcfe90d27211
|
4
|
+
data.tar.gz: 8130b7c181f4d8ff0b2e322d4a408b4311505aa8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 39b25eff37a4405e38312d5c6cce3836468f1db7791421f90176060140496c27dfec84cd5e78289e2a316e00787884a09c6d18dbe9681ca7a851693ddd76f1d5
|
7
|
+
data.tar.gz: abf9b314b14437d08dc1fc5116a51d2d129881097ac6491d0ac66c6c73cc16af9f0749af0b8b1886a45126fecdbf790ce27751063e2fcf9eeed0fb2051d6cb9b
|
data/Gemfile.lock
CHANGED
@@ -2,19 +2,35 @@ PATH
|
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
4
|
stockade (0.1.1)
|
5
|
+
credit_card_validations (~> 3.5)
|
5
6
|
memoist (~> 0.1)
|
6
|
-
rambling-trie
|
7
|
+
rambling-trie (~> 2.0)
|
8
|
+
rubyzip (~> 1.0)
|
7
9
|
|
8
10
|
GEM
|
9
11
|
remote: https://rubygems.org/
|
10
12
|
specs:
|
13
|
+
activemodel (5.2.1)
|
14
|
+
activesupport (= 5.2.1)
|
15
|
+
activesupport (5.2.1)
|
16
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
17
|
+
i18n (>= 0.7, < 2)
|
18
|
+
minitest (~> 5.1)
|
19
|
+
tzinfo (~> 1.1)
|
11
20
|
ast (2.4.0)
|
12
21
|
byebug (10.0.2)
|
13
22
|
coderay (1.1.2)
|
23
|
+
concurrent-ruby (1.0.5)
|
24
|
+
credit_card_validations (3.5.0)
|
25
|
+
activemodel (>= 3, <= 6)
|
26
|
+
activesupport (>= 3, <= 6)
|
14
27
|
diff-lcs (1.3)
|
28
|
+
i18n (1.1.0)
|
29
|
+
concurrent-ruby (~> 1.0)
|
15
30
|
jaro_winkler (1.5.1)
|
16
31
|
memoist (0.16.0)
|
17
32
|
method_source (0.9.0)
|
33
|
+
minitest (5.11.3)
|
18
34
|
parallel (1.12.1)
|
19
35
|
parser (2.5.1.2)
|
20
36
|
ast (~> 2.4.0)
|
@@ -50,6 +66,10 @@ GEM
|
|
50
66
|
ruby-progressbar (~> 1.7)
|
51
67
|
unicode-display_width (~> 1.0, >= 1.0.1)
|
52
68
|
ruby-progressbar (1.9.0)
|
69
|
+
rubyzip (1.2.1)
|
70
|
+
thread_safe (0.3.6)
|
71
|
+
tzinfo (1.2.5)
|
72
|
+
thread_safe (~> 0.1)
|
53
73
|
unicode-display_width (1.4.0)
|
54
74
|
|
55
75
|
PLATFORMS
|
@@ -57,10 +77,10 @@ PLATFORMS
|
|
57
77
|
|
58
78
|
DEPENDENCIES
|
59
79
|
bundler (~> 1.16)
|
60
|
-
pry-byebug
|
80
|
+
pry-byebug (~> 3.0)
|
61
81
|
rake (~> 10.0)
|
62
82
|
rspec (~> 3.0)
|
63
|
-
rubocop
|
83
|
+
rubocop (~> 0.49)
|
64
84
|
stockade!
|
65
85
|
|
66
86
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,11 +1,10 @@
|
|
1
|
-
# PII
|
1
|
+
# PII Detector
|
2
2
|
|
3
3
|
_This is a proof-of-concept level software._
|
4
4
|
|
5
|
-
Stockade is a
|
6
|
-
unstructured text (from files, logs, databases, web etc.) and
|
7
|
-
|
8
|
-
discard, mask data.
|
5
|
+
Stockade is a Personally Identifiable Information (PII) detector. It scans
|
6
|
+
unstructured text (from files, logs, databases, web etc.) and masks all
|
7
|
+
identified pieces of PII.
|
9
8
|
|
10
9
|
## Installation
|
11
10
|
|
@@ -17,21 +16,40 @@ gem install stockade
|
|
17
16
|
|
18
17
|
```ruby
|
19
18
|
require 'stockade'
|
20
|
-
|
21
|
-
Stockade.mask(
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
|
20
|
+
puts Stockade.mask(<<-EOS
|
21
|
+
|
22
|
+
Dossier on Mr. John Smith born 09/02/1995
|
23
|
+
His email is jsmith@example.com and his phone is 555-123-4567.
|
24
|
+
He is using Visa card 4111 1111 1111 1111
|
25
|
+
|
26
|
+
EOS
|
27
|
+
|
28
|
+
#=>
|
29
|
+
Dossier on Mr. **** ***** born **********
|
30
|
+
His email is ****************** and his phone is ************.
|
31
|
+
** is using Visa card *******************
|
32
|
+
|
25
33
|
```
|
26
|
-
|
34
|
+
Notice, how word 'He' was incorrectly identified as a name.
|
27
35
|
|
28
36
|
## Implementation
|
29
37
|
|
30
|
-
|
38
|
+
This is done in three stages.
|
39
|
+
|
40
|
+
### Scanning
|
41
|
+
|
42
|
+
Using a manually curated list of regexes and
|
31
43
|
[StringScanner](https://ruby-doc.org/stdlib-2.5.1/libdoc/strscan/rdoc/StringScanner.html)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
44
|
+
it extracts and labels lexeme candidates.
|
45
|
+
|
46
|
+
### Evaluation
|
47
|
+
|
48
|
+
Lexeme candidates further evaluated (in some cases this is a no-op) to filter
|
49
|
+
out false positives. For example, first and lastnames are checked against a
|
50
|
+
database of known names. Dates are checked to be in the past.
|
51
|
+
|
52
|
+
### Parsing
|
37
53
|
|
54
|
+
Some rudimentary parsing done. Lexemes that are fully covered by other lexemes
|
55
|
+
are eliminated. Ambiguous lexemes are disambiguated using rules of precedence.
|
data/bin/load
CHANGED
data/data/firstnames.zip
ADDED
Binary file
|
data/data/lastnames.zip
ADDED
Binary file
|
data/data/phones.txt
ADDED
data/data/words.zip
ADDED
Binary file
|
data/lib/stockade.rb
CHANGED
@@ -13,17 +13,41 @@ require 'stockade/lexemes/dict'
|
|
13
13
|
require 'stockade/lexemes/word'
|
14
14
|
require 'stockade/lexemes/lastname'
|
15
15
|
require 'stockade/lexemes/firstname'
|
16
|
+
require 'stockade/lexemes/payment_card'
|
16
17
|
|
17
18
|
# Stockade module
|
18
19
|
module Stockade
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
20
|
+
class << self
|
21
|
+
# Mask all PII in `text` with `*`
|
22
|
+
#
|
23
|
+
def mask(text)
|
24
|
+
process(text, :mask)
|
25
|
+
end
|
26
|
+
|
27
|
+
def tokenize(text)
|
28
|
+
process(text, :token)
|
29
|
+
end
|
30
|
+
|
31
|
+
def process(text, action)
|
32
|
+
lexemes(text).inject(text) do |mask, lexeme|
|
33
|
+
prefix = lexeme.start_pos.zero? ? '' : mask[0..lexeme.start_pos - 1]
|
34
|
+
postfix = mask[lexeme.end_pos..-1]
|
35
|
+
"#{prefix}#{lexeme.send(action)}#{postfix}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract(text)
|
40
|
+
lexemes(text).map do |lexeme|
|
41
|
+
{
|
42
|
+
lexeme.class.name.to_s.split('::').last.downcase => lexeme.value
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def lexemes(text)
|
50
|
+
Parser.call(Lexer.call(text))
|
27
51
|
end
|
28
52
|
end
|
29
53
|
end
|
@@ -42,6 +42,24 @@ module Stockade
|
|
42
42
|
def mask
|
43
43
|
'*' * raw_value.size
|
44
44
|
end
|
45
|
+
|
46
|
+
def token
|
47
|
+
SecureRandom.base64(raw_value.size)[0..raw_value.size - 1]
|
48
|
+
end
|
49
|
+
|
50
|
+
def type
|
51
|
+
self.class.name.split('::').last.downcase.to_sym
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.types
|
55
|
+
%i[date word email firstname lastname phone]
|
56
|
+
end
|
57
|
+
|
58
|
+
types.each do |type_name|
|
59
|
+
define_method :"#{type_name}?" do
|
60
|
+
type == type_name
|
61
|
+
end
|
62
|
+
end
|
45
63
|
end
|
46
64
|
end
|
47
65
|
end
|
@@ -27,13 +27,17 @@ module Stockade
|
|
27
27
|
Word.new(value: value).valid?
|
28
28
|
end
|
29
29
|
|
30
|
+
def capitalized?
|
31
|
+
raw_value[0] == raw_value[0].upcase
|
32
|
+
end
|
33
|
+
|
30
34
|
class << self
|
31
35
|
extend Memoist
|
32
36
|
|
33
37
|
def dict_name; end
|
34
38
|
|
35
39
|
def dict
|
36
|
-
Rambling::Trie.load("data/#{dict_name}.
|
40
|
+
Rambling::Trie.load("data/#{dict_name}.zip")
|
37
41
|
end
|
38
42
|
memoize :dict
|
39
43
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'credit_card_validations'
|
4
|
+
|
5
|
+
module Stockade
|
6
|
+
module Lexemes
|
7
|
+
# Lexeme for anything that resembles payment card numbers
|
8
|
+
# https://en.wikipedia.org/wiki/Payment_card_number
|
9
|
+
#
|
10
|
+
# Any 10-19 character long sequences of digits optionally grouped using
|
11
|
+
# ' ' or '-' delimiters are suspects
|
12
|
+
class PaymentCard < Base
|
13
|
+
def self.regex
|
14
|
+
/
|
15
|
+
(?<!\d) # NaN
|
16
|
+
\d
|
17
|
+
([\s\-]*\d[\s\-]*){10,17}
|
18
|
+
\d
|
19
|
+
(?!\d) # NaN
|
20
|
+
/x
|
21
|
+
end
|
22
|
+
|
23
|
+
def valid?
|
24
|
+
CreditCardValidations::Detector.new(value).valid?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -4,22 +4,7 @@ module Stockade
|
|
4
4
|
module Lexemes
|
5
5
|
# Phone lexeme
|
6
6
|
class Phone < Base
|
7
|
-
|
8
|
-
MASKS = [
|
9
|
-
'#-###-###-####',
|
10
|
-
'+#-###-###-####',
|
11
|
-
'+##-###-###-####',
|
12
|
-
'+###-###-###-####',
|
13
|
-
'###-###-####',
|
14
|
-
'### ### ####',
|
15
|
-
'(## ##) #### ####',
|
16
|
-
'##########',
|
17
|
-
'(##) #### ####',
|
18
|
-
'(##) ## #### ####',
|
19
|
-
'###-###-###-####',
|
20
|
-
'###-####',
|
21
|
-
'(###) ###-####'
|
22
|
-
].freeze
|
7
|
+
MASKS = File.readlines('data/phones.txt').freeze
|
23
8
|
|
24
9
|
class << self
|
25
10
|
def regex
|
@@ -33,7 +18,7 @@ module Stockade
|
|
33
18
|
|
34
19
|
private
|
35
20
|
|
36
|
-
# Convert phone
|
21
|
+
# Convert less noisy phone mask syntax to regexes
|
37
22
|
# ### ### #### => (?:\d{3}\s\d{3}\s\d{4})
|
38
23
|
def to_re(mask)
|
39
24
|
'(?:' +
|
data/lib/stockade/lexer.rb
CHANGED
data/lib/stockade/parser.rb
CHANGED
@@ -75,18 +75,13 @@ module Stockade
|
|
75
75
|
end
|
76
76
|
|
77
77
|
def priority(lexeme)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
Lexemes::Lastname,
|
86
|
-
Lexemes::Phone,
|
87
|
-
Lexemes::Date,
|
88
|
-
Lexemes::Email
|
89
|
-
]
|
78
|
+
if [Lexemes::Firstname, Lexemes::Lastname].include?(lexeme.class)
|
79
|
+
lexeme.capitalized? ? 3 : 1
|
80
|
+
elsif lexeme.is_a?(Lexemes::Word)
|
81
|
+
2
|
82
|
+
else
|
83
|
+
3
|
84
|
+
end
|
90
85
|
end
|
91
86
|
end
|
92
87
|
end
|
data/lib/stockade/version.rb
CHANGED
data/stockade.gemspec
CHANGED
@@ -4,6 +4,7 @@ lib = File.expand_path('lib', __dir__)
|
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
5
|
require 'stockade/version'
|
6
6
|
|
7
|
+
# rubocop:disable Metrics/BlockLength
|
7
8
|
Gem::Specification.new do |spec|
|
8
9
|
spec.name = 'stockade'
|
9
10
|
spec.version = Stockade::VERSION
|
@@ -29,8 +30,11 @@ Gem::Specification.new do |spec|
|
|
29
30
|
spec.add_development_dependency 'pry-byebug', '~> 3.0'
|
30
31
|
spec.add_development_dependency 'rake', '~> 10.0'
|
31
32
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
32
|
-
spec.add_development_dependency 'rubocop', '~> 0.
|
33
|
+
spec.add_development_dependency 'rubocop', '~> 0.49'
|
33
34
|
|
35
|
+
spec.add_runtime_dependency 'credit_card_validations', '~> 3.5'
|
34
36
|
spec.add_runtime_dependency 'memoist', '~> 0.1'
|
35
37
|
spec.add_runtime_dependency 'rambling-trie', '~> 2.0'
|
38
|
+
spec.add_runtime_dependency 'rubyzip', '~> 1.0'
|
36
39
|
end
|
40
|
+
# rubocop:enable Metrics/BlockLength
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stockade
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stan Mazhara
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -72,14 +72,28 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0.
|
75
|
+
version: '0.49'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0.
|
82
|
+
version: '0.49'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: credit_card_validations
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.5'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.5'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: memoist
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,6 +122,20 @@ dependencies:
|
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '2.0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: rubyzip
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '1.0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '1.0'
|
111
139
|
description: "\n Stockade is a lexer that reads unstructured text information (from
|
112
140
|
files,\n logs, databases etc.) and tokenizes pieces that look like personally\n
|
113
141
|
\ identifiable information (PII).\n "
|
@@ -124,19 +152,19 @@ files:
|
|
124
152
|
- README.md
|
125
153
|
- Rakefile
|
126
154
|
- bin/load
|
127
|
-
- data/firstnames.
|
128
|
-
- data/
|
129
|
-
- data/
|
130
|
-
- data/
|
131
|
-
- data/words.dump
|
132
|
-
- data/words.txt
|
155
|
+
- data/firstnames.zip
|
156
|
+
- data/lastnames.zip
|
157
|
+
- data/phones.txt
|
158
|
+
- data/words.zip
|
133
159
|
- lib/stockade.rb
|
134
160
|
- lib/stockade/lexemes/base.rb
|
135
161
|
- lib/stockade/lexemes/date.rb
|
136
162
|
- lib/stockade/lexemes/dict.rb
|
137
163
|
- lib/stockade/lexemes/email.rb
|
164
|
+
- lib/stockade/lexemes/filler.rb
|
138
165
|
- lib/stockade/lexemes/firstname.rb
|
139
166
|
- lib/stockade/lexemes/lastname.rb
|
167
|
+
- lib/stockade/lexemes/payment_card.rb
|
140
168
|
- lib/stockade/lexemes/phone.rb
|
141
169
|
- lib/stockade/lexemes/word.rb
|
142
170
|
- lib/stockade/lexer.rb
|