stockade 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +23 -3
- data/README.md +35 -17
- data/bin/load +1 -1
- data/data/firstnames.zip +0 -0
- data/data/lastnames.zip +0 -0
- data/data/phones.txt +13 -0
- data/data/words.zip +0 -0
- data/lib/stockade.rb +32 -8
- data/lib/stockade/lexemes/base.rb +18 -0
- data/lib/stockade/lexemes/dict.rb +5 -1
- data/lib/stockade/lexemes/filler.rb +8 -0
- data/lib/stockade/lexemes/payment_card.rb +28 -0
- data/lib/stockade/lexemes/phone.rb +2 -17
- data/lib/stockade/lexemes/word.rb +4 -0
- data/lib/stockade/lexer.rb +1 -0
- data/lib/stockade/parser.rb +7 -12
- data/lib/stockade/version.rb +1 -1
- data/stockade.gemspec +5 -1
- metadata +38 -10
- data/data/firstnames.dump +0 -0
- data/data/firstnames.txt +0 -5496
- data/data/lastnames.dump +0 -0
- data/data/lastnames.txt +0 -240470
- data/data/words.dump +0 -0
- data/data/words.txt +0 -370099
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e40900217ae6ce9c9548707578efcfe90d27211
|
4
|
+
data.tar.gz: 8130b7c181f4d8ff0b2e322d4a408b4311505aa8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 39b25eff37a4405e38312d5c6cce3836468f1db7791421f90176060140496c27dfec84cd5e78289e2a316e00787884a09c6d18dbe9681ca7a851693ddd76f1d5
|
7
|
+
data.tar.gz: abf9b314b14437d08dc1fc5116a51d2d129881097ac6491d0ac66c6c73cc16af9f0749af0b8b1886a45126fecdbf790ce27751063e2fcf9eeed0fb2051d6cb9b
|
data/Gemfile.lock
CHANGED
@@ -2,19 +2,35 @@ PATH
|
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
4
|
stockade (0.1.1)
|
5
|
+
credit_card_validations (~> 3.5)
|
5
6
|
memoist (~> 0.1)
|
6
|
-
rambling-trie
|
7
|
+
rambling-trie (~> 2.0)
|
8
|
+
rubyzip (~> 1.0)
|
7
9
|
|
8
10
|
GEM
|
9
11
|
remote: https://rubygems.org/
|
10
12
|
specs:
|
13
|
+
activemodel (5.2.1)
|
14
|
+
activesupport (= 5.2.1)
|
15
|
+
activesupport (5.2.1)
|
16
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
17
|
+
i18n (>= 0.7, < 2)
|
18
|
+
minitest (~> 5.1)
|
19
|
+
tzinfo (~> 1.1)
|
11
20
|
ast (2.4.0)
|
12
21
|
byebug (10.0.2)
|
13
22
|
coderay (1.1.2)
|
23
|
+
concurrent-ruby (1.0.5)
|
24
|
+
credit_card_validations (3.5.0)
|
25
|
+
activemodel (>= 3, <= 6)
|
26
|
+
activesupport (>= 3, <= 6)
|
14
27
|
diff-lcs (1.3)
|
28
|
+
i18n (1.1.0)
|
29
|
+
concurrent-ruby (~> 1.0)
|
15
30
|
jaro_winkler (1.5.1)
|
16
31
|
memoist (0.16.0)
|
17
32
|
method_source (0.9.0)
|
33
|
+
minitest (5.11.3)
|
18
34
|
parallel (1.12.1)
|
19
35
|
parser (2.5.1.2)
|
20
36
|
ast (~> 2.4.0)
|
@@ -50,6 +66,10 @@ GEM
|
|
50
66
|
ruby-progressbar (~> 1.7)
|
51
67
|
unicode-display_width (~> 1.0, >= 1.0.1)
|
52
68
|
ruby-progressbar (1.9.0)
|
69
|
+
rubyzip (1.2.1)
|
70
|
+
thread_safe (0.3.6)
|
71
|
+
tzinfo (1.2.5)
|
72
|
+
thread_safe (~> 0.1)
|
53
73
|
unicode-display_width (1.4.0)
|
54
74
|
|
55
75
|
PLATFORMS
|
@@ -57,10 +77,10 @@ PLATFORMS
|
|
57
77
|
|
58
78
|
DEPENDENCIES
|
59
79
|
bundler (~> 1.16)
|
60
|
-
pry-byebug
|
80
|
+
pry-byebug (~> 3.0)
|
61
81
|
rake (~> 10.0)
|
62
82
|
rspec (~> 3.0)
|
63
|
-
rubocop
|
83
|
+
rubocop (~> 0.49)
|
64
84
|
stockade!
|
65
85
|
|
66
86
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,11 +1,10 @@
|
|
1
|
-
# PII
|
1
|
+
# PII Detector
|
2
2
|
|
3
3
|
_This is a proof-of-concept level software._
|
4
4
|
|
5
|
-
Stockade is a
|
6
|
-
unstructured text (from files, logs, databases, web etc.) and
|
7
|
-
|
8
|
-
discard, mask data.
|
5
|
+
Stockade is a Personally Identifiable Information (PII) detector. It scans
|
6
|
+
unstructured text (from files, logs, databases, web etc.) and masks all
|
7
|
+
identified pieces of PII.
|
9
8
|
|
10
9
|
## Installation
|
11
10
|
|
@@ -17,21 +16,40 @@ gem install stockade
|
|
17
16
|
|
18
17
|
```ruby
|
19
18
|
require 'stockade'
|
20
|
-
|
21
|
-
Stockade.mask(
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
|
20
|
+
puts Stockade.mask(<<-EOS
|
21
|
+
|
22
|
+
Dossier on Mr. John Smith born 09/02/1995
|
23
|
+
His email is jsmith@example.com and his phone is 555-123-4567.
|
24
|
+
He is using Visa card 4111 1111 1111 1111
|
25
|
+
|
26
|
+
EOS
|
27
|
+
|
28
|
+
#=>
|
29
|
+
Dossier on Mr. **** ***** born **********
|
30
|
+
His email is ****************** and his phone is ************.
|
31
|
+
** is using Visa card *******************
|
32
|
+
|
25
33
|
```
|
26
|
-
|
34
|
+
Notice, how word 'He' was incorrectly identified as a name.
|
27
35
|
|
28
36
|
## Implementation
|
29
37
|
|
30
|
-
|
38
|
+
This is done in three stages.
|
39
|
+
|
40
|
+
### Scanning
|
41
|
+
|
42
|
+
Using a manually curated list of regexes and
|
31
43
|
[StringScanner](https://ruby-doc.org/stdlib-2.5.1/libdoc/strscan/rdoc/StringScanner.html)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
44
|
+
it extracts and labels lexeme candidates.
|
45
|
+
|
46
|
+
### Evaluation
|
47
|
+
|
48
|
+
Lexeme candidates further evaluated (in some cases this is a no-op) to filter
|
49
|
+
out false positives. For example, first and lastnames are checked against a
|
50
|
+
database of known names. Dates are checked to be in the past.
|
51
|
+
|
52
|
+
### Parsing
|
37
53
|
|
54
|
+
Some rudimentary parsing done. Lexemes that are fully covered by other lexemes
|
55
|
+
are eliminated. Ambiguous lexemes are disambiguated using rules of precedence.
|
data/bin/load
CHANGED
data/data/firstnames.zip
ADDED
Binary file
|
data/data/lastnames.zip
ADDED
Binary file
|
data/data/phones.txt
ADDED
data/data/words.zip
ADDED
Binary file
|
data/lib/stockade.rb
CHANGED
@@ -13,17 +13,41 @@ require 'stockade/lexemes/dict'
|
|
13
13
|
require 'stockade/lexemes/word'
|
14
14
|
require 'stockade/lexemes/lastname'
|
15
15
|
require 'stockade/lexemes/firstname'
|
16
|
+
require 'stockade/lexemes/payment_card'
|
16
17
|
|
17
18
|
# Stockade module
|
18
19
|
module Stockade
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
20
|
+
class << self
|
21
|
+
# Mask all PII in `text` with `*`
|
22
|
+
#
|
23
|
+
def mask(text)
|
24
|
+
process(text, :mask)
|
25
|
+
end
|
26
|
+
|
27
|
+
def tokenize(text)
|
28
|
+
process(text, :token)
|
29
|
+
end
|
30
|
+
|
31
|
+
def process(text, action)
|
32
|
+
lexemes(text).inject(text) do |mask, lexeme|
|
33
|
+
prefix = lexeme.start_pos.zero? ? '' : mask[0..lexeme.start_pos - 1]
|
34
|
+
postfix = mask[lexeme.end_pos..-1]
|
35
|
+
"#{prefix}#{lexeme.send(action)}#{postfix}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract(text)
|
40
|
+
lexemes(text).map do |lexeme|
|
41
|
+
{
|
42
|
+
lexeme.class.name.to_s.split('::').last.downcase => lexeme.value
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def lexemes(text)
|
50
|
+
Parser.call(Lexer.call(text))
|
27
51
|
end
|
28
52
|
end
|
29
53
|
end
|
@@ -42,6 +42,24 @@ module Stockade
|
|
42
42
|
def mask
|
43
43
|
'*' * raw_value.size
|
44
44
|
end
|
45
|
+
|
46
|
+
def token
|
47
|
+
SecureRandom.base64(raw_value.size)[0..raw_value.size - 1]
|
48
|
+
end
|
49
|
+
|
50
|
+
def type
|
51
|
+
self.class.name.split('::').last.downcase.to_sym
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.types
|
55
|
+
%i[date word email firstname lastname phone]
|
56
|
+
end
|
57
|
+
|
58
|
+
types.each do |type_name|
|
59
|
+
define_method :"#{type_name}?" do
|
60
|
+
type == type_name
|
61
|
+
end
|
62
|
+
end
|
45
63
|
end
|
46
64
|
end
|
47
65
|
end
|
@@ -27,13 +27,17 @@ module Stockade
|
|
27
27
|
Word.new(value: value).valid?
|
28
28
|
end
|
29
29
|
|
30
|
+
def capitalized?
|
31
|
+
raw_value[0] == raw_value[0].upcase
|
32
|
+
end
|
33
|
+
|
30
34
|
class << self
|
31
35
|
extend Memoist
|
32
36
|
|
33
37
|
def dict_name; end
|
34
38
|
|
35
39
|
def dict
|
36
|
-
Rambling::Trie.load("data/#{dict_name}.
|
40
|
+
Rambling::Trie.load("data/#{dict_name}.zip")
|
37
41
|
end
|
38
42
|
memoize :dict
|
39
43
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'credit_card_validations'
|
4
|
+
|
5
|
+
module Stockade
|
6
|
+
module Lexemes
|
7
|
+
# Lexeme for anything that resembles payment card numbers
|
8
|
+
# https://en.wikipedia.org/wiki/Payment_card_number
|
9
|
+
#
|
10
|
+
# Any 10-19 character long sequences of digits optionally grouped using
|
11
|
+
# ' ' or '-' delimiters are suspects
|
12
|
+
class PaymentCard < Base
|
13
|
+
def self.regex
|
14
|
+
/
|
15
|
+
(?<!\d) # NaN
|
16
|
+
\d
|
17
|
+
([\s\-]*\d[\s\-]*){10,17}
|
18
|
+
\d
|
19
|
+
(?!\d) # NaN
|
20
|
+
/x
|
21
|
+
end
|
22
|
+
|
23
|
+
def valid?
|
24
|
+
CreditCardValidations::Detector.new(value).valid?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -4,22 +4,7 @@ module Stockade
|
|
4
4
|
module Lexemes
|
5
5
|
# Phone lexeme
|
6
6
|
class Phone < Base
|
7
|
-
|
8
|
-
MASKS = [
|
9
|
-
'#-###-###-####',
|
10
|
-
'+#-###-###-####',
|
11
|
-
'+##-###-###-####',
|
12
|
-
'+###-###-###-####',
|
13
|
-
'###-###-####',
|
14
|
-
'### ### ####',
|
15
|
-
'(## ##) #### ####',
|
16
|
-
'##########',
|
17
|
-
'(##) #### ####',
|
18
|
-
'(##) ## #### ####',
|
19
|
-
'###-###-###-####',
|
20
|
-
'###-####',
|
21
|
-
'(###) ###-####'
|
22
|
-
].freeze
|
7
|
+
MASKS = File.readlines('data/phones.txt').freeze
|
23
8
|
|
24
9
|
class << self
|
25
10
|
def regex
|
@@ -33,7 +18,7 @@ module Stockade
|
|
33
18
|
|
34
19
|
private
|
35
20
|
|
36
|
-
# Convert phone
|
21
|
+
# Convert less noisy phone mask syntax to regexes
|
37
22
|
# ### ### #### => (?:\d{3}\s\d{3}\s\d{4})
|
38
23
|
def to_re(mask)
|
39
24
|
'(?:' +
|
data/lib/stockade/lexer.rb
CHANGED
data/lib/stockade/parser.rb
CHANGED
@@ -75,18 +75,13 @@ module Stockade
|
|
75
75
|
end
|
76
76
|
|
77
77
|
def priority(lexeme)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
Lexemes::Lastname,
|
86
|
-
Lexemes::Phone,
|
87
|
-
Lexemes::Date,
|
88
|
-
Lexemes::Email
|
89
|
-
]
|
78
|
+
if [Lexemes::Firstname, Lexemes::Lastname].include?(lexeme.class)
|
79
|
+
lexeme.capitalized? ? 3 : 1
|
80
|
+
elsif lexeme.is_a?(Lexemes::Word)
|
81
|
+
2
|
82
|
+
else
|
83
|
+
3
|
84
|
+
end
|
90
85
|
end
|
91
86
|
end
|
92
87
|
end
|
data/lib/stockade/version.rb
CHANGED
data/stockade.gemspec
CHANGED
@@ -4,6 +4,7 @@ lib = File.expand_path('lib', __dir__)
|
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
5
|
require 'stockade/version'
|
6
6
|
|
7
|
+
# rubocop:disable Metrics/BlockLength
|
7
8
|
Gem::Specification.new do |spec|
|
8
9
|
spec.name = 'stockade'
|
9
10
|
spec.version = Stockade::VERSION
|
@@ -29,8 +30,11 @@ Gem::Specification.new do |spec|
|
|
29
30
|
spec.add_development_dependency 'pry-byebug', '~> 3.0'
|
30
31
|
spec.add_development_dependency 'rake', '~> 10.0'
|
31
32
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
32
|
-
spec.add_development_dependency 'rubocop', '~> 0.
|
33
|
+
spec.add_development_dependency 'rubocop', '~> 0.49'
|
33
34
|
|
35
|
+
spec.add_runtime_dependency 'credit_card_validations', '~> 3.5'
|
34
36
|
spec.add_runtime_dependency 'memoist', '~> 0.1'
|
35
37
|
spec.add_runtime_dependency 'rambling-trie', '~> 2.0'
|
38
|
+
spec.add_runtime_dependency 'rubyzip', '~> 1.0'
|
36
39
|
end
|
40
|
+
# rubocop:enable Metrics/BlockLength
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stockade
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stan Mazhara
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -72,14 +72,28 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0.
|
75
|
+
version: '0.49'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0.
|
82
|
+
version: '0.49'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: credit_card_validations
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.5'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.5'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: memoist
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,6 +122,20 @@ dependencies:
|
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '2.0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: rubyzip
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '1.0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '1.0'
|
111
139
|
description: "\n Stockade is a lexer that reads unstructured text information (from
|
112
140
|
files,\n logs, databases etc.) and tokenizes pieces that look like personally\n
|
113
141
|
\ identifiable information (PII).\n "
|
@@ -124,19 +152,19 @@ files:
|
|
124
152
|
- README.md
|
125
153
|
- Rakefile
|
126
154
|
- bin/load
|
127
|
-
- data/firstnames.
|
128
|
-
- data/
|
129
|
-
- data/
|
130
|
-
- data/
|
131
|
-
- data/words.dump
|
132
|
-
- data/words.txt
|
155
|
+
- data/firstnames.zip
|
156
|
+
- data/lastnames.zip
|
157
|
+
- data/phones.txt
|
158
|
+
- data/words.zip
|
133
159
|
- lib/stockade.rb
|
134
160
|
- lib/stockade/lexemes/base.rb
|
135
161
|
- lib/stockade/lexemes/date.rb
|
136
162
|
- lib/stockade/lexemes/dict.rb
|
137
163
|
- lib/stockade/lexemes/email.rb
|
164
|
+
- lib/stockade/lexemes/filler.rb
|
138
165
|
- lib/stockade/lexemes/firstname.rb
|
139
166
|
- lib/stockade/lexemes/lastname.rb
|
167
|
+
- lib/stockade/lexemes/payment_card.rb
|
140
168
|
- lib/stockade/lexemes/phone.rb
|
141
169
|
- lib/stockade/lexemes/word.rb
|
142
170
|
- lib/stockade/lexer.rb
|