stockade 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +22 -6
- data/README.md +37 -0
- data/Rakefile +5 -3
- data/bin/load +6 -32
- data/data/firstnames.dump +0 -0
- data/data/firstnames.txt +5496 -0
- data/data/lastnames.dump +0 -0
- data/data/lastnames.txt +240470 -0
- data/data/words.dump +0 -0
- data/data/words.txt +370099 -0
- data/lib/stockade/lexemes/base.rb +47 -0
- data/lib/stockade/lexemes/date.rb +49 -0
- data/lib/stockade/lexemes/dict.rb +42 -0
- data/lib/stockade/lexemes/email.rb +18 -0
- data/lib/stockade/lexemes/firstname.rb +14 -0
- data/lib/stockade/lexemes/lastname.rb +14 -0
- data/lib/stockade/lexemes/phone.rb +51 -0
- data/lib/stockade/lexemes/word.rb +17 -0
- data/lib/stockade/lexer.rb +61 -0
- data/lib/stockade/parser.rb +92 -0
- data/lib/stockade/version.rb +3 -1
- data/lib/stockade.rb +22 -107
- data/stockade.gemspec +13 -7
- metadata +50 -10
- data/data/firstnames/1.csv +0 -5496
- data/data/surnames/1.csv +0 -151671
- data/data/surnames/2.csv +0 -88799
- data/data/surnames.dump +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84a7317f812734960f8ed6f56c0194d783aad1c7
|
4
|
+
data.tar.gz: 6d361b573054fb0b0b2c5d19a02cebe7cb705173
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 431ecb92de2cd3e67596af6347d23c09e86bc4efbd8436617bdd599d25140fa67872385b848bd989635b24f8477da44d880047c2ac112fa8614f9b291b844727
|
7
|
+
data.tar.gz: 0b9040960eb148c06f4a664d6f074c075200afd36f10046c8c50eaa914d5550ebb1c91ea8ca5f9d53d0add1f0ddfb464c7b0636beb5ec6e361eaeefce129f417
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,28 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
stockade (0.1.
|
5
|
-
|
6
|
-
|
4
|
+
stockade (0.1.1)
|
5
|
+
memoist (~> 0.1)
|
6
|
+
rambling-trie
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
|
12
|
-
redis
|
11
|
+
ast (2.4.0)
|
13
12
|
byebug (10.0.2)
|
14
13
|
coderay (1.1.2)
|
15
14
|
diff-lcs (1.3)
|
15
|
+
jaro_winkler (1.5.1)
|
16
16
|
memoist (0.16.0)
|
17
17
|
method_source (0.9.0)
|
18
|
+
parallel (1.12.1)
|
19
|
+
parser (2.5.1.2)
|
20
|
+
ast (~> 2.4.0)
|
21
|
+
powerpack (0.1.2)
|
18
22
|
pry (0.11.3)
|
19
23
|
coderay (~> 1.1.0)
|
20
24
|
method_source (~> 0.9.0)
|
21
25
|
pry-byebug (3.6.0)
|
22
26
|
byebug (~> 10.0)
|
23
27
|
pry (~> 0.10)
|
28
|
+
rainbow (3.0.0)
|
24
29
|
rake (10.5.0)
|
25
|
-
|
30
|
+
rambling-trie (2.0.0)
|
26
31
|
rspec (3.7.0)
|
27
32
|
rspec-core (~> 3.7.0)
|
28
33
|
rspec-expectations (~> 3.7.0)
|
@@ -36,6 +41,16 @@ GEM
|
|
36
41
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
42
|
rspec-support (~> 3.7.0)
|
38
43
|
rspec-support (3.7.1)
|
44
|
+
rubocop (0.58.2)
|
45
|
+
jaro_winkler (~> 1.5.1)
|
46
|
+
parallel (~> 1.10)
|
47
|
+
parser (>= 2.5, != 2.5.1.1)
|
48
|
+
powerpack (~> 0.1)
|
49
|
+
rainbow (>= 2.2.2, < 4.0)
|
50
|
+
ruby-progressbar (~> 1.7)
|
51
|
+
unicode-display_width (~> 1.0, >= 1.0.1)
|
52
|
+
ruby-progressbar (1.9.0)
|
53
|
+
unicode-display_width (1.4.0)
|
39
54
|
|
40
55
|
PLATFORMS
|
41
56
|
ruby
|
@@ -45,6 +60,7 @@ DEPENDENCIES
|
|
45
60
|
pry-byebug
|
46
61
|
rake (~> 10.0)
|
47
62
|
rspec (~> 3.0)
|
63
|
+
rubocop
|
48
64
|
stockade!
|
49
65
|
|
50
66
|
BUNDLED WITH
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# PII Lexer
|
2
|
+
|
3
|
+
_This is a proof-of-concept level software._
|
4
|
+
|
5
|
+
Stockade is a lexer for Personally Identifiable Information (PII). It scans
|
6
|
+
unstructured text (from files, logs, databases, web etc.) and tokenized
|
7
|
+
recognized pieces of PII. This information can be used to raise errors,
|
8
|
+
discard, mask data.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
```
|
13
|
+
gem install stockade
|
14
|
+
```
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
require 'stockade'
|
20
|
+
#=> true
|
21
|
+
Stockade.mask('Mr. John Smith email is jsmith@example.com')
|
22
|
+
#=> "Mr. **** ***** email is ******************"
|
23
|
+
Stockade.mask('and his phone is 555-123-4567.')
|
24
|
+
#=> *** his phone is ************.
|
25
|
+
```
|
26
|
+
Yes, 'and' looks like PII because there is also a lastname.
|
27
|
+
|
28
|
+
## Implementation
|
29
|
+
|
30
|
+
It uses
|
31
|
+
[StringScanner](https://ruby-doc.org/stdlib-2.5.1/libdoc/strscan/rdoc/StringScanner.html)
|
32
|
+
and a manually curated list of regular expressions to match strings that _look_
|
33
|
+
like PII. This works for things like emails, phone numbers, dates, national
|
34
|
+
ids, credit card numbers and ip addresses. But it does not work for names.
|
35
|
+
Names are verified against the list of known first and last names that are
|
36
|
+
stored as a trie.
|
37
|
+
|
data/Rakefile
CHANGED
data/bin/load
CHANGED
@@ -1,38 +1,12 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
#
|
4
|
+
# Convert plain text dictionaries to trie
|
4
5
|
#
|
5
6
|
require 'bundler/setup'
|
7
|
+
require 'rambling-trie'
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def load(type)
|
11
|
-
bf = BloomFilter::Native.new(
|
12
|
-
:size => 10_000_000,
|
13
|
-
:hashes => 2,
|
14
|
-
:seed => 1,
|
15
|
-
:bucket => 3,
|
16
|
-
:raise => false
|
17
|
-
)
|
18
|
-
|
19
|
-
Dir.glob("data/#{type}/*.csv").each do |file|
|
20
|
-
CSV.foreach(file) do |line|
|
21
|
-
name = line.first
|
22
|
-
next if name == 'name'
|
23
|
-
|
24
|
-
name.strip!
|
25
|
-
name.downcase!
|
26
|
-
bf.insert(name) unless bf.include?(name)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
dump = Marshal.dump(bf)
|
31
|
-
|
32
|
-
File.write("data/#{type}.dump", dump)
|
33
|
-
|
34
|
-
df = Marshal.load(File.read("data/#{type}.dump"))
|
9
|
+
%w[lastnames firstnames words].each do |type|
|
10
|
+
trie = Rambling::Trie.create("data/#{type}.txt")
|
11
|
+
Rambling::Trie.dump(trie, "data/#{type}.dump")
|
35
12
|
end
|
36
|
-
|
37
|
-
load('surnames')
|
38
|
-
load('firstnames')
|
data/data/firstnames.dump
CHANGED
Binary file
|