stockade 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +22 -6
- data/README.md +37 -0
- data/Rakefile +5 -3
- data/bin/load +6 -32
- data/data/firstnames.dump +0 -0
- data/data/firstnames.txt +5496 -0
- data/data/lastnames.dump +0 -0
- data/data/lastnames.txt +240470 -0
- data/data/words.dump +0 -0
- data/data/words.txt +370099 -0
- data/lib/stockade/lexemes/base.rb +47 -0
- data/lib/stockade/lexemes/date.rb +49 -0
- data/lib/stockade/lexemes/dict.rb +42 -0
- data/lib/stockade/lexemes/email.rb +18 -0
- data/lib/stockade/lexemes/firstname.rb +14 -0
- data/lib/stockade/lexemes/lastname.rb +14 -0
- data/lib/stockade/lexemes/phone.rb +51 -0
- data/lib/stockade/lexemes/word.rb +17 -0
- data/lib/stockade/lexer.rb +61 -0
- data/lib/stockade/parser.rb +92 -0
- data/lib/stockade/version.rb +3 -1
- data/lib/stockade.rb +22 -107
- data/stockade.gemspec +13 -7
- metadata +50 -10
- data/data/firstnames/1.csv +0 -5496
- data/data/surnames/1.csv +0 -151671
- data/data/surnames/2.csv +0 -88799
- data/data/surnames.dump +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84a7317f812734960f8ed6f56c0194d783aad1c7
|
4
|
+
data.tar.gz: 6d361b573054fb0b0b2c5d19a02cebe7cb705173
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 431ecb92de2cd3e67596af6347d23c09e86bc4efbd8436617bdd599d25140fa67872385b848bd989635b24f8477da44d880047c2ac112fa8614f9b291b844727
|
7
|
+
data.tar.gz: 0b9040960eb148c06f4a664d6f074c075200afd36f10046c8c50eaa914d5550ebb1c91ea8ca5f9d53d0add1f0ddfb464c7b0636beb5ec6e361eaeefce129f417
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,28 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
stockade (0.1.
|
5
|
-
|
6
|
-
|
4
|
+
stockade (0.1.1)
|
5
|
+
memoist (~> 0.1)
|
6
|
+
rambling-trie
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
|
12
|
-
redis
|
11
|
+
ast (2.4.0)
|
13
12
|
byebug (10.0.2)
|
14
13
|
coderay (1.1.2)
|
15
14
|
diff-lcs (1.3)
|
15
|
+
jaro_winkler (1.5.1)
|
16
16
|
memoist (0.16.0)
|
17
17
|
method_source (0.9.0)
|
18
|
+
parallel (1.12.1)
|
19
|
+
parser (2.5.1.2)
|
20
|
+
ast (~> 2.4.0)
|
21
|
+
powerpack (0.1.2)
|
18
22
|
pry (0.11.3)
|
19
23
|
coderay (~> 1.1.0)
|
20
24
|
method_source (~> 0.9.0)
|
21
25
|
pry-byebug (3.6.0)
|
22
26
|
byebug (~> 10.0)
|
23
27
|
pry (~> 0.10)
|
28
|
+
rainbow (3.0.0)
|
24
29
|
rake (10.5.0)
|
25
|
-
|
30
|
+
rambling-trie (2.0.0)
|
26
31
|
rspec (3.7.0)
|
27
32
|
rspec-core (~> 3.7.0)
|
28
33
|
rspec-expectations (~> 3.7.0)
|
@@ -36,6 +41,16 @@ GEM
|
|
36
41
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
42
|
rspec-support (~> 3.7.0)
|
38
43
|
rspec-support (3.7.1)
|
44
|
+
rubocop (0.58.2)
|
45
|
+
jaro_winkler (~> 1.5.1)
|
46
|
+
parallel (~> 1.10)
|
47
|
+
parser (>= 2.5, != 2.5.1.1)
|
48
|
+
powerpack (~> 0.1)
|
49
|
+
rainbow (>= 2.2.2, < 4.0)
|
50
|
+
ruby-progressbar (~> 1.7)
|
51
|
+
unicode-display_width (~> 1.0, >= 1.0.1)
|
52
|
+
ruby-progressbar (1.9.0)
|
53
|
+
unicode-display_width (1.4.0)
|
39
54
|
|
40
55
|
PLATFORMS
|
41
56
|
ruby
|
@@ -45,6 +60,7 @@ DEPENDENCIES
|
|
45
60
|
pry-byebug
|
46
61
|
rake (~> 10.0)
|
47
62
|
rspec (~> 3.0)
|
63
|
+
rubocop
|
48
64
|
stockade!
|
49
65
|
|
50
66
|
BUNDLED WITH
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# PII Lexer
|
2
|
+
|
3
|
+
_This is a proof-of-concept level software._
|
4
|
+
|
5
|
+
Stockade is a lexer for Personally Identifiable Information (PII). It scans
|
6
|
+
unstructured text (from files, logs, databases, web etc.) and tokenized
|
7
|
+
recognized pieces of PII. This information can be used to raise errors,
|
8
|
+
discard, mask data.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
```
|
13
|
+
gem install stockade
|
14
|
+
```
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
require 'stockade'
|
20
|
+
#=> true
|
21
|
+
Stockade.mask('Mr. John Smith email is jsmith@example.com')
|
22
|
+
#=> "Mr. **** ***** email is ******************"
|
23
|
+
Stockade.mask('and his phone is 555-123-4567.')
|
24
|
+
#=> *** his phone is ************.
|
25
|
+
```
|
26
|
+
Yes, 'and' looks like PII because there is also a lastname.
|
27
|
+
|
28
|
+
## Implementation
|
29
|
+
|
30
|
+
It uses
|
31
|
+
[StringScanner](https://ruby-doc.org/stdlib-2.5.1/libdoc/strscan/rdoc/StringScanner.html)
|
32
|
+
and a manually curated list of regular expressions to match strings that _look_
|
33
|
+
like PII. This works for things like emails, phone numbers, dates, national
|
34
|
+
ids, credit card numbers and ip addresses. But it does not work for names.
|
35
|
+
Names are verified against the list of known first and last names that are
|
36
|
+
stored as a trie.
|
37
|
+
|
data/Rakefile
CHANGED
data/bin/load
CHANGED
@@ -1,38 +1,12 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
#
|
4
|
+
# Convert plain text dictionaries to trie
|
4
5
|
#
|
5
6
|
require 'bundler/setup'
|
7
|
+
require 'rambling-trie'
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def load(type)
|
11
|
-
bf = BloomFilter::Native.new(
|
12
|
-
:size => 10_000_000,
|
13
|
-
:hashes => 2,
|
14
|
-
:seed => 1,
|
15
|
-
:bucket => 3,
|
16
|
-
:raise => false
|
17
|
-
)
|
18
|
-
|
19
|
-
Dir.glob("data/#{type}/*.csv").each do |file|
|
20
|
-
CSV.foreach(file) do |line|
|
21
|
-
name = line.first
|
22
|
-
next if name == 'name'
|
23
|
-
|
24
|
-
name.strip!
|
25
|
-
name.downcase!
|
26
|
-
bf.insert(name) unless bf.include?(name)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
dump = Marshal.dump(bf)
|
31
|
-
|
32
|
-
File.write("data/#{type}.dump", dump)
|
33
|
-
|
34
|
-
df = Marshal.load(File.read("data/#{type}.dump"))
|
9
|
+
%w[lastnames firstnames words].each do |type|
|
10
|
+
trie = Rambling::Trie.create("data/#{type}.txt")
|
11
|
+
Rambling::Trie.dump(trie, "data/#{type}.dump")
|
35
12
|
end
|
36
|
-
|
37
|
-
load('surnames')
|
38
|
-
load('firstnames')
|
data/data/firstnames.dump
CHANGED
Binary file
|