stockade 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +51 -0
- data/LICENSE +21 -0
- data/Rakefile +6 -0
- data/bin/load +38 -0
- data/data/firstnames/1.csv +5496 -0
- data/data/firstnames.dump +0 -0
- data/data/surnames/1.csv +151671 -0
- data/data/surnames/2.csv +88799 -0
- data/data/surnames.dump +0 -0
- data/lib/stockade/version.rb +3 -0
- data/lib/stockade.rb +114 -0
- data/stockade.gemspec +30 -0
- metadata +130 -0
data/data/surnames.dump
ADDED
Binary file
|
data/lib/stockade.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'stockade/version'
|
2
|
+
|
3
|
+
require 'bloomfilter-rb'
|
4
|
+
require 'memoist'
|
5
|
+
require 'strscan'
|
6
|
+
|
7
|
+
module Stockade
|
8
|
+
class Lexer
|
9
|
+
extend Memoist
|
10
|
+
|
11
|
+
attr_reader :datum
|
12
|
+
|
13
|
+
def initialize(datum)
|
14
|
+
@datum = datum.strip.dup
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.call(datum)
|
18
|
+
new(datum).call
|
19
|
+
end
|
20
|
+
|
21
|
+
# order is important - from most specific to least
|
22
|
+
# the first one that matches stops the scan
|
23
|
+
def patterns
|
24
|
+
{
|
25
|
+
email: email_regex,
|
26
|
+
phone: phone_regex,
|
27
|
+
name: name_regex,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def scanner
|
32
|
+
StringScanner.new(datum)
|
33
|
+
end
|
34
|
+
memoize :scanner
|
35
|
+
|
36
|
+
def call
|
37
|
+
res = []
|
38
|
+
|
39
|
+
patterns.each do |name, regex|
|
40
|
+
scanner = StringScanner.new(datum)
|
41
|
+
|
42
|
+
loop do
|
43
|
+
break unless scanner.scan_until(regex)
|
44
|
+
value = scanner.matched
|
45
|
+
|
46
|
+
lexeme = name
|
47
|
+
if lexeme == :name
|
48
|
+
lexeme = :surname if surname?(value)
|
49
|
+
lexeme = :firstname if firstname?(value)
|
50
|
+
end
|
51
|
+
next if lexeme == :name
|
52
|
+
|
53
|
+
res << {
|
54
|
+
lexeme: lexeme,
|
55
|
+
value: scanner.matched
|
56
|
+
}
|
57
|
+
|
58
|
+
@datum = @datum[0..scanner.pos-scanner.matched.size] +
|
59
|
+
'*' * scanner.matched.size +
|
60
|
+
@datum[scanner.pos..-1]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
res
|
65
|
+
end
|
66
|
+
|
67
|
+
def name_regex
|
68
|
+
/\w+/
|
69
|
+
end
|
70
|
+
|
71
|
+
def word_regex
|
72
|
+
/\W+/
|
73
|
+
end
|
74
|
+
|
75
|
+
private def email_regex
|
76
|
+
/
|
77
|
+
[\w+\-\.\+]+
|
78
|
+
@
|
79
|
+
[a-z\d\-]+
|
80
|
+
(\.[a-z]+)*
|
81
|
+
\.[a-z]+ # TLD
|
82
|
+
/x
|
83
|
+
end
|
84
|
+
|
85
|
+
private def email_address?
|
86
|
+
datum =~ email_regex
|
87
|
+
end
|
88
|
+
|
89
|
+
private def phone_number?
|
90
|
+
datum =~ phone_number_regex
|
91
|
+
end
|
92
|
+
|
93
|
+
private def phone_regex
|
94
|
+
/\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
|
95
|
+
end
|
96
|
+
|
97
|
+
private def surname?(value)
|
98
|
+
found?('surnames', value)
|
99
|
+
end
|
100
|
+
|
101
|
+
private def firstname?(value)
|
102
|
+
found?('firstnames', value)
|
103
|
+
end
|
104
|
+
|
105
|
+
private def found?(db, value)
|
106
|
+
db(db).include?(value.downcase)
|
107
|
+
end
|
108
|
+
|
109
|
+
private def db(name)
|
110
|
+
Marshal.load(File.read("data/#{name}.dump"))
|
111
|
+
end
|
112
|
+
memoize :db
|
113
|
+
end
|
114
|
+
end
|
data/stockade.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'stockade/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'stockade'
|
7
|
+
spec.version = Stockade::VERSION
|
8
|
+
spec.authors = ['Stan Mazhara']
|
9
|
+
spec.email = ['akmegran@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = %q{Stockade is a lexer for PII}
|
12
|
+
spec.description = %q{
|
13
|
+
Stockade is a lexer that reads unstructured text information (from files,
|
14
|
+
logs, databases etc.) and tokenizes pieces that look like personally
|
15
|
+
identifiable information (PII).
|
16
|
+
}
|
17
|
+
spec.homepage = 'https://github.com/smazhara/stockade'
|
18
|
+
spec.license = 'MIT'
|
19
|
+
|
20
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
21
|
+
`git ls-files`.split(/\n/).reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
|
+
end
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
27
|
+
|
28
|
+
spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
|
29
|
+
spec.add_runtime_dependency 'memoist', '~> 0.1'
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stockade
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stan Mazhara
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-07-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.16'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.16'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bloomfilter-rb
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: memoist
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.1'
|
83
|
+
description: "\n Stockade is a lexer that reads unstructured text information (from
|
84
|
+
files,\n logs, databases etc.) and tokenizes pieces that look like personally\n
|
85
|
+
\ identifiable information (PII).\n "
|
86
|
+
email:
|
87
|
+
- akmegran@gmail.com
|
88
|
+
executables: []
|
89
|
+
extensions: []
|
90
|
+
extra_rdoc_files: []
|
91
|
+
files:
|
92
|
+
- ".ruby-version"
|
93
|
+
- Gemfile
|
94
|
+
- Gemfile.lock
|
95
|
+
- LICENSE
|
96
|
+
- Rakefile
|
97
|
+
- bin/load
|
98
|
+
- data/firstnames.dump
|
99
|
+
- data/firstnames/1.csv
|
100
|
+
- data/surnames.dump
|
101
|
+
- data/surnames/1.csv
|
102
|
+
- data/surnames/2.csv
|
103
|
+
- lib/stockade.rb
|
104
|
+
- lib/stockade/version.rb
|
105
|
+
- stockade.gemspec
|
106
|
+
homepage: https://github.com/smazhara/stockade
|
107
|
+
licenses:
|
108
|
+
- MIT
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
requirements: []
|
125
|
+
rubyforge_project:
|
126
|
+
rubygems_version: 2.6.11
|
127
|
+
signing_key:
|
128
|
+
specification_version: 4
|
129
|
+
summary: Stockade is a lexer for PII
|
130
|
+
test_files: []
|