stockade 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +51 -0
- data/LICENSE +21 -0
- data/Rakefile +6 -0
- data/bin/load +38 -0
- data/data/firstnames/1.csv +5496 -0
- data/data/firstnames.dump +0 -0
- data/data/surnames/1.csv +151671 -0
- data/data/surnames/2.csv +88799 -0
- data/data/surnames.dump +0 -0
- data/lib/stockade/version.rb +3 -0
- data/lib/stockade.rb +114 -0
- data/stockade.gemspec +30 -0
- metadata +130 -0
data/data/surnames.dump
ADDED
Binary file
|
data/lib/stockade.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'stockade/version'
|
2
|
+
|
3
|
+
require 'bloomfilter-rb'
|
4
|
+
require 'memoist'
|
5
|
+
require 'strscan'
|
6
|
+
|
7
|
+
module Stockade
|
8
|
+
class Lexer
|
9
|
+
extend Memoist
|
10
|
+
|
11
|
+
attr_reader :datum
|
12
|
+
|
13
|
+
def initialize(datum)
|
14
|
+
@datum = datum.strip.dup
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.call(datum)
|
18
|
+
new(datum).call
|
19
|
+
end
|
20
|
+
|
21
|
+
# order is important - from most specific to least
|
22
|
+
# the first one that matches stops the scan
|
23
|
+
def patterns
|
24
|
+
{
|
25
|
+
email: email_regex,
|
26
|
+
phone: phone_regex,
|
27
|
+
name: name_regex,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def scanner
|
32
|
+
StringScanner.new(datum)
|
33
|
+
end
|
34
|
+
memoize :scanner
|
35
|
+
|
36
|
+
def call
|
37
|
+
res = []
|
38
|
+
|
39
|
+
patterns.each do |name, regex|
|
40
|
+
scanner = StringScanner.new(datum)
|
41
|
+
|
42
|
+
loop do
|
43
|
+
break unless scanner.scan_until(regex)
|
44
|
+
value = scanner.matched
|
45
|
+
|
46
|
+
lexeme = name
|
47
|
+
if lexeme == :name
|
48
|
+
lexeme = :surname if surname?(value)
|
49
|
+
lexeme = :firstname if firstname?(value)
|
50
|
+
end
|
51
|
+
next if lexeme == :name
|
52
|
+
|
53
|
+
res << {
|
54
|
+
lexeme: lexeme,
|
55
|
+
value: scanner.matched
|
56
|
+
}
|
57
|
+
|
58
|
+
@datum = @datum[0..scanner.pos-scanner.matched.size] +
|
59
|
+
'*' * scanner.matched.size +
|
60
|
+
@datum[scanner.pos..-1]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
res
|
65
|
+
end
|
66
|
+
|
67
|
+
def name_regex
|
68
|
+
/\w+/
|
69
|
+
end
|
70
|
+
|
71
|
+
def word_regex
|
72
|
+
/\W+/
|
73
|
+
end
|
74
|
+
|
75
|
+
private def email_regex
|
76
|
+
/
|
77
|
+
[\w+\-\.\+]+
|
78
|
+
@
|
79
|
+
[a-z\d\-]+
|
80
|
+
(\.[a-z]+)*
|
81
|
+
\.[a-z]+ # TLD
|
82
|
+
/x
|
83
|
+
end
|
84
|
+
|
85
|
+
private def email_address?
|
86
|
+
datum =~ email_regex
|
87
|
+
end
|
88
|
+
|
89
|
+
private def phone_number?
|
90
|
+
datum =~ phone_number_regex
|
91
|
+
end
|
92
|
+
|
93
|
+
private def phone_regex
|
94
|
+
/\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
|
95
|
+
end
|
96
|
+
|
97
|
+
private def surname?(value)
|
98
|
+
found?('surnames', value)
|
99
|
+
end
|
100
|
+
|
101
|
+
private def firstname?(value)
|
102
|
+
found?('firstnames', value)
|
103
|
+
end
|
104
|
+
|
105
|
+
private def found?(db, value)
|
106
|
+
db(db).include?(value.downcase)
|
107
|
+
end
|
108
|
+
|
109
|
+
private def db(name)
|
110
|
+
Marshal.load(File.read("data/#{name}.dump"))
|
111
|
+
end
|
112
|
+
memoize :db
|
113
|
+
end
|
114
|
+
end
|
data/stockade.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'stockade/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'stockade'
|
7
|
+
spec.version = Stockade::VERSION
|
8
|
+
spec.authors = ['Stan Mazhara']
|
9
|
+
spec.email = ['akmegran@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = %q{Stockade is a lexer for PII}
|
12
|
+
spec.description = %q{
|
13
|
+
Stockade is a lexer that reads unstructured text information (from files,
|
14
|
+
logs, databases etc.) and tokenizes pieces that look like personally
|
15
|
+
identifiable information (PII).
|
16
|
+
}
|
17
|
+
spec.homepage = 'https://github.com/smazhara/stockade'
|
18
|
+
spec.license = 'MIT'
|
19
|
+
|
20
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
21
|
+
`git ls-files`.split(/\n/).reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
|
+
end
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
27
|
+
|
28
|
+
spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
|
29
|
+
spec.add_runtime_dependency 'memoist', '~> 0.1'
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stockade
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stan Mazhara
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-07-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.16'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.16'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bloomfilter-rb
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: memoist
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.1'
|
83
|
+
description: "\n Stockade is a lexer that reads unstructured text information (from
|
84
|
+
files,\n logs, databases etc.) and tokenizes pieces that look like personally\n
|
85
|
+
\ identifiable information (PII).\n "
|
86
|
+
email:
|
87
|
+
- akmegran@gmail.com
|
88
|
+
executables: []
|
89
|
+
extensions: []
|
90
|
+
extra_rdoc_files: []
|
91
|
+
files:
|
92
|
+
- ".ruby-version"
|
93
|
+
- Gemfile
|
94
|
+
- Gemfile.lock
|
95
|
+
- LICENSE
|
96
|
+
- Rakefile
|
97
|
+
- bin/load
|
98
|
+
- data/firstnames.dump
|
99
|
+
- data/firstnames/1.csv
|
100
|
+
- data/surnames.dump
|
101
|
+
- data/surnames/1.csv
|
102
|
+
- data/surnames/2.csv
|
103
|
+
- lib/stockade.rb
|
104
|
+
- lib/stockade/version.rb
|
105
|
+
- stockade.gemspec
|
106
|
+
homepage: https://github.com/smazhara/stockade
|
107
|
+
licenses:
|
108
|
+
- MIT
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
requirements: []
|
125
|
+
rubyforge_project:
|
126
|
+
rubygems_version: 2.6.11
|
127
|
+
signing_key:
|
128
|
+
specification_version: 4
|
129
|
+
summary: Stockade is a lexer for PII
|
130
|
+
test_files: []
|