stockade 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -0,0 +1,3 @@
1
+ module Stockade
2
+ VERSION = "0.1.0"
3
+ end
data/lib/stockade.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'stockade/version'
2
+
3
+ require 'bloomfilter-rb'
4
+ require 'memoist'
5
+ require 'strscan'
6
+
7
+ module Stockade
8
+ class Lexer
9
+ extend Memoist
10
+
11
+ attr_reader :datum
12
+
13
+ def initialize(datum)
14
+ @datum = datum.strip.dup
15
+ end
16
+
17
+ def self.call(datum)
18
+ new(datum).call
19
+ end
20
+
21
+ # order is important - from most specific to least
22
+ # the first one that matches stops the scan
23
+ def patterns
24
+ {
25
+ email: email_regex,
26
+ phone: phone_regex,
27
+ name: name_regex,
28
+ }
29
+ end
30
+
31
+ def scanner
32
+ StringScanner.new(datum)
33
+ end
34
+ memoize :scanner
35
+
36
+ def call
37
+ res = []
38
+
39
+ patterns.each do |name, regex|
40
+ scanner = StringScanner.new(datum)
41
+
42
+ loop do
43
+ break unless scanner.scan_until(regex)
44
+ value = scanner.matched
45
+
46
+ lexeme = name
47
+ if lexeme == :name
48
+ lexeme = :surname if surname?(value)
49
+ lexeme = :firstname if firstname?(value)
50
+ end
51
+ next if lexeme == :name
52
+
53
+ res << {
54
+ lexeme: lexeme,
55
+ value: scanner.matched
56
+ }
57
+
58
+ @datum = @datum[0..scanner.pos-scanner.matched.size] +
59
+ '*' * scanner.matched.size +
60
+ @datum[scanner.pos..-1]
61
+ end
62
+ end
63
+
64
+ res
65
+ end
66
+
67
+ def name_regex
68
+ /\w+/
69
+ end
70
+
71
+ def word_regex
72
+ /\W+/
73
+ end
74
+
75
+ private def email_regex
76
+ /
77
+ [\w+\-\.\+]+
78
+ @
79
+ [a-z\d\-]+
80
+ (\.[a-z]+)*
81
+ \.[a-z]+ # TLD
82
+ /x
83
+ end
84
+
85
+ private def email_address?
86
+ datum =~ email_regex
87
+ end
88
+
89
+ private def phone_number?
90
+ datum =~ phone_number_regex
91
+ end
92
+
93
+ private def phone_regex
94
+ /\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
95
+ end
96
+
97
+ private def surname?(value)
98
+ found?('surnames', value)
99
+ end
100
+
101
+ private def firstname?(value)
102
+ found?('firstnames', value)
103
+ end
104
+
105
+ private def found?(db, value)
106
+ db(db).include?(value.downcase)
107
+ end
108
+
109
+ private def db(name)
110
+ Marshal.load(File.read("data/#{name}.dump"))
111
+ end
112
+ memoize :db
113
+ end
114
+ end
data/stockade.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'stockade/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'stockade'
7
+ spec.version = Stockade::VERSION
8
+ spec.authors = ['Stan Mazhara']
9
+ spec.email = ['akmegran@gmail.com']
10
+
11
+ spec.summary = %q{Stockade is a lexer for PII}
12
+ spec.description = %q{
13
+ Stockade is a lexer that reads unstructured text information (from files,
14
+ logs, databases etc.) and tokenizes pieces that look like personally
15
+ identifiable information (PII).
16
+ }
17
+ spec.homepage = 'https://github.com/smazhara/stockade'
18
+ spec.license = 'MIT'
19
+
20
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
21
+ `git ls-files`.split(/\n/).reject { |f| f.match(%r{^(test|spec|features)/}) }
22
+ end
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.16'
25
+ spec.add_development_dependency 'rake', '~> 10.0'
26
+ spec.add_development_dependency 'rspec', '~> 3.0'
27
+
28
+ spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
29
+ spec.add_runtime_dependency 'memoist', '~> 0.1'
30
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stockade
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Stan Mazhara
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-07-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bloomfilter-rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: memoist
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.1'
83
+ description: "\n Stockade is a lexer that reads unstructured text information (from
84
+ files,\n logs, databases etc.) and tokenizes pieces that look like personally\n
85
+ \ identifiable information (PII).\n "
86
+ email:
87
+ - akmegran@gmail.com
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".ruby-version"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE
96
+ - Rakefile
97
+ - bin/load
98
+ - data/firstnames.dump
99
+ - data/firstnames/1.csv
100
+ - data/surnames.dump
101
+ - data/surnames/1.csv
102
+ - data/surnames/2.csv
103
+ - lib/stockade.rb
104
+ - lib/stockade/version.rb
105
+ - stockade.gemspec
106
+ homepage: https://github.com/smazhara/stockade
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.6.11
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Stockade is a lexer for PII
130
+ test_files: []