stockade 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -0,0 +1,3 @@
1
+ module Stockade
2
+ VERSION = "0.1.0"
3
+ end
data/lib/stockade.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'stockade/version'
2
+
3
+ require 'bloomfilter-rb'
4
+ require 'memoist'
5
+ require 'strscan'
6
+
7
+ module Stockade
8
+ class Lexer
9
+ extend Memoist
10
+
11
+ attr_reader :datum
12
+
13
+ def initialize(datum)
14
+ @datum = datum.strip.dup
15
+ end
16
+
17
+ def self.call(datum)
18
+ new(datum).call
19
+ end
20
+
21
+ # order is important - from most specific to least
22
+ # the first one that matches stops the scan
23
+ def patterns
24
+ {
25
+ email: email_regex,
26
+ phone: phone_regex,
27
+ name: name_regex,
28
+ }
29
+ end
30
+
31
+ def scanner
32
+ StringScanner.new(datum)
33
+ end
34
+ memoize :scanner
35
+
36
+ def call
37
+ res = []
38
+
39
+ patterns.each do |name, regex|
40
+ scanner = StringScanner.new(datum)
41
+
42
+ loop do
43
+ break unless scanner.scan_until(regex)
44
+ value = scanner.matched
45
+
46
+ lexeme = name
47
+ if lexeme == :name
48
+ lexeme = :surname if surname?(value)
49
+ lexeme = :firstname if firstname?(value)
50
+ end
51
+ next if lexeme == :name
52
+
53
+ res << {
54
+ lexeme: lexeme,
55
+ value: scanner.matched
56
+ }
57
+
58
+ @datum = @datum[0..scanner.pos-scanner.matched.size] +
59
+ '*' * scanner.matched.size +
60
+ @datum[scanner.pos..-1]
61
+ end
62
+ end
63
+
64
+ res
65
+ end
66
+
67
+ def name_regex
68
+ /\w+/
69
+ end
70
+
71
+ def word_regex
72
+ /\W+/
73
+ end
74
+
75
+ private def email_regex
76
+ /
77
+ [\w+\-\.\+]+
78
+ @
79
+ [a-z\d\-]+
80
+ (\.[a-z]+)*
81
+ \.[a-z]+ # TLD
82
+ /x
83
+ end
84
+
85
+ private def email_address?
86
+ datum =~ email_regex
87
+ end
88
+
89
+ private def phone_number?
90
+ datum =~ phone_number_regex
91
+ end
92
+
93
+ private def phone_regex
94
+ /\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
95
+ end
96
+
97
+ private def surname?(value)
98
+ found?('surnames', value)
99
+ end
100
+
101
+ private def firstname?(value)
102
+ found?('firstnames', value)
103
+ end
104
+
105
+ private def found?(db, value)
106
+ db(db).include?(value.downcase)
107
+ end
108
+
109
+ private def db(name)
110
+ Marshal.load(File.read("data/#{name}.dump"))
111
+ end
112
+ memoize :db
113
+ end
114
+ end
data/stockade.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'stockade/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'stockade'
7
+ spec.version = Stockade::VERSION
8
+ spec.authors = ['Stan Mazhara']
9
+ spec.email = ['akmegran@gmail.com']
10
+
11
+ spec.summary = %q{Stockade is a lexer for PII}
12
+ spec.description = %q{
13
+ Stockade is a lexer that reads unstructured text information (from files,
14
+ logs, databases etc.) and tokenizes pieces that look like personally
15
+ identifiable information (PII).
16
+ }
17
+ spec.homepage = 'https://github.com/smazhara/stockade'
18
+ spec.license = 'MIT'
19
+
20
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
21
+ `git ls-files`.split(/\n/).reject { |f| f.match(%r{^(test|spec|features)/}) }
22
+ end
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.16'
25
+ spec.add_development_dependency 'rake', '~> 10.0'
26
+ spec.add_development_dependency 'rspec', '~> 3.0'
27
+
28
+ spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
29
+ spec.add_runtime_dependency 'memoist', '~> 0.1'
30
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stockade
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Stan Mazhara
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-07-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bloomfilter-rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: memoist
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.1'
83
+ description: "\n Stockade is a lexer that reads unstructured text information (from
84
+ files,\n logs, databases etc.) and tokenizes pieces that look like personally\n
85
+ \ identifiable information (PII).\n "
86
+ email:
87
+ - akmegran@gmail.com
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".ruby-version"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE
96
+ - Rakefile
97
+ - bin/load
98
+ - data/firstnames.dump
99
+ - data/firstnames/1.csv
100
+ - data/surnames.dump
101
+ - data/surnames/1.csv
102
+ - data/surnames/2.csv
103
+ - lib/stockade.rb
104
+ - lib/stockade/version.rb
105
+ - stockade.gemspec
106
+ homepage: https://github.com/smazhara/stockade
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.6.11
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Stockade is a lexer for PII
130
+ test_files: []