stockade 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Base class for all lexemes
6
+ #
7
+ # Lexer extracts lexem candidates of text using `.regex` of
8
+ # corresponding= subclass, instantiates it and then furtner calls
9
+ # its `#valid?` to verify that this is indeed a valid lexeme.
10
+ #
11
+ class Base
12
+ attr_reader :raw_value, :start_pos
13
+
14
+ def initialize(value, start_pos = nil)
15
+ @raw_value = value
16
+ @start_pos = start_pos
17
+ end
18
+
19
+ def value
20
+ raw_value.downcase.strip
21
+ end
22
+
23
+ def end_pos
24
+ start_pos + raw_value.size
25
+ end
26
+
27
+ def self.regex; end
28
+
29
+ def valid?
30
+ true
31
+ end
32
+
33
+ def ==(other)
34
+ value == other.value &&
35
+ self.class == other.class
36
+ end
37
+
38
+ def range
39
+ start_pos..end_pos
40
+ end
41
+
42
+ def mask
43
+ '*' * raw_value.size
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Date lexeme
6
+ class Date < Base
7
+ class << self
8
+ def regex
9
+ /
10
+ (?<!\d)
11
+ (\d{1,4})
12
+ #{delim}
13
+ (\d{1,4})
14
+ #{delim}
15
+ (\d{1,4})
16
+ (?!\d)
17
+ /x
18
+ end
19
+
20
+ def delim
21
+ %r{[\s\.\-\/]}
22
+ end
23
+ end
24
+
25
+ def valid?
26
+ possible_dates.any? &&
27
+ possible_dates.all? do |date|
28
+ date <= ::Date.today
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def possible_dates
35
+ parts.permutation.map do |permutation|
36
+ begin
37
+ ::Date.new(*permutation)
38
+ rescue ArgumentError
39
+ nil
40
+ end
41
+ end.compact
42
+ end
43
+
44
+ def parts
45
+ self.class.regex.match(value).captures.map(&:to_i)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Abstract Dictionary lexeme
6
+ #
7
+ # Dictionary lexemes are those that can only be verified by
8
+ # checking the corresponding dictionary
9
+ #
10
+ class Dict < Base
11
+ def self.regex
12
+ /
13
+ [a-zA-Z]+
14
+ /x
15
+ end
16
+
17
+ def valid?
18
+ return false unless self.class.dict
19
+ self.class.dict.include?(value)
20
+ end
21
+
22
+ def name
23
+ raise 'Abstract'
24
+ end
25
+
26
+ def common_word?
27
+ Word.new(value: value).valid?
28
+ end
29
+
30
+ class << self
31
+ extend Memoist
32
+
33
+ def dict_name; end
34
+
35
+ def dict
36
+ Rambling::Trie.load("data/#{dict_name}.dump")
37
+ end
38
+ memoize :dict
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Email lexeme
6
+ class Email < Base
7
+ def self.regex
8
+ /
9
+ [\w+\-\.\+]+
10
+ @
11
+ [a-z\d\-]+
12
+ (\.[a-z]+)*
13
+ \.[a-z]+ # TLD
14
+ /x
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Firstname lexeme
6
+ class Firstname < Dict
7
+ class << self
8
+ def dict_name
9
+ 'firstnames'
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Lastname lexeme
6
+ class Lastname < Dict
7
+ class << self
8
+ def dict_name
9
+ 'lastnames'
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Phone lexeme
6
+ class Phone < Base
7
+ # Less noisy phone mask syntax compared to regexes
8
+ MASKS = [
9
+ '#-###-###-####',
10
+ '+#-###-###-####',
11
+ '+##-###-###-####',
12
+ '+###-###-###-####',
13
+ '###-###-####',
14
+ '### ### ####',
15
+ '(## ##) #### ####',
16
+ '##########',
17
+ '(##) #### ####',
18
+ '(##) ## #### ####',
19
+ '###-###-###-####',
20
+ '###-####',
21
+ '(###) ###-####'
22
+ ].freeze
23
+
24
+ class << self
25
+ def regex
26
+ /
27
+ #{MASKS
28
+ .map { |mask| to_re(mask) }
29
+ .join(" |\n")
30
+ }
31
+ /x
32
+ end
33
+
34
+ private
35
+
36
+ # Convert phone number mask to its regex
37
+ # ### ### #### => (?:\d{3}\s\d{3}\s\d{4})
38
+ def to_re(mask)
39
+ '(?:' +
40
+ mask
41
+ .gsub('+', '\\\+')
42
+ .gsub(/(#+)/) { |m| "\\d{#{m.size}}" }
43
+ .gsub(' ', '\s')
44
+ .gsub('(', '\(\s*')
45
+ .gsub(')', '\\s*\)') +
46
+ ')'
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # A word found is common words dictionary
6
+ class Word < Dict
7
+ def self.dict_name
8
+ 'words'
9
+ end
10
+
11
+ # common dictionary words are safe
12
+ def mask
13
+ raw_value
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rambling-trie'
4
+ require 'memoist'
5
+ require 'strscan'
6
+ require 'pry-byebug'
7
+
8
+ module Stockade
9
+ # Class Lexer
10
+ #
11
+ # Usage `Stockade::Lexer.call(context)`
12
+ #
13
+ # Returns list of found lexemes.
14
+ #
15
+ class Lexer
16
+ extend Memoist
17
+
18
+ attr_reader :context
19
+
20
+ def initialize(context)
21
+ @context = context.dup
22
+ end
23
+
24
+ def self.call(context)
25
+ new(context).call
26
+ end
27
+
28
+ def lexeme_classes
29
+ [
30
+ Stockade::Lexemes::Date,
31
+ Stockade::Lexemes::Email,
32
+ Stockade::Lexemes::Phone,
33
+ Stockade::Lexemes::Word,
34
+ Stockade::Lexemes::Firstname,
35
+ Stockade::Lexemes::Lastname
36
+ ]
37
+ end
38
+
39
+ def call
40
+ lexeme_classes.map do |lexeme_class|
41
+ tokenize(lexeme_class)
42
+ end.flatten
43
+ end
44
+
45
+ private
46
+
47
+ def tokenize(lexeme_class)
48
+ lexemes = []
49
+ scanner = StringScanner.new(context)
50
+
51
+ while scanner.scan_until(lexeme_class.regex)
52
+ lexemes << lexeme_class.new(
53
+ scanner.matched,
54
+ scanner.pos - scanner.matched.size
55
+ )
56
+ end
57
+
58
+ lexemes.select(&:valid?)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ # Parser
5
+ #
6
+ # Takes a raw list (a set really) of lexemes returned by Lexer
7
+ # and further processes them. Currently this means:
8
+ # 1. Deduping - >1 ambiguous lexemes matching exactly same fragment
9
+ # are deduped according to their priority. Only the highest priority
10
+ # lexeme kept.
11
+ # 2. Removing covered lexems - lexeme that matches a string that is a
12
+ # substring of another matched string is removed.
13
+ #
14
+ class Parser
15
+ extend Memoist
16
+
17
+ attr_reader :lexemes
18
+
19
+ def initialize(lexemes)
20
+ @lexemes = lexemes
21
+ end
22
+
23
+ def self.call(lexemes)
24
+ new(lexemes).call
25
+ end
26
+
27
+ def call
28
+ non_covered_lexemes
29
+ end
30
+
31
+ private
32
+
33
+ def grouped
34
+ lexemes.group_by(&:range).values
35
+ end
36
+ memoize :grouped
37
+
38
+ def deduped
39
+ grouped.map do |group|
40
+ if group.size == 1
41
+ group.first
42
+ else
43
+ group.max_by do |lexeme|
44
+ priority(lexeme)
45
+ end
46
+ end
47
+ end
48
+ end
49
+ memoize :deduped
50
+
51
+ def ordered
52
+ lexemes.sort_by(&:start_pos)
53
+ end
54
+ memoize :ordered
55
+
56
+ def non_covered_lexemes
57
+ res = deduped.dup
58
+
59
+ res.each_index do |index|
60
+ head, *rest = *res[index..-1]
61
+
62
+ rest.each do |lex|
63
+ next unless covers?(head, lex)
64
+ res.delete(lex)
65
+ end
66
+ end.to_a
67
+
68
+ res
69
+ end
70
+
71
+ def covers?(head, lex)
72
+ head != lex &&
73
+ head.start_pos <= lex.start_pos &&
74
+ head.end_pos >= lex.end_pos
75
+ end
76
+
77
+ def priority(lexeme)
78
+ lexeme_priorities.index(lexeme.class)
79
+ end
80
+
81
+ def lexeme_priorities
82
+ [
83
+ Lexemes::Word,
84
+ Lexemes::Firstname,
85
+ Lexemes::Lastname,
86
+ Lexemes::Phone,
87
+ Lexemes::Date,
88
+ Lexemes::Email
89
+ ]
90
+ end
91
+ end
92
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Stockade
2
- VERSION = "0.1.0"
4
+ VERSION = '0.1.1'
3
5
  end
data/lib/stockade.rb CHANGED
@@ -1,114 +1,29 @@
1
- require 'stockade/version'
1
+ # frozen_string_literal: true
2
2
 
3
- require 'bloomfilter-rb'
4
3
  require 'memoist'
5
- require 'strscan'
6
4
 
5
+ require 'stockade/version'
6
+ require 'stockade/lexer'
7
+ require 'stockade/parser'
8
+ require 'stockade/lexemes/base'
9
+ require 'stockade/lexemes/date'
10
+ require 'stockade/lexemes/email'
11
+ require 'stockade/lexemes/phone'
12
+ require 'stockade/lexemes/dict'
13
+ require 'stockade/lexemes/word'
14
+ require 'stockade/lexemes/lastname'
15
+ require 'stockade/lexemes/firstname'
16
+
17
+ # Stockade module
7
18
  module Stockade
8
- class Lexer
9
- extend Memoist
10
-
11
- attr_reader :datum
12
-
13
- def initialize(datum)
14
- @datum = datum.strip.dup
15
- end
16
-
17
- def self.call(datum)
18
- new(datum).call
19
- end
20
-
21
- # order is important - from most specific to least
22
- # the first one that matches stops the scan
23
- def patterns
24
- {
25
- email: email_regex,
26
- phone: phone_regex,
27
- name: name_regex,
28
- }
29
- end
30
-
31
- def scanner
32
- StringScanner.new(datum)
33
- end
34
- memoize :scanner
35
-
36
- def call
37
- res = []
38
-
39
- patterns.each do |name, regex|
40
- scanner = StringScanner.new(datum)
41
-
42
- loop do
43
- break unless scanner.scan_until(regex)
44
- value = scanner.matched
45
-
46
- lexeme = name
47
- if lexeme == :name
48
- lexeme = :surname if surname?(value)
49
- lexeme = :firstname if firstname?(value)
50
- end
51
- next if lexeme == :name
52
-
53
- res << {
54
- lexeme: lexeme,
55
- value: scanner.matched
56
- }
57
-
58
- @datum = @datum[0..scanner.pos-scanner.matched.size] +
59
- '*' * scanner.matched.size +
60
- @datum[scanner.pos..-1]
61
- end
62
- end
63
-
64
- res
65
- end
66
-
67
- def name_regex
68
- /\w+/
69
- end
70
-
71
- def word_regex
72
- /\W+/
73
- end
74
-
75
- private def email_regex
76
- /
77
- [\w+\-\.\+]+
78
- @
79
- [a-z\d\-]+
80
- (\.[a-z]+)*
81
- \.[a-z]+ # TLD
82
- /x
83
- end
84
-
85
- private def email_address?
86
- datum =~ email_regex
87
- end
88
-
89
- private def phone_number?
90
- datum =~ phone_number_regex
91
- end
92
-
93
- private def phone_regex
94
- /\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
95
- end
96
-
97
- private def surname?(value)
98
- found?('surnames', value)
99
- end
100
-
101
- private def firstname?(value)
102
- found?('firstnames', value)
103
- end
104
-
105
- private def found?(db, value)
106
- db(db).include?(value.downcase)
107
- end
108
-
109
- private def db(name)
110
- Marshal.load(File.read("data/#{name}.dump"))
19
+ # Mask all PII in `text` with `*`
20
+ #
21
+ def self.mask(text)
22
+ lexemes = Parser.call(Lexer.call(text))
23
+ lexemes.inject(text) do |mask, lexeme|
24
+ prefix = lexeme.start_pos.zero? ? '' : mask[0..lexeme.start_pos - 1]
25
+ postfix = mask[lexeme.end_pos..-1]
26
+ "#{prefix}#{lexeme.mask}#{postfix}"
111
27
  end
112
- memoize :db
113
28
  end
114
29
  end
data/stockade.gemspec CHANGED
@@ -1,4 +1,6 @@
1
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
  require 'stockade/version'
4
6
 
@@ -8,23 +10,27 @@ Gem::Specification.new do |spec|
8
10
  spec.authors = ['Stan Mazhara']
9
11
  spec.email = ['akmegran@gmail.com']
10
12
 
11
- spec.summary = %q{Stockade is a lexer for PII}
12
- spec.description = %q{
13
+ spec.summary = 'Stockade is a lexer for PII'
14
+ spec.description = '
13
15
  Stockade is a lexer that reads unstructured text information (from files,
14
16
  logs, databases etc.) and tokenizes pieces that look like personally
15
17
  identifiable information (PII).
16
- }
18
+ '
17
19
  spec.homepage = 'https://github.com/smazhara/stockade'
18
20
  spec.license = 'MIT'
19
21
 
20
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
21
- `git ls-files`.split(/\n/).reject { |f| f.match(%r{^(test|spec|features)/}) }
22
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
23
+ `git ls-files`.split(/\n/).reject do |f|
24
+ f.match(%r{^(test|spec|features)/})
25
+ end
22
26
  end
23
27
 
24
28
  spec.add_development_dependency 'bundler', '~> 1.16'
29
+ spec.add_development_dependency 'pry-byebug', '~> 3.0'
25
30
  spec.add_development_dependency 'rake', '~> 10.0'
26
31
  spec.add_development_dependency 'rspec', '~> 3.0'
32
+ spec.add_development_dependency 'rubocop', '~> 0.5'
27
33
 
28
- spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
29
34
  spec.add_runtime_dependency 'memoist', '~> 0.1'
35
+ spec.add_runtime_dependency 'rambling-trie', '~> 2.0'
30
36
  end