stockade 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Base class for all lexemes
6
+ #
7
+ # Lexer extracts lexem candidates of text using `.regex` of
8
+ # corresponding= subclass, instantiates it and then furtner calls
9
+ # its `#valid?` to verify that this is indeed a valid lexeme.
10
+ #
11
+ class Base
12
+ attr_reader :raw_value, :start_pos
13
+
14
+ def initialize(value, start_pos = nil)
15
+ @raw_value = value
16
+ @start_pos = start_pos
17
+ end
18
+
19
+ def value
20
+ raw_value.downcase.strip
21
+ end
22
+
23
+ def end_pos
24
+ start_pos + raw_value.size
25
+ end
26
+
27
+ def self.regex; end
28
+
29
+ def valid?
30
+ true
31
+ end
32
+
33
+ def ==(other)
34
+ value == other.value &&
35
+ self.class == other.class
36
+ end
37
+
38
+ def range
39
+ start_pos..end_pos
40
+ end
41
+
42
+ def mask
43
+ '*' * raw_value.size
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Date lexeme
6
+ class Date < Base
7
+ class << self
8
+ def regex
9
+ /
10
+ (?<!\d)
11
+ (\d{1,4})
12
+ #{delim}
13
+ (\d{1,4})
14
+ #{delim}
15
+ (\d{1,4})
16
+ (?!\d)
17
+ /x
18
+ end
19
+
20
+ def delim
21
+ %r{[\s\.\-\/]}
22
+ end
23
+ end
24
+
25
+ def valid?
26
+ possible_dates.any? &&
27
+ possible_dates.all? do |date|
28
+ date <= ::Date.today
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def possible_dates
35
+ parts.permutation.map do |permutation|
36
+ begin
37
+ ::Date.new(*permutation)
38
+ rescue ArgumentError
39
+ nil
40
+ end
41
+ end.compact
42
+ end
43
+
44
+ def parts
45
+ self.class.regex.match(value).captures.map(&:to_i)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Abstract Dictionary lexeme
6
+ #
7
+ # Dictionary lexemes are those that can only be verified by
8
+ # checking the corresponding dictionary
9
+ #
10
+ class Dict < Base
11
+ def self.regex
12
+ /
13
+ [a-zA-Z]+
14
+ /x
15
+ end
16
+
17
+ def valid?
18
+ return false unless self.class.dict
19
+ self.class.dict.include?(value)
20
+ end
21
+
22
+ def name
23
+ raise 'Abstract'
24
+ end
25
+
26
+ def common_word?
27
+ Word.new(value: value).valid?
28
+ end
29
+
30
+ class << self
31
+ extend Memoist
32
+
33
+ def dict_name; end
34
+
35
+ def dict
36
+ Rambling::Trie.load("data/#{dict_name}.dump")
37
+ end
38
+ memoize :dict
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Email lexeme
6
+ class Email < Base
7
+ def self.regex
8
+ /
9
+ [\w+\-\.\+]+
10
+ @
11
+ [a-z\d\-]+
12
+ (\.[a-z]+)*
13
+ \.[a-z]+ # TLD
14
+ /x
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Firstname lexeme
6
+ class Firstname < Dict
7
+ class << self
8
+ def dict_name
9
+ 'firstnames'
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Lastname lexeme
6
+ class Lastname < Dict
7
+ class << self
8
+ def dict_name
9
+ 'lastnames'
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # Phone lexeme
6
+ class Phone < Base
7
+ # Less noisy phone mask syntax compared to regexes
8
+ MASKS = [
9
+ '#-###-###-####',
10
+ '+#-###-###-####',
11
+ '+##-###-###-####',
12
+ '+###-###-###-####',
13
+ '###-###-####',
14
+ '### ### ####',
15
+ '(## ##) #### ####',
16
+ '##########',
17
+ '(##) #### ####',
18
+ '(##) ## #### ####',
19
+ '###-###-###-####',
20
+ '###-####',
21
+ '(###) ###-####'
22
+ ].freeze
23
+
24
+ class << self
25
+ def regex
26
+ /
27
+ #{MASKS
28
+ .map { |mask| to_re(mask) }
29
+ .join(" |\n")
30
+ }
31
+ /x
32
+ end
33
+
34
+ private
35
+
36
+ # Convert phone number mask to its regex
37
+ # ### ### #### => (?:\d{3}\s\d{3}\s\d{4})
38
+ def to_re(mask)
39
+ '(?:' +
40
+ mask
41
+ .gsub('+', '\\\+')
42
+ .gsub(/(#+)/) { |m| "\\d{#{m.size}}" }
43
+ .gsub(' ', '\s')
44
+ .gsub('(', '\(\s*')
45
+ .gsub(')', '\\s*\)') +
46
+ ')'
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ module Lexemes
5
+ # A word found is common words dictionary
6
+ class Word < Dict
7
+ def self.dict_name
8
+ 'words'
9
+ end
10
+
11
+ # common dictionary words are safe
12
+ def mask
13
+ raw_value
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rambling-trie'
4
+ require 'memoist'
5
+ require 'strscan'
6
+ require 'pry-byebug'
7
+
8
+ module Stockade
9
+ # Class Lexer
10
+ #
11
+ # Usage `Stockade::Lexer.call(context)`
12
+ #
13
+ # Returns list of found lexemes.
14
+ #
15
+ class Lexer
16
+ extend Memoist
17
+
18
+ attr_reader :context
19
+
20
+ def initialize(context)
21
+ @context = context.dup
22
+ end
23
+
24
+ def self.call(context)
25
+ new(context).call
26
+ end
27
+
28
+ def lexeme_classes
29
+ [
30
+ Stockade::Lexemes::Date,
31
+ Stockade::Lexemes::Email,
32
+ Stockade::Lexemes::Phone,
33
+ Stockade::Lexemes::Word,
34
+ Stockade::Lexemes::Firstname,
35
+ Stockade::Lexemes::Lastname
36
+ ]
37
+ end
38
+
39
+ def call
40
+ lexeme_classes.map do |lexeme_class|
41
+ tokenize(lexeme_class)
42
+ end.flatten
43
+ end
44
+
45
+ private
46
+
47
+ def tokenize(lexeme_class)
48
+ lexemes = []
49
+ scanner = StringScanner.new(context)
50
+
51
+ while scanner.scan_until(lexeme_class.regex)
52
+ lexemes << lexeme_class.new(
53
+ scanner.matched,
54
+ scanner.pos - scanner.matched.size
55
+ )
56
+ end
57
+
58
+ lexemes.select(&:valid?)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stockade
4
+ # Parser
5
+ #
6
+ # Takes a raw list (a set really) of lexemes returned by Lexer
7
+ # and further processes them. Currently this means:
8
+ # 1. Deduping - >1 ambiguous lexemes matching exactly same fragment
9
+ # are deduped according to their priority. Only the highest priority
10
+ # lexeme kept.
11
+ # 2. Removing covered lexems - lexeme that matches a string that is a
12
+ # substring of another matched string is removed.
13
+ #
14
+ class Parser
15
+ extend Memoist
16
+
17
+ attr_reader :lexemes
18
+
19
+ def initialize(lexemes)
20
+ @lexemes = lexemes
21
+ end
22
+
23
+ def self.call(lexemes)
24
+ new(lexemes).call
25
+ end
26
+
27
+ def call
28
+ non_covered_lexemes
29
+ end
30
+
31
+ private
32
+
33
+ def grouped
34
+ lexemes.group_by(&:range).values
35
+ end
36
+ memoize :grouped
37
+
38
+ def deduped
39
+ grouped.map do |group|
40
+ if group.size == 1
41
+ group.first
42
+ else
43
+ group.max_by do |lexeme|
44
+ priority(lexeme)
45
+ end
46
+ end
47
+ end
48
+ end
49
+ memoize :deduped
50
+
51
+ def ordered
52
+ lexemes.sort_by(&:start_pos)
53
+ end
54
+ memoize :ordered
55
+
56
+ def non_covered_lexemes
57
+ res = deduped.dup
58
+
59
+ res.each_index do |index|
60
+ head, *rest = *res[index..-1]
61
+
62
+ rest.each do |lex|
63
+ next unless covers?(head, lex)
64
+ res.delete(lex)
65
+ end
66
+ end.to_a
67
+
68
+ res
69
+ end
70
+
71
+ def covers?(head, lex)
72
+ head != lex &&
73
+ head.start_pos <= lex.start_pos &&
74
+ head.end_pos >= lex.end_pos
75
+ end
76
+
77
+ def priority(lexeme)
78
+ lexeme_priorities.index(lexeme.class)
79
+ end
80
+
81
+ def lexeme_priorities
82
+ [
83
+ Lexemes::Word,
84
+ Lexemes::Firstname,
85
+ Lexemes::Lastname,
86
+ Lexemes::Phone,
87
+ Lexemes::Date,
88
+ Lexemes::Email
89
+ ]
90
+ end
91
+ end
92
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Stockade
2
- VERSION = "0.1.0"
4
+ VERSION = '0.1.1'
3
5
  end
data/lib/stockade.rb CHANGED
@@ -1,114 +1,29 @@
1
- require 'stockade/version'
1
+ # frozen_string_literal: true
2
2
 
3
- require 'bloomfilter-rb'
4
3
  require 'memoist'
5
- require 'strscan'
6
4
 
5
+ require 'stockade/version'
6
+ require 'stockade/lexer'
7
+ require 'stockade/parser'
8
+ require 'stockade/lexemes/base'
9
+ require 'stockade/lexemes/date'
10
+ require 'stockade/lexemes/email'
11
+ require 'stockade/lexemes/phone'
12
+ require 'stockade/lexemes/dict'
13
+ require 'stockade/lexemes/word'
14
+ require 'stockade/lexemes/lastname'
15
+ require 'stockade/lexemes/firstname'
16
+
17
+ # Stockade module
7
18
  module Stockade
8
- class Lexer
9
- extend Memoist
10
-
11
- attr_reader :datum
12
-
13
- def initialize(datum)
14
- @datum = datum.strip.dup
15
- end
16
-
17
- def self.call(datum)
18
- new(datum).call
19
- end
20
-
21
- # order is important - from most specific to least
22
- # the first one that matches stops the scan
23
- def patterns
24
- {
25
- email: email_regex,
26
- phone: phone_regex,
27
- name: name_regex,
28
- }
29
- end
30
-
31
- def scanner
32
- StringScanner.new(datum)
33
- end
34
- memoize :scanner
35
-
36
- def call
37
- res = []
38
-
39
- patterns.each do |name, regex|
40
- scanner = StringScanner.new(datum)
41
-
42
- loop do
43
- break unless scanner.scan_until(regex)
44
- value = scanner.matched
45
-
46
- lexeme = name
47
- if lexeme == :name
48
- lexeme = :surname if surname?(value)
49
- lexeme = :firstname if firstname?(value)
50
- end
51
- next if lexeme == :name
52
-
53
- res << {
54
- lexeme: lexeme,
55
- value: scanner.matched
56
- }
57
-
58
- @datum = @datum[0..scanner.pos-scanner.matched.size] +
59
- '*' * scanner.matched.size +
60
- @datum[scanner.pos..-1]
61
- end
62
- end
63
-
64
- res
65
- end
66
-
67
- def name_regex
68
- /\w+/
69
- end
70
-
71
- def word_regex
72
- /\W+/
73
- end
74
-
75
- private def email_regex
76
- /
77
- [\w+\-\.\+]+
78
- @
79
- [a-z\d\-]+
80
- (\.[a-z]+)*
81
- \.[a-z]+ # TLD
82
- /x
83
- end
84
-
85
- private def email_address?
86
- datum =~ email_regex
87
- end
88
-
89
- private def phone_number?
90
- datum =~ phone_number_regex
91
- end
92
-
93
- private def phone_regex
94
- /\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\b/
95
- end
96
-
97
- private def surname?(value)
98
- found?('surnames', value)
99
- end
100
-
101
- private def firstname?(value)
102
- found?('firstnames', value)
103
- end
104
-
105
- private def found?(db, value)
106
- db(db).include?(value.downcase)
107
- end
108
-
109
- private def db(name)
110
- Marshal.load(File.read("data/#{name}.dump"))
19
+ # Mask all PII in `text` with `*`
20
+ #
21
+ def self.mask(text)
22
+ lexemes = Parser.call(Lexer.call(text))
23
+ lexemes.inject(text) do |mask, lexeme|
24
+ prefix = lexeme.start_pos.zero? ? '' : mask[0..lexeme.start_pos - 1]
25
+ postfix = mask[lexeme.end_pos..-1]
26
+ "#{prefix}#{lexeme.mask}#{postfix}"
111
27
  end
112
- memoize :db
113
28
  end
114
29
  end
data/stockade.gemspec CHANGED
@@ -1,4 +1,6 @@
1
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
  require 'stockade/version'
4
6
 
@@ -8,23 +10,27 @@ Gem::Specification.new do |spec|
8
10
  spec.authors = ['Stan Mazhara']
9
11
  spec.email = ['akmegran@gmail.com']
10
12
 
11
- spec.summary = %q{Stockade is a lexer for PII}
12
- spec.description = %q{
13
+ spec.summary = 'Stockade is a lexer for PII'
14
+ spec.description = '
13
15
  Stockade is a lexer that reads unstructured text information (from files,
14
16
  logs, databases etc.) and tokenizes pieces that look like personally
15
17
  identifiable information (PII).
16
- }
18
+ '
17
19
  spec.homepage = 'https://github.com/smazhara/stockade'
18
20
  spec.license = 'MIT'
19
21
 
20
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
21
- `git ls-files`.split(/\n/).reject { |f| f.match(%r{^(test|spec|features)/}) }
22
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
23
+ `git ls-files`.split(/\n/).reject do |f|
24
+ f.match(%r{^(test|spec|features)/})
25
+ end
22
26
  end
23
27
 
24
28
  spec.add_development_dependency 'bundler', '~> 1.16'
29
+ spec.add_development_dependency 'pry-byebug', '~> 3.0'
25
30
  spec.add_development_dependency 'rake', '~> 10.0'
26
31
  spec.add_development_dependency 'rspec', '~> 3.0'
32
+ spec.add_development_dependency 'rubocop', '~> 0.5'
27
33
 
28
- spec.add_runtime_dependency 'bloomfilter-rb', '~> 2.0'
29
34
  spec.add_runtime_dependency 'memoist', '~> 0.1'
35
+ spec.add_runtime_dependency 'rambling-trie', '~> 2.0'
30
36
  end