ramparts 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ # Roadmap
2
+ - [x] Add error messages, especially for malformed input
3
+ - [ ] Implement options for `find_phone_numbers` and `replace_phone_numbers`
4
+ - [x] Implement [Stale](https://github.com/probot/stale) for issue
5
+ - [ ] Implement Automatic Deployments
6
+ - [x] Look into using hashes instead of array for testing infrastructure
7
+ - [ ] Look into using custom rspec matchers
8
+ - [ ] Travis badge for marketing
9
+ - [x] Use Ruby that's already installed on TravisCI
10
+ - [ ] Use threads for true Map/Reduce. Look into [Celluloid](https://github.com/celluloid/celluloid)
11
+ - [ ] Have check domain (for email matches) as an option (not only on :aggressive option) to reduce over matching
12
+ - [ ] Check TLD to make sure it's a valid TLD as an option to reduce over matching
13
+ - [x] Refactor tests to use hashes instead of arrays
14
+ - [ ] Allow use of function based replacement instead of forcing yield functionality
15
+ - [ ] Implement URL functionality
16
+ - [ ] Change URL to keywords list functionality
17
+ - [ ] Robust-ify the overlapping interval scenario
18
+ - [ ] Get setup on Code Climate once the repo goes open source
19
+ - [ ] Think about using InchCI (inline docs) instead of README
20
+ - [x] Creating a `spec_helper.rb` and use tools like SimpleCov,
21
+ - [ ] Look into using Shoulda matchers, and test randomization.
File without changes
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative './ramparts/base'
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative './parsers/email_parser'
4
+ require_relative './parsers/phone_parser'
5
+ require_relative './parsers/url_parser'
6
+ require_relative './helpers'
7
+
8
+ class Ramparts
9
+ def self.count_phone_numbers(text, options = {})
10
+ pp = PhoneParser.new
11
+ pp.count_phone_number_instances(text, options)
12
+ end
13
+
14
+ def self.find_phone_numbers(text, options = {})
15
+ pp = PhoneParser.new
16
+ pp.find_phone_number_instances(text, options)
17
+ end
18
+
19
+ def self.replace_phone_numbers(text, options = {}, &block)
20
+ pp = PhoneParser.new
21
+ pp.replace_phone_number_instances(text, options, &block)
22
+ end
23
+
24
+ def self.count_emails(text, options = {})
25
+ ep = EmailParser.new
26
+ ep.count_email_instances(text, options)
27
+ end
28
+
29
+ def self.find_emails(text, options = {})
30
+ ep = EmailParser.new
31
+ ep.find_email_instances(text, options)
32
+ end
33
+
34
+ def self.replace_emails(text, options = {}, &block)
35
+ ep = EmailParser.new
36
+ ep.replace_email_instances(text, options, &block)
37
+ end
38
+
39
+ def self.count_urls(text, options = {})
40
+ up = UrlParser.new
41
+ up.count_url_instances(text, options)
42
+ end
43
+
44
+ def self.count_phone_numbers_and_emails(text, options = {})
45
+ pp = PhoneParser.new
46
+ ep = EmailParser.new
47
+
48
+ phone_instances = pp.count_phone_number_instances(text, options)
49
+ email_instances = ep.count_email_instances(text, options)
50
+ phone_instances + email_instances
51
+ end
52
+
53
+ def self.find_phone_numbers_and_emails(text, options = {})
54
+ pp = PhoneParser.new
55
+ ep = EmailParser.new
56
+
57
+ phone_instances = pp.find_phone_number_instances(text, options)
58
+ email_instances = ep.find_email_instances(text, options)
59
+
60
+ if options.fetch(:compare, false)
61
+ phone_instances.each do |phone|
62
+ phone_range = (phone[:start_offset]...phone[:end_offset])
63
+ email_instances.delete_if do |email|
64
+ email_range = (email[:start_offset]...email[:end_offset])
65
+ ranges_overlap?(phone_range, email_range)
66
+ end
67
+ end
68
+ end
69
+
70
+ phone_instances + email_instances
71
+ end
72
+
73
+ def self.replace_phone_numbers_and_emails(text, options = {}, &block)
74
+ pp = PhoneParser.new
75
+ ep = EmailParser.new
76
+
77
+ phone_instances = pp.find_phone_number_instances(text, options)
78
+ email_instances = ep.find_email_instances(text, options)
79
+ total_instances = phone_instances + email_instances
80
+
81
+ if options.fetch(:compare, false)
82
+ phone_instances.each do |phone|
83
+ phone_range = (phone[:start_offset]...phone[:end_offset])
84
+ email_instances.delete_if do |email|
85
+ email_range = (email[:start_offset]...email[:end_offset])
86
+ ranges_overlap?(phone_range, email_range)
87
+ end
88
+ end
89
+ end
90
+
91
+ # We have no idea the order of the matches unless we ran the regex for both occurrences at the same time.
92
+ # Instead we sort by start offset and then reverse so that we can replace from the end of the
93
+ # string to the start to not screw up indices. Apparently this is the fastest way to sort in reverse
94
+ # https://stackoverflow.com/questions/2642182/sorting-an-array-in-descending-order-in-ruby#answer-2651028
95
+ total_instances_sorted = total_instances.sort_by { |instance| instance[:start_offset] }.reverse!
96
+
97
+ replace(text, total_instances_sorted, &block)
98
+ end
99
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ EMAIL_DOMAINS = %w[
4
+ gmail
5
+ yahoo
6
+ hotmail
7
+ aol
8
+ icloud
9
+ live
10
+ outlook
11
+ ymail
12
+ comcast
13
+ shaw
14
+ rogers
15
+ msn
16
+ mail
17
+ me
18
+ att
19
+ careguide
20
+ sbcglobal
21
+ rocketmail
22
+ telus
23
+ sympatico
24
+ cox
25
+ gmai
26
+ email
27
+ aim
28
+ yandex
29
+ gamil
30
+ gmx
31
+ student
32
+ students
33
+ earthlink
34
+ gnail
35
+ juno
36
+ gmsil
37
+ netzero
38
+ ail
39
+ gmil
40
+ gmal
41
+ hmail
42
+ yaho
43
+ alumni
44
+ gmial
45
+ googlemail
46
+ tampabay
47
+ mtroyal
48
+ usa
49
+ cfl
50
+ yshoo
51
+ protonmail
52
+ rediffmail
53
+ liberty
54
+ maine
55
+ inbox
56
+ optimum
57
+ example
58
+ yhaoo
59
+ yorku
60
+ mchsi
61
+ yahoi
62
+ zoho
63
+ hushmail
64
+ libero
65
+ hotmal
66
+ ukr
67
+ wowway
68
+ post
69
+ lycos
70
+ yaboo
71
+ contractor
72
+ yahool
73
+ ].freeze
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: false
2
+
3
+ ARGUMENT_ERROR_TEXT = 'Parameter 1, the block of text to parse, is not a string'.freeze
4
+
5
+ # The map reduce (MR) algorithm. Faster by ~2x than the other algorithm.
6
+ # Maps parts of the text such as 'at' or 'FOUR' down to '@' and '4'
7
+ # removes spaces etc, and then runs a simple regex over the remainder
8
+ # Information loss occurs and hence it can't return indices
9
+ MR_ALGO = 'MR'.freeze
10
+
11
+ # The glorified regex (GR) algorithm.
12
+ # An obtuse and yet heartily strong regex that does a single pass over
13
+ # the text. Since the regex is so complicated and robust - it is slower
14
+ # than the map reduce algorithm. No information loss occurs
15
+ # so we can return indices of where the phone numbers and etc. exist
16
+ GR_ALGO = 'GR'.freeze
17
+
18
+ # Given some text it replaces each matched instance with the given insertable
19
+ def replace(text, instances, &block) # rubocop:disable Lint/UnusedMethodArgument
20
+ altered_text = String.new(text)
21
+
22
+ instances.map do |instance|
23
+ insertable = yield instance
24
+ altered_text[instance[:start_offset]...instance[:end_offset]] = insertable
25
+ end
26
+ altered_text
27
+ end
28
+
29
+ # Given some text it scans the text with the given regex for matches
30
+ def scan(text, regex, type)
31
+ text
32
+ .enum_for(:scan, regex)
33
+ .map do
34
+ {
35
+ start_offset: Regexp.last_match.begin(0),
36
+ end_offset: Regexp.last_match.begin(0) + Regexp.last_match.to_s.length,
37
+ value: Regexp.last_match.to_s,
38
+ type: type
39
+ }
40
+ end
41
+ end
42
+
43
+ # Check if two ranges overlap
44
+ def ranges_overlap?(r1, r2)
45
+ r1.cover?(r2.first) || r2.cover?(r1.first)
46
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../data/list_of_email_domains'
4
+ require_relative '../helpers'
5
+
6
+ # Parses text and attempts to locate email
7
+ class EmailParser
8
+ # Counts email occurrences within a block of text
9
+ # Note: Uses map reduce algorithm
10
+ def count_email_instances(text, options)
11
+ raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
12
+
13
+ text = parse_email(text)
14
+ email_instances(MR_ALGO, text, options).length
15
+ end
16
+
17
+ # Replaces the occurrences of email within the block of text with an insertable
18
+ def replace_email_instances(text, options, &block)
19
+ raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
20
+
21
+ instances = find_email_instances(text, options)
22
+ replace(text, instances.reverse!, &block)
23
+ end
24
+
25
+ # Fins the occurrences of emails within a block of text and returns their positions
26
+ def find_email_instances(text, options)
27
+ raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
28
+
29
+ text = text.downcase
30
+ email_instances(GR_ALGO, text, options)
31
+ end
32
+
33
+ private
34
+
35
+ # Matches a certain string of text allowed in emails
36
+ TEXT_MATCH = 'a-z0-9._%+-'
37
+
38
+ # rubocop:disable LineLength
39
+
40
+ # Regex to find the emails, must have .com or something similar to match
41
+ GR_REGEX = Regexp.new(/(([#{TEXT_MATCH}]{1}[^\w]{1})+|([#{TEXT_MATCH}])+)([^\w]*@[^\w]*){1}[a-z0-9.-]+((\.|[^\w]+(dot){1}[^\w]+){1}[a-z]{2,})+/)
42
+ # Regex to find the emails, must have .com or something similar to match and also checks for the word 'at' as '@'
43
+ GR_REGEX_WITH_AT = Regexp.new(/(([#{TEXT_MATCH}]{1}[^\w]{1})+|([#{TEXT_MATCH}])+)([^\w]+(at){1}[^\w]+|[^\w]*@[^\w]*){1}[a-z0-9.-]+((\.|[^\w]+(dot){1}[^\w]+){1}[a-z]{2,})+/)
44
+ # Regex to find the emails, does .com or something similar to match
45
+ GR_REGEX_WITHOUT_DOT = Regexp.new(/(([#{TEXT_MATCH}]{1}[^\w]{1})+|([#{TEXT_MATCH}])+)([^\w]+(at){1}[^\w]+|[^\w]*@[^\w]*){1}[a-z0-9.-]+([^\w]*\.[^\w]*|[^\w]+(dot){1}[^\w]+)?([a-z]{2,})?/)
46
+
47
+ # rubocop:enable LineLength
48
+
49
+ # Regex to find emails for MapReduce, must have .com or something similar to match
50
+ MR_REGEX = Regexp.new(/[a-z0-9._%+-]+\${,2}@{1}\${,2}[a-z0-9.-]+\${,2}(\.){1}[a-z]{2,}/)
51
+ # Regex to find emails for MapReduce, does not have to have .com or something similar to match
52
+ MR_REGEX_WITHOUT_DOT = Regexp.new(/[a-z0-9._%+-]+\${,2}@{1}\${,2}[a-z0-9.-]+/)
53
+
54
+ # Map these occurences down to their constituent parts
55
+ REPLACEMENTS = {
56
+ ' at ' => '@',
57
+ '(at)' => '@',
58
+ ' dot ' => '.'
59
+ }.freeze
60
+
61
+ # Parses the email and maps down certain occurrences
62
+ def parse_email(text)
63
+ text.downcase.gsub(/\ at\ |\(at\)|\ dot\ /, REPLACEMENTS).gsub(/[^\w\@\.\_\%\+\-]/, '$')
64
+ end
65
+
66
+ def email_instances(algo, text, options)
67
+ # Determines which algorithm to use
68
+ regex = algo == MR_ALGO ? MR_REGEX : GR_REGEX
69
+ regex_without_dot = algo == MR_ALGO ? MR_REGEX_WITHOUT_DOT : GR_REGEX_WITHOUT_DOT
70
+ regex_with_at = GR_REGEX_WITH_AT
71
+
72
+ instances = []
73
+ if options.fetch(:aggressive, false)
74
+ temp_instances = scan(text, regex_without_dot, :email)
75
+
76
+ # Since this is the aggressive option where '.com' or similar isn't needed
77
+ # Check to make sure the last word of the string is a domain
78
+ temp_instances.each do |instance|
79
+ instances << instance if EMAIL_DOMAINS.any? { |domain| instance[:value].split('@')[1]&.include? domain }
80
+ end
81
+ elsif options.fetch(:check_for_at, false)
82
+ instances = scan(text, regex_with_at, :email)
83
+ else
84
+ instances = scan(text, regex, :email)
85
+ end
86
+ instances
87
+ end
88
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../helpers'
4
+
5
+ # Parses text and attempts to locate phone numbers
6
+ class PhoneParser
7
+ # Counts the number of phone number instances that occur within the block of text
8
+ def count_phone_number_instances(text, options)
9
+ raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
10
+
11
+ parsed_text = parse_phone_number(text, options)
12
+
13
+ # Uses the map reduce algorithm
14
+ phone_number_instances(MR_ALGO, parsed_text, options).length
15
+ end
16
+
17
+ # Replaces phone number instances within the block of text with the insertable
18
+ def replace_phone_number_instances(text, options, &block)
19
+ raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
20
+
21
+ instances = find_phone_number_instances(text, options)
22
+ replace(text, instances.reverse!, &block)
23
+ end
24
+
25
+ # Finds phone number instances within the block of text
26
+ def find_phone_number_instances(text, options)
27
+ raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
28
+
29
+ text = text.downcase
30
+
31
+ # Finds the phone number instances using the glorified regex algorithm
32
+ phone_number_instances(GR_ALGO, text, options)
33
+ end
34
+
35
+ private
36
+
37
+ # Phonetic versions of numbers
38
+ PHONETICS = %w[
39
+ one
40
+ two
41
+ three
42
+ four
43
+ five
44
+ six
45
+ seven
46
+ eight
47
+ nine
48
+ oh
49
+ zero
50
+ ].freeze
51
+
52
+ # L33t speak versions of numbers
53
+ LEET_SPEAK = %w[
54
+ w0n
55
+ too
56
+ thr33
57
+ f0r
58
+ f1v3
59
+ s3x
60
+ sex
61
+ s3v3n
62
+ at3
63
+ nin3
64
+ ].freeze
65
+
66
+ # Handles multiple spaces
67
+ MULTI_SPACE = '( )*'
68
+
69
+ # Regex for phonetics, both with spaces and otherwise
70
+ REGEX_PHONETICS = PHONETICS.join('|')
71
+ REGEX_PHONETICS_SPACED = PHONETICS.map { |word| word.split('').join(MULTI_SPACE) }.join('|')
72
+
73
+ # Regex for l33t, both with spaces and otherwise
74
+ REGEX_LEET_SPEAK = LEET_SPEAK.join('|')
75
+ REGEX_LEET_SPEAK_SPACED = LEET_SPEAK.map { |word| word.split('').join(MULTI_SPACE) }.join('|')
76
+
77
+ # Base matching for a possible phone number digit
78
+ BASE_MATCHING = "#{REGEX_PHONETICS}|#{REGEX_LEET_SPEAK}|#{REGEX_PHONETICS_SPACED}|#{REGEX_LEET_SPEAK_SPACED}"
79
+
80
+ # The final regex used to match phone numbers for GR
81
+ GR_REGEX =
82
+ Regexp.new(/(\()?(\d|#{BASE_MATCHING}){1}([^\w]*(\d|#{BASE_MATCHING}){1}[^\w]*){5,}(\d|#{BASE_MATCHING}){1}/)
83
+
84
+ # The final regex used to match phone numbers for MR
85
+ MR_REGEX = Regexp.new(/(\-*\.?\d{1}\.?\-*){7,}/)
86
+
87
+ # Replacements used for phonetics for MR
88
+ REPLACEMENTS = {
89
+ 'one' => '1',
90
+ 'two' => '2',
91
+ 'three' => '3',
92
+ 'four' => '4',
93
+ 'five' => '5',
94
+ 'six' => '6',
95
+ 'seven' => '7',
96
+ 'eight' => '8',
97
+ 'nine' => '9',
98
+ 'oh' => '0',
99
+ 'zero' => '0'
100
+ }.freeze
101
+
102
+ # Replacements used for l33t for MR
103
+ LEET_REPLACEMENTS = {
104
+ 'w0n' => '1',
105
+ 'too' => '2',
106
+ 'thr33' => '3',
107
+ 'f0r' => '4',
108
+ 'f1v3' => '5',
109
+ 's3x' => '6',
110
+ 'sex' => '6',
111
+ 's3v3n' => '7',
112
+ 'at3' => '8',
113
+ 'nin3' => '9'
114
+ }.freeze
115
+
116
+ # Parses the phone number for MR, uses a variety of options
117
+ def parse_phone_number(text, options)
118
+ text = text.delete(' ') if options.fetch(:remove_spaces, true)
119
+
120
+ text = text.downcase.gsub(/#{REGEX_PHONETICS}/, REPLACEMENTS)
121
+
122
+ if options.fetch(:parse_leet, true)
123
+ text = text.gsub(/#{REGEX_LEET_SPEAK}/, LEET_REPLACEMENTS)
124
+ end
125
+
126
+ text.gsub(/[^\w]/, '-').gsub(/[a-z]/, '.')
127
+ end
128
+
129
+ # Returns the phone number instances using the specified algorithm
130
+ def phone_number_instances(algo, text, _options)
131
+ # Determines which algorithm to use
132
+ regex = algo == MR_ALGO ? MR_REGEX : GR_REGEX
133
+
134
+ instances = scan(text, regex, :phone)
135
+ instances
136
+ end
137
+ end