RubyGems - ramparts - Versions diffs - 0.3.0 - Mend

ramparts 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +7 -0
data/.github/stale.yml +24 -0
data/.gitignore +1 -0
data/.rspec +1 -0
data/.rubocop.yml +54 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +18 -0
data/CONTRIBUTING.md +46 -0
data/Gemfile +9 -0
data/Gemfile.lock +52 -0
data/LICENSE.md +21 -0
data/README.md +226 -0
data/ROADMAP.md +21 -0
data/Rakefile +0 -0
data/lib/ramparts.rb +3 -0
data/lib/ramparts/base.rb +99 -0
data/lib/ramparts/data/list_of_email_domains.rb +73 -0
data/lib/ramparts/helpers.rb +46 -0
data/lib/ramparts/parsers/email_parser.rb +88 -0
data/lib/ramparts/parsers/phone_parser.rb +137 -0
data/lib/ramparts/parsers/url_parser.rb +30 -0
data/lib/ramparts/version.rb +5 -0
data/ramparts.gemspec +24 -0
data/spec/data/email_and_phone_data/falsy_email_and_phone_data.rb +6 -0
data/spec/data/email_and_phone_data/truthy_email_and_phone_data.rb +33 -0
data/spec/data/email_data/falsy_email_data.rb +6 -0
data/spec/data/email_data/truthy_email_data.rb +87 -0
data/spec/data/phone_data/falsy_phone_data.rb +6 -0
data/spec/data/phone_data/truthy_phone_data.rb +109 -0
data/spec/data/url_data/falsy_url_data.rb +6 -0
data/spec/data/url_data/truthy_url_data.rb +12 -0
data/spec/parsers/email_and_phone_parser_spec.rb +44 -0
data/spec/parsers/email_parser_spec.rb +60 -0
data/spec/parsers/phone_parser_spec.rb +56 -0
data/spec/parsers/url_parser_spec.rb +15 -0
data/spec/spec_constants.rb +3 -0
data/spec/spec_helper.rb +87 -0
metadata +147 -0

data/ROADMAP.md ADDED

@@ -0,0 +1,21 @@
+# Roadmap
+- [x] Add error messages, especially for malformed input
+- [ ] Implement options for `find_phone_numbers` and `replace_phone_numbers`
+- [x] Implement [Stale](https://github.com/probot/stale) for issue
+- [ ] Implement Automatic Deployments
+- [x] Look into using hashes instead of array for testing infrastructure
+- [ ] Look into using custom rspec matchers
+- [ ] Travis badge for marketing
+- [x] Use Ruby that's already installed on TravisCI
+- [ ] Use threads for true Map/Reduce. Look into [Celluloid](https://github.com/celluloid/celluloid)
+- [ ] Have check domain (for email matches) as an option (not only on :aggressive option) to reduce over matching
+- [ ] Check TLD to make sure it's a valid TLD as an option to reduce over matching
+- [x] Refactor tests to use hashes instead of arrays
+- [ ] Allow use of function based replacement instead of forcing yield functionality
+- [ ] Implement URL functionality
+- [ ] Change URL to keywords list functionality
+- [ ] Robust-ify the overlapping interval scenario
+- [ ] Get setup on Code Climate once the repo goes open source
+- [ ] Think about using InchCI (inline docs) instead of README
+- [x] Creating a `spec_helper.rb` and use tools like SimpleCov,
+- [ ] Look into using Shoulda matchers, and test randomization.

data/Rakefile ADDED

File without changes

data/lib/ramparts.rb ADDED

@@ -0,0 +1,3 @@
+# frozen_string_literal: true
+require_relative './ramparts/base'

data/lib/ramparts/base.rb ADDED

@@ -0,0 +1,99 @@
+# frozen_string_literal: true
+require_relative './parsers/email_parser'
+require_relative './parsers/phone_parser'
+require_relative './parsers/url_parser'
+require_relative './helpers'
+class Ramparts
+  def self.count_phone_numbers(text, options = {})
+    pp = PhoneParser.new
+    pp.count_phone_number_instances(text, options)
+  end
+  def self.find_phone_numbers(text, options = {})
+    pp = PhoneParser.new
+    pp.find_phone_number_instances(text, options)
+  end
+  def self.replace_phone_numbers(text, options = {}, &block)
+    pp = PhoneParser.new
+    pp.replace_phone_number_instances(text, options, &block)
+  end
+  def self.count_emails(text, options = {})
+    ep = EmailParser.new
+    ep.count_email_instances(text, options)
+  end
+  def self.find_emails(text, options = {})
+    ep = EmailParser.new
+    ep.find_email_instances(text, options)
+  end
+  def self.replace_emails(text, options = {}, &block)
+    ep = EmailParser.new
+    ep.replace_email_instances(text, options, &block)
+  end
+  def self.count_urls(text, options = {})
+    up = UrlParser.new
+    up.count_url_instances(text, options)
+  end
+  def self.count_phone_numbers_and_emails(text, options = {})
+    pp = PhoneParser.new
+    ep = EmailParser.new
+    phone_instances = pp.count_phone_number_instances(text, options)
+    email_instances = ep.count_email_instances(text, options)
+    phone_instances + email_instances
+  end
+  def self.find_phone_numbers_and_emails(text, options = {})
+    pp = PhoneParser.new
+    ep = EmailParser.new
+    phone_instances = pp.find_phone_number_instances(text, options)
+    email_instances = ep.find_email_instances(text, options)
+    if options.fetch(:compare, false)
+      phone_instances.each do |phone|
+        phone_range = (phone[:start_offset]...phone[:end_offset])
+        email_instances.delete_if do |email|
+          email_range = (email[:start_offset]...email[:end_offset])
+          ranges_overlap?(phone_range, email_range)
+        end
+      end
+    end
+    phone_instances + email_instances
+  end
+  def self.replace_phone_numbers_and_emails(text, options = {}, &block)
+    pp = PhoneParser.new
+    ep = EmailParser.new
+    phone_instances = pp.find_phone_number_instances(text, options)
+    email_instances = ep.find_email_instances(text, options)
+    total_instances = phone_instances + email_instances
+    if options.fetch(:compare, false)
+      phone_instances.each do |phone|
+        phone_range = (phone[:start_offset]...phone[:end_offset])
+        email_instances.delete_if do |email|
+          email_range = (email[:start_offset]...email[:end_offset])
+          ranges_overlap?(phone_range, email_range)
+        end
+      end
+    end
+    # We have no idea the order of the matches unless we ran the regex for both occurrences at the same time.
+    # Instead we sort by start offset and then reverse so that we can replace from the end of the
+    # string to the start to not screw up indices. Apparently this is the fastest way to sort in reverse
+    # https://stackoverflow.com/questions/2642182/sorting-an-array-in-descending-order-in-ruby#answer-2651028
+    total_instances_sorted = total_instances.sort_by { |instance| instance[:start_offset] }.reverse!
+    replace(text, total_instances_sorted, &block)
+  end
+end

data/lib/ramparts/data/list_of_email_domains.rb ADDED

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+EMAIL_DOMAINS = %w[
+  gmail
+  yahoo
+  hotmail
+  aol
+  icloud
+  live
+  outlook
+  ymail
+  comcast
+  shaw
+  rogers
+  msn
+  mail
+  me
+  att
+  careguide
+  sbcglobal
+  rocketmail
+  telus
+  sympatico
+  cox
+  gmai
+  email
+  aim
+  yandex
+  gamil
+  gmx
+  student
+  students
+  earthlink
+  gnail
+  juno
+  gmsil
+  netzero
+  ail
+  gmil
+  gmal
+  hmail
+  yaho
+  alumni
+  gmial
+  googlemail
+  tampabay
+  mtroyal
+  usa
+  cfl
+  yshoo
+  protonmail
+  rediffmail
+  liberty
+  maine
+  inbox
+  optimum
+  example
+  yhaoo
+  yorku
+  mchsi
+  yahoi
+  zoho
+  hushmail
+  libero
+  hotmal
+  ukr
+  wowway
+  post
+  lycos
+  yaboo
+  contractor
+  yahool
+].freeze

data/lib/ramparts/helpers.rb ADDED

@@ -0,0 +1,46 @@
+# frozen_string_literal: false
+ARGUMENT_ERROR_TEXT = 'Parameter 1, the block of text to parse, is not a string'.freeze
+# The map reduce (MR) algorithm. Faster by ~2x than the other algorithm.
+# Maps parts of the text such as 'at' or 'FOUR' down to '@' and '4'
+# removes spaces etc, and then runs a simple regex over the remainder
+# Information loss occurs and hence it can't return indices
+MR_ALGO = 'MR'.freeze
+# The glorified regex (GR) algorithm.
+# An obtuse and yet heartily strong regex that does a single pass over
+# the text. Since the regex is so complicated and robust - it is slower
+# than the map reduce algorithm. No information loss occurs
+# so we can return indices of where the phone numbers and etc. exist
+GR_ALGO = 'GR'.freeze
+# Given some text it replaces each matched instance with the given insertable
+def replace(text, instances, &block) # rubocop:disable Lint/UnusedMethodArgument
+  altered_text = String.new(text)
+  instances.map do |instance|
+    insertable = yield instance
+    altered_text[instance[:start_offset]...instance[:end_offset]] = insertable
+  end
+  altered_text
+end
+# Given some text it scans the text with the given regex for matches
+def scan(text, regex, type)
+  text
+    .enum_for(:scan, regex)
+    .map do
+      {
+        start_offset: Regexp.last_match.begin(0),
+        end_offset: Regexp.last_match.begin(0) + Regexp.last_match.to_s.length,
+        value: Regexp.last_match.to_s,
+        type: type
+      }
+    end
+end
+# Check if two ranges overlap
+def ranges_overlap?(r1, r2)
+  r1.cover?(r2.first) || r2.cover?(r1.first)
+end

data/lib/ramparts/parsers/email_parser.rb ADDED

@@ -0,0 +1,88 @@
+# frozen_string_literal: true
+require_relative '../data/list_of_email_domains'
+require_relative '../helpers'
+# Parses text and attempts to locate email
+class EmailParser
+  # Counts email occurrences within a block of text
+  # Note: Uses map reduce algorithm
+  def count_email_instances(text, options)
+    raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
+    text = parse_email(text)
+    email_instances(MR_ALGO, text, options).length
+  end
+  # Replaces the occurrences of email within the block of text with an insertable
+  def replace_email_instances(text, options, &block)
+    raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
+    instances = find_email_instances(text, options)
+    replace(text, instances.reverse!, &block)
+  end
+  # Fins the occurrences of emails within a block of text and returns their positions
+  def find_email_instances(text, options)
+    raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
+    text = text.downcase
+    email_instances(GR_ALGO, text, options)
+  end
+  private
+  # Matches a certain string of text allowed in emails
+  TEXT_MATCH = 'a-z0-9._%+-'
+  # rubocop:disable LineLength
+  # Regex to find the emails, must have .com or something similar to match
+  GR_REGEX = Regexp.new(/(([#{TEXT_MATCH}]{1}[^\w]{1})+|([#{TEXT_MATCH}])+)([^\w]*@[^\w]*){1}[a-z0-9.-]+((\.|[^\w]+(dot){1}[^\w]+){1}[a-z]{2,})+/)
+  # Regex to find the emails, must have .com or something similar to match and also checks for the word 'at' as '@'
+  GR_REGEX_WITH_AT = Regexp.new(/(([#{TEXT_MATCH}]{1}[^\w]{1})+|([#{TEXT_MATCH}])+)([^\w]+(at){1}[^\w]+|[^\w]*@[^\w]*){1}[a-z0-9.-]+((\.|[^\w]+(dot){1}[^\w]+){1}[a-z]{2,})+/)
+  # Regex to find the emails, does .com or something similar to match
+  GR_REGEX_WITHOUT_DOT = Regexp.new(/(([#{TEXT_MATCH}]{1}[^\w]{1})+|([#{TEXT_MATCH}])+)([^\w]+(at){1}[^\w]+|[^\w]*@[^\w]*){1}[a-z0-9.-]+([^\w]*\.[^\w]*|[^\w]+(dot){1}[^\w]+)?([a-z]{2,})?/)
+  # rubocop:enable LineLength
+  # Regex to find emails for MapReduce, must have .com or something similar to match
+  MR_REGEX = Regexp.new(/[a-z0-9._%+-]+\${,2}@{1}\${,2}[a-z0-9.-]+\${,2}(\.){1}[a-z]{2,}/)
+  # Regex to find emails for MapReduce, does not have to have .com or something similar to match
+  MR_REGEX_WITHOUT_DOT = Regexp.new(/[a-z0-9._%+-]+\${,2}@{1}\${,2}[a-z0-9.-]+/)
+  # Map these occurences down to their constituent parts
+  REPLACEMENTS = {
+    ' at ' => '@',
+    '(at)' => '@',
+    ' dot ' => '.'
+  }.freeze
+  # Parses the email and maps down certain occurrences
+  def parse_email(text)
+    text.downcase.gsub(/\ at\ |\(at\)|\ dot\ /, REPLACEMENTS).gsub(/[^\w\@\.\_\%\+\-]/, '$')
+  end
+  def email_instances(algo, text, options)
+    # Determines which algorithm to use
+    regex = algo == MR_ALGO ? MR_REGEX : GR_REGEX
+    regex_without_dot = algo == MR_ALGO ? MR_REGEX_WITHOUT_DOT : GR_REGEX_WITHOUT_DOT
+    regex_with_at = GR_REGEX_WITH_AT
+    instances = []
+    if options.fetch(:aggressive, false)
+      temp_instances = scan(text, regex_without_dot, :email)
+      # Since this is the aggressive option where '.com' or similar isn't needed
+      # Check to make sure the last word of the string is a domain
+      temp_instances.each do |instance|
+        instances << instance if EMAIL_DOMAINS.any? { |domain| instance[:value].split('@')[1]&.include? domain }
+      end
+    elsif options.fetch(:check_for_at, false)
+      instances = scan(text, regex_with_at, :email)
+    else
+      instances = scan(text, regex, :email)
+    end
+    instances
+  end
+end

data/lib/ramparts/parsers/phone_parser.rb ADDED

@@ -0,0 +1,137 @@
+# frozen_string_literal: true
+require_relative '../helpers'
+# Parses text and attempts to locate phone numbers
+class PhoneParser
+  # Counts the number of phone number instances that occur within the block of text
+  def count_phone_number_instances(text, options)
+    raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
+    parsed_text = parse_phone_number(text, options)
+    # Uses the map reduce algorithm
+    phone_number_instances(MR_ALGO, parsed_text, options).length
+  end
+  # Replaces phone number instances within the block of text with the insertable
+  def replace_phone_number_instances(text, options, &block)
+    raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
+    instances = find_phone_number_instances(text, options)
+    replace(text, instances.reverse!, &block)
+  end
+  # Finds phone number instances within the block of text
+  def find_phone_number_instances(text, options)
+    raise ArgumentError, ARGUMENT_ERROR_TEXT unless text.is_a? String
+    text = text.downcase
+    # Finds the phone number instances using the glorified regex algorithm
+    phone_number_instances(GR_ALGO, text, options)
+  end
+  private
+  # Phonetic versions of numbers
+  PHONETICS = %w[
+    one
+    two
+    three
+    four
+    five
+    six
+    seven
+    eight
+    nine
+    oh
+    zero
+  ].freeze
+  # L33t speak versions of numbers
+  LEET_SPEAK = %w[
+    w0n
+    too
+    thr33
+    f0r
+    f1v3
+    s3x
+    sex
+    s3v3n
+    at3
+    nin3
+  ].freeze
+  # Handles multiple spaces
+  MULTI_SPACE = '( )*'
+  # Regex for phonetics, both with spaces and otherwise
+  REGEX_PHONETICS = PHONETICS.join('|')
+  REGEX_PHONETICS_SPACED = PHONETICS.map { |word| word.split('').join(MULTI_SPACE) }.join('|')
+  # Regex for l33t, both with spaces and otherwise
+  REGEX_LEET_SPEAK = LEET_SPEAK.join('|')
+  REGEX_LEET_SPEAK_SPACED = LEET_SPEAK.map { |word| word.split('').join(MULTI_SPACE) }.join('|')
+  # Base matching for a possible phone number digit
+  BASE_MATCHING = "#{REGEX_PHONETICS}|#{REGEX_LEET_SPEAK}|#{REGEX_PHONETICS_SPACED}|#{REGEX_LEET_SPEAK_SPACED}"
+  # The final regex used to match phone numbers for GR
+  GR_REGEX =
+    Regexp.new(/(\()?(\d|#{BASE_MATCHING}){1}([^\w]*(\d|#{BASE_MATCHING}){1}[^\w]*){5,}(\d|#{BASE_MATCHING}){1}/)
+  # The final regex used to match phone numbers for MR
+  MR_REGEX = Regexp.new(/(\-*\.?\d{1}\.?\-*){7,}/)
+  # Replacements used for phonetics for MR
+  REPLACEMENTS = {
+    'one' => '1',
+    'two' => '2',
+    'three' => '3',
+    'four' => '4',
+    'five' => '5',
+    'six' => '6',
+    'seven' => '7',
+    'eight' => '8',
+    'nine' => '9',
+    'oh' => '0',
+    'zero' => '0'
+  }.freeze
+  # Replacements used for l33t for MR
+  LEET_REPLACEMENTS = {
+    'w0n' => '1',
+    'too' => '2',
+    'thr33' => '3',
+    'f0r' => '4',
+    'f1v3' => '5',
+    's3x' => '6',
+    'sex' => '6',
+    's3v3n' => '7',
+    'at3' => '8',
+    'nin3' => '9'
+  }.freeze
+  # Parses the phone number for MR, uses a variety of options
+  def parse_phone_number(text, options)
+    text = text.delete(' ') if options.fetch(:remove_spaces, true)
+    text = text.downcase.gsub(/#{REGEX_PHONETICS}/, REPLACEMENTS)
+    if options.fetch(:parse_leet, true)
+      text = text.gsub(/#{REGEX_LEET_SPEAK}/, LEET_REPLACEMENTS)
+    end
+    text.gsub(/[^\w]/, '-').gsub(/[a-z]/, '.')
+  end
+  # Returns the phone number instances using the specified algorithm
+  def phone_number_instances(algo, text, _options)
+    # Determines which algorithm to use
+    regex = algo == MR_ALGO ? MR_REGEX : GR_REGEX
+    instances = scan(text, regex, :phone)
+    instances
+  end
+end