RubyGems - incollege-text - Versions diffs - 1.12.0 - Mend

incollege-text 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gemtest +0 -0
data/.gitignore +40 -0
data/.gitmodules +3 -0
data/.rspec +2 -0
data/Gemfile +4 -0
data/LICENSE +188 -0
data/README.rdoc +106 -0
data/Rakefile +36 -0
data/incollege-text.gemspec +28 -0
data/lib/incollege-text.rb +22 -0
data/lib/incollege-text/autolink.rb +448 -0
data/lib/incollege-text/deprecation.rb +15 -0
data/lib/incollege-text/extractor.rb +328 -0
data/lib/incollege-text/hash_helper.rb +21 -0
data/lib/incollege-text/hit_highlighter.rb +86 -0
data/lib/incollege-text/regex.rb +333 -0
data/lib/incollege-text/rewriter.rb +63 -0
data/lib/incollege-text/unicode.rb +26 -0
data/lib/incollege-text/validation.rb +113 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/spec/autolinking_spec.rb +844 -0
data/spec/extractor_spec.rb +368 -0
data/spec/hithighlighter_spec.rb +92 -0
data/spec/incollege_text_spec.rb +21 -0
data/spec/regex_spec.rb +38 -0
data/spec/rewriter_spec.rb +548 -0
data/spec/spec_helper.rb +128 -0
data/spec/test_urls.rb +84 -0
data/spec/unicode_spec.rb +31 -0
data/spec/validation_spec.rb +43 -0
data/test/conformance_test.rb +207 -0
metadata +182 -0

data/lib/incollege-text/extractor.rb ADDED

@@ -0,0 +1,328 @@
+# encoding: UTF-8
+class String
+  # Helper function to count the character length by first converting to an
+  # array.  This is needed because with unicode strings, the return value
+  # of length may be incorrect
+  def char_length
+    if respond_to? :codepoints
+      length
+    else
+      chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
+    end
+  end
+  # Helper function to convert this string into an array of unicode characters.
+  def to_char_a
+    @to_char_a ||= if chars.kind_of?(Enumerable)
+      chars.to_a
+    else
+      char_array = []
+      0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
+      char_array
+    end
+  end
+end
+# Helper functions to return character offsets instead of byte offsets.
+class MatchData
+  def char_begin(n)
+    if string.respond_to? :codepoints
+      self.begin(n)
+    else
+      string[0, self.begin(n)].char_length
+    end
+  end
+  def char_end(n)
+    if string.respond_to? :codepoints
+      self.end(n)
+    else
+      string[0, self.end(n)].char_length
+    end
+  end
+end
+module Incollege
+  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
+  # of usernames, lists, URLs and hashtags.
+  module Extractor extend self
+    # Remove overlapping entities.
+    # This returns a new array with no overlapping entities.
+    def remove_overlapping_entities(entities)
+      # sort by start index
+      entities = entities.sort_by{|entity| entity[:indices].first}
+      # remove duplicates
+      prev = nil
+      entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
+      entities
+    end
+    # Extracts all usernames, lists, hashtags and URLs  in the Tweet <tt>text</tt>
+    # along with the indices for where the entity ocurred
+    # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each entity.
+    def extract_entities_with_indices(text, options = {}, &block)
+      # extract all entities
+      entities = extract_urls_with_indices(text, options) +
+                 extract_hashtags_with_indices(text, :check_url_overlap => false) +
+                 extract_mentions_or_lists_with_indices(text) +
+                 extract_cashtags_with_indices(text)
+      return [] if entities.empty?
+      entities = remove_overlapping_entities(entities)
+      entities.each(&block) if block_given?
+      entities
+    end
+    # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each username.
+    def extract_mentioned_screen_names(text, &block) # :yields: username
+      screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
+      screen_names.each(&block) if block_given?
+      screen_names
+    end
+    # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
+    # along with the indices for where the mention ocurred.  If the
+    # <tt>text</tt> is nil or contains no username mentions, an empty array
+    # will be returned.
+    #
+    # If a block is given, then it will be called with each username, the start
+    # index, and the end index in the <tt>text</tt>.
+    def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
+      return [] unless text
+      possible_screen_names = []
+      extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
+        next unless list_slug.empty?
+        possible_screen_names << {
+          :screen_name => screen_name,
+          :indices => [start_position, end_position]
+        }
+      end
+      if block_given?
+        possible_screen_names.each do |mention|
+          yield mention[:screen_name], mention[:indices].first, mention[:indices].last
+        end
+      end
+      possible_screen_names
+    end
+    # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
+    # along with the indices for where the mention ocurred.  If the
+    # <tt>text</tt> is nil or contains no username or list mentions, an empty array
+    # will be returned.
+    #
+    # If a block is given, then it will be called with each username, list slug, the start
+    # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
+    # if this is a username mention.
+    def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
+      return [] unless text =~ /[@＠]/
+      possible_entries = []
+      text.to_s.scan(Incollege::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
+        match_data = $~
+        after = $'
+        unless after =~ Incollege::Regex[:end_mention_match]
+          start_position = match_data.char_begin(3) - 1
+          end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
+          possible_entries << {
+            :screen_name => screen_name,
+            :list_slug => list_slug || "",
+            :indices => [start_position, end_position]
+          }
+        end
+      end
+      if block_given?
+        possible_entries.each do |mention|
+          yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
+        end
+      end
+      possible_entries
+    end
+    # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
+    #
+    # If a block is given then it will be called with the username replied to (if any)
+    def extract_reply_screen_name(text) # :yields: username
+      return nil unless text
+      possible_screen_name = text.match(Incollege::Regex[:valid_reply])
+      return unless possible_screen_name.respond_to?(:captures)
+      return if $' =~ Incollege::Regex[:end_mention_match]
+      screen_name = possible_screen_name.captures.first
+      yield screen_name if block_given?
+      screen_name
+    end
+    # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each URL.
+    def extract_urls(text, &block) # :yields: url
+      urls = extract_urls_with_indices(text).map{|u| u[:url]}
+      urls.each(&block) if block_given?
+      urls
+    end
+    # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
+    # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
+    # URLs an empty array will be returned.
+    #
+    # If a block is given then it will be called for each URL.
+    def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
+      return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
+      urls = []
+      text.to_s.scan(Incollege::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
+        valid_url_match_data = $~
+        start_position = valid_url_match_data.char_begin(3)
+        end_position = valid_url_match_data.char_end(3)
+        # If protocol is missing and domain contains non-ASCII characters,
+        # extract ASCII-only domains.
+        if !protocol
+          next if !options[:extract_url_without_protocol] || before =~ Incollege::Regex[:invalid_url_without_protocol_preceding_chars]
+          last_url = nil
+          domain.scan(Incollege::Regex[:valid_ascii_domain]) do |ascii_domain|
+            last_url = {
+              :url => ascii_domain,
+              :indices => [start_position + $~.char_begin(0),
+                           start_position + $~.char_end(0)]
+            }
+            if path ||
+                ascii_domain =~ Incollege::Regex[:valid_special_short_domain] ||
+                ascii_domain !~ Incollege::Regex[:invalid_short_domain]
+              urls << last_url
+            end
+          end
+          # no ASCII-only domain found. Skip the entire URL
+          next unless last_url
+          # last_url only contains domain. Need to add path and query if they exist.
+          if path
+            # last_url was not added. Add it to urls here.
+            last_url[:url] = url.sub(domain, last_url[:url])
+            last_url[:indices][1] = end_position
+          end
+        else
+          # In the case of t.co URLs, don't allow additional path characters
+          if url =~ Incollege::Regex[:valid_tco_url]
+            url = $&
+            end_position = start_position + url.char_length
+          end
+          urls << {
+            :url => url,
+            :indices => [start_position, end_position]
+          }
+        end
+      end
+      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
+      urls
+    end
+    # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>#</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each hashtag.
+    def extract_hashtags(text, &block) # :yields: hashtag_text
+      hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
+      hashtags.each(&block) if block_given?
+      hashtags
+    end
+    # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>#</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each hashtag.
+    def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
+      return [] unless text =~ /[#＃]/
+      tags = []
+      text.scan(Incollege::Regex[:valid_hashtag]) do |before, hash, hash_text|
+        match_data = $~
+        start_position = match_data.char_begin(2)
+        end_position = match_data.char_end(3)
+        after = $'
+        unless after =~ Incollege::Regex[:end_hashtag_match]
+          tags << {
+            :hashtag => hash_text,
+            :indices => [start_position, end_position]
+          }
+        end
+      end
+      if options[:check_url_overlap]
+        # extract URLs
+        urls = extract_urls_with_indices(text)
+        unless urls.empty?
+          tags.concat(urls)
+          # remove duplicates
+          tags = remove_overlapping_entities(tags)
+          # remove URL entities
+          tags.reject!{|entity| !entity[:hashtag] }
+        end
+      end
+      tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
+      tags
+    end
+    # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>$</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each cashtag.
+    def extract_cashtags(text, &block) # :yields: cashtag_text
+      cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
+      cashtags.each(&block) if block_given?
+      cashtags
+    end
+    # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>$</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each cashtag.
+    def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
+      return [] unless text =~ /\$/
+      tags = []
+      text.scan(Incollege::Regex[:valid_cashtag]) do |before, dollar, cash_text|
+        match_data = $~
+        start_position = match_data.char_begin(2)
+        end_position = match_data.char_end(3)
+        tags << {
+          :cashtag => cash_text,
+          :indices => [start_position, end_position]
+        }
+      end
+      tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
+      tags
+    end
+  end
+end

data/lib/incollege-text/hash_helper.rb ADDED

@@ -0,0 +1,21 @@
+module Incollege
+  module HashHelper
+    # Return a new hash with all keys converted to symbols, as long as
+    # they respond to +to_sym+.
+    #
+    #   { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
+    #   #=> { :name => "Rob", :years => "28" }
+    def self.symbolize_keys(hash)
+      symbolize_keys!(hash.dup)
+    end
+    # Destructively convert all keys to symbols, as long as they respond
+    # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
+    def self.symbolize_keys!(hash)
+      hash.keys.each do |key|
+        hash[(key.to_sym rescue key) || key] = hash.delete(key)
+      end
+      hash
+    end
+  end
+end

data/lib/incollege-text/hit_highlighter.rb ADDED

@@ -0,0 +1,86 @@
+module Incollege
+  # Module for doing "hit highlighting" on tweets that have been auto-linked already.
+  # Useful with the results returned from the Search API.
+  module HitHighlighter extend self
+    # Default Tag used for hit highlighting
+    DEFAULT_HIGHLIGHT_TAG = "em"
+    # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
+    # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
+    # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
+    #
+    # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
+    #
+    #  irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
+    #  => "test <strong>hit</strong> here"
+    def hit_highlight(text, hits = [], options = {})
+      if hits.empty?
+        return text
+      end
+      tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
+      tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
+      chunks = text.split(/[<>]/)
+      result = []
+      chunk_index, chunk = 0, chunks[0]
+      chunk_chars = chunk.to_s.to_char_a
+      prev_chunks_len = 0
+      chunk_cursor = 0
+      start_in_chunk = false
+      for hit, index in hits.flatten.each_with_index do
+        tag = tags[index % 2]
+        placed = false
+        until chunk.nil? || hit < prev_chunks_len + chunk.length do
+          result << chunk_chars[chunk_cursor..-1]
+          if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
+            result << tag
+            placed = true
+          end
+          # correctly handle highlights that end on the final character.
+          if tag_text = chunks[chunk_index+1]
+            result << "<#{tag_text}>"
+          end
+          prev_chunks_len += chunk_chars.length
+          chunk_cursor = 0
+          chunk_index += 2
+          chunk = chunks[chunk_index]
+          chunk_chars = chunk.to_s.to_char_a
+          start_in_chunk = false
+        end
+        if !placed && !chunk.nil?
+          hit_spot = hit - prev_chunks_len
+          result << chunk_chars[chunk_cursor...hit_spot] << tag
+          chunk_cursor = hit_spot
+          if index % 2 == 0
+            start_in_chunk = true
+          else
+            start_in_chunk = false
+          end
+          placed = true
+        end
+        # ultimate fallback, hits that run off the end get a closing tag
+        if !placed
+          result << tag
+        end
+      end
+      if chunk
+        if chunk_cursor < chunk_chars.length
+          result << chunk_chars[chunk_cursor..-1]
+        end
+        (chunk_index+1).upto(chunks.length-1).each do |i|
+          result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
+        end
+      end
+      result.flatten.join
+    end
+  end
+end

data/lib/incollege-text/regex.rb ADDED

@@ -0,0 +1,333 @@
+# encoding: UTF-8
+module Incollege
+  # A collection of regular expressions for parsing Tweet text. The regular expression
+  # list is frozen at load time to ensure immutability. These regular expressions are
+  # used throughout the <tt>Incollege</tt> classes. Special care has been taken to make
+  # sure these reular expressions work with Tweets in all languages.
+  class Regex
+    require 'yaml'
+    REGEXEN = {} # :nodoc:
+    def self.regex_range(from, to = nil) # :nodoc:
+      if $RUBY_1_9
+        if to
+          "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
+        else
+          "\\u{#{from.to_s(16).rjust(4, '0')}}"
+        end
+      else
+        if to
+          [from].pack('U') + '-' + [to].pack('U')
+        else
+          [from].pack('U')
+        end
+      end
+    end
+    TLDS = YAML.load_file(
+      File.join(
+        File.expand_path('../../..', __FILE__), # project root
+        'lib', 'assets', 'tld_lib.yml'
+      )
+    )
+    # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
+    # to access both the list of characters and a pattern suitible for use with String#split
+    #  Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
+    UNICODE_SPACES = [
+          (0x0009..0x000D).to_a,  # White_Space # Cc   [5] <control-0009>..<control-000D>
+          0x0020,          # White_Space # Zs       SPACE
+          0x0085,          # White_Space # Cc       <control-0085>
+          0x00A0,          # White_Space # Zs       NO-BREAK SPACE
+          0x1680,          # White_Space # Zs       OGHAM SPACE MARK
+          0x180E,          # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+          (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+          0x2028,          # White_Space # Zl       LINE SEPARATOR
+          0x2029,          # White_Space # Zp       PARAGRAPH SEPARATOR
+          0x202F,          # White_Space # Zs       NARROW NO-BREAK SPACE
+          0x205F,          # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+          0x3000,          # White_Space # Zs       IDEOGRAPHIC SPACE
+    ].flatten.map{|c| [c].pack('U*')}.freeze
+    REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
+    # Character not allowed in Tweets
+    INVALID_CHARACTERS = [
+      0xFFFE, 0xFEFF, # BOM
+      0xFFFF,         # Special
+      0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
+    ].map{|cp| [cp].pack('U') }.freeze
+    REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
+    major, minor, _patch = RUBY_VERSION.split('.')
+    if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
+      REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
+    else
+      # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
+      REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
+    end
+    # Latin accented characters
+    # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
+    # Also excludes 0xf7, the division sign
+    LATIN_ACCENTS = [
+          regex_range(0xc0, 0xd6),
+          regex_range(0xd8, 0xf6),
+          regex_range(0xf8, 0xff),
+          regex_range(0x0100, 0x024f),
+          regex_range(0x0253, 0x0254),
+          regex_range(0x0256, 0x0257),
+          regex_range(0x0259),
+          regex_range(0x025b),
+          regex_range(0x0263),
+          regex_range(0x0268),
+          regex_range(0x026f),
+          regex_range(0x0272),
+          regex_range(0x0289),
+          regex_range(0x028b),
+          regex_range(0x02bb),
+          regex_range(0x0300, 0x036f),
+          regex_range(0x1e00, 0x1eff)
+    ].join('').freeze
+    REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
+    RTL_CHARACTERS = [
+      regex_range(0x0600,0x06FF),
+      regex_range(0x0750,0x077F),
+      regex_range(0x0590,0x05FF),
+      regex_range(0xFE70,0xFEFF)
+    ].join('').freeze
+    PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
+    SPACE_CHARS = " \t\n\x0B\f\r"
+    CTRL_CHARS = "\x00-\x1F\x7F"
+    # A hashtag must contain at least one unicode letter or mark, as well as numbers, underscores, and select special characters.
+    HASHTAG_ALPHA = /[\p{L}\p{M}]/
+    HASHTAG_ALPHANUMERIC = /[\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]/
+    HASHTAG_BOUNDARY = /\A|\z|[^&\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]/
+    HASHTAG = /(#{HASHTAG_BOUNDARY})(#|＃)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
+    REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
+    # Used in Extractor for final filtering
+    REGEXEN[:end_hashtag_match] = /\A(?:[#＃]|:\/\/)/o
+    REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@＠]|^|(?:^|[^a-zA-Z0-9_+~.-])[rR][tT]:?)/o
+    REGEXEN[:at_signs] = /[@＠]/
+    REGEXEN[:valid_mention_or_list] = /
+      (#{REGEXEN[:valid_mention_preceding_chars]})  # $1: Preceeding character
+      (#{REGEXEN[:at_signs]})                       # $2: At mark
+      ([a-zA-Z0-9_]{1,20})                          # $3: Screen name
+      (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?             # $4: List (optional)
+    /ox
+    REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
+    # Used in Extractor for final filtering
+    REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
+    # URL related hash regex collection
+    REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@＠$#＃#{INVALID_CHARACTERS.join('')}]|^)/io
+    REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
+    DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
+    REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
+    REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
+    REGEXEN[:valid_gTLD] = %r{
+      (?:
+        (?:#{TLDS['generic'].join('|')})
+        (?=[^0-9a-z@]|$)
+      )
+    }ix
+    REGEXEN[:valid_ccTLD] = %r{
+      (?:
+        (?:#{TLDS['country'].join('|')})
+        (?=[^0-9a-z@]|$)
+      )
+    }ix
+    REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
+    REGEXEN[:valid_special_cctld] = %r{
+      (?:
+        (?:co|tv)
+        (?=[^0-9a-z@]|$)
+      )
+    }ix
+    REGEXEN[:valid_domain] = /(?:
+      #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
+      (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
+    )/iox
+    # This is used in Extractor
+    REGEXEN[:valid_ascii_domain] = /
+      (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
+      (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
+    /iox
+    # This is used in Extractor for stricter t.co URL extraction
+    REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
+    # This is used in Extractor to filter out unwanted URLs.
+    REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
+    REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
+    REGEXEN[:valid_port_number] = /[0-9]+/
+    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
+    # Allow URL paths to contain up to two nested levels of balanced parens
+    #  1. Used in Wikipedia URLs like /Primer_(film)
+    #  2. Used in IIS sessions like /S(dfd346)/
+    #  3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
+    REGEXEN[:valid_url_balanced_parens] = /
+      \(
+        (?:
+          #{REGEXEN[:valid_general_url_path_chars]}+
+          |
+          # allow one nested level of balanced parentheses
+          (?:
+            #{REGEXEN[:valid_general_url_path_chars]}*
+            \(
+              #{REGEXEN[:valid_general_url_path_chars]}+
+            \)
+            #{REGEXEN[:valid_general_url_path_chars]}*
+          )
+        )
+      \)
+    /iox
+    # Valid end-of-path chracters (so /foo. does not gobble the period).
+    #   1. Allow =&# for empty URL parameters and other URL-join artifacts
+    REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
+    REGEXEN[:valid_url_path] = /(?:
+      (?:
+        #{REGEXEN[:valid_general_url_path_chars]}*
+        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
+        #{REGEXEN[:valid_url_path_ending_chars]}
+      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
+    )/iox
+    REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
+    REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
+    REGEXEN[:valid_url] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceeding chracter
+        (                                                                                   #   $3 URL
+          (https?:\/\/)?                                                                    #   $4 Protocol (optional)
+          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
+          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
+          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
+        )
+      )
+    }iox
+    REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
+    REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
+    # These URL validation pattern strings are based on the ABNF from RFC 3986
+    REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
+    REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
+    REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
+    REGEXEN[:validate_url_pchar] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}|
+      [:\|@]
+    )/iox
+    REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
+    REGEXEN[:validate_url_userinfo] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}|
+      :
+    )*/iox
+    REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
+    REGEXEN[:validate_url_ipv4] =
+      /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
+    # Punting on real IPv6 validation for now
+    REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
+    # Also punting on IPvFuture for now
+    REGEXEN[:validate_url_ip] = /(?:
+      #{REGEXEN[:validate_url_ipv4]}|
+      #{REGEXEN[:validate_url_ipv6]}
+    )/iox
+    # This is more strict than the rfc specifies
+    REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
+                                     (?:#{REGEXEN[:validate_url_domain_segment]}\.)
+                                     #{REGEXEN[:validate_url_domain_tld]})/iox
+    REGEXEN[:validate_url_host] = /(?:
+      #{REGEXEN[:validate_url_ip]}|
+      #{REGEXEN[:validate_url_domain]}
+    )/iox
+    # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
+    REGEXEN[:validate_url_unicode_subdomain_segment] =
+      /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain_segment] =
+      /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain_tld] =
+      /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
+                                             (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
+                                             #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
+    REGEXEN[:validate_url_unicode_host] = /(?:
+      #{REGEXEN[:validate_url_ip]}|
+      #{REGEXEN[:validate_url_unicode_domain]}
+    )/iox
+    REGEXEN[:validate_url_port] = /[0-9]{1,5}/
+    REGEXEN[:validate_url_unicode_authority] = %r{
+      (?:(#{REGEXEN[:validate_url_userinfo]})@)?     #  $1 userinfo
+      (#{REGEXEN[:validate_url_unicode_host]})       #  $2 host
+      (?::(#{REGEXEN[:validate_url_port]}))?         #  $3 port
+    }iox
+    REGEXEN[:validate_url_authority] = %r{
+      (?:(#{REGEXEN[:validate_url_userinfo]})@)?     #  $1 userinfo
+      (#{REGEXEN[:validate_url_host]})               #  $2 host
+      (?::(#{REGEXEN[:validate_url_port]}))?         #  $3 port
+    }iox
+    REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
+    REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
+    REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
+    # Modified version of RFC 3986 Appendix B
+    REGEXEN[:validate_url_unencoded] = %r{
+      \A                                #  Full URL
+      (?:
+        ([^:/?#]+)://                  #  $1 Scheme
+      )?
+      ([^/?#]*)                        #  $2 Authority
+      ([^?#]*)                         #  $3 Path
+      (?:
+        \?([^#]*)                      #  $4 Query
+      )?
+      (?:
+        \#(.*)                         #  $5 Fragment
+      )?\Z
+    }ix
+    REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
+    REGEXEN.each_pair{|k,v| v.freeze }
+    # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
+    # is not a known symbol a <tt>nil</tt> will be returned.
+    def self.[](key)
+      REGEXEN[key]
+    end
+  end
+end