RubyGems - sterile - Versions diffs - 1.0.0 - Mend

sterile 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/.autotest +5 -0
data/.gitignore +8 -0
data/.rvmrc +1 -0
data/Gemfile +9 -0
data/Gemfile.lock +14 -0
data/README.markdown +0 -0
data/Rakefile +7 -0
data/lib/sterile/codepoints.rb +46522 -0
data/lib/sterile/html_entities.rb +259 -0
data/lib/sterile/smart_format_rules.rb +40 -0
data/lib/sterile/version.rb +5 -0
data/lib/sterile.rb +339 -0
data/sterilize.gemspec +24 -0
metadata +69 -0

data/lib/sterile/html_entities.rb ADDED Viewed

@@ -0,0 +1,259 @@
+# encoding: UTF-8
+module Sterile
+  HTML_ENTITIES = {
+    "quot" => 34,
+    "amp" => 38,
+    "apos" => 39,
+    "lt" => 60,
+    "gt" => 62,
+    "nbsp" => 160,
+    "iexcl" => 161,
+    "cent" => 162,
+    "pound" => 163,
+    "curren" => 164,
+    "yen" => 165,
+    "brvbar" => 166,
+    "sect" => 167,
+    "uml" => 168,
+    "copy" => 169,
+    "ordf" => 170,
+    "laquo" => 171,
+    "not" => 172,
+    "shy" => 173,
+    "reg" => 174,
+    "macr" => 175,
+    "deg" => 176,
+    "plusmn" => 177,
+    "sup2" => 178,
+    "sup3" => 179,
+    "acute" => 180,
+    "micro" => 181,
+    "para" => 182,
+    "middot" => 183,
+    "cedil" => 184,
+    "sup1" => 185,
+    "ordm" => 186,
+    "raquo" => 187,
+    "frac14" => 188,
+    "frac12" => 189,
+    "frac34" => 190,
+    "iquest" => 191,
+    "Agrave" => 192,
+    "Aacute" => 193,
+    "Acirc" => 194,
+    "Atilde" => 195,
+    "Auml" => 196,
+    "Aring" => 197,
+    "AElig" => 198,
+    "Ccedil" => 199,
+    "Egrave" => 200,
+    "Eacute" => 201,
+    "Ecirc" => 202,
+    "Euml" => 203,
+    "Igrave" => 204,
+    "Iacute" => 205,
+    "Icirc" => 206,
+    "Iuml" => 207,
+    "ETH" => 208,
+    "Ntilde" => 209,
+    "Ograve" => 210,
+    "Oacute" => 211,
+    "Ocirc" => 212,
+    "Otilde" => 213,
+    "Ouml" => 214,
+    "times" => 215,
+    "Oslash" => 216,
+    "Ugrave" => 217,
+    "Uacute" => 218,
+    "Ucirc" => 219,
+    "Uuml" => 220,
+    "Yacute" => 221,
+    "THORN" => 222,
+    "szlig" => 223,
+    "agrave" => 224,
+    "aacute" => 225,
+    "acirc" => 226,
+    "atilde" => 227,
+    "auml" => 228,
+    "aring" => 229,
+    "aelig" => 230,
+    "ccedil" => 231,
+    "egrave" => 232,
+    "eacute" => 233,
+    "ecirc" => 234,
+    "euml" => 235,
+    "igrave" => 236,
+    "iacute" => 237,
+    "icirc" => 238,
+    "iuml" => 239,
+    "eth" => 240,
+    "ntilde" => 241,
+    "ograve" => 242,
+    "oacute" => 243,
+    "ocirc" => 244,
+    "otilde" => 245,
+    "ouml" => 246,
+    "divide" => 247,
+    "oslash" => 248,
+    "ugrave" => 249,
+    "uacute" => 250,
+    "ucirc" => 251,
+    "uuml" => 252,
+    "yacute" => 253,
+    "thorn" => 254,
+    "yuml" => 255,
+    "OElig" => 338,
+    "oelig" => 339,
+    "Scaron" => 352,
+    "scaron" => 353,
+    "Yuml" => 376,
+    "fnof" => 402,
+    "circ" => 710,
+    "tilde" => 732,
+    "Alpha" => 913,
+    "Beta" => 914,
+    "Gamma" => 915,
+    "Delta" => 916,
+    "Epsilon" => 917,
+    "Zeta" => 918,
+    "Eta" => 919,
+    "Theta" => 920,
+    "Iota" => 921,
+    "Kappa" => 922,
+    "Lambda" => 923,
+    "Mu" => 924,
+    "Nu" => 925,
+    "Xi" => 926,
+    "Omicron" => 927,
+    "Pi" => 928,
+    "Rho" => 929,
+    "Sigma" => 931,
+    "Tau" => 932,
+    "Upsilon" => 933,
+    "Phi" => 934,
+    "Chi" => 935,
+    "Psi" => 936,
+    "Omega" => 937,
+    "alpha" => 945,
+    "beta" => 946,
+    "gamma" => 947,
+    "delta" => 948,
+    "epsilon" => 949,
+    "zeta" => 950,
+    "eta" => 951,
+    "theta" => 952,
+    "iota" => 953,
+    "kappa" => 954,
+    "lambda" => 955,
+    "mu" => 956,
+    "nu" => 957,
+    "xi" => 958,
+    "omicron" => 959,
+    "pi" => 960,
+    "rho" => 961,
+    "sigmaf" => 962,
+    "sigma" => 963,
+    "tau" => 964,
+    "upsilon" => 965,
+    "phi" => 966,
+    "chi" => 967,
+    "psi" => 968,
+    "omega" => 969,
+    "thetasym" => 977,
+    "upsih" => 978,
+    "piv" => 982,
+    "ensp" => 8194,
+    "emsp" => 8195,
+    "thinsp" => 8201,
+    "zwnj" => 8204,
+    "zwj" => 8205,
+    "lrm" => 8206,
+    "rlm" => 8207,
+    "ndash" => 8211,
+    "mdash" => 8212,
+    "lsquo" => 8216,
+    "rsquo" => 8217,
+    "sbquo" => 8218,
+    "ldquo" => 8220,
+    "rdquo" => 8221,
+    "bdquo" => 8222,
+    "dagger" => 8224,
+    "Dagger" => 8225,
+    "bull" => 8226,
+    "hellip" => 8230,
+    "permil" => 8240,
+    "prime" => 8242,
+    "Prime" => 8243,
+    "lsaquo" => 8249,
+    "rsaquo" => 8250,
+    "oline" => 8254,
+    "frasl" => 8260,
+    "euro" => 8364,
+    "image" => 8465,
+    "weierp" => 8472,
+    "real" => 8476,
+    "trade" => 8482,
+    "alefsym" => 8501,
+    "larr" => 8592,
+    "uarr" => 8593,
+    "rarr" => 8594,
+    "darr" => 8595,
+    "harr" => 8596,
+    "crarr" => 8629,
+    "lArr" => 8656,
+    "uArr" => 8657,
+    "rArr" => 8658,
+    "dArr" => 8659,
+    "hArr" => 8660,
+    "forall" => 8704,
+    "part" => 8706,
+    "exist" => 8707,
+    "empty" => 8709,
+    "nabla" => 8711,
+    "isin" => 8712,
+    "notin" => 8713,
+    "ni" => 8715,
+    "prod" => 8719,
+    "sum" => 8721,
+    "minus" => 8722,
+    "lowast" => 8727,
+    "radic" => 8730,
+    "prop" => 8733,
+    "infin" => 8734,
+    "ang" => 8736,
+    "and" => 8743,
+    "or" => 8744,
+    "cap" => 8745,
+    "cup" => 8746,
+    "int" => 8747,
+    "there4" => 8756,
+    "sim" => 8764,
+    "cong" => 8773,
+    "asymp" => 8776,
+    "ne" => 8800,
+    "equiv" => 8801,
+    "le" => 8804,
+    "ge" => 8805,
+    "sub" => 8834,
+    "sup" => 8835,
+    "nsub" => 8836,
+    "sube" => 8838,
+    "supe" => 8839,
+    "oplus" => 8853,
+    "otimes" => 8855,
+    "perp" => 8869,
+    "sdot" => 8901,
+    "lceil" => 8968,
+    "rceil" => 8969,
+    "lfloor" => 8970,
+    "rfloor" => 8971,
+    "lang" => 10216,
+    "rang" => 10217,
+    "loz" => 9674,
+    "spades" => 9824,
+    "clubs" => 9827,
+    "hearts" => 9829,
+    "diams" => 9830
+  }
+end

data/lib/sterile/smart_format_rules.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# encoding: UTF-8
+module Sterile
+  SMART_FORMAT_RULES = [
+    ["'tain't", "’tain’t"],
+    ["'twere", "’twere"],
+    ["'twas", "’twas"],
+    ["'tis", "’tis"],
+    ["'twill", "’twill"],
+    ["'til", "’til"],
+    ["'bout", "’bout"],
+    ["'nuff", "’nuff"],
+    ["'round", "’round"],
+    ["'cause", "’cause"],
+    ["'cos", "’cos"],
+    ["i'm", "i’m"],
+    ['--"', "—”"],
+    ["--'", "—’"],
+    ["--", "—"],
+    ["...", "…"],
+    ["(tm)", "™"],
+    ["(TM)", "™"],
+    ["(c)", "©"],
+    ["(r)", "®"],
+    ["(R)", "®"],
+    [/s\'([^a-zA-Z0-9])/, "s’\\1"],
+    [/"([:;])/, "”\\1"],
+    [/\'s$/, "’s"],
+    [/\'(\d\d(?:’|\')?s)/, "’\\1"],
+    [/(\s|\A|"|\(|\[)\'/, "\\1‘"],
+    [/(\d+)"/, "\\1′"],
+    [/(\d+)\'/, "\\1″"],
+    [/(\S)\'([^\'\s])/, "\\1’\\2"],
+    [/(\s|\A|\(|\[)"(?!\s)/, "\\1“\\2"],
+    [/"(\s|\S|\Z)/, "”\\1"],
+    [/\'([\s.]|\Z)/, "’\\1"],
+    [/(\d+)x(\d+)/, "\\1×\\2"],
+    [/([a-z])'(t|d|s|ll|re|ve)(\b)/i, "\\1’\\2\\3"]
+  ]
+end

data/lib/sterile/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# encoding: UTF-8
+module Sterile
+  VERSION = "1.0.0"
+end

data/lib/sterile.rb ADDED Viewed

@@ -0,0 +1,339 @@
+# encoding: UTF-8
+# Copyright (c) 2011 Patrick Hogan
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+require "sterile/codepoints"
+require "sterile/html_entities"
+require "sterile/smart_format_rules"
+module Sterile
+  class << self
+    def transmogrify(string, &block)
+      raise "No block given" unless block_given?
+      result = ""
+      string.unpack("U*").each do |codepoint|
+        cg = codepoint >> 8
+        cp = codepoint & 0xFF
+        begin
+          mapping = CODEPOINTS[cg][cp]
+          result << yield(mapping, codepoint)
+        rescue
+        end
+      end
+      result
+    end
+    # Transliterate Unicode [and accented ASCII] characters to their plain-text
+    # ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
+    # which is in turn a port of Perl's Unidecode and ostensibly provides
+    # superior results to iconv. The optical conversion data is based on work
+    # by Eric Boehs at https://github.com/ericboehs/to_slug
+    # Passing an option of :optical => true will prefer optical mapping instead
+    # of more pedantic matches.
+    #
+    #   "ýůçký".transliterate # => "yucky"
+    #
+    def transliterate(string, options = {})
+      options = {
+        :optical => false
+      }.merge!(options)
+      if options[:optical]
+        transmogrify(string) do |mapping, codepoint|
+          mapping[1] || mapping[0] || ""
+        end
+      else
+        transmogrify(string) do |mapping, codepoint|
+          mapping[0] || mapping[1] || ""
+        end
+      end
+    end
+    alias_method :to_ascii, :transliterate
+    # Trim whitespace from start and end of string and remove any redundant
+    # whitespace in between.
+    #
+    #   " Hello  world! ".transliterate # => "Hello world!"
+    #
+    def trim_whitespace(string)
+      string.gsub(/\s+/, " ").strip
+    end
+    # Transliterate to ASCII and strip out any HTML/XML tags.
+    #
+    #   "<b>nåsty</b>".sterilize # => "nasty"
+    #
+    def sterilize(string)
+      strip_tags(transliterate(string))
+    end
+    # Transliterate to ASCII, downcase and format for URL permalink/slug
+    # by stripping out all non-alphanumeric characters and replacing spaces
+    # with a delimiter (defaults to '-').
+    #
+    #   "Hello World!".sluggerize # => "hello-world"
+    #
+    def sluggerize(string, options = {})
+      options = {
+        :delimiter => "-"
+      }.merge!(options)
+      sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
+    end
+    alias_method :to_slug, :sluggerize
+    # Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
+    #
+    #   q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
+    #
+    def smart_format(string)
+      SMART_FORMAT_RULES.each do |rule|
+        string.gsub!(rule[0], rule[1])
+      end
+      string
+    end
+    # Turn Unicode characters into their HTML equivilents.
+    # If a valid HTML entity is not possible, it will create a numeric entity.
+    #
+    #   q{“Economy Hits Bottom,” ran the headline}.smart_format # => &ldquo;Economy Hits Bottom,&rdquo; ran the headline
+    #
+    def encode_entities(string)
+      transmogrify(string) do |mapping, codepoint|
+        if (32..126).include?(codepoint)
+          mapping[0]
+        else
+          "&" + (mapping[2] || "#" + codepoint.to_s) + ";"
+        end
+      end
+    end
+    # The reverse of +encode_entities+. Turns HTML or numeric entities into
+    # their Unicode counterparts.
+    #
+    def decode_entities(string)
+      string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
+      string.gsub(/&([a-zA-Z0-9]+);/) do
+        codepoint = HTML_ENTITIES[$1]
+        codepoint ? [codepoint].pack("U") : $&
+      end
+    end
+    # Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
+    # CDATA is considered text unless :keep_cdata => false is specified.
+    # Redundant whitespace will be removed unless :keep_whitespace => true is specified.
+    #
+    def strip_tags(string, options = {})
+      options = {
+        :keep_whitespace => false,
+        :keep_cdata      => true
+      }.merge!(options)
+      string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
+      string.gsub!(/<!--[^-]*-->/, '')      # strip comments
+      string.gsub!(
+        /
+          <!\[CDATA\[
+          ([^\]]*)
+          \]\]>
+        /xi,
+        options[:keep_cdata] ? '\\1' : ''
+      )
+      html_name = /[\w:-]+/
+      html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
+      html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
+      string.gsub!(
+        /
+          <
+          [\/]?
+          #{html_name}
+          (\s+(#{html_attr}(\s+#{html_attr})*))?
+          \s*
+          [\/]?
+          >
+        /xi,
+        ''
+      )
+      options[:keep_whitespace] ? string : trim_whitespace(string)
+    end
+    # Similar to +gsub+, except it works in between HTML/XML tags and
+    # yields text to a block. Text will be replaced by what the block
+    # returns.
+    # Warning: does not work in some degenerate cases.
+    #
+    def gsub_tags(string, &block)
+      raise "No block given" unless block_given?
+      string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
+        $2 ? yield($2) : $1
+      end
+    end
+    # Iterates over all text in between HTML/XML tags and yields
+    # it to a block.
+    # Warning: does not work in some degenerate cases.
+    #
+    def scan_tags(string, &block)
+      raise "No block given" unless block_given?
+      string.scan(/(<[^>]*>)|([^<]+)/) do |match|
+        yield($2) unless $2.nil?
+      end
+    end
+    # Like +smart_format+, but works with HTML/XML (somewhat).
+    #
+    def smart_format_tags(string)
+      string.gsub_tags do |text|
+        text.smart_format.encode_entities
+      end
+    end
+    # Format text appropriately for titles. This method is much smarter
+    # than ActiveSupport's +titlecase+. The algorithm is based on work done
+    # by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
+    #
+    def titlecase(string)
+      string.strip!
+      string.gsub!(/\s+/, " ")
+      string.downcase! unless string =~ /[[:lower:]]/
+      small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
+      apos = / (?: ['’] [[:lower:]]* )? /xu
+      string.gsub!(
+        /
+          \b
+          ([_\*]*)
+          (?:
+            ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} )      # URL, domain, or email
+            |
+            ( (?i: #{small_words} ) #{apos} )               # or small word, case-insensitive
+            |
+            ( [[:alpha:]] [[:lower:]'’()\[\]{}]* #{apos} )  # or word without internal caps
+            |
+            ( [[:alpha:]] [[:alpha:]'’()\[\]{}]* #{apos} )  # or some other word
+          )
+          ([_\*]*)
+          \b
+        /xu
+      ) do
+        ($1 ? $1 : "") +
+        ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
+        ($6 ? $6 : "")
+      end
+      if RUBY_VERSION < "1.9.0"
+        string.gsub!(
+          /
+            \b
+            ([:alpha:]+)
+            (‑)
+            ([:alpha:]+)
+            \b
+          /xu
+        ) do
+          $1.downcase.capitalize + $2 + $1.downcase.capitalize
+        end
+      end
+      string.gsub!(
+        /
+          (
+            \A [[:punct:]]*     # start of title
+            | [:.;?!][ ]+       # or of subsentence
+            | [ ]['"“‘(\[][ ]*  # or of inserted subphrase
+          )
+          ( #{small_words} )    # followed by a small-word
+          \b
+        /xiu
+      ) do
+        $1 + $2.downcase.capitalize
+      end
+      string.gsub!(
+        /
+          \b
+          ( #{small_words} )    # small-word
+          (?=
+            [[:punct:]]* \Z     # at the end of the title
+            |
+            ['"’”)\]] [ ]       # or of an inserted subphrase
+          )
+        /xu
+      ) do
+        $1.downcase.capitalize
+      end
+      string.gsub!(
+        /
+          (
+            \b
+            [[:alpha:]]         # single first letter
+            [\-‑]               # followed by a dash
+          )
+          ( [[:alpha:]] )       # followed by a letter
+        /xu
+      ) do
+        $1 + $2.downcase
+      end
+      string.gsub!(/q&a/i, 'Q&A')
+      string
+    end
+  end
+end
+# Add extensions to String
+#
+class String
+  Sterile.methods(false).each do |method|
+    eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
+    eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
+  end
+end

data/sterilize.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+# encoding: UTF-8
+$:.push File.expand_path("../lib", __FILE__)
+require "sterile/version"
+Gem::Specification.new do |s|
+  s.name        = "sterile"
+  s.version     = Sterile::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Patrick Hogan"]
+  s.email       = ["pbhogan@gmail.com"]
+  s.homepage    = "https://github.com/pbhogan/sterile"
+  s.summary     = %q{Sterilize your strings! Transliterate, generate slugs, smart format, strip tags, encode/decode entities and more.}
+  s.description = s.summary
+  s.rubyforge_project = "sterile"
+  # s.add_dependency("nokogiri")
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths  = ["lib"]
+end