RubyGems - sterile - Versions diffs - 1.0.1 → 1.0.2 - Mend

sterile 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/.autotest +1 -1
data/.yaropts +1 -0
data/Gemfile +1 -1
data/Gemfile.lock +1 -1
data/lib/sterile/data/codepoints_data.rb +46527 -0
data/lib/sterile/data/html_entities_data.rb +264 -0
data/lib/sterile/data/smart_format_rules.rb +45 -0
data/lib/sterile/entities.rb +48 -0
data/lib/sterile/smart_format.rb +41 -0
data/lib/sterile/string_extensions.rb +19 -0
data/lib/sterile/tags.rb +78 -0
data/lib/sterile/titlecase.rb +123 -0
data/lib/sterile/transliterate.rb +65 -0
data/lib/sterile/utilities.rb +43 -0
data/lib/sterile/version.rb +1 -1
data/lib/sterile.rb +7 -314
metadata +13 -5
data/lib/sterile/codepoints.rb +0 -46523
data/lib/sterile/html_entities.rb +0 -260
data/lib/sterile/smart_format_rules.rb +0 -41

data/lib/sterile/data/html_entities_data.rb ADDED Viewed

@@ -0,0 +1,264 @@
+# encoding: UTF-8
+module Sterile
+  # @private
+  class Data
+    def self.html_entities_data
+      {
+        "quot" => 34,
+        "amp" => 38,
+        "apos" => 39,
+        "lt" => 60,
+        "gt" => 62,
+        "nbsp" => 160,
+        "iexcl" => 161,
+        "cent" => 162,
+        "pound" => 163,
+        "curren" => 164,
+        "yen" => 165,
+        "brvbar" => 166,
+        "sect" => 167,
+        "uml" => 168,
+        "copy" => 169,
+        "ordf" => 170,
+        "laquo" => 171,
+        "not" => 172,
+        "shy" => 173,
+        "reg" => 174,
+        "macr" => 175,
+        "deg" => 176,
+        "plusmn" => 177,
+        "sup2" => 178,
+        "sup3" => 179,
+        "acute" => 180,
+        "micro" => 181,
+        "para" => 182,
+        "middot" => 183,
+        "cedil" => 184,
+        "sup1" => 185,
+        "ordm" => 186,
+        "raquo" => 187,
+        "frac14" => 188,
+        "frac12" => 189,
+        "frac34" => 190,
+        "iquest" => 191,
+        "Agrave" => 192,
+        "Aacute" => 193,
+        "Acirc" => 194,
+        "Atilde" => 195,
+        "Auml" => 196,
+        "Aring" => 197,
+        "AElig" => 198,
+        "Ccedil" => 199,
+        "Egrave" => 200,
+        "Eacute" => 201,
+        "Ecirc" => 202,
+        "Euml" => 203,
+        "Igrave" => 204,
+        "Iacute" => 205,
+        "Icirc" => 206,
+        "Iuml" => 207,
+        "ETH" => 208,
+        "Ntilde" => 209,
+        "Ograve" => 210,
+        "Oacute" => 211,
+        "Ocirc" => 212,
+        "Otilde" => 213,
+        "Ouml" => 214,
+        "times" => 215,
+        "Oslash" => 216,
+        "Ugrave" => 217,
+        "Uacute" => 218,
+        "Ucirc" => 219,
+        "Uuml" => 220,
+        "Yacute" => 221,
+        "THORN" => 222,
+        "szlig" => 223,
+        "agrave" => 224,
+        "aacute" => 225,
+        "acirc" => 226,
+        "atilde" => 227,
+        "auml" => 228,
+        "aring" => 229,
+        "aelig" => 230,
+        "ccedil" => 231,
+        "egrave" => 232,
+        "eacute" => 233,
+        "ecirc" => 234,
+        "euml" => 235,
+        "igrave" => 236,
+        "iacute" => 237,
+        "icirc" => 238,
+        "iuml" => 239,
+        "eth" => 240,
+        "ntilde" => 241,
+        "ograve" => 242,
+        "oacute" => 243,
+        "ocirc" => 244,
+        "otilde" => 245,
+        "ouml" => 246,
+        "divide" => 247,
+        "oslash" => 248,
+        "ugrave" => 249,
+        "uacute" => 250,
+        "ucirc" => 251,
+        "uuml" => 252,
+        "yacute" => 253,
+        "thorn" => 254,
+        "yuml" => 255,
+        "OElig" => 338,
+        "oelig" => 339,
+        "Scaron" => 352,
+        "scaron" => 353,
+        "Yuml" => 376,
+        "fnof" => 402,
+        "circ" => 710,
+        "tilde" => 732,
+        "Alpha" => 913,
+        "Beta" => 914,
+        "Gamma" => 915,
+        "Delta" => 916,
+        "Epsilon" => 917,
+        "Zeta" => 918,
+        "Eta" => 919,
+        "Theta" => 920,
+        "Iota" => 921,
+        "Kappa" => 922,
+        "Lambda" => 923,
+        "Mu" => 924,
+        "Nu" => 925,
+        "Xi" => 926,
+        "Omicron" => 927,
+        "Pi" => 928,
+        "Rho" => 929,
+        "Sigma" => 931,
+        "Tau" => 932,
+        "Upsilon" => 933,
+        "Phi" => 934,
+        "Chi" => 935,
+        "Psi" => 936,
+        "Omega" => 937,
+        "alpha" => 945,
+        "beta" => 946,
+        "gamma" => 947,
+        "delta" => 948,
+        "epsilon" => 949,
+        "zeta" => 950,
+        "eta" => 951,
+        "theta" => 952,
+        "iota" => 953,
+        "kappa" => 954,
+        "lambda" => 955,
+        "mu" => 956,
+        "nu" => 957,
+        "xi" => 958,
+        "omicron" => 959,
+        "pi" => 960,
+        "rho" => 961,
+        "sigmaf" => 962,
+        "sigma" => 963,
+        "tau" => 964,
+        "upsilon" => 965,
+        "phi" => 966,
+        "chi" => 967,
+        "psi" => 968,
+        "omega" => 969,
+        "thetasym" => 977,
+        "upsih" => 978,
+        "piv" => 982,
+        "ensp" => 8194,
+        "emsp" => 8195,
+        "thinsp" => 8201,
+        "zwnj" => 8204,
+        "zwj" => 8205,
+        "lrm" => 8206,
+        "rlm" => 8207,
+        "ndash" => 8211,
+        "mdash" => 8212,
+        "lsquo" => 8216,
+        "rsquo" => 8217,
+        "sbquo" => 8218,
+        "ldquo" => 8220,
+        "rdquo" => 8221,
+        "bdquo" => 8222,
+        "dagger" => 8224,
+        "Dagger" => 8225,
+        "bull" => 8226,
+        "hellip" => 8230,
+        "permil" => 8240,
+        "prime" => 8242,
+        "Prime" => 8243,
+        "lsaquo" => 8249,
+        "rsaquo" => 8250,
+        "oline" => 8254,
+        "frasl" => 8260,
+        "euro" => 8364,
+        "image" => 8465,
+        "weierp" => 8472,
+        "real" => 8476,
+        "trade" => 8482,
+        "alefsym" => 8501,
+        "larr" => 8592,
+        "uarr" => 8593,
+        "rarr" => 8594,
+        "darr" => 8595,
+        "harr" => 8596,
+        "crarr" => 8629,
+        "lArr" => 8656,
+        "uArr" => 8657,
+        "rArr" => 8658,
+        "dArr" => 8659,
+        "hArr" => 8660,
+        "forall" => 8704,
+        "part" => 8706,
+        "exist" => 8707,
+        "empty" => 8709,
+        "nabla" => 8711,
+        "isin" => 8712,
+        "notin" => 8713,
+        "ni" => 8715,
+        "prod" => 8719,
+        "sum" => 8721,
+        "minus" => 8722,
+        "lowast" => 8727,
+        "radic" => 8730,
+        "prop" => 8733,
+        "infin" => 8734,
+        "ang" => 8736,
+        "and" => 8743,
+        "or" => 8744,
+        "cap" => 8745,
+        "cup" => 8746,
+        "int" => 8747,
+        "there4" => 8756,
+        "sim" => 8764,
+        "cong" => 8773,
+        "asymp" => 8776,
+        "ne" => 8800,
+        "equiv" => 8801,
+        "le" => 8804,
+        "ge" => 8805,
+        "sub" => 8834,
+        "sup" => 8835,
+        "nsub" => 8836,
+        "sube" => 8838,
+        "supe" => 8839,
+        "oplus" => 8853,
+        "otimes" => 8855,
+        "perp" => 8869,
+        "sdot" => 8901,
+        "lceil" => 8968,
+        "rceil" => 8969,
+        "lfloor" => 8970,
+        "rfloor" => 8971,
+        "lang" => 10216,
+        "rang" => 10217,
+        "loz" => 9674,
+        "spades" => 9824,
+        "clubs" => 9827,
+        "hearts" => 9829,
+        "diams" => 9830
+      }
+    end
+  end
+end

data/lib/sterile/data/smart_format_rules.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# encoding: UTF-8
+module Sterile
+  # @private
+  class Data
+    def self.smart_format_rules
+      [
+        ["'tain't", "’tain’t"],
+        ["'twere", "’twere"],
+        ["'twas", "’twas"],
+        ["'tis", "’tis"],
+        ["'twill", "’twill"],
+        ["'til", "’til"],
+        ["'bout", "’bout"],
+        ["'nuff", "’nuff"],
+        ["'round", "’round"],
+        ["'cause", "’cause"],
+        ["'cos", "’cos"],
+        ["i'm", "i’m"],
+        ['--"', "—”"],
+        ["--'", "—’"],
+        ["--", "—"],
+        ["...", "…"],
+        ["(tm)", "™"],
+        ["(TM)", "™"],
+        ["(c)", "©"],
+        ["(r)", "®"],
+        ["(R)", "®"],
+        [/s\'([^a-zA-Z0-9])/, "s’\\1"],
+        [/"([:;])/, "”\\1"],
+        [/\'s$/, "’s"],
+        [/\'(\d\d(?:’|\')?s)/, "’\\1"],
+        [/(\s|\A|"|\(|\[)\'/, "\\1‘"],
+        [/(\d+)"/, "\\1′"],
+        [/(\d+)\'/, "\\1″"],
+        [/(\S)\'([^\'\s])/, "\\1’\\2"],
+        [/(\s|\A|\(|\[)"(?!\s)/, "\\1“\\2"],
+        [/"(\s|\S|\Z)/, "”\\1"],
+        [/\'([\s.]|\Z)/, "’\\1"],
+        [/(\d+)x(\d+)/, "\\1×\\2"],
+        [/([a-z])'(t|d|s|ll|re|ve)(\b)/i, "\\1’\\2\\3"]
+      ]
+    end
+  end
+end

data/lib/sterile/entities.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# encoding: UTF-8
+module Sterile
+  class << self
+    # Turn Unicode characters into their HTML equivilents.
+    # If a valid HTML entity is not possible, it will create a numeric entity.
+    #
+    #   q{“Economy Hits Bottom,” ran the headline}.encode_entities # => &ldquo;Economy Hits Bottom,&rdquo; ran the headline
+    #
+    def encode_entities(string)
+      transmogrify(string) do |mapping, codepoint|
+        if (32..126).include?(codepoint)
+          mapping[0]
+        else
+          "&" + (mapping[2] || "#" + codepoint.to_s) + ";"
+        end
+      end
+    end
+    # The reverse of +encode_entities+. Turns HTML or numeric entities into
+    # their Unicode counterparts.
+    #
+    def decode_entities(string)
+      string.gsub!(/&#(\d{1,4});/) { [$1.to_i].pack("U") }
+      string.gsub(/&([a-zA-Z0-9]+);/) do
+        codepoint = html_entities_data[$1]
+        codepoint ? [codepoint].pack("U") : $&
+      end
+    end
+    private
+    # Lazy load html entities
+    #
+    def html_entities_data
+      @html_entities_data ||= begin
+        require "sterile/data/html_entities_data"
+        Data.html_entities_data
+      end
+    end
+  end # class << self
+end # module Sterile

data/lib/sterile/smart_format.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# encoding: UTF-8
+module Sterile
+  class << self
+    # Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
+    #
+    #   q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
+    #
+    def smart_format(string)
+      smart_format_rules.each do |rule|
+        string.gsub!(rule[0], rule[1])
+      end
+      string
+    end
+    # Like +smart_format+, but works with HTML/XML (somewhat).
+    #
+    def smart_format_tags(string)
+      string.gsub_tags do |text|
+        text.smart_format.encode_entities
+      end
+    end
+    private
+    # Lazy load smart formatting rules
+    #
+    def smart_format_rules
+      @smart_format_rules ||= begin
+        require "sterile/data/smart_format_rules"
+        Data.smart_format_rules
+      end
+    end
+  end # class << self
+end # module Sterile

data/lib/sterile/string_extensions.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# encoding: UTF-8
+module Sterile
+  module StringExtensions
+    def self.included(base)
+      Sterile.methods(false).each do |method|
+        eval("def #{method}(*args, &block); Sterile.#{method}(self, *args, &block); end")
+        eval("def #{method}!(*args, &block); replace Sterile.#{method}(self, *args, &block); end")
+      end
+    end
+  end
+end
+class String
+  include Sterile::StringExtensions
+end

data/lib/sterile/tags.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# encoding: UTF-8
+module Sterile
+  class << self
+    # Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
+    # CDATA is considered text unless :keep_cdata => false is specified.
+    # Redundant whitespace will be removed unless :keep_whitespace => true is specified.
+    #
+    def strip_tags(string, options = {})
+      options = {
+        :keep_whitespace => false,
+        :keep_cdata      => true
+      }.merge!(options)
+      string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
+      string.gsub!(/<!--[^-]*-->/, '')      # strip comments
+      string.gsub!(
+        /
+          <!\[CDATA\[
+          ([^\]]*)
+          \]\]>
+        /xi,
+        options[:keep_cdata] ? '\\1' : ''
+      )
+      html_name = /[\w:-]+/
+      html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
+      html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
+      string.gsub!(
+        /
+          <
+          [\/]?
+          #{html_name}
+          (\s+(#{html_attr}(\s+#{html_attr})*))?
+          \s*
+          [\/]?
+          >
+        /xi,
+        ''
+      )
+      options[:keep_whitespace] ? string : trim_whitespace(string)
+    end
+    # Similar to +gsub+, except it works in between HTML/XML tags and
+    # yields text to a block. Text will be replaced by what the block
+    # returns.
+    # Warning: does not work in some degenerate cases.
+    #
+    def gsub_tags(string, &block)
+      raise "No block given" unless block_given?
+      string.gsub!(/(<[^>]*>)|([^<]+)/) do |match|
+        $2 ? yield($2) : $1
+      end
+    end
+    # Iterates over all text in between HTML/XML tags and yields
+    # it to a block.
+    # Warning: does not work in some degenerate cases.
+    #
+    def scan_tags(string, &block)
+      raise "No block given" unless block_given?
+      string.scan(/(<[^>]*>)|([^<]+)/) do |match|
+        yield($2) unless $2.nil?
+      end
+    end
+  end # class << self
+end # module Sterile

data/lib/sterile/titlecase.rb ADDED Viewed

@@ -0,0 +1,123 @@
+# encoding: UTF-8
+module Sterile
+  class << self
+    # Format text appropriately for titles. This method is much smarter
+    # than ActiveSupport's +titlecase+. The algorithm is based on work done
+    # by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
+    #
+    def titlecase(string)
+      lsquo = [8216].pack("U")
+      rsquo = [8217].pack("U")
+      ldquo = [8220].pack("U")
+      rdquo = [8221].pack("U")
+      ndash = [8211].pack("U")
+      string.strip!
+      string.gsub!(/\s+/, " ")
+      string.downcase! unless string =~ /[[:lower:]]/
+      small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
+      apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu
+      string.gsub!(
+        /
+          \b
+          ([_\*]*)
+          (?:
+            ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} )      # URL, domain, or email
+            |
+            ( (?i: #{small_words} ) #{apos} )               # or small word, case-insensitive
+            |
+            ( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} )  # or word without internal caps
+            |
+            ( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} )  # or some other word
+          )
+          ([_\*]*)
+          \b
+        /xu
+      ) do
+        ($1 ? $1 : "") +
+        ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
+        ($6 ? $6 : "")
+      end
+      if RUBY_VERSION < "1.9.0"
+        string.gsub!(
+          /
+            \b
+            ([:alpha:]+)
+            (#{ndash})
+            ([:alpha:]+)
+            \b
+          /xu
+        ) do
+          $1.downcase.capitalize + $2 + $1.downcase.capitalize
+        end
+      end
+      string.gsub!(
+        /
+          (
+            \A [[:punct:]]*     # start of title
+            | [:.;?!][ ]+       # or of subsentence
+            | [ ]['"#{ldquo}#{lsquo}(\[][ ]*  # or of inserted subphrase
+          )
+          ( #{small_words} )    # followed by a small-word
+          \b
+        /xiu
+      ) do
+        $1 + $2.downcase.capitalize
+      end
+      string.gsub!(
+        /
+          \b
+          ( #{small_words} )    # small-word
+          (?=
+            [[:punct:]]* \Z     # at the end of the title
+            |
+            ['"#{rsquo}#{rdquo})\]] [ ]       # or of an inserted subphrase
+          )
+        /xu
+      ) do
+        $1.downcase.capitalize
+      end
+      string.gsub!(
+        /
+          (
+            \b
+            [[:alpha:]]         # single first letter
+            [\-#{ndash}]               # followed by a dash
+          )
+          ( [[:alpha:]] )       # followed by a letter
+        /xu
+      ) do
+        $1 + $2.downcase
+      end
+      string.gsub!(/q&a/i, 'Q&A')
+      string
+    end
+    alias_method :titleize, :titlecase
+    private
+    # Lazy load smart formatting rules
+    #
+    def smart_format_rules
+      @smart_format_rules ||= begin
+        require "sterile/data/smart_format_rules"
+        Data.smart_format_rules
+      end
+    end
+  end # class << self
+end # module Sterile

data/lib/sterile/transliterate.rb ADDED Viewed

@@ -0,0 +1,65 @@
+# encoding: UTF-8
+module Sterile
+  class << self
+    def transmogrify(string, &block)
+      raise "No block given" unless block_given?
+      result = ""
+      string.unpack("U*").each do |codepoint|
+        cg = codepoint >> 8
+        cp = codepoint & 0xFF
+        begin
+          mapping = codepoints_data[cg][cp]
+          result << yield(mapping, codepoint)
+        rescue
+        end
+      end
+      result
+    end
+    # Transliterate Unicode [and accented ASCII] characters to their plain-text
+    # ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
+    # which is in turn a port of Perl's Unidecode and ostensibly provides
+    # superior results to iconv. The optical conversion data is based on work
+    # by Eric Boehs at https://github.com/ericboehs/to_slug
+    # Passing an option of :optical => true will prefer optical mapping instead
+    # of more pedantic matches.
+    #
+    #   "ýůçký".transliterate # => "yucky"
+    #
+    def transliterate(string, options = {})
+      options = {
+        :optical => false
+      }.merge!(options)
+      if options[:optical]
+        transmogrify(string) do |mapping, codepoint|
+          mapping[1] || mapping[0] || ""
+        end
+      else
+        transmogrify(string) do |mapping, codepoint|
+          mapping[0] || mapping[1] || ""
+        end
+      end
+    end
+    alias_method :to_ascii, :transliterate
+    private
+    # Lazy load codepoints data
+    #
+    def codepoints_data
+      @codepoints_data ||= begin
+        require "sterile/data/codepoints_data"
+        Data.codepoints_data
+      end
+    end
+  end # class << self
+end # module Sterile