RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

data/lib/imw/dataset/scraped_uri.rb ADDED

@@ -0,0 +1,305 @@
+# -*- coding: utf-8 -*-
+module Linkish
+  def self.included base
+    base.class_eval do
+      include DataMapper::Resource
+      include Infochimps::Resource
+      property      :id,              Integer,        :serial      => true
+      property      :full_url,        String,         :length      => 255,    :nullable => false,                     :unique_index => true
+      has_handle
+      alias_method  :handle_generator, :full_url
+      has_time_and_user_stamps
+      #
+      property      :name,            String,         :length      => 255,    :nullable => false, :default => ''
+      #
+      property      :file_path,       String,    :length => 1024
+      property      :file_time,       DateTime
+      property      :file_size,       Integer
+      property      :file_sha1,       String,    :length => 40
+      property      :tried_fetch,     DataMapper::Resource::Boolean
+      property      :fetched,         DataMapper::Resource::Boolean
+      #
+      before :create, :make_uuid_and_handle
+      before :create, :update_from_file!
+    end
+    base.extend ClassMethods
+  end
+  # ===========================================================================
+  #
+  # Delegate methods to uri
+  #
+  def uri
+    @uri ||= Addressable::URI.parse(self.full_url)
+  end
+  # Dispatch anything else to the aggregated uri object
+  def method_missing method, *args
+    if self.uri.respond_to?(method)
+      self.uri.send(method, *args)
+    else
+      super method, *args
+    end
+  end
+  def to_s
+    "<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
+  end
+  # ===========================================================================
+  #
+  # ID, naming, etc
+  #
+  def normalize_url!
+    u = Addressable::URI.parse(self.full_url).normalize
+    self.full_url = u.to_s
+  end
+  # ===========================================================================
+  #
+  # Properly belongs in FileStore module
+  #
+  #
+  # Refresh cached properties from our copy of the asset.
+  #
+  def update_from_file!
+    self.make_uuid_and_handle # make sure this happened
+    # Set the file path
+    self.file_path = self.to_file_path if self.file_path.blank?
+    # FIXME -- kludge to ripd_root
+    if ! File.exist?(actual_path)
+      self.fetched   = false
+    else
+      self.fetched   = self.tried_fetch = true
+      self.file_size = File.size( actual_path)
+      self.file_time = File.mtime(actual_path)
+    end
+    self.fetched
+  end
+  def actual_path
+    path_to(:ripd_root, self.file_path)
+  end
+  # ===========================================================================
+  #
+  # Properly belongs in own module
+  #
+  IMW_WGET_OPTIONS = {
+    :root       => :ripd_root,
+    :wait       => 2,
+    :noretry    => true,
+    :log_level  => Logger::DEBUG,
+    :clobber    => false,
+  }
+  #
+  # Fetch from the web
+  #
+  def wget options={}
+    options.reverse_merge! IMW_WGET_OPTIONS
+    cd path_to(options[:root]) do
+      if (not options[:clobber]) && File.file?(file_path) then
+        IMW.log.add options[:log_level], "Skipping #{file_path}"; return
+      end
+      # Do the fetch
+      mkdir_p File.dirname(actual_path)
+      # defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
+      cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
+      IMW.log.add(options[:log_level], cmd)
+      IMW.log.add(options[:log_level], `#{cmd}`)
+      self.tried_fetch = true
+      sleep options[:wait] # please hammer don't hurt em
+      update_from_file!
+      self.save
+      return self.fetched
+    end
+  end
+  #
+  #
+  #
+  def contents options={}
+    wget options
+    if fetched
+      File.open actual_path
+    end
+  end
+  # ===========================================================================
+  #
+  # Properly belongs in FileStore
+  #
+  protected
+  #
+  # The standard file path for this url's ripped cache
+  #
+  # * leading directory from reverse.dotted.host_scheme:port:user@password
+  # * normalized path/file?query#fragment
+  # * uuid formed from the
+  #
+  def to_file_path
+    file_path_str = ""
+    file_path_str << to_file_path_root_part
+    file_path_str << to_file_path_path_part
+    file_path_str << to_file_path_file_part
+    file_path_str = self.class.path_str_encode(file_path_str)
+    self.class.validate_roundtrip(file_path_str)
+    file_path_str
+  end
+  def file_timestamp
+    file_time.strftime("%Y%m%d-%H%M%S")
+  end
+  def to_file_path_with_timestamp
+    to_file_path + file_timestamp
+  end
+  #
+  # revhost_scheme:port:user@password -- omitting _scheme if it's http, and
+  # omitting :port:user@password if all three are blank.
+  #
+  def to_file_path_root_part
+    root_part_str = ""
+    tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
+    root_part_str << revhost
+    root_part_str << "_#{uri.scheme}"                           unless uri.scheme == 'http'
+    root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
+    root_part_str
+  end
+  def to_file_path_path_part
+    uri.path.to_s
+  end
+  def to_file_path_file_part
+    file_path_str = ""
+    file_path_str << "?#{uri.query}"        unless uri.query.nil?
+    file_path_str << "##{uri.fragment}"     unless uri.fragment.nil?
+    file_path_str << "-#{self.uuid}"
+  end
+  public
+  module ClassMethods
+    #
+    # find_or_creates from url
+    #
+    # url is heuristic_parse'd and normalized by Addressable before lookup:
+    #   "Converts an input to a URI. The input does not have to be a valid URI —
+    #   the method will use heuristics to guess what URI was intended. This is not
+    #   standards compliant, merely user-friendly.
+    #
+    def find_or_create_from_url url_str
+      link = self.find_or_new_from_url url_str
+      link.save
+      link
+    end
+    def find_or_new_from_url url_str # :nodoc:
+      url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
+      link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
+      link.make_uuid_and_handle
+      link.update_from_file!
+      link
+    end
+    def find_or_create_from_file_path ripd_file
+      url_str = Link.url_from_file_path(ripd_file)
+      link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
+      link.file_path = ripd_file
+      link.make_uuid_and_handle
+      link.update_from_file!
+      link.save
+      link
+    end
+    #
+    # Decode url from its file_path
+    #
+    def url_from_file_path fp
+      fp = path_str_decode(fp)
+      m = (%r{\A
+            (#{Addressable::URI::HOST_TLD})  # tld tier
+           /(..?)                            # revhost tier
+           /([^/\:_]+)                       # revhost
+        (?:_([^/\:]+))?                      # _scheme
+        (?::(\d*):([^/]*)@([^@/]*?))?        # :port:user@password
+           /(?:(.*?)/)?                      # /dirs/
+            ([^/]*)                          #  file
+           -([a-f0-9]{32})                   # -uuid
+                                \z}x.match(fp))
+      raise "Can't extract url from file path #{fp}" if !m
+      fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
+      fp_host     = fp_host.split('.').reverse.join('.')
+      fp_scheme ||= 'http'
+      fp_pass     = ":#{fp_pass}"             unless fp_pass.blank?
+      fp_userpass = "#{fp_user}#{fp_user}@"   unless fp_user.blank?
+      fp_port     = ":#{fp_port}"             unless fp_port.blank?
+      fp_path     = File.join(*[fp_path, fp_file].compact)
+      "#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
+    end
+    #
+    # to control files-per-directory madness, take a path segment like "foobar" in
+    #   blah.com/top/foobar/directory
+    # and transform into
+    #   blah.com/top/fo/foobar/directory
+    #
+    # Ex.
+    #   self.class.tier_path_segment('a_username')
+    #   # => 'a_/a_username'
+    #   self.class.tier_path_segment('1')
+    #   # => '1/1'
+    #   self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
+    #   # => 'com/tw/com.twitter'
+    #
+    def self.tier_path_segment(path_seg, re=/(..?)/)
+      frag_seg = re.match(path_seg).captures
+      raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
+      File.join(* [frag_seg, path_seg].flatten )
+    end
+    #
+    #
+    # It's really bad if you can't roundtrip --
+    # since saving is the rare case (only done once!) we insist on checking.
+    #
+    def self.validate_roundtrip file_path_str
+      # uu = self.class.url_from_file_path(file_path_str)
+      # puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
+      return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
+      raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
+    end
+    #
+    # Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
+    # and without linebreaking or anything. The intent is to reversibly and
+    # recognizably store URLs to disk with names that (apart from path) do not
+    # need to be further escaped in filesystem, URL, database or HTML.
+    #
+    # The only characters in a path_encoded string are alpha-numeric /_-.=
+    #
+    # Rules:
+    # * Any character that is not alphanumeric, and is not /_-.  is encoded as an
+    #   equals sign = followed by its upper-case hex encoding.
+    #
+    # * Furthermore, in any sequence of repeated '.' characters, all after the
+    #   first are hex encoded; same with '/'.
+    #
+    # Ex.
+    #   path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
+    #   # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
+    #
+    # Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
+    #
+    def path_str_encode(str)
+      str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
+      str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
+      str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
+    end
+    #
+    # See the notes in path_encode
+    #
+    def path_str_decode(str)
+      str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
+    end
+    protected
+    def path_encode_chars(chars) # :nodoc:
+      # send each character to an equals sign followed by its uppercase hex encoding
+      encoded = "";
+      chars.each_byte{|c| encoded << "+%02X" % c }
+      encoded
+    end
+    public
+  end
+end

data/lib/imw/dataset/scrub/old_working_scrubber.rb ADDED

@@ -0,0 +1,87 @@
+  def self.url_from_file_path fp
+    # FIXME -- doesn't work with extension preservation
+    unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
+      # m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
+      raise "Bad match to #{fp}"
+    end
+    fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
+    fp_host     = fp_host.split('.').reverse.join('.')
+    fp_scheme ||= 'http'
+    fp_path     = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
+    url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
+    unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
+      # warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
+    end
+    url
+  end
+  #
+  # returns [dirname, basename, ext] for the file_path
+  # ext is determined by basename_ext_splitter
+  #
+  def path_split
+    path_split_str path
+  end
+  # lowercase; only a-z, num, . -
+  def scrubbed_revhost
+    return unless revhost
+    revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '')  # note: no _
+  end
+  cattr_accessor  :basename_ext_splitter
+  BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
+  BASENAME_EXT_NO_SPLIT    = /(.+?)()/
+  self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
+  #
+  # Like File.split but heuristically handles things like .tar.bz2:
+  #
+  #   foo.        => ['foo.', '']
+  #   foo.tar.gz  => ['foo.', '']
+  #   foo.tar.bz2 => ['foo.', '']
+  #   foo.yaml    => ['foo', '']
+  #
+  def path_split_str str
+    if str =~ %r{/.+\z}
+      dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
+    else
+      dirname, basename = ['', str]
+    end
+    # Get basename, extension (as given by capture groups in basename_ext_splitter)
+    if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
+      basename, ext = m.captures
+    else
+      basename, ext = [basename, '']
+    end
+    [dirname, basename, ext]
+  end
+  # remove all blank components, join the rest with separator
+  def join_non_blank separator, *strs
+    strs.reject(&:blank?).join(separator)
+  end
+  # only a-z A-Z, num, .-_/
+  def scrubbed_path
+    path_part = path
+    # colons into /
+    path_part = path_part.gsub(%r{\:+}, '/')
+    # Kill weird chars
+    path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
+    # Compact (killing foo/../bar, etc)
+    path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
+    # Kill leading & trailing non-alnum
+    path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
+  end
+  #
+  # name for this URL regarded as a file (instance)
+  #
+  def to_file_path
+    dirname, basename, ext = path_split_str(scrubbed_path)
+    basename = join_non_blank '-', basename, uuid
+    basename = join_non_blank '.', basename, ext
+    join_non_blank '/', root_path, dirname, basename
+  end

data/lib/imw/dataset/scrub/scrub.rb ADDED

@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+require 'rubygems'
+require 'active_support'
+require 'uuidtools'
+module Scrub
+  class Generic
+    # A regular expression character group
+    # (a bunch of characters ready to drop into /[#{validator}]*/)
+    # whitelisting allowed characters
+    #
+    # Must be overridden in child class
+    class_inheritable_accessor :validator
+    # Sentence fragment for error message on failed validation.
+    class_inheritable_accessor :complaint
+    self.complaint = "has characters I can't understand"
+    # Proc or string or anything that can be 2nd arg to gsub
+    # to sanitize
+    class_inheritable_accessor :replacer
+    self.replacer  = '-'
+    # A regular expression to sanitize objects
+    # if unset or nil, the validator char group
+    class_inheritable_accessor :sanitizer
+    # unless overridden or set expressly, just use the
+    # validator
+    def sanitizer
+      @sanitizer || self.validator
+    end
+    def sanitize str
+      str = str.to_s
+      str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
+    end
+    def valid? str
+      %r{\A([#{validator.to_s}]*)\z}u.match(str)
+    end
+  end
+  #
+  # A permissive, ASCII-only name string - no control chars, newlines, backslash
+  # or <> angle brackets
+  #
+  class Title < Scrub::Generic
+    self.complaint = "should only contain basic keyboard characters (and should not use \\ &lt; or &gt;)."
+    self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
+  end
+  #
+  # A permissive, ASCII-only name string - no control chars, newlines, backslash
+  # or <> angle brackets
+  #
+  class UnicodeTitle < Scrub::Title
+    self.complaint  = "should only contain keyboard characters (and should not use \\ &lt; or &gt;)."
+    self.validator  = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
+  end
+  #
+  # Visible characters and spaces (i.e. anything except control characters, etc.)
+  #
+  class FreeText < Scrub::Generic
+    self.complaint  = "should not contain control characters or that kind of junk."
+    self.validator  = %r{[:print:]\n\t}u
+  end
+  module BeginsWithAlpha
+    mattr_accessor :slug
+    self.slug = 'x'
+    # prepend #{slug}#{replacer} to the string if it starts with non-alpha.
+    # so, for instance '23jumpstreet' => 'x_23jumpstreet'
+    def sanitize_with_begins_with_alpha str
+      str = sanitize_without_begins_with_alpha str
+      str = 'x' + replacer + str if (str !~ /^[a-z]/i)  # call at end of chain!
+      str
+    end
+    def valid_with_begins_with_alpha? str
+      (str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
+    end
+    def self.included base
+      base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
+      base.alias_method_chain :valid?,   :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
+    end
+  end
+  #
+  # insist that a string be lowercased.
+  #
+  module Lowercased
+    def sanitize_with_lowercased str
+      str = sanitize_without_lowercased str
+      str.downcase # call at end of chain!
+    end
+    def valid_with_lowercase? str
+      (str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
+    end
+    def self.included base
+      base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
+      base.alias_method_chain :valid?,   :lowercase  # unless defined?(base.valid_without_lowercase?)
+    end
+  end
+  #
+  # start with a letter, and contain only A-Za-z0-9_
+  #
+  class Identifier < Scrub::Generic
+    self.complaint  = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
+    self.validator  = %r{a-z0-9_}u
+    self.replacer   = '_'
+    include Scrub::BeginsWithAlpha
+    include Scrub::Lowercased
+  end
+  #
+  # start with a letter, and contain only A-Za-z0-9_
+  #
+  class Handle < Scrub::Generic
+    self.complaint  = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
+    self.validator  = %r{a-z0-9_}u
+    self.replacer   = '_'
+    include Scrub::BeginsWithAlpha
+    include Scrub::Lowercased
+  end
+  # HANDLE_RE  = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
+  # HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
+  #
+  # # "Domain names are restricted to the ASCII letters a through z
+  # # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
+  # # restrictions in terms of name length and position of hyphens."
+  # # (http://en.wikipedia.org/wiki/Domain_name#Overview)
+  # # http://tools.ietf.org/html/rfc1034
+  # DOMAIN_RE    = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
+  # DOMAIN_MSG   = "should look like a domain name."
+  # DOMAIN_MORE  = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
+  MSG_EMAIL_BAD      = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and .&nbsp;+&nbsp;-&nbsp;&#37; please."
+  RE_EMAIL_NAME      = '[\w\.%\+\-]+'                          # what you actually see in practice
+  RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
+  RE_DOMAIN_HEAD     = '(?:[A-Z0-9\-]+\.)+'
+  RE_DOMAIN_TLD      = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
+  RE_EMAIL_OK        = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
+  RE_EMAIL_RFC2822   = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
+end