RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

@@ -0,0 +1,67 @@
+require 'logger'
+module IMW
+  LOG_FILE_DESTINATION = STDERR             unless defined?(LOG_FILE_DESTINATION)
+  LOG_TIMEFORMAT       = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
+  class << self; attr_accessor :log end
+  #
+  # Create a Logger and point it at LOG_FILE_DESTINATION
+  #
+  # LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
+  # ~/.imwrc, or set IMW.log yourself, if that's not cool.
+  #
+  def self.instantiate_logger!
+    IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
+    IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
+    IMW.log.level           = Logger::INFO
+  end
+  def announce *events
+    options = events.extract_options!
+    options.reverse_merge! :level => Logger::INFO
+    # puts [options, events ].inspect, "*"*76
+    IMW.log.add options[:level], events.join("\n")
+  end
+  def banner *events
+    options = events.extract_options!
+    options.reverse_merge! :level => Logger::INFO
+    ["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
+  end
+  PROGRESS_TRACKERS = {}
+  #
+  # When the slowly-changing tracked variable +var+ changes value,
+  # announce its new value.  Always announces on first call.
+  #
+  # Ex:
+  #   track_progress :indexing_names, name[0..0] # announce at each initial letter
+  #   track_progress :files, (i % 1000)          # announce at each 1,000 iterations
+  #
+  def track_progress tracker, val
+    unless (IMW::PROGRESS_TRACKERS.include?(tracker)) &&
+           (IMW::PROGRESS_TRACKERS[tracker] == val)
+      announce "#{tracker.to_s.gsub(/_/,' ')}: #{val}"
+      IMW::PROGRESS_TRACKERS[tracker] = val
+    end
+  end
+  PROGRESS_COUNTERS = {}
+  #
+  # Log repetitions in a given context
+  #
+  # At every n'th (default 1000) call,
+  # announce progress in the IMW.log
+  #
+  def track_count tracker, every=1000
+    PROGRESS_COUNTERS[tracker] ||= 0
+    PROGRESS_COUNTERS[tracker]  += 1
+    chunk = every * (PROGRESS_COUNTERS[tracker]/every).to_i
+    track_progress "count_of_#{tracker}", chunk
+  end
+end
+#
+# Make the default logger
+#
+IMW.instantiate_logger!

data/lib/imw/utils/misc.rb ADDED

@@ -0,0 +1,63 @@
+module IMW
+  # Return a string representing the current UTC time in the IMW
+  # format.
+  def self.current_utc_time_string
+    Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
+  end
+  # A simple counter.  The +value+ and +add+ methods read and
+  # increment the counter's value.
+  #
+  #   counter = IMW::Counter.new
+  #   counter.value  #=> 0
+  #   counter.add 1
+  #   counter.value  #=> 1
+  #
+  # The +next!+ method acts as like C's <tt>value++</tt>, incrementing
+  # +value+ _after_ it is referenced.
+  #
+  #   counter = IMW::Counter.new
+  #   counter.value  #=> 0
+  #   counter.next!  #=> 0
+  #   counter.value  #=> 1
+  #
+  # Counters can also be reset
+  #
+  #   counter.reset!
+  #   counter.value  #=> 0
+  class Counter
+    attr_accessor :value, :starting_value, :increment
+    # Return a new Counter.  The first argument is the starting value
+    # (defaults to 0) and the second is the increment (defaults to 1).
+    def initialize starting_value=0,increment=1
+      @starting_value = starting_value
+      @value          = starting_value
+      @increment      = increment
+    end
+    # Add +amount+ (defaults to the value of <tt>@increment</tt>).
+    def add amount=nil
+      @value += amount || @increment
+    end
+    alias_method :add!, :add
+    # Increment the counter by <tt>@increment</tt> but return its
+    # value _before_ being incremented.
+    def next!
+      old_value = @value
+      @value += @increment
+      old_value
+    end
+    # Reset the counter to +value+ (defaults to the value of
+    # <tt>@starting_value</tt>).
+    def reset! value=nil
+      @value = value || @starting_value
+    end
+  end
+end
+# puts "#{File.basename(__FILE__)}: Your Monkeywrench seems suddenly more utilisable." # at bottom

data/lib/imw/utils/paths.rb ADDED

@@ -0,0 +1,115 @@
+#
+# h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
+#
+# == About
+#
+# IMW uses lots of different directories to keep information on data
+# and datasets separate.  This module interfaces with the
+# configuration files to establish the paths to these IMW directories
+# and provides functions and mixins for IMW objects to use to access
+# these paths.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+module IMW
+  # Implements methods designed to work with an object's
+  # <tt>@paths</tt> attributes, adding and deleting symbolic
+  # references to paths and expanding calls to +path_to+ from that
+  # attribute or (when a miss) from <tt>IMW::PATHS</tt>.
+  #
+  # An including class should therefore define an array attribute
+  # <tt>@paths</tt>.
+  module Paths
+    # Expands a shorthand workflow path specification to an
+    # actual file path.
+    #
+    #   add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
+    #   path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
+    #   => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
+    def path_to *pathsegs
+      begin
+        path = Pathname.new path_to_helper(*pathsegs)
+        path.absolute? ? File.expand_path(path) : path.to_s
+      rescue Exception => e
+        raise("Can't find path to '#{pathsegs}': #{e}");
+      end
+    end
+    private
+    def path_to_helper *pathsegs # :nodoc:
+      # +path_to_helper+ handles the recursive calls for +path_to+.
+      expanded = pathsegs.flatten.compact.map do |pathseg|
+        case
+        when pathseg.is_a?(Symbol) && @paths.include?(pathseg)     then path_to(@paths[pathseg])
+        when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
+        when pathseg.is_a?(Symbol)                                 then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
+        else pathseg
+        end
+      end
+      File.join(*expanded)
+    end
+    public
+    # Adds a symbolic path for expansion by +path_to+.
+    def add_path sym, *pathsegs
+      @paths[sym] = pathsegs.flatten
+    end
+    # Removes a symbolic path for expansion by +path_to+.
+    def remove_path sym
+      @paths.delete sym if @paths.include? sym
+    end
+  end
+  class Dataset
+    attr_reader :paths
+    include IMW::Paths
+    private
+    def set_paths
+      @paths = {}
+      add_path :self, File.dirname(eval('__FILE__'))
+    end
+  end
+  def self.path_to *pathsegs
+    begin
+      path = Pathname.new IMW.path_to_helper(*pathsegs)
+      path.absolute? ? File.expand_path(path) : path.to_s
+    rescue Exception => e
+      raise("Can't find path to '#{pathsegs}': #{e}");
+    end
+  end
+  private
+  def self.path_to_helper *pathsegs # :nodoc:
+    # +path_to_helper+ handles the recursive calls for +path_to+.
+    expanded = pathsegs.flatten.compact.map do |pathseg|
+      case
+      when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
+      when pathseg.is_a?(Symbol)                                 then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
+      else pathseg
+      end
+    end
+    File.join(*expanded)
+  end
+  public
+  # Adds a symbolic path for expansion by +path_to+.
+  def self.add_path sym, *pathsegs
+    IMW::PATHS[sym] = pathsegs.flatten
+  end
+  # Removes a symbolic path for expansion by +path_to+.
+  def self.remove_path sym
+    IMW::PATHS.delete sym if IMW::PATHS.include? sym
+  end
+end
+# puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."

data/lib/imw/utils/uri.rb ADDED

@@ -0,0 +1,59 @@
+require 'imw/utils'
+require 'imw/utils/uuid'
+require 'addressable/uri'
+module Addressable
+  #
+  # Add the #scrubbed and #revhost calls
+  #
+  class URI
+    SAFE_CHARS      = %r{a-zA-Z0-9\-\._!\(\)\*\'}
+    PATH_CHARS      = %r{#{SAFE_CHARS}\$&\+,:=@\/;}
+    RESERVED_CHARS  = %r{\$&\+,:=@\/;\?\%}
+    UNSAFE_CHARS    = %r{\\ \"\#<>\[\]\^\`\|\~\{\}}
+    HOST_HEAD     = '(?:[a-z0-9\-]+\.)+'
+    HOST_TLD      = '(?:[a-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
+    def host_valid?
+      !!(host =~ %r{\A#{HOST_HEAD}#{HOST_TLD}\z}i)
+    end
+    def path_valid?
+      !!(path =~ %r{\A[#{PATH_CHARS}%]*\z})
+    end
+    def simple_connection_part?
+      ( ['http', nil].include?(scheme) &&
+        [80,     nil].include?(port) &&
+        (self.to_hash.values_at(:password, :user).join.blank?) )
+    end
+    #
+    # Does this look like a
+    #
+    def simple?
+      host_valid? && path_valid? && simple_connection_part?
+    end
+    #
+    # +revhost+
+    # the dot-reversed host:
+    #   foo.company.com => com.company.foo
+    #
+    def revhost
+      return host unless host =~ /\./
+      host.split('.').reverse.join('.')
+    end
+    #
+    # +uuid+  -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
+    #
+    # See
+    #   http://www.faqs.org/rfcs/rfc4122.html
+    #
+    def url_uuid
+      UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
+    end
+  end
+end
+class << Addressable::URI
+  alias_method :encode_segment,   :encode_component    if ! defined?(encode_segment)
+  alias_method :unencode_segment, :unencode_component  if ! defined?(unencode_segment)
+end

data/lib/imw/utils/uuid.rb ADDED

@@ -0,0 +1,33 @@
+require 'uuidtools'
+class UUID
+  #
+  # A string suitable for using as a path name --
+  #
+  # Ex.
+  #   urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
+  #   urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
+  #
+  # It's well possible there are more perspicacious choices for points to split
+  # the string, but until we hit that limit this'll do.
+  #
+  def to_path
+    'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
+  end
+  def self.hex_to_str str
+    /([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
+  end
+  def self.parse_hex str
+    parse(UUID.hex_to_str(str))
+  end
+  # Overrides UUIDTools -- force 32 hex digits (leading zeros)
+  def hexdigest
+    "%032x" % self.to_i
+  end
+end

data/lib/imw/utils/validate.rb ADDED

@@ -0,0 +1,38 @@
+# Return true if <tt>email</tt> is a valid email address
+def is_email?(email)
+  raise ArgumentError, "'email' must be a string" if email.class != String
+  return false if email.empty?
+  parts = email.split('@')
+  return false if parts.size != 2
+  local = parts.first
+  return false if not local =~ /[a-zA-Z0-9_~=+-.]*/ # allowed characters
+  return false if local[0,1] == '.' # starts with .
+  return false if local[-1,1] == '.' # end with .
+  return false if local.include?('..') # can't repeat .
+  domain = parts.last
+  return false if not is_domain?(domain)
+  return true
+end
+# Return true if <tt>domain</tt> is a valid domain name
+def is_domain?(domain)
+  raise ArgumentError, "'domain' must be a string" if domain.class != String
+  return false if domain.empty?
+  return false if domain.size > 255 # max number of characters in a domain
+  return false if not domain =~ /^[a-zA-Z0-9.\-]+$/ # allowed characters
+  parts = domain.split('.')
+  return false if parts.size > 127 # max number of subdomains
+  parts.all? {|part| return false if part.size > 63} # max number of characters in a subdomain
+  return true
+end
+# puts "#{File.basename(__FILE__)}: As you shape your body to the confines of your container you feel a tremendous sense of validation." # at bottom

data/lib/imw/utils/version.rb ADDED

@@ -0,0 +1,12 @@
+# copied from activewarehouse-etl gem
+module IMWVersion #:nodoc:
+  unless defined?(VERSION)
+  module VERSION #:nodoc:
+    MAJOR = 0
+    MINOR = 0
+    TINY  = 0
+    STRING = [MAJOR, MINOR, TINY].join('.')
+  end
+  end
+end

data/lib/imw/utils/view.rb ADDED

@@ -0,0 +1,113 @@
+class ActiveRecord::Base
+  class << self
+  end
+  # def merge!(hsh)
+  #   hsh = hsh.dup
+  #   # puts hsh.to_yaml
+  #   # has_many datasets, notes, fields, contributors
+  #   self.class.reflect_on_all_associations.each do |ass|
+  #     # ["@macro", "@class_name", "@name", "@primary_key_name", "@options",
+  #     #  "@klass",
+  #     #  "@through_reflection",
+  #     #  "@active_record",
+  #     puts [ass.name, ass.macro, ass.primary_key_name].to_yaml
+  #     if ass.macro == :has_many
+  #       els = hsh.delete(ass.name.to_s) || []
+  #       puts "!!!!!!!!!!!!!!!!!!!!!!!!!!", els, '!!'
+  #       els.each do |el|
+  #         puts el
+  #         self[ass.name] = ass.klass.new().merge!(el)
+  #       end
+  #     end
+  #     hsh.each do |key,val|
+  #       self[key] = val
+  #     end
+  #     p self
+  #     p self.datasets if self.respond_to? 'datasets'
+  #   end
+  # end
+  def undump(hsh)
+    puts "unumping from #{hsh.to_json}"
+    hsh.each{ |k,v| self[k] = v }
+    self.save!
+    self
+  end
+end
+class Pool < ActiveRecord::Base
+  def undump(hsh)
+    { :datasets => Dataset, :fields => Field,
+      :contributors => Contributor, :pool_notes => PoolNote }.each do |field, klass|
+      vals = hsh.delete(field.to_s) || []
+      puts "Undumping #{vals} info #{field}"
+      self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
+    end
+    super
+    self
+  end
+end
+class Dataset < ActiveRecord::Base
+  def undump(hsh)
+    { :datasets => Dataset, :fields => Field,
+      :contributors => Contributor, :dataset_notes => DatasetNote }.each do |field, klass|
+      vals = hsh.delete(field.to_s) || []
+      puts "Undumping #{vals} info #{field}"
+      self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
+    end
+    super
+    puts "Got Dataset #{self.to_yaml}"
+    self
+  end
+end
+class IMW < OpenStruct
+  #
+  # Takes an Infochimps Stupid Schema stream and
+  # constructs the corresponding objects.
+  #
+  # Here are the rules:
+  # * the schema has the structure
+  #   # this has to be first.
+  #   - infochimps_schema:
+  #       schema_version:     0.2  # in case stuff changes
+  #   # then any number of imw objects:
+  #   - pool:         (...)
+  #       fields:         [era, innings_pitched,
+  #   - dataset:      (...)
+  #       fields:
+  #         - name:       Earned Run Average
+  #           handle:   era
+  #           concept:    baseball-era
+  #           units:      earned_runs / (9*innings_pitched)
+  #   - contributor:  (...)
+  #   - field:        (...)
+  #
+  # * Objects are referred to by __handle__, *NOT* __id__. If an ID is
+  #   included, and an object exists with a non-matching ID or handle,
+  #   an error will be raised.
+  #
+  # * We want to make the schema files maintainable by hand, which means that
+  #   the loader tries to be smart about inline-defined objects.  That is, you
+  #   can either refer to (via handle) a field defined elsewhere, or you can
+  #   define the field in whole, and trust that the Right Thing will
+  #   happen. This presents the problem of collisions, though. If a bulk object
+  #   update arrives, we need to know whom to believe -- bulk loader or
+  #   database.  In the absence of versioning: we look up the object by its
+  #   handle.  If there's an existing object, any new information (fields with
+  #   values in new that are blank in old) is added to it.  If the object is
+  #   defined at the top level, it wins; if the object is defined as a sub field
+  #   it loses.
+  #
+  # * Every interesting object (Pool, Dataset, Contributor, Field) has a desc:
+  #   attribute (for Pool and Dataset it's virtual but never mind) to describe
+  #   __itself__.  Additionally, every interesting relationship has its own desc: field.
+  #
+  def self.undump(schema)
+    # compact then merge -- kill off blank
+  end
+end