RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

data/lib/imw/dataset/datamapper.rb ADDED

@@ -0,0 +1,66 @@
+#
+# h2. lib/imw/dataset/datamapper.rb -- extensions to datamapper for datasets
+#
+# == About
+#
+# The DataMapper[http://datamapper.org/] library is an ORM for Ruby
+# which is lighter than ActiveRecord[http://ar.rubyonrails.com/] and
+# the like.  It is the ORM that IMW is designed to work natively with.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'imw/utils'
+require 'dm-core'
+require 'dm-ar-finders'
+require 'dm-aggregates'
+require 'dm-serializer'
+module DataMapper
+  # Connect to a remote database
+  def self.setup_remote_connection options
+    options = { :handle => :default }.merge options
+    params = options.values_at(:protocol, :username, :password, :hostname, :dbname)
+    DataMapper.setup(options[:handle], "%s://%s:%s@%s/%s" % params)
+  end
+  # Connect to a local database
+  def self.setup_local_connection options
+    options = { :handle => :default }.merge options
+    params = options.values_at(:protocol, :dbpath, :dbname)
+    DataMapper.setup(options[:handle], "%s://%s/%s" % params)
+  end
+  # KLUDGE
+  def self.open_repositories repository_dbnames, params
+    repository_dbnames.each do |handle, dbname|
+      repo_params = params.merge({ :handle => handle, :dbname => dbname })
+      DataMapper.setup_remote_connection repo_params
+    end
+  end
+  module Model
+    # Find or create the resource matching search attributes and in
+    # either case set the update-able attributes.
+    def update_or_create(search_attributes, updateable_attributes = {})
+      if (resource = first(search_attributes))
+        resource.update_attributes updateable_attributes
+      else
+        resource = create(search_attributes.merge(updateable_attributes))
+      end
+      resource
+    end
+  end
+  # watch SQL log -- must be BEFORE call to db setup
+  def self.logging=(verbosity)
+    verbosity = :debug if (verbosity == true)
+    DataMapper::Logger.new(STDERR, verbosity) if verbosity
+  end
+end

data/lib/imw/dataset/datamapper/time_and_user_stamps.rb ADDED

@@ -0,0 +1,37 @@
+require 'rubygems'
+# gem 'dm-core', '=0.9.6'
+require 'dm-core'
+#
+# Stolen from http://github.com/sam/dm-more/tree/master/dm-timestamps/lib/dm-timestamps.rb
+#
+module DataMapper
+  module Timestamp
+    TIMESTAMP_PROPERTIES = {
+      :updated_at => lambda { |r| r.updated_at = DateTime.now },
+      :updated_on => lambda { |r| r.updated_on = Date.today   },
+      :updated_by => lambda { |r| r.updated_by = IMW::USER_INFO[:id] },
+      :created_at => lambda { |r| r.created_at = DateTime.now            if r.new_record? && r.created_at.nil? },
+      :created_on => lambda { |r| r.created_on = Date.today              if r.new_record? && r.created_on.nil?},
+      :created_by => lambda { |r| r.created_by = IMW::USER_INFO[:id]     if r.new_record? && r.created_by.blank?},
+    }
+    def self.included(model)
+      model.before :save, :set_timestamp_properties
+    end
+    private
+    def set_timestamp_properties
+      if dirty?
+        self.class.properties.slice(*TIMESTAMP_PROPERTIES.keys).compact.each do |property|
+          TIMESTAMP_PROPERTIES[property.name][self] unless attribute_dirty?(property.name)
+        end
+      end
+    end
+  end # module Timestamp
+  Resource::append_inclusions Timestamp
+end

data/lib/imw/dataset/loaddump.rb ADDED

@@ -0,0 +1,50 @@
+#
+# h2. lib/imw/dataset/loaddump.rb -- read and write datasets to resources
+#
+# == About
+#
+# Implements methods to load a dataset from a resource and to write a
+# dataset back to a resource.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'imw/utils'
+module IMW
+  class Dataset
+    # Return the data in +filename+ in an appropriate form.
+    #
+    # FIXME How do I get pass a block from one method to another?
+    def self.load filename, &block
+      filename = path_to(filename)
+      announce "Loading #{filename}"
+      file = IMW.open(filename)
+      data = file.load(filename)
+      if block
+        data.each{|record| yield record}
+        file
+      else
+        data
+      end
+    end
+    # Dump +data+ to +filename+.
+    def self.dump data, filename
+      filename = path_to(filename)
+      announce "Dumping to #{filename}"
+      IMW.open(filename,'w').dump(data)
+    end
+    # Dispatch to <tt>Dataset.dump</tt>.
+    def dump filename
+      self.class.dump self.data, *args
+    end
+  end
+end

data/lib/imw/dataset/old/file_collection.rb ADDED

@@ -0,0 +1,88 @@
+require 'imw/dataset'
+require 'imw/dataset/uri'
+#
+# All the files associated with a given URL
+#
+class DatasetFileCollection
+  include DataMapper::Resource
+  property      :id,                    Integer,   :serial   => true
+  property      :category,              String,    :nullable => false, :unique_index => :category
+  has n,        :ripped_file_collections
+end
+#
+# Collection of raw files retrieved from a spider based at a given URL
+#
+class RippedFileCollection
+  include DataMapper::Resource
+  property      :id,                    Integer,   :serial    => true
+  belongs_to    :url, :class_name => DM_URI, :child_key => [:url_id]
+  has n,        :ripped_files
+  belongs_to    :dataset_file_collection
+  def self.find_or_create_from_url url, dataset_file_collection
+    url = DM_URI.find_or_create_from_url(url)
+    ripdfiles = self.find_or_create(
+      { :url_id => url.id },
+      { :dataset_file_collection => dataset_file_collection})
+  end
+  def listing_filename()
+    path_to(:rawd, "listing-#{url.as_flat_filename}.txt")
+  end
+  def make_listing_file
+    return if File.exist?(listing_filename)
+    FileUtils.cd path_to(:ripd_root) do
+      `find #{url.as_path} > #{listing_filename}`
+    end
+  end
+  # Mon Aug 11 08:59:00 -0500 2008    files: 0
+  # Mon Aug 11 09:05:34 -0500 2008    files: 100000 => so, 1M files/hr. not good.
+  def index_from_listing
+    make_listing_file
+    self.ripped_files
+    FileUtils.cd path_to(:ripd_root) do
+      File.foreach(listing_filename) do |full_path|
+        track_count :files
+        full_path.chomp!
+        ripd_path = full_path[1+url.as_path.length..-1]
+        next if ripd_path.blank?
+        RippedFile.from_file(self, full_path, ripd_path)
+      end
+    end
+    self.save
+  end
+end
+#
+# Index the raw files retrieved from website
+#
+class RippedFile
+  include DataMapper::Resource
+  property      :id,                    Integer,   :serial => true
+  property      :ripped_file_collection_id, Integer,                 :unique_index => :ripd_path
+  property      :ripd_path,             String,    :length => 255, :nullable => false, :unique_index => :ripd_path
+  property      :retrieval_date,        DateTime
+  property      :compressed_size,       Integer
+  belongs_to    :ripped_file_collection
+  def self.from_file clxn, full_path, ripd_path
+    filedate = File.mtime(full_path)
+    filesize = File.size( full_path)
+    ripped_file = self.find_or_create({ :ripd_path => ripd_path }, {
+      :ripped_file_collection => clxn,
+      :retrieval_date  => filedate,
+      :compressed_size => filesize,
+    })
+    ripped_file
+  end
+end
+# SELECT r.*, u.host, u.path FROM ripped_files r
+# LEFT JOIN ripped_file_collections rfs ON r.ripped_file_collection_id = rfs.id
+# LEFT JOIN dm_uris u ON rfs.url_id = u.id

data/lib/imw/dataset/old/file_collection_utils.rb ADDED

@@ -0,0 +1,71 @@
+#!/usr/bin/env ruby
+require 'imw/utils'; include IMW
+require 'imw/dataset/file_collection'
+require 'tempfile'
+def bulk_listing_filename()     '/tmp/listing_foo.txt'  end
+def table_name()                'ripped_files'          end
+def run_mysql_cmd db_params, cmd
+  username, password, hostname, dbname = db_params.values_at(:username, :password, :hostname, :dbname)
+  query_file = Tempfile.new("qlstg")
+  query_file.puts cmd
+  query_file.close
+  puts `time mysql -E -u#{username} -p#{password} -h#{hostname} #{dbname} < #{query_file.path}`
+end
+def bulk_load_mysql db_params, ripd_base
+  announce "Calling mysql to bulk load #{ripd_base} (expect ~2s per 100k files)"
+  run_mysql_cmd db_params, %Q{
+    LOAD DATA LOCAL INFILE '#{bulk_listing_filename}'
+      REPLACE INTO TABLE `#{table_name}`
+      FIELDS TERMINATED BY ','
+      LINES  TERMINATED BY '\n'
+      (`ripped_file_collection_id`, `ripd_path`, `retrieval_date`, `compressed_size`)
+      ;
+  }
+end
+def clear_table
+  run_mysql_cmd "TRUNCATE #{table_name}"
+end
+class RippedFileCollection
+  def bulk_load_listing db_params, extra_find_args=""
+    announce "Indexing #{url.as_path} (expect ~10s per 100k files)"
+    FileUtils.cd path_to(:ripd_root) do
+      find_fmt = "#{self.id},%P,%TY-%Tm-%Td %TH:%TM:%TS,%s\n"
+      find_cmd = "find #{url.as_path} #{extra_find_args} -printf '#{find_fmt}' > #{bulk_listing_filename}"
+      puts `time #{find_cmd}`
+    end unless File.exist?(bulk_listing_filename)
+    bulk_load_mysql db_params, url.as_path
+  end
+end
+# SELECT rf_yrs.*, dfc.*, url.scheme, url.host, url.path
+#   FROM (
+#     SELECT SUBSTR(ripd_path,1,4) AS yr, COUNT(*), r.*
+#     FROM ripped_files r
+#     GROUP BY ripped_file_collection_id, yr
+#     ORDER BY ripped_file_collection_id, yr
+#   ) rf_yrs
+#   LEFT JOIN ripped_file_collections  rfc ON rfc.id = rf_yrs.ripped_file_collection_id
+#   LEFT JOIN dataset_file_collections dfc ON dfc.id = rfc.dataset_file_collection_id
+#   LEFT JOIN dm_uris url ON url.id = rfc.url_id
+db_params = IMW::DEFAULT_DATABASE_CONNECTION_PARAMS.merge({ :dbname => 'imw_weather_ncdc' })
+IMW::Dataset.setup_remote_connection db_params
+# Daily
+daily_dset_clxn  = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/daily' })
+rf_clxn          = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod', daily_dset_clxn
+rf_clxn.bulk_load_listing db_params
+# Hourly
+hourly_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly' })
+rf_clxn          = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa', hourly_dset_clxn
+rf_clxn.bulk_load_listing db_params, '\\! \\( -iname "isd-lite" -prune \\) '
+# Hourly-lite
+hlite_dset_clxn  = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly_lite' })
+rf_clxn          = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite', hlite_dset_clxn
+rf_clxn.bulk_load_listing db_params

data/lib/imw/dataset/scaffold.rb ADDED

@@ -0,0 +1,132 @@
+#
+# h2. lib/imw/workflow/scaffold.rb -- scaffold the directory structure for a dataset
+#
+# == About
+#
+# Defines workflow tasks for datasets to create directories and
+# symlinks to ease the processing of a dataset.
+#
+# Right now this file contains code written by Flip as well as code
+# written by Dhruv which accomplish basically the same task.  Dhruv's
+# code integrates with <tt>IMW::Dataset</tt> and Rake and should be
+# used preferentially.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: POST NO BILLS.  Is that funny to anyone but me?  No?" # at bottom
+require 'rake'
+require 'fileutils'
+require 'imw/utils'
+require 'imw/dataset/task'
+include FileUtils
+module IMW
+  include FileUtils
+  ################################################################
+  ## FLIP'S CODE
+  ################################################################
+  def scaffold_script_dirs
+    mkdir_p path_to(:me)
+  end
+  #
+  # * creates a directory for the dataset in each of the top-level hierarchies
+  #   (as given in ~/.imwrc)
+  # * links to that directory within the working directory
+  #   in directory pool/foo/bar/baz we'd find
+  #     rawd => /data/rawd/foo/bar/baz
+  #
+  def scaffold_dset_dirs
+    [:rawd, :temp, :fixd, :log].each do |seg|
+      unless File.exist?(path_to(seg))
+        seg_dir = path_to(pathseg_root(seg), :dset)
+        mkdir_p seg_dir
+        ln_s    seg_dir, path_to(seg)
+      end
+    end
+  end
+  #
+  # * creates a symlink within the working directory to the
+  #   ripped directory, named after its url
+  #
+  def scaffold_rip_dir url
+    unless File.exist?(path_to(seg))
+      ripd_dir = path_to(:ripd_root, url)
+      mkdir_p ripd_dir
+      ln_s    ripd_dir, path_to(:ripd)
+    end
+  end
+  def scaffold_dset
+    scaffold_script_dirs
+    scaffold_dset_dirs
+  end
+  ################################################################
+  ## DHRUV's CODE -- uses IMW::Dataset and Rake
+  ################################################################
+  module Workflow
+    # Creates a workflow task <tt>:create_directories</tt> to create
+    # the directory structure for this dataset.
+    def create_directories_task
+      @last_description = "Creates directories for this dataset in the peel through package steps."
+      define_task(IMW::Task, {:create_directories => []}) do
+        [:peel, :munge, :fix, :package].each do |step|
+          FileUtils.mkdir_p(path_to(step)) unless File.exist?(path_to(step))
+        end
+      end
+    end
+    # Creates a workflow task <tt>:create_symlinks</tt> to create
+    # the directory structure for this dataset.
+    def create_symlinks_task
+      @last_description = "Creates symlinks pointing from the directory containing scripts for this dataset to the directories for the peel through package steps."
+      define_task(IMW::Task, {:create_symlinks => [:create_directories]}) do
+        [:peel, :munge, :fix, :package].each do |step|
+          symlink = File.join(path_to(:script),IMW::Dataset::WORKFLOW_STEP_DIRS[step].to_s)
+          FileUtils.ln_s(path_to(step), symlink) unless File.exist?(symlink)
+        end
+        symlink = File.join(path_to(:script), "ripd")
+        FileUtils.ln_s(path_to(:ripd_root), symlink) unless File.exist?(symlink)
+      end
+    end
+    # Creates a task <tt>:initialize</tt> which does nothing but
+    # depends upon all the tasks required to initialize the dataset.
+    def create_initialize_task
+      @last_description = "Set everything up to begin processing the dataset."
+      define_task(IMW::Task, :initialize => [:create_directories, :create_symlinks])
+    end
+    # Removes all data for this dataset from the data directories.
+    def create_delete_data_task
+      @last_description = "Deletes all data and directories for this dataset for the peel through package steps."
+      define_task(IMW::Task, {:delete_data => []}) do
+        [:peel, :munge, :fix, :package].each do |step|
+          FileUtils.remove_dir(path_to(step)) if File.exist?(path_to(step))
+        end
+      end
+    end
+    # Creates a task <tt>:destroy</tt> which does nothing but depends
+    # upon all the tasks required to delete the dataset's data and
+    # remove its footprint from IMW.
+    def create_destroy_task
+      @last_description = "Get rid of all traces of this dataset."
+      define_task(IMW::Task, :destroy => [:delete_data])
+    end
+  end
+end