RubyGems - imw - Versions diffs - 0.1.0 → 0.1.1 - Mend

imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/README.rdoc +194 -31
data/VERSION +1 -1
data/bin/imw +5 -0
data/lib/imw/boot.rb +0 -15
data/lib/imw/dataset/paths.rb +38 -0
data/lib/imw/dataset/task.rb +21 -18
data/lib/imw/dataset/workflow.rb +126 -65
data/lib/imw/dataset.rb +56 -82
data/lib/imw/files/basicfile.rb +3 -3
data/lib/imw/files/compressed_files_and_archives.rb +23 -37
data/lib/imw/files/csv.rb +2 -1
data/lib/imw/files/directory.rb +62 -0
data/lib/imw/files/excel.rb +84 -0
data/lib/imw/files/sgml.rb +4 -23
data/lib/imw/files.rb +62 -47
data/lib/imw/packagers/archiver.rb +19 -1
data/lib/imw/packagers/s3_mover.rb +8 -0
data/lib/imw/parsers/html_parser/matchers.rb +251 -268
data/lib/imw/parsers/html_parser.rb +181 -176
data/lib/imw/parsers.rb +1 -1
data/lib/imw/repository.rb +35 -0
data/lib/imw/runner.rb +114 -0
data/lib/imw/utils/extensions/core.rb +0 -16
data/lib/imw/utils/paths.rb +0 -28
data/lib/imw.rb +21 -32
metadata +11 -19
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
data/lib/imw/dataset/datamapper.rb +0 -66
data/lib/imw/dataset/loaddump.rb +0 -50
data/lib/imw/dataset/old/file_collection.rb +0 -88
data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
data/lib/imw/dataset/scaffold.rb +0 -132
data/lib/imw/dataset/scraped_uri.rb +0 -305
data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
data/lib/imw/dataset/scrub/scrub.rb +0 -147
data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
data/lib/imw/dataset/scrub/slug.rb +0 -101
data/lib/imw/dataset/stats/counter.rb +0 -23
data/lib/imw/dataset/stats.rb +0 -73

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: imw
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Dhruv Bansal
@@ -10,14 +10,14 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-12-20 00:00:00 -06:00
-default_executable:
+date: 2010-02-02 00:00:00 -06:00
+default_executable: imw
 dependencies: []
 description: The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the tasks of acquiring, extracting, transforming, loading, and packaging data. It minimizes programmer time by encapsulating common data workflows and patterns and creating interfaces to many other useful Ruby libraries.
 email: coders@infochimps.org
-executables: []
+executables:
+- imw
 extensions: []
 extra_rdoc_files:
@@ -30,24 +30,12 @@ files:
 - README.rdoc
 - Rakefile
 - VERSION
+- bin/imw
 - etc/imwrc.rb
 - lib/imw.rb
 - lib/imw/boot.rb
 - lib/imw/dataset.rb
-- lib/imw/dataset/datamapper.rb
-- lib/imw/dataset/datamapper/time_and_user_stamps.rb
-- lib/imw/dataset/loaddump.rb
-- lib/imw/dataset/old/file_collection.rb
-- lib/imw/dataset/old/file_collection_utils.rb
-- lib/imw/dataset/scaffold.rb
-- lib/imw/dataset/scraped_uri.rb
-- lib/imw/dataset/scrub/old_working_scrubber.rb
-- lib/imw/dataset/scrub/scrub.rb
-- lib/imw/dataset/scrub/scrub_simple_url.rb
-- lib/imw/dataset/scrub/scrub_test.rb
-- lib/imw/dataset/scrub/slug.rb
-- lib/imw/dataset/stats.rb
-- lib/imw/dataset/stats/counter.rb
+- lib/imw/dataset/paths.rb
 - lib/imw/dataset/task.rb
 - lib/imw/dataset/workflow.rb
 - lib/imw/files.rb
@@ -58,6 +46,8 @@ files:
 - lib/imw/files/compressed_files_and_archives.rb
 - lib/imw/files/compressible.rb
 - lib/imw/files/csv.rb
+- lib/imw/files/directory.rb
+- lib/imw/files/excel.rb
 - lib/imw/files/json.rb
 - lib/imw/files/sgml.rb
 - lib/imw/files/text.rb
@@ -70,6 +60,8 @@ files:
 - lib/imw/parsers/html_parser/matchers.rb
 - lib/imw/parsers/line_parser.rb
 - lib/imw/parsers/regexp_parser.rb
+- lib/imw/repository.rb
+- lib/imw/runner.rb
 - lib/imw/utils.rb
 - lib/imw/utils/components.rb
 - lib/imw/utils/config.rb

data/lib/imw/dataset/datamapper/time_and_user_stamps.rb DELETED Viewed

@@ -1,37 +0,0 @@
-require 'rubygems'
-# gem 'dm-core', '=0.9.6'
-require 'dm-core'
-#
-# Stolen from http://github.com/sam/dm-more/tree/master/dm-timestamps/lib/dm-timestamps.rb
-#
-module DataMapper
-  module Timestamp
-    TIMESTAMP_PROPERTIES = {
-      :updated_at => lambda { |r| r.updated_at = DateTime.now },
-      :updated_on => lambda { |r| r.updated_on = Date.today   },
-      :updated_by => lambda { |r| r.updated_by = IMW::USER_INFO[:id] },
-      :created_at => lambda { |r| r.created_at = DateTime.now            if r.new_record? && r.created_at.nil? },
-      :created_on => lambda { |r| r.created_on = Date.today              if r.new_record? && r.created_on.nil?},
-      :created_by => lambda { |r| r.created_by = IMW::USER_INFO[:id]     if r.new_record? && r.created_by.blank?},
-    }
-    def self.included(model)
-      model.before :save, :set_timestamp_properties
-    end
-    private
-    def set_timestamp_properties
-      if dirty?
-        self.class.properties.slice(*TIMESTAMP_PROPERTIES.keys).compact.each do |property|
-          TIMESTAMP_PROPERTIES[property.name][self] unless attribute_dirty?(property.name)
-        end
-      end
-    end
-  end # module Timestamp
-  Resource::append_inclusions Timestamp
-end

data/lib/imw/dataset/datamapper.rb DELETED Viewed

@@ -1,66 +0,0 @@
-#
-# h2. lib/imw/dataset/datamapper.rb -- extensions to datamapper for datasets
-#
-# == About
-#
-# The DataMapper[http://datamapper.org/] library is an ORM for Ruby
-# which is lighter than ActiveRecord[http://ar.rubyonrails.com/] and
-# the like.  It is the ORM that IMW is designed to work natively with.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
-require 'imw/utils'
-require 'dm-core'
-require 'dm-ar-finders'
-require 'dm-aggregates'
-require 'dm-serializer'
-module DataMapper
-  # Connect to a remote database
-  def self.setup_remote_connection options
-    options = { :handle => :default }.merge options
-    params = options.values_at(:protocol, :username, :password, :hostname, :dbname)
-    DataMapper.setup(options[:handle], "%s://%s:%s@%s/%s" % params)
-  end
-  # Connect to a local database
-  def self.setup_local_connection options
-    options = { :handle => :default }.merge options
-    params = options.values_at(:protocol, :dbpath, :dbname)
-    DataMapper.setup(options[:handle], "%s://%s/%s" % params)
-  end
-  # KLUDGE
-  def self.open_repositories repository_dbnames, params
-    repository_dbnames.each do |handle, dbname|
-      repo_params = params.merge({ :handle => handle, :dbname => dbname })
-      DataMapper.setup_remote_connection repo_params
-    end
-  end
-  module Model
-    # Find or create the resource matching search attributes and in
-    # either case set the update-able attributes.
-    def update_or_create(search_attributes, updateable_attributes = {})
-      if (resource = first(search_attributes))
-        resource.update_attributes updateable_attributes
-      else
-        resource = create(search_attributes.merge(updateable_attributes))
-      end
-      resource
-    end
-  end
-  # watch SQL log -- must be BEFORE call to db setup
-  def self.logging=(verbosity)
-    verbosity = :debug if (verbosity == true)
-    DataMapper::Logger.new(STDERR, verbosity) if verbosity
-  end
-end

data/lib/imw/dataset/loaddump.rb DELETED Viewed

@@ -1,50 +0,0 @@
-#
-# h2. lib/imw/dataset/loaddump.rb -- read and write datasets to resources
-#
-# == About
-#
-# Implements methods to load a dataset from a resource and to write a
-# dataset back to a resource.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
-require 'imw/utils'
-module IMW
-  class Dataset
-    # Return the data in +filename+ in an appropriate form.
-    #
-    # FIXME How do I get pass a block from one method to another?
-    def self.load filename, &block
-      filename = path_to(filename)
-      announce "Loading #{filename}"
-      file = IMW.open(filename)
-      data = file.load(filename)
-      if block
-        data.each{|record| yield record}
-        file
-      else
-        data
-      end
-    end
-    # Dump +data+ to +filename+.
-    def self.dump data, filename
-      filename = path_to(filename)
-      announce "Dumping to #{filename}"
-      IMW.open(filename,'w').dump(data)
-    end
-    # Dispatch to <tt>Dataset.dump</tt>.
-    def dump filename
-      self.class.dump self.data, *args
-    end
-  end
-end

data/lib/imw/dataset/old/file_collection.rb DELETED Viewed

@@ -1,88 +0,0 @@
-require 'imw/dataset'
-require 'imw/dataset/uri'
-#
-# All the files associated with a given URL
-#
-class DatasetFileCollection
-  include DataMapper::Resource
-  property      :id,                    Integer,   :serial   => true
-  property      :category,              String,    :nullable => false, :unique_index => :category
-  has n,        :ripped_file_collections
-end
-#
-# Collection of raw files retrieved from a spider based at a given URL
-#
-class RippedFileCollection
-  include DataMapper::Resource
-  property      :id,                    Integer,   :serial    => true
-  belongs_to    :url, :class_name => DM_URI, :child_key => [:url_id]
-  has n,        :ripped_files
-  belongs_to    :dataset_file_collection
-  def self.find_or_create_from_url url, dataset_file_collection
-    url = DM_URI.find_or_create_from_url(url)
-    ripdfiles = self.find_or_create(
-      { :url_id => url.id },
-      { :dataset_file_collection => dataset_file_collection})
-  end
-  def listing_filename()
-    path_to(:rawd, "listing-#{url.as_flat_filename}.txt")
-  end
-  def make_listing_file
-    return if File.exist?(listing_filename)
-    FileUtils.cd path_to(:ripd_root) do
-      `find #{url.as_path} > #{listing_filename}`
-    end
-  end
-  # Mon Aug 11 08:59:00 -0500 2008    files: 0
-  # Mon Aug 11 09:05:34 -0500 2008    files: 100000 => so, 1M files/hr. not good.
-  def index_from_listing
-    make_listing_file
-    self.ripped_files
-    FileUtils.cd path_to(:ripd_root) do
-      File.foreach(listing_filename) do |full_path|
-        track_count :files
-        full_path.chomp!
-        ripd_path = full_path[1+url.as_path.length..-1]
-        next if ripd_path.blank?
-        RippedFile.from_file(self, full_path, ripd_path)
-      end
-    end
-    self.save
-  end
-end
-#
-# Index the raw files retrieved from website
-#
-class RippedFile
-  include DataMapper::Resource
-  property      :id,                    Integer,   :serial => true
-  property      :ripped_file_collection_id, Integer,                 :unique_index => :ripd_path
-  property      :ripd_path,             String,    :length => 255, :nullable => false, :unique_index => :ripd_path
-  property      :retrieval_date,        DateTime
-  property      :compressed_size,       Integer
-  belongs_to    :ripped_file_collection
-  def self.from_file clxn, full_path, ripd_path
-    filedate = File.mtime(full_path)
-    filesize = File.size( full_path)
-    ripped_file = self.find_or_create({ :ripd_path => ripd_path }, {
-      :ripped_file_collection => clxn,
-      :retrieval_date  => filedate,
-      :compressed_size => filesize,
-    })
-    ripped_file
-  end
-end
-# SELECT r.*, u.host, u.path FROM ripped_files r
-# LEFT JOIN ripped_file_collections rfs ON r.ripped_file_collection_id = rfs.id
-# LEFT JOIN dm_uris u ON rfs.url_id = u.id

data/lib/imw/dataset/old/file_collection_utils.rb DELETED Viewed

@@ -1,71 +0,0 @@
-#!/usr/bin/env ruby
-require 'imw/utils'; include IMW
-require 'imw/dataset/file_collection'
-require 'tempfile'
-def bulk_listing_filename()     '/tmp/listing_foo.txt'  end
-def table_name()                'ripped_files'          end
-def run_mysql_cmd db_params, cmd
-  username, password, hostname, dbname = db_params.values_at(:username, :password, :hostname, :dbname)
-  query_file = Tempfile.new("qlstg")
-  query_file.puts cmd
-  query_file.close
-  puts `time mysql -E -u#{username} -p#{password} -h#{hostname} #{dbname} < #{query_file.path}`
-end
-def bulk_load_mysql db_params, ripd_base
-  announce "Calling mysql to bulk load #{ripd_base} (expect ~2s per 100k files)"
-  run_mysql_cmd db_params, %Q{
-    LOAD DATA LOCAL INFILE '#{bulk_listing_filename}'
-      REPLACE INTO TABLE `#{table_name}`
-      FIELDS TERMINATED BY ','
-      LINES  TERMINATED BY '\n'
-      (`ripped_file_collection_id`, `ripd_path`, `retrieval_date`, `compressed_size`)
-      ;
-  }
-end
-def clear_table
-  run_mysql_cmd "TRUNCATE #{table_name}"
-end
-class RippedFileCollection
-  def bulk_load_listing db_params, extra_find_args=""
-    announce "Indexing #{url.as_path} (expect ~10s per 100k files)"
-    FileUtils.cd path_to(:ripd_root) do
-      find_fmt = "#{self.id},%P,%TY-%Tm-%Td %TH:%TM:%TS,%s\n"
-      find_cmd = "find #{url.as_path} #{extra_find_args} -printf '#{find_fmt}' > #{bulk_listing_filename}"
-      puts `time #{find_cmd}`
-    end unless File.exist?(bulk_listing_filename)
-    bulk_load_mysql db_params, url.as_path
-  end
-end
-# SELECT rf_yrs.*, dfc.*, url.scheme, url.host, url.path
-#   FROM (
-#     SELECT SUBSTR(ripd_path,1,4) AS yr, COUNT(*), r.*
-#     FROM ripped_files r
-#     GROUP BY ripped_file_collection_id, yr
-#     ORDER BY ripped_file_collection_id, yr
-#   ) rf_yrs
-#   LEFT JOIN ripped_file_collections  rfc ON rfc.id = rf_yrs.ripped_file_collection_id
-#   LEFT JOIN dataset_file_collections dfc ON dfc.id = rfc.dataset_file_collection_id
-#   LEFT JOIN dm_uris url ON url.id = rfc.url_id
-db_params = IMW::DEFAULT_DATABASE_CONNECTION_PARAMS.merge({ :dbname => 'imw_weather_ncdc' })
-IMW::Dataset.setup_remote_connection db_params
-# Daily
-daily_dset_clxn  = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/daily' })
-rf_clxn          = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod', daily_dset_clxn
-rf_clxn.bulk_load_listing db_params
-# Hourly
-hourly_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly' })
-rf_clxn          = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa', hourly_dset_clxn
-rf_clxn.bulk_load_listing db_params, '\\! \\( -iname "isd-lite" -prune \\) '
-# Hourly-lite
-hlite_dset_clxn  = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly_lite' })
-rf_clxn          = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite', hlite_dset_clxn
-rf_clxn.bulk_load_listing db_params

data/lib/imw/dataset/scaffold.rb DELETED Viewed

@@ -1,132 +0,0 @@
-#
-# h2. lib/imw/workflow/scaffold.rb -- scaffold the directory structure for a dataset
-#
-# == About
-#
-# Defines workflow tasks for datasets to create directories and
-# symlinks to ease the processing of a dataset.
-#
-# Right now this file contains code written by Flip as well as code
-# written by Dhruv which accomplish basically the same task.  Dhruv's
-# code integrates with <tt>IMW::Dataset</tt> and Rake and should be
-# used preferentially.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: POST NO BILLS.  Is that funny to anyone but me?  No?" # at bottom
-require 'rake'
-require 'fileutils'
-require 'imw/utils'
-require 'imw/dataset/task'
-include FileUtils
-module IMW
-  include FileUtils
-  ################################################################
-  ## FLIP'S CODE
-  ################################################################
-  def scaffold_script_dirs
-    mkdir_p path_to(:me)
-  end
-  #
-  # * creates a directory for the dataset in each of the top-level hierarchies
-  #   (as given in ~/.imwrc)
-  # * links to that directory within the working directory
-  #   in directory pool/foo/bar/baz we'd find
-  #     rawd => /data/rawd/foo/bar/baz
-  #
-  def scaffold_dset_dirs
-    [:rawd, :temp, :fixd, :log].each do |seg|
-      unless File.exist?(path_to(seg))
-        seg_dir = path_to(pathseg_root(seg), :dset)
-        mkdir_p seg_dir
-        ln_s    seg_dir, path_to(seg)
-      end
-    end
-  end
-  #
-  # * creates a symlink within the working directory to the
-  #   ripped directory, named after its url
-  #
-  def scaffold_rip_dir url
-    unless File.exist?(path_to(seg))
-      ripd_dir = path_to(:ripd_root, url)
-      mkdir_p ripd_dir
-      ln_s    ripd_dir, path_to(:ripd)
-    end
-  end
-  def scaffold_dset
-    scaffold_script_dirs
-    scaffold_dset_dirs
-  end
-  ################################################################
-  ## DHRUV's CODE -- uses IMW::Dataset and Rake
-  ################################################################
-  module Workflow
-    # Creates a workflow task <tt>:create_directories</tt> to create
-    # the directory structure for this dataset.
-    def create_directories_task
-      @last_description = "Creates directories for this dataset in the peel through package steps."
-      define_task(IMW::Task, {:create_directories => []}) do
-        [:peel, :munge, :fix, :package].each do |step|
-          FileUtils.mkdir_p(path_to(step)) unless File.exist?(path_to(step))
-        end
-      end
-    end
-    # Creates a workflow task <tt>:create_symlinks</tt> to create
-    # the directory structure for this dataset.
-    def create_symlinks_task
-      @last_description = "Creates symlinks pointing from the directory containing scripts for this dataset to the directories for the peel through package steps."
-      define_task(IMW::Task, {:create_symlinks => [:create_directories]}) do
-        [:peel, :munge, :fix, :package].each do |step|
-          symlink = File.join(path_to(:script),IMW::Dataset::WORKFLOW_STEP_DIRS[step].to_s)
-          FileUtils.ln_s(path_to(step), symlink) unless File.exist?(symlink)
-        end
-        symlink = File.join(path_to(:script), "ripd")
-        FileUtils.ln_s(path_to(:ripd_root), symlink) unless File.exist?(symlink)
-      end
-    end
-    # Creates a task <tt>:initialize</tt> which does nothing but
-    # depends upon all the tasks required to initialize the dataset.
-    def create_initialize_task
-      @last_description = "Set everything up to begin processing the dataset."
-      define_task(IMW::Task, :initialize => [:create_directories, :create_symlinks])
-    end
-    # Removes all data for this dataset from the data directories.
-    def create_delete_data_task
-      @last_description = "Deletes all data and directories for this dataset for the peel through package steps."
-      define_task(IMW::Task, {:delete_data => []}) do
-        [:peel, :munge, :fix, :package].each do |step|
-          FileUtils.remove_dir(path_to(step)) if File.exist?(path_to(step))
-        end
-      end
-    end
-    # Creates a task <tt>:destroy</tt> which does nothing but depends
-    # upon all the tasks required to delete the dataset's data and
-    # remove its footprint from IMW.
-    def create_destroy_task
-      @last_description = "Get rid of all traces of this dataset."
-      define_task(IMW::Task, :destroy => [:delete_data])
-    end
-  end
-end