RubyGems - imw - Versions diffs - 0.1.1 → 0.2.0 - Mend

imw 0.1.1 → 0.2.0

Files changed (143) hide show

data/.gitignore +4 -1
data/Rakefile +10 -0
data/TODO +18 -0
data/VERSION +1 -1
data/bin/imw +1 -1
data/etc/imwrc.rb +0 -50
data/examples/dataset.rb +12 -0
data/lib/imw/boot.rb +55 -9
data/lib/imw/dataset/paths.rb +15 -24
data/lib/imw/dataset/workflow.rb +131 -72
data/lib/imw/dataset.rb +94 -186
data/lib/imw/parsers/html_parser.rb +1 -1
data/lib/imw/parsers.rb +1 -1
data/lib/imw/repository.rb +3 -27
data/lib/imw/resource.rb +190 -0
data/lib/imw/resources/archive.rb +97 -0
data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
data/lib/imw/resources/archives_and_compressed.rb +32 -0
data/lib/imw/resources/compressed_file.rb +89 -0
data/lib/imw/resources/compressible.rb +77 -0
data/lib/imw/resources/formats/delimited.rb +92 -0
data/lib/imw/resources/formats/excel.rb +125 -0
data/lib/imw/resources/formats/json.rb +53 -0
data/lib/imw/resources/formats/sgml.rb +72 -0
data/lib/imw/resources/formats/yaml.rb +53 -0
data/lib/imw/resources/formats.rb +32 -0
data/lib/imw/resources/local.rb +198 -0
data/lib/imw/resources/remote.rb +110 -0
data/lib/imw/resources/schemes/hdfs.rb +242 -0
data/lib/imw/resources/schemes/http.rb +161 -0
data/lib/imw/resources/schemes/s3.rb +137 -0
data/lib/imw/resources/schemes.rb +19 -0
data/lib/imw/resources.rb +118 -0
data/lib/imw/runner.rb +5 -4
data/lib/imw/transforms/archiver.rb +215 -0
data/lib/imw/transforms/transferer.rb +103 -0
data/lib/imw/transforms.rb +8 -0
data/lib/imw/utils/error.rb +26 -30
data/lib/imw/utils/extensions/array.rb +5 -15
data/lib/imw/utils/extensions/hash.rb +6 -16
data/lib/imw/utils/extensions/hpricot.rb +0 -14
data/lib/imw/utils/extensions/string.rb +5 -15
data/lib/imw/utils/extensions/symbol.rb +0 -13
data/lib/imw/utils/extensions.rb +65 -0
data/lib/imw/utils/log.rb +14 -13
data/lib/imw/utils/misc.rb +0 -6
data/lib/imw/utils/paths.rb +101 -42
data/lib/imw/utils/version.rb +8 -9
data/lib/imw/utils.rb +2 -18
data/lib/imw.rb +92 -17
data/spec/data/sample.csv +1 -1
data/spec/data/sample.json +1 -0
data/spec/data/sample.tsv +1 -1
data/spec/data/sample.txt +1 -1
data/spec/data/sample.xml +1 -1
data/spec/data/sample.yaml +1 -1
data/spec/imw/dataset/paths_spec.rb +32 -0
data/spec/imw/dataset/workflow_spec.rb +41 -0
data/spec/imw/resource_spec.rb +79 -0
data/spec/imw/resources/archive_spec.rb +69 -0
data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
data/spec/imw/resources/compressed_file_spec.rb +48 -0
data/spec/imw/resources/compressible_spec.rb +36 -0
data/spec/imw/resources/formats/delimited_spec.rb +33 -0
data/spec/imw/resources/formats/json_spec.rb +32 -0
data/spec/imw/resources/formats/sgml_spec.rb +24 -0
data/spec/imw/resources/formats/yaml_spec.rb +41 -0
data/spec/imw/resources/local_spec.rb +98 -0
data/spec/imw/resources/remote_spec.rb +35 -0
data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
data/spec/imw/resources/schemes/http_spec.rb +19 -0
data/spec/imw/resources/schemes/s3_spec.rb +19 -0
data/spec/imw/transforms/archiver_spec.rb +120 -0
data/spec/imw/transforms/transferer_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +5 -33
data/spec/imw/utils/shared_paths_spec.rb +29 -0
data/spec/spec_helper.rb +5 -5
data/spec/support/paths_matcher.rb +67 -0
data/spec/support/random.rb +39 -36
metadata +88 -75
data/lib/imw/dataset/task.rb +0 -41
data/lib/imw/files/archive.rb +0 -113
data/lib/imw/files/basicfile.rb +0 -122
data/lib/imw/files/binary.rb +0 -28
data/lib/imw/files/compressed_file.rb +0 -93
data/lib/imw/files/compressed_files_and_archives.rb +0 -334
data/lib/imw/files/compressible.rb +0 -103
data/lib/imw/files/csv.rb +0 -113
data/lib/imw/files/directory.rb +0 -62
data/lib/imw/files/excel.rb +0 -84
data/lib/imw/files/json.rb +0 -41
data/lib/imw/files/sgml.rb +0 -46
data/lib/imw/files/text.rb +0 -68
data/lib/imw/files/yaml.rb +0 -46
data/lib/imw/files.rb +0 -125
data/lib/imw/packagers/archiver.rb +0 -126
data/lib/imw/packagers/s3_mover.rb +0 -36
data/lib/imw/packagers.rb +0 -8
data/lib/imw/utils/components.rb +0 -61
data/lib/imw/utils/config.rb +0 -46
data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
data/lib/imw/utils/extensions/core.rb +0 -27
data/lib/imw/utils/extensions/dir.rb +0 -24
data/lib/imw/utils/extensions/file_core.rb +0 -64
data/lib/imw/utils/extensions/typed_struct.rb +0 -22
data/lib/imw/utils/extensions/uri.rb +0 -59
data/lib/imw/utils/view/dump_csv.rb +0 -112
data/lib/imw/utils/view/dump_csv_older.rb +0 -117
data/lib/imw/utils/view.rb +0 -113
data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
data/spec/imw/files/archive_spec.rb +0 -118
data/spec/imw/files/basicfile_spec.rb +0 -121
data/spec/imw/files/bz2_spec.rb +0 -32
data/spec/imw/files/compressed_file_spec.rb +0 -96
data/spec/imw/files/compressible_spec.rb +0 -100
data/spec/imw/files/file_spec.rb +0 -144
data/spec/imw/files/gz_spec.rb +0 -32
data/spec/imw/files/rar_spec.rb +0 -33
data/spec/imw/files/tar_spec.rb +0 -31
data/spec/imw/files/text_spec.rb +0 -23
data/spec/imw/files/zip_spec.rb +0 -31
data/spec/imw/files_spec.rb +0 -38
data/spec/imw/packagers/archiver_spec.rb +0 -125
data/spec/imw/packagers/s3_mover_spec.rb +0 -7
data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
data/spec/imw/utils/extensions/find_spec.rb +0 -113
data/spec/imw/workflow/rip/local_spec.rb +0 -89
data/spec/imw/workflow/rip_spec.rb +0 -27
data/spec/support/archive_contents_matcher.rb +0 -94
data/spec/support/directory_contents_matcher.rb +0 -61

data/lib/imw/dataset.rb CHANGED Viewed

@@ -1,206 +1,114 @@
-require 'imw/utils'
 require 'imw/dataset/workflow'
 require 'imw/dataset/paths'
 module IMW
-  # The IMW::Dataset class is useful organizing a complex data
-  # transformation because it is capable of managing a collection of
-  # paths and the interdependencies between subparts of the
-  # transformation.
-  #
-  # == Manipulating Paths
-  #
-  # Storing paths makes code shorter and more readable.  By default
-  # (this assumes the executing script is in a file
-  # /home/imw_user/data/foo.rb):
-  #
-  #   dataset = IMW::Dataset.new
-  #   dataset.path_to(:self)
-  #   #=> '/home/imw_user/data'
-  #   dataset.path_to(:ripd)
-  #   #=> '/home/imw_user/data/ripd'
-  #   dataset.path_to(:pkgd, 'final.tar.gz')
-  #   #=> '/home/imw_user/data/pkgd/final.tar.gz'
-  #
-  # Paths can be added
-  #
-  #   dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
-  #   dataset.path_to(:sorted_output)
-  #   #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
-  #
-  # as well as removed (via +remove_path+).
-  #
-  # == Defining Workflows
-  #
-  # IMW encourages you to think of transforming data as a network of
-  # interdependent steps (see IMW::Workflow).  Each of IMW's five
-  # default steps maps to a named directory remembered by each
-  # dataset.
-  #
-  # The following example shows why this is a useful abstraction as
-  # well as illustrating some of the other functionality in IMW.
-  #
-  # == Example Dataset
-  #
-  # The first step is to import IMW and create the dataset
-  #
-  #   require 'rubygems'
-  #   require 'imw'
-  #   dataset = IMW::Dataset.new
-  #
-  # You can pass in a handle (the name or "slug" for the dataset) as
-  # well as some options.  Now define the steps you intend to take to
-  # complete the transformation:
-  #
-  # rip::
-  #   Data is collected from a source (+http+, +ftp+, database, &c.)
-  #   and deposited in the <tt>:ripd</tt> directory of this dataset.
-  #
-  #     dataset.task :rip do
-  #       IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
-  #         #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
-  #
-  #       IMW::Rip.from_database :named  => "weather_records",
-  #                              :at     => "public.astro.chimpu.edu",
-  #                              :select => "* FROM hurricane_frequency"
-  #         #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
-  #     end
-  #
-  #   Where <tt>[ripd]</tt> would be replaced by the IMW
-  #   <tt>:ripd</tt> directory.  The default <tt>:rip</tt> task is
-  #   empty so If there's no need to rip data (perhaps it's already on
-  #   disk?) then nothing needs to be done here.
-  #
-  # raw::
-  #   Managed by the <tt>:raw</tt> task, data is uncompressed and
-  #   extracted (if necessary) and stored in a subdirectory of the
-  #   <tt>:data</tt> directory named by the taxon and handle of this
-  #   dataset.
-  #
-  #     dataset.task :raw do
-  #       IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
-  #                                       Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
-  #       #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
-  #           [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
-  #           [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
-  #           ...
-  #           [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
-  #     end
-  #
-  #   Where <tt>[data]</tt> would be replaced by the IMW
-  #   <tt>:data</tt> directory.
-  #
-  #   If this dataset didn't have a taxon
-  #   (economics/alarming_trends) its files would be stored in a
-  #   directory +recent_history_of_banana_prices+ just below the
-  #   <tt>:data</tt> directory.
-  #
-  # fix::
-  #   Managed by the <tt>:fix</tt> task, transformations on the data
-  #   are performed.  IMW's method is to read data from a source
-  #   format (XML, YAML, CSV, &c.) into Ruby objects with hash
-  #   semantics.  These objects might be based upon structs,
-  #   ActiveRecord, DataMapper::Resource, FasterCSV...anything which
-  #   can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
-  #   ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
-  #   neatly into your toobox.
-  #
-  #
-  #     # Open an output file in XML for writing
-  #     output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
-  #       #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
-  #
-  #     # A place to store the combined data
-  #     correlations = []
-  #
-  #     dataset.task :fix do
-  #
-  #       # Return the contents of the weather data which has rows like
-  #       #
-  #       #   1    2008-09-01    4
-  #       #   2    2008-09-08    3
-  #       #   3    2008-08-15    3
-  #       #   ...
-  #       #
-  #       weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
-  #                               :headers => ["ID","DATE","NUM_HURRICANES"]).entries
-  #         #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
-  #
-  #
-  #       # Return the matching data from the produce prices XML file which looks like
-  #       #
-  #       #  <prices>
-  #       #    <price type="apple">
-  #       #      <date>2008/09/01</date>
-  #       #      <amount>0.15</amount>
-  #       #    </price>
-  #       #    <price type="banana">
-  #       #      <date>2008/09/01</date>
-  #       #      <amount>0.20</amount>
-  #       #    </price>
-  #       #    ...
-  #       #  </prices>
-  #       parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
-  #                                                 { :week  => 'date',
-  #                                                   :price => 'amount' }]
-  #
-  #       # Loop through the XML produce prices, mixing in the hurricane data,
-  #       # and outputting new rows.
-  #       Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
-  #         IMW.open file do |xml| #=> Hpricot::Doc
-  #           parser.parse(xml).each do |record|
-  #             num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
-  #             output << [week,record[:price],num_hurricanes]
-  #           end
-  #         end
-  #       end
-  #     end
-  #
-  # package::
-  #   Data is packaged and compressed (if necessary) into a delivery
-  #   format and deposited into the <tt>:pkgd</tt> directory.
-  #
-  #   dataset.task :pkg do
-  #     IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
-  #       #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
+  # The IMW::Dataset represents a common object in which paths, data
+  # resources, and various tasks can be intermingled to define a
+  # complex transformation of data.
+  #
+  # == Organizing Paths
+  #
+  # IMW encourages you to work within the following directory
+  # structure for a dataset +my_dataset+:
+  #
+  #   my_dataset/
+  #   |-- my_dataset.rb
+  #   |-- ripd
+  #   |   `-- ...
+  #   |-- rawd
+  #   |   `-- ...
+  #   |-- fixd
+  #   |   `-- ...
+  #   `-- pkgd
+  #       `-- ...
+  #
+  # Just like IMW itself, a dataset can manage a collection of paths.
+  # If <tt>my_dataset.rb</tt> defines a dataset:
+  #
+  #   # my_dataset/my_dataset.rb
+  #   dataset = IMW::Dataset.new(:my_dataset)
+  #
+  # then the following paths will be defined:
+  #
+  #   dataset.path_to(:root)   #=> my_dataset
+  #   dataset.path_to(:script) #=> my_dataset/my_dataset.rb
+  #   dataset.path_to(:ripd)   #=> my_dataset/ripd
+  #   dataset.path_to(:rawd)   #=> my_dataset/rawd
+  #   dataset.path_to(:fixd)   #=> my_dataset/fixd
+  #   dataset.path_to(:pkgd)   #=> my_dataset/pkgd
+  #
+  # Just like IMW itself, the +dataset+ supports adding path
+  # references
+  #
+  #   dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
+  #   dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
+  #
+  # as well as removed (via <tt>dataset.remove_path</tt>)).
+  #
+  # A subclass of IMW::Dataset can customize these paths be overriding
+  # IMW::Dataset#set_default_paths as well as define new ones by
+  # overriding IMW::Dataset#set_paths.
+  #
+  # Setting paths can be skipped altogether by passing the
+  # <tt>:skip_paths</tt> option when instantiating a dataset:
+  #
+  #   dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
+  #
+  # == Utilizing Tasks
+  #
+  # An IMW::Dataset utilizes Rake to manage tasks needed to transform
+  # data.  See IMW::Workflow for a description of the pre-defined
+  # tasks (+rip+, +parse+, +fix+, +package+).
+  #
+  # New tasks can be defined
+  #
+  #   dataset.task :get_authorization do
+  #     # ... get an authorization token
   #   end
   #
-  # In the above, <tt>dataset.task</tt> behaves like
-  # <tt>Rake.task</tt>, merely defining a task and its dependencies
-  # without executing it via
+  # and hooked into the default tasks in the usual Rake manner
+  #
+  #   dataset.task :rip => [:get_authorization]
+  #
+  # A dataset also has methods for the workflow step tasks to make
+  # this easier
+  #
+  #   dataset.rip [:get_authorized]
+  #
+  # Tasks for a dataset can be accessed and invoked as follows
+  #
+  #   dataset[:rip].invoke
+  #
+  # as well as by using the command line +imw+ tool.
+  #
+  # Defining tasks can be skipped altogether by passing the
+  # <tt>:skip_workflow</tt> option when instantiating a dataset
   #
-  #   dataset.task(:pkg).invoke
+  #   dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
   #
-  # Since the <tt>:rip</tt>, <tt>:raw</tt>, <tt>:fix</tt>, and
-  # <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
-  # will first cause <tt>:rip</tt> to run.
+  # == Working with Repositories
   #
-  # By default, the tasks associated with a dataset are blank.  All of
-  # IMW's functionality is available without defining tasks.  Tasks
-  # simply provide a convenient scaffold for building a data
-  # transformation upon.
+  # A dataset can be added to a repository by passing the
+  # <tt>:repository</tt> option
   #
-  # Similarly, there is no requirement to use the directory structure
-  # outlined above.  IMW's methods accept plain filenames and do the
-  # Right Thing where possible.  The combination of tasks with
-  # matching directory structure is a suggested but not mandatory
-  # framework in which to program.
+  #   repo    = IMW::Repository.new
+  #   dataset = IMW::Dataset.new :my_dataset, :repository => repo
   class Dataset
-    # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
-    # dataset processing.
     include IMW::Workflow
-    attr_accessor :handle, :options, :data
+    attr_accessor :handle, :options
-    def initialize options = {}
+    def initialize handle, options = {}
       @options = options
-      @handle  = options[:handle]
-      initialize_workflow
-      set_root_paths
-      set_paths
-      set_tasks
+      @handle  = handle
+      set_default_paths   unless options[:skip_paths]
+      set_paths           unless options[:skip_paths]
+      initialize_workflow unless options[:skip_workflow]
+      if options[:repository]
+        options[:repository][handle] = self
+      end
     end
   end

data/lib/imw/parsers/html_parser.rb CHANGED Viewed

@@ -18,7 +18,7 @@
 # that goes elsewhere.
 #
 #
-# == Sample HTML (http://twitter.com:
+# == Sample HTML (http://twitter.com):
 #
 #   <ul class="about vcard entry-author">
 #     <li         ><span class="label">Name</span>     <span class="fn" >MarsPhoenix       </span> </li>

data/lib/imw/parsers.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module IMW
   module Parsers
-    autoload :HtmlParser,   'imw/parsers/html_parser'
     autoload :LineParser,   'imw/parsers/line_parser'
     autoload :RegexpParser, 'imw/parsers/regexp_parser'
+    autoload :HtmlParser,   'imw/parsers/html_parser'
   end
 end

data/lib/imw/repository.rb CHANGED Viewed

@@ -1,35 +1,11 @@
-require 'imw/utils'
 module IMW
-  # A Repository is a collection of datasets.
+  # A Repository is a collection of datasets.  It is used by the
+  # command-line +imw+ tool.
   class Repository < Hash
-    # FIXME This should read some configuration settings somewhere and
-    # generate a pool specific to each IMW user.
-    def self.default
-      new
-    end
+    alias_method :datasets, :values
   end
-  # The default repository managed by IMW.
-  REPOSITORY = Repository.default
-  # Add a dataset to the IMW::REPOSITORY.  If the dataset has a
-  # +handle+ then it will be used as the key in this repository;
-  # otherwise the dataset's class will be used.
-  def self.add dataset
-    REPOSITORY[dataset.handle] = dataset
-  end
-  # Remove a dataset from the IMW::REPOSITORY.  Can pass in either a
-  # string handle or an instance of the dataset.
-  def self.delete handle
-    handle = handle.handle if handle.respond_to?(:handle)
-    REPOSITORY.delete(handle)
-  end
 end

data/lib/imw/resource.rb ADDED Viewed

@@ -0,0 +1,190 @@
+require 'addressable/uri'
+require 'imw/resources'
+module IMW
+  # A resource can be anything addressable via a URI.  Examples
+  # include local files, remote files, webpages, &c.
+  #
+  # The IMW::Resource class takes a URI as input and then dynamically
+  # extends itself with appropriate modules from IMW::Resources.  As
+  # an example, calling
+  #
+  #   my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
+  #
+  # would return an IMW::Resource extended by
+  # IMW::Resources::Archives::Tarbz2 (among other modules) which
+  # therefore has methods for extracting, listing, and appending to
+  # the archive.
+  #
+  # Modules are so extended based on handlers defined in the
+  # <tt>imw/resources</tt> directory and accessible via
+  # IMW::Resources#handlers.  You can define your own handlers by
+  # defining the constant IMW::Resources::USER_DEFINED_HANDLERS in
+  # your configuration file.
+  #
+  # The modules extending a particular IMW::Resource instance can be
+  # listed as follows
+  #
+  #   my_archive.resource_modules #=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Archives::Tarbz2]
+  #
+  # By default, resources are opened for reading.  Passing in the
+  # appropriate <tt>:mode</tt> option changes this:
+  #
+  #   IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
+  #
+  # If the <tt>:skip_modules</tt> option is passed in then the
+  # resource will not extend itself with any modules and will
+  # essentially only retain the bare functionality of a URI.  This can
+  # be useful when subclassing IMW::Resource or dealing with a very
+  # strange kind of resource.
+  #
+  # Read the documentation for modules in IMW::Resources to learn more
+  # about the various behaviors an IMW::Resource can acquire.
+  class Resource
+    attr_reader :uri, :mode
+    def initialize uri, options={}
+      self.uri = uri
+      @mode    = options[:mode] || 'r'
+      extend_appropriately! unless options[:skip_modules]
+    end
+    # Return the modules this resource has been extended by.
+    #
+    # @return [Array] the modules this resource has been extended by.
+    def resource_modules
+      @resource_modules ||= []
+    end
+    # Works just like Object#extend except it keeps track of the
+    # modules it has extended, see Resource#resource_modules.
+    def extend mod
+      resource_modules << mod
+      super mod
+    end
+    # Extend this resource with modules by passing it through a
+    # collection of handlers defined by IMW::Resources#handlers
+    def extend_appropriately!
+      IMW::Resources.extend_resource!(self)
+    end
+    # Set the URI of this resource by parsing the given +uri+ (if
+    # necessary).
+    #
+    # @param [String, Addressable::URI] uri the uri to parse
+    def uri= uri
+      if uri.is_a?(Addressable::URI)
+        @uri = uri
+      else
+        begin
+          @uri = Addressable::URI.parse(uri.to_s)
+        rescue URI::InvalidURIError
+          @uri = Addressable::URI.parse(URI.encode(uri.to_s))
+          @encoded_uri = true
+        end
+      end
+    end
+    # The scheme of this resource.  Will be +nil+ for local resources.
+    #
+    # @return [String]
+    def scheme
+      @scheme ||= uri.scheme
+    end
+    # The directory name of this resource's path.
+    #
+    # @return [String]
+    def dirname
+      @dirname  ||= File.dirname(path)
+    end
+    # The basename of this resource's path.
+    #
+    # @return [String]
+    def basename
+      @basename ||= File.basename(path)
+    end
+    # Returns the extension (INCLUDING the '.') of this resource's
+    # path.  Redefine this in an including class for which this is
+    # weird ('.tar.gz' I'm talking to you...)
+    #
+    # @return [String]
+    def extname
+      @extname ||= File.extname(path)
+    end
+    # Returns the extension (WITHOUT the '.') of this resource's path.
+    #
+    # @return [String]
+    def extension
+      @extension ||= extname[1..-1] || ''
+    end
+    # Returns the basename of the file with its extension removed
+    #
+    #   IMW.open('/path/to/some_file.tar.gz').name # => some_file
+    #
+    # @return [String]
+    def name
+      @name ||= extname ? basename[0,basename.length - extname.length] : basename
+    end
+    def to_s
+      uri.to_s
+    end
+    # Raise an error unless this resource exists.
+    #
+    # @param [String] message an optional message to include
+    def should_exist!(message=nil)
+      raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', '))          unless respond_to?(:path)
+      raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
+      raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', '))                                                             unless exist?
+    end
+    # Open a copy of this resource.
+    #
+    # This is useful when wanting to reset file handles.  Though -- be
+    # warned -- it does not close any file handles itself...
+    #
+    # @return [IMW::Resource] the new (old) resource
+    def reopen
+      IMW.open(self.uri.to_s)
+    end
+    # If +method+ begins with the strings +is+, +on+, or +via+ and
+    # ends with a question mark then we interpret it as a question
+    # this resource doesn't know how to answer -- so we have it answer
+    # +false+.
+    #
+    # As an example, consider the following loop:
+    #
+    #   IMW.open('/tmp').all_contents.each do |obj|
+    #     if obj.is_archive?
+    #       # ... do something
+    #     end
+    #   end
+    #
+    # When +obj+ is initialized and it _isn't_ an archive, then it
+    # doesn't know about the <tt>is_archive?</tt> method -- but it
+    # should therefore answer false anyway.
+    #
+    # This lets a basic text file answer questions about whether it's
+    # an archive (or on S3, or accessed via some user-defined scheme,
+    # &c.) without needing to know anything about archives (or S3 or
+    # the user-defined scheme).
+    def method_missing method, *args
+      if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
+        # querying for a boolean response so answer false
+        return false
+      else
+        raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{resource_modules.join(', ')}"
+      end
+    end
+  end
+end

data/lib/imw/resources/archive.rb ADDED Viewed

@@ -0,0 +1,97 @@
+module IMW
+  module Resources
+    module Archives
+      autoload :Rar,    'imw/resources/archives_and_compressed/rar'
+      autoload :Tar,    'imw/resources/archives_and_compressed/tar'
+      autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
+      autoload :Targz,  'imw/resources/archives_and_compressed/targz'
+      autoload :Zip,    'imw/resources/archives_and_compressed/zip'
+    end
+    # Defines methods for creating, appending to, extracting, and
+    # listing an archive file.  This module isn't used to directly
+    # extend an IMW::Resource -- instead, format specifc modules
+    # (e.g. - IMW::Resources::Archives::Tarbz2) include this module
+    # and define the specific settings (command-line flags, &c.)
+    # required to make things work.
+    module Archive
+      attr_accessor :archive_settings
+      # Is this file an archive?
+      #
+      # @return [true, false]
+      def is_archive?
+        true
+      end
+      # Create an archive of the given +input_paths+.
+      #
+      # @param [String, IMW::Resource] input_paths the paths to add to this archive
+      def create *input_paths
+        should_have_archive_setting!("Cannot create archive #{path}", :program, :create)
+        IMW.system archive_settings[:program], archive_settings[:create], path, *input_paths.flatten
+        self
+      end
+      # Append to this archive the given +input_paths+.
+      #
+      # @param [String, IMW::Resource] input_paths the paths to add to this archive
+      def append *input_paths
+        should_have_archive_setting!("Cannot append to archive #{path}", :append)
+        IMW.system archive_settings[:program], archive_settings[:append], path, *input_paths.flatten
+        self
+      end
+      # Extract the files from this archive to the current directory.
+      def extract
+        should_exist!("Cannot extract archive.")
+        should_have_archive_setting!("Cannot extract archive #{path}", :extract, [:unarchving_program, :program])
+        program = archive_settings[:unarchiving_program] || archive_settings[:program]
+        IMW.system program, archive_settings[:extract], path
+      end
+      # Return a (sorted) list of contents in this archive.
+      #
+      # @return [Array] a list of paths in the archive.
+      def contents
+        should_exist!("Cannot list archive contents.")
+        should_have_archive_setting!("Cannot list archive #{path}", :list, [:unarchiving_program, :program])
+        program = archive_settings[:unarchiving_program] || archive_settings[:program]
+        # FIXME this needs to be more robust
+        flags = archive_settings[:list]
+        flags = flags.join(' ') if flags.is_a?(Array)
+        command = [program, flags, path.gsub(' ', '\ ')].join(' ')
+        output  = `#{command}`
+        archive_contents_string_to_array(output)
+      end
+      protected
+      def should_have_archive_setting! message=nil,*settings # :nodoc:
+        settings.each do |setting|
+          if setting.is_a?(Array)
+            raise IMW::Error.new([message, "Must define one of #{setting.join(', ')} in archive_settings"].compact.join(', ')) unless setting.any? { |optional_setting| archive_settings[optional_setting] }
+          else
+            raise IMW::Error.new([message, "Must define #{setting} in archive_setings"].compact.join(', '))                    unless archive_settings[setting]
+          end
+        end
+      end
+      # Parse and format the output from the archive program's "list"
+      # command into an array of filenames.
+      #
+      # An including class can override this method to match the
+      # output from the archiving program of that class.
+      #
+      # @param [String] string the raw output from the archive program's "list" command
+      # @return [Array] a list of paths in the archive
+      def archive_contents_string_to_array string
+        string.split("\n")
+      end
+    end
+  end
+end

data/lib/imw/resources/archives_and_compressed/bz2.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module IMW
+  module Resources
+    module CompressedFiles
+      module Bz2
+        include IMW::Resources::CompressedFile
+        def compression_settings
+          @compression_settings ||= {
+            :decompression_program    => :bzip2,
+            :decompress               => '-fd'
+          }
+        end
+      end
+    end
+  end
+end