RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

data/lib/imw/files/compressible.rb ADDED

@@ -0,0 +1,103 @@
+#
+# h2. lib/imw/files//compressible.rb -- compression module
+#
+# == About
+#
+# Module used for compression of files.  An including
+# <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
+# <tt>compress!</tt> methods.
+#
+# By default, bzip2 is used for compression though gzip can also be
+# specified (the full list of known compression programs is in
+# <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>).  Zip and Rar
+# compression are handled by the <tt>IMW::Files::Archive</tt> module.
+#
+# Decompression should be handled via the
+# <tt>IMW::Files::CompressedFile</tt> class.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
+module IMW
+  module Files
+    module Compressible
+      # Known compression programs.
+      COMPRESSION_PROGS = [:bzip2, :gzip]
+      # Extensions that are appended by each compression program.
+      COMPRESSION_EXTS = {
+        :bzip2 => '.bz2',
+        :gzip => '.gz'
+      }
+      # Compression flags for each program
+      COMPRESSION_FLAGS = {
+        :bzip2 => "-f",
+        :gzip => "-f"
+      }
+      protected
+      # Check that +program+ is a valid compression program.
+      def ensure_valid_compression_program program
+        raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
+      end
+      # Construct the command passed to the shell to compress this
+      # file using the given +program+.
+      def compression_command program
+        ensure_valid_compression_program program
+        [IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
+      end
+      # Return the object representing this file compressed with
+      # +program+.
+      def compressed_file_path program
+        ensure_valid_compression_program program
+        path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
+      end
+      public
+      # Compress this file in its present directory using +program+,
+      # overwriting any existing compressed files and without saving
+      # the original file.  Returns an
+      # <tt>IMW::Files::CompressedFile</tt> object corresponding to
+      # the compressed file.
+      #
+      # Options:
+      #
+      # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
+      # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
+      def compress! program = :bzip2
+        raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
+        FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
+        IMW.open(self.compressed_file_path(program))
+      end
+      # Compress this file in its present directory, overwriting any
+      # existing compressed files while keeping the original file.
+      # Returns an <tt>IMW::Files::CompressedFile</tt> object
+      # corresponding to the compressed file.
+      #
+      # Options:
+      #
+      # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
+      # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
+      def compress program = :bzip2
+        raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
+        begin
+          FileUtils.cp(self.path,self.path + 'copy')
+          compress! program
+        ensure
+          FileUtils.mv(self.path + 'copy',self.path)
+        end
+        IMW.open(self.compressed_file_path(program))
+      end
+    end
+  end
+end

data/lib/imw/files/csv.rb ADDED

@@ -0,0 +1,112 @@
+#
+# h2. lib/imw/files/csv.rb -- CSV, TSV files
+#
+# == About
+#
+# For "comma-separated value" (CSV) and "tab-separated value" (TSV)
+# files.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'fastercsv'
+module IMW
+  module Files
+    # A base class from which to subclass various types of tabular
+    # data files (CSV, TSV, &c.)
+    class TabularDataFile < FasterCSV
+      include IMW::Files::BasicFile
+      include IMW::Files::Compressible
+      # Default options to be passed to
+      # FasterCSV[http://fastercsv.rubyforge.org/]; see its
+      # documentation for more information.
+      DEFAULT_OPTIONS = {
+        :col_sep        => ',',
+        :headers        => false,
+        :return_headers => false,
+        :write_headers  => true,
+        :skip_blanks    => false,
+        :force_quotes   => false
+      }
+      def initialize uri, mode='r', options = {}
+        options.reverse_merge!(self.class::DEFAULT_OPTIONS)
+        self.uri= uri
+        super open(uri,mode),options
+      end
+      # Return the contents of this CSV file as an array of arrays.
+      def load
+        entries
+      end
+      # Dump +data+ to this file.
+      #
+      # Options include:
+      # <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
+      # <tt>:close</tt> (true):: close the file after writing +data+
+      def dump data, options = {}
+        options = options.reverse_merge :close => true, :flush => true
+        data.each {|row| self << row}
+        self.flush if options[:flush]
+        self.close if options[:close]
+        self
+      end
+      # Return a random sample of rows.
+      def sample length=10
+        rows, indices = [], Set.new
+        begin
+          each_with_index do |row, index|
+            break if rows.size == length
+            next if index != 0 && rand < 0.75   # skip 3/4 of rows after the 1st
+            rows    << row
+            indices << index
+          end
+          # now fill up to length if not there already
+          while rows.length < length
+            each_with_index do |row, index|
+              break if rows.size == length
+              next if index indices.include?(index)
+              rows << row
+            end
+          end
+          rows
+        rescue FasterCSV::MalformedCSVError
+          rows
+        end
+      end
+    end
+    # Represents a file of comma-separated values (CSV).  This class
+    # is a subclass of <tt>FasterCSV</tt> so the methods of that
+    # library are available for use.
+    #
+    # See <tt>IMW::Files::TabularDataFile</tt> for more complete
+    # documentation.
+    class Csv < TabularDataFile
+    end
+    # Represents a file of tab-separated values (TSV).  This class
+    # is a subclass of <tt>FasterCSV</tt> so the methods of that
+    # library are available for use.
+    #
+    # See <tt>IMW::Files::TabularDataFile</tt> for more complete
+    # documentation.
+    class Tsv < TabularDataFile
+      DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
+    end
+    FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
+    FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
+  end
+end

data/lib/imw/files/json.rb ADDED

@@ -0,0 +1,41 @@
+# h2. lib/imw/files/json.rb -- describes json files
+#
+# == About
+#
+# A class for working with JSON files.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
+require 'json'
+require 'imw/files/text'
+module IMW
+  module Files
+    class Json < IMW::Files::Text
+      def initialize uri, mode='r', options = {}
+        super uri, mode
+      end
+      # Return the contents of this JSON file.
+      #
+      # FIXME what to do if a block is passed in?
+      def load &block
+        JSON.parse File.new(@path).read
+      end
+      # Dump +data+ to this file as JSON.
+      def dump data
+        super data.to_json
+      end
+    end
+    FILE_REGEXPS << [/\.json$/, IMW::Files::Json]
+  end
+end

data/lib/imw/files/sgml.rb ADDED

@@ -0,0 +1,65 @@
+#
+# h2. lib/imw/files/sgml.rb -- SGML files
+#
+# == About
+#
+# For SGML-derived files, including XML, HTML, &c..
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'hpricot'
+require 'imw/files/text'
+require 'imw/parsers/html_parser'
+module IMW
+  module Files
+    module Sgml
+      attr_accessor :doc
+      def initialize uri, mode='r', options={}
+        super uri, mode, options
+        @doc = Hpricot(open(uri))
+      end
+      # Delegate to Hpricot
+      def method_missing method, *args, &block
+        @doc.send method, *args, &block
+      end
+      # Parse this file using the IMW HTMLParser.  The parser can
+      # either be passed in directly or constructed from a passed hash
+      # of matchers.
+      def parse *args
+        parser = args.first.is_a?(IMW::HTMLParser) ? args.first : IMW::HTMLParser.new(*args)
+        parser.parse(self)
+      end
+    end
+    class Xml < IMW::Files::Text
+      include Sgml
+      def initialize uri, mode='r', options={}
+        super uri, mode, options
+        @doc = Hpricot.XML(open(uri))
+      end
+    end
+    class Html < IMW::Files::Text
+      include Sgml
+      def initialize uri, mode='r', options={}
+        super uri, mode, options
+        @doc = Hpricot(open(uri))
+      end
+    end
+  end
+end

data/lib/imw/files/text.rb ADDED

@@ -0,0 +1,68 @@
+module IMW
+  module Files
+    # Used to process text files when no more specialized class is suitable.
+    #
+    #   f = IMW::Files::Text.new '/path/to/my_file.dat'
+    #   f.load do |line|
+    #     # ...
+    #   end
+    #
+    # Missing methods will be passed to the associated file handle
+    # (either IO or StringIO depending on whether the URI passed in
+    # was local or remote) so the usual stuff like read or each_line
+    # still works.
+    class Text
+      include IMW::Files::BasicFile
+      include IMW::Files::Compressible
+      attr_reader :file, :parser
+      def initialize uri, mode='r', options = {}
+        self.uri= uri
+        raise IMW::PathError.new("Cannot write to remote file #{uri}") if mode == 'w' && remote?
+        @file = open(uri, mode)
+      end
+      # Return the contents of this text file as a string.
+      def load
+        file.read
+      end
+      # Return an array with each line of this file.  If given a
+      # block, pass each line to the block.
+      def entries &block
+        if block_given?
+          file.each do |line|
+            yield line.chomp
+          end
+        else
+          file.map do |line|
+            line.chomp
+          end
+        end
+      end
+      # Dump +data+ to this file as a string.  Close the file handle
+      # if passed in :close.
+      def dump data, options={}
+        file.write(data.inspect)
+        file.close if options[:close]
+      end
+      def method_missing method, *args
+        file.send method, *args
+      end
+      def parse parser_spec, &block
+        lines = parser_spec.delete(:lines)
+        @parser = IMW::Parsers::RegexpParser.new(parser_spec)
+        parser.parse!(file, {:lines => lines}, &block)
+      end
+    end
+  end
+end
+# puts "#{File.basename(__FILE__)}: Don't forget to put a nametag on your Monkeywrench or one of the other chimps might steal it!" # at bottom

data/lib/imw/files/yaml.rb ADDED

@@ -0,0 +1,46 @@
+#
+# h2. lib/imw/files/yaml.rb -- describes yaml files
+#
+# == About
+#
+# A class for working with YAML files.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+require 'yaml'
+require 'imw/files/text'
+module IMW
+  module Files
+    class Yaml < IMW::Files::Text
+      def initialize uri, mode='r', options = {}
+        super uri, mode
+      end
+      # Return the contents of this YAML file.
+      #
+      # FIXME what to do if a block is passed in?
+      def load &block
+        YAML.load_file @path
+      end
+      # Dump +data+ to this file as YAML.
+      def dump data
+        super data.to_yaml
+      end
+    end
+    FILE_REGEXPS << [/\.yaml$/, IMW::Files::Yaml]
+    FILE_REGEXPS << [/\.yml$/,  IMW::Files::Yaml]
+  end
+end
+# puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm

data/lib/imw/packagers.rb ADDED

@@ -0,0 +1,8 @@
+module IMW
+  module Packagers
+    autoload :Archiver, 'imw/packagers/archiver'
+    autoload :S3Mover,  'imw/packagers/s3_mover'
+  end
+end