RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

data/spec/imw/packagers/s3_mover_spec.rb ADDED

@@ -0,0 +1,7 @@
+require File.dirname(__FILE__) + "/../../spec_helper"
+describe IMW::Packagers::S3Mover do
+  it { pending }
+end

data/spec/imw/parsers/line_parser_spec.rb ADDED

@@ -0,0 +1,96 @@
+require File.dirname(__FILE__) + "/../../spec_helper"
+require 'ostruct'
+describe IMW::Parsers::LineParser do
+  before do
+    @path = File.dirname(__FILE__) + "/../../data/sample.csv"
+    @file = File.new(@path)
+    @fields = [:id, :name, :genus, :species]
+  end
+  describe "without an implemented parsing method" do
+    before do
+      @parser = IMW::Parsers::LineParser.new
+    end
+    it "should raise an error when attempting to parse a line" do
+      lambda { @parser.parse_line "wahtever" }.should raise_error(IMW::NotImplementedError)
+    end
+  end
+  describe "with an implemented parsing method" do
+    before do
+      @parser_class = Class.new(IMW::Parsers::LineParser)
+      @parser_class.class_eval do
+        def parse_line line
+          id, name, genus, species = line.chomp.split(',')
+          { :id => id, :name => name, :genus => genus, :species => species }
+        end
+      end
+      @parser = @parser_class.new
+    end
+    it "should skip lines as needed" do
+      @parser.skip_first = 1
+      results = @parser.parse!(@file)
+      results.length.should == 130
+    end
+    it "should read as many lines as it's asked" do
+      results = @parser.parse!(@file, :lines => 10)
+      results.length.should == 10
+    end
+    describe "when parsing into hashes" do
+      it "should return an array of hashes when called without a block" do
+        results = @parser.parse!(@file)
+        results.length.should == 131
+        results.first.should == { :id => "ID", :name => "Name", :genus => "Genus", :species => "Species" }
+      end
+      it "should pass each hash to a block when given one" do
+        results = returning([]) do |array|
+          @parser.parse!(@file) do |hsh|
+            hsh.delete(:id)
+            array << hsh
+          end
+        end
+        results.length.should == 131
+        results.first.should == { :name => "Name", :genus => "Genus", :species => "Species" }
+      end
+    end
+    describe "when parsing into objects" do
+      before { @parser.klass = OpenStruct }
+      it "should return an array of objects when defined with a class" do
+        results = @parser.parse!(@file)
+        results.length.should == 131
+        results.first.class.should == OpenStruct
+      end
+      it "should pass each object to a block when given one and defined with a class" do
+        @parser.klass = OpenStruct
+        results = returning([]) do |array|
+          @parser.parse!(@file) do |obj|
+            obj.genus = nil
+            array << obj
+          end
+        end
+        results.length.should == 131
+        results.first.class.should == OpenStruct
+        results.first.genus.should be_blank
+      end
+    end
+  end
+end

data/spec/imw/parsers/regexp_parser_spec.rb ADDED

@@ -0,0 +1,42 @@
+require File.dirname(__FILE__) + "/../../spec_helper"
+require 'ostruct'
+describe IMW::Parsers::RegexpParser do
+  before do
+    @path = "foobar.dat"
+    @text = <<EOF
+151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
+81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
+81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
+EOF
+    File.open(@path, 'w') { |f| f.write(@text) }
+    @file = File.new(@path)
+    @regexp = %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$}
+    @fields = [:ip, :timestamp, :verb, :url, :version]
+    @parser = IMW::Parsers::RegexpParser.new :by_regexp => @regexp, :into_fields => @fields
+  end
+  describe "parsing a line which matches its regexp" do
+    it "should return an appropriate hash" do
+      @parser.parse_line(@file.readline).should == {:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}
+    end
+  end
+  describe "parsing a line which doesn't match its regexp" do
+    before { @parser.regexp = /foobar/ }
+    it "return an empty hash if not parsing strictly" do
+      @parser.parse_line(@file.readline).should == {}
+    end
+    it "should raise an error if parsing strictly" do
+      @parser.strict = true
+      lambda { @parser.parse_line(@file.readline) }.should raise_error IMW::ParseError
+    end
+  end
+end

data/spec/imw/utils/extensions/file_core_spec.rb ADDED

@@ -0,0 +1,72 @@
+#
+# h2. spec/imw/utils/extensions/file_core_spec.rb -- spec for extensions to core file module
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+require File.join(File.dirname(__FILE__),'../../../spec_helper')
+require 'fileutils'
+require 'imw/utils/random'
+describe File do
+  it "should return the 'name' of a file with 'name_of_file'" do
+    File.name_of_file("/path/to/some_file.txt").should eql("some_file")
+  end
+  describe "when finding the handle corresponding to a path" do
+    it "should correctly identify paths with the processing instruction suffix" do
+      File.handle("/path/to/the_handle#{IMW::PROCESSING_INSTRUCTION_SUFFIX}.yaml").should eql(:the_handle)
+    end
+    it "should correctly identify paths with the metadata instruction suffix" do
+      File.handle("/path/to/the_handle#{IMW::METADATA_SUFFIX}.yaml").should eql(:the_handle)
+    end
+    it "should raise an error if the path does not correspond to a handle" do
+      lambda {File.handle("/path/to/the_handle.txt")}.should raise_error(IMW::PathError)
+    end
+  end
+  describe "when creating unique filenames" do
+    before(:each) do
+      @root_directory = IMW::DIRECTORIES[:dump] + "/file_core_spec"
+      @file0 = @root_directory + "/the_original.txt"
+      @file1 = @root_directory + "/the_original.txt.1"
+      @file2 = @root_directory + "/the_original.txt.2"
+      FileUtils.mkdir(@root_directory)
+    end
+    after(:each) do
+      FileUtils.rm_rf @root_directory
+    end
+    it "should return the given path if there is no such file already" do
+      File.uniquify(@file0).should eql(@file0)
+    end
+    it "should return the given path with a numerical suffix of `.1' if the file exists" do
+      IMW::Random.file(@file0)
+      File.uniquify(@file0).should eql(@file1)
+    end
+    it "should return the given path with a numerical suffix o `.2' if the file exists and a file with a suffix of `.1' also exists" do
+      IMW::Random.file(@file0)
+      IMW::Random.file(@file1)
+      File.uniquify(@file0).should eql(@file2)
+    end
+  end
+end
+# puts "#{File.basename(__FILE__)}: You bend the file folder almost in half and watch as it springs back to shape." # at bottom

data/spec/imw/utils/extensions/find_spec.rb ADDED

@@ -0,0 +1,113 @@
+#
+# h2. spec/imw/utils/extensions/find_spec.rb -- spec for find.rb
+#
+# == About
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+require File.join(File.dirname(__FILE__),'../../../spec_helper')
+require IMW_SPEC_DIR + "/imw/matchers/without_regard_to_order_matcher"
+require 'fileutils'
+require 'set'
+require 'imw/utils'
+require 'imw/utils/random'
+require 'imw/utils/extensions/find'
+describe Find do
+  include Spec::Matchers::IMW
+  def create_sample_files
+    FileUtils.mkdir_p(@subsubdirectory)
+    [@file1,@file2,@file3,@file4,@file5,@file6].each {|path| IMW::Random.file path}
+  end
+  before(:all) do
+    @root_directory = IMW::DIRECTORIES[:dump] + "/find_extension_spec"
+    @subdirectory = @root_directory + "/subdir"
+    @subsubdirectory = @subdirectory + "/subsubdir"
+    @fake_directory = @root_directory + "/notreal"
+    @file1 = @root_directory + "/my_file1.txt"
+    @file2 = @root_directory + "/my_file2.csv"
+    @file3 = @root_directory + "/my_file3.dat"
+    @file4 = @subdirectory + "/your_file4.html"
+    @file5 = @subdirectory + "/your_file5.csv"
+    @file6 = @subdirectory + "/your_file5"
+  end
+  before(:each) do
+    create_sample_files
+  end
+  after(:each) do
+    FileUtils.rm_rf @root_directory
+  end
+  describe "when listing files with absolute paths contained in a directory" do
+    it "should raise an error when listing a non-exsiting directory" do
+      lambda {Find.files_in_directory(@fake_directory) }.should raise_error(IMW::PathError)
+    end
+    it "should find every file by default" do
+      Find.files_in_directory(@root_directory).should match_without_regard_to_order([@file1,@file2,@file3,@file4,@file5,@file6])
+    end
+    it "should only find files which match its :include argument" do
+      Find.files_in_directory(@root_directory, :include => /.*\.csv$/).should match_without_regard_to_order([@file2,@file5])
+    end
+    it "should not find files which match its :exclude argument" do
+      Find.files_in_directory(@root_directory, :exclude => /.*\.csv$/).should match_without_regard_to_order([@file1,@file3,@file4,@file6])
+    end
+    it "should only find files which match its :include argument and don't match its :exclude argument" do
+      Find.files_in_directory(@root_directory, :include => /my/, :exclude => /.*\.csv$/).should match_without_regard_to_order([@file1,@file3])
+    end
+  end
+  describe "when listing files with relative paths contained in a directory" do
+    def strip_root_directory array
+      array.map {|item| item[@root_directory.length + 1,item.size]}
+    end
+    it "should raise an error when listing a non-exsiting directory" do
+      lambda {Find.files_in_directory(@fake_directory) }.should raise_error(IMW::PathError)
+    end
+    it "should find every file by default" do
+      Find.files_relative_to_directory(@root_directory).should match_without_regard_to_order(strip_root_directory([@file1,@file2,@file3,@file4,@file5,@file6]))
+    end
+    it "should only find files which match its :include argument" do
+      Find.files_relative_to_directory(@root_directory, :include => /.*\.csv$/).should match_without_regard_to_order(strip_root_directory([@file2,@file5]))
+    end
+    it "should not find files which match its :exclude argument" do
+      Find.files_relative_to_directory(@root_directory, :exclude => /.*\.csv$/).should match_without_regard_to_order(strip_root_directory([@file1,@file3,@file4,@file6]))
+    end
+    it "should only find files which match its :include argument and don't match its :exclude argument" do
+      Find.files_relative_to_directory(@root_directory, :include => /^my/, :exclude => /.*\.csv$/).should match_without_regard_to_order(strip_root_directory([@file1,@file3]))
+    end
+  end
+  describe "when listing handles in a directory" do
+    it "should return a unique set of handles" do
+      Find.handles_in_directory(@root_directory, :include => /your/).should match_without_regard_to_order([:your_file4, :your_file5])
+    end
+  end
+end
+# puts "#{File.basename(__FILE__)}: You throw your Monkeywrench backwards over your shoulder and run like mad to go find it.  Again, and again, and again." # at bottom

data/spec/imw/utils/paths_spec.rb ADDED

@@ -0,0 +1,38 @@
+require File.join(File.dirname(__FILE__),'../../spec_helper')
+require 'imw'
+require 'imw/utils/paths'
+describe IMW do
+  include IMW
+  before(:each) do
+    IMW::PATHS = {
+      :data    => '/data',
+      :weather => 'ftp.ncdc.noaa.gov/pub/data/noaa',
+      :first   => ['1', :second, 'last'],
+      :second  => ['2', :third],
+      :third   => ['3'],
+    }
+  end
+  it 'is idempotent on a string' do
+    path_to('hi').should == 'hi'
+  end
+  it 'has an absolute path to the data dir' do
+    path_to(:data).should =~ %r{^/}
+  end
+  it 'handles mixed array and sym args' do
+    path_to( [:data, 'hi'], [[['there']]]).should == '/data/hi/there'
+  end
+  it 'expands to later generations' do
+    path_to(:first).should == File.join('1/2/3/last')
+  end
+  it 'expands interior symbols' do
+    path_to(['hadoop1:/working', :data, :weather]).should ==
+      File.join('hadoop1:/working/data/ftp.ncdc.noaa.gov/pub/data/noaa')
+  end
+end

data/spec/imw/workflow/rip/local_spec.rb ADDED

@@ -0,0 +1,89 @@
+#
+# h2. spec/imw/workflow/rip/local_spec.rb -- specs for copying files from local disk
+#
+# == About
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+require File.join(File.dirname(__FILE__),'../../../spec_helper')
+require IMW_SPEC_DIR + "/imw/matchers/without_regard_to_order_matcher.rb"
+require 'fileutils'
+require 'imw/utils/random'
+require 'imw/utils/extensions/find'
+require 'imw/workflow/rip/local'
+describe "Ripping from local disk" do
+  include Spec::Matchers::IMW
+  before(:all) do
+    @root_directory = IMW::DIRECTORIES[:dump] + "/local_spec"
+    @file1 = @root_directory + "/first.csv"
+    @source_directory1 = @root_directory + "/source1"
+    @file2 = @source_directory1 + "/second.txt"
+    @file3 = @source_directory1 + "/third.csv"
+    @source_directory2 = @root_directory + "/source2"
+    @file4 = @source_directory2 + "/fourth.txt"
+    @file5a = @source_directory2 + "/fifth-shared.yaml"
+    @source_directory3 = @source_directory2 + "/source3-nested"
+    @file5b = @source_directory3 + "/fifth-shared.yaml"
+    @target_directory = @root_directory + "/target"
+  end
+  before(:each) do
+    FileUtils.mkdir([@root_directory,@source_directory1,@source_directory2,@source_directory3,@target_directory])
+    [@file1,@file2,@file3,@file4,@file5a,@file5b].each {|file| IMW::Random.file(file)}
+  end
+  after(:each) do
+    FileUtils.rm_rf @root_directory
+  end
+  def basenames_of files
+    files.map {|file| File.basename file}
+  end
+  it "should raise an error when attempting to copy to a non-existent target directory" do
+    FileUtils.rm_rf @target_directory
+    lambda { IMW::Rip.from_local_disk(@target_directory,@source_directory1)}.should raise_error(IMW::PathError)
+  end
+  it "should copy all files in all directories and paths recursively to the target directory without any hierarchy" do
+    IMW::Rip.from_local_disk(@target_directory,@file1,@source_directory1,@source_directory2)
+    Find.files_relative_to_directory(@target_directory).should match_without_regard_to_order(basenames_of([@file1,@file2,@file3,@file4,@file5a]))
+  end
+  it "should accept a block which establishes a hierarchy to be created in the target directory and which skips copying certain files if it returns nil" do
+    # complicated block to copy files to sub-directories of the target
+    # directory depending on their extension
+    IMW::Rip.from_local_disk(@target_directory,@file1,@source_directory1,@source_directory2) do |path|
+      if File.extname(path) == '.txt' then
+        File.join('txt',File.basename(path)) # put text files in txt
+      elsif File.extname(path) == '.csv' then
+        File.join("csv",File.basename(path)) # put csv files in csv
+      else
+        nil # don't copy other extensions
+      end
+    end
+    # what we would expect to see from that block
+    txt = [@file2,@file4].map {|path| File.join("txt",File.basename(path))}
+    csv = [@file1,@file3].map {|path| File.join("csv",File.basename(path))}
+    Find.files_relative_to_directory(@target_directory).should match_without_regard_to_order(txt + csv)
+  end
+end
+# puts "#{File.basename(__FILE__)}: Having found the platter you were looking for, you stare at it, examining your reflection.  What a handsome chimp you are!" # at bottom