RubyGems - chimps - Versions diffs - 0.1.0 - Mend

chimps 0.1.0

Files changed (44) hide show

data/.gitignore +17 -0
data/LICENSE +674 -0
data/README.rdoc +48 -0
data/VERSION +1 -0
data/bin/chimps +4 -0
data/examples/batch.yaml +69 -0
data/lib/chimps/cli.rb +102 -0
data/lib/chimps/commands/base.rb +107 -0
data/lib/chimps/commands/batch.rb +68 -0
data/lib/chimps/commands/create.rb +33 -0
data/lib/chimps/commands/destroy.rb +28 -0
data/lib/chimps/commands/download.rb +76 -0
data/lib/chimps/commands/help.rb +89 -0
data/lib/chimps/commands/list.rb +54 -0
data/lib/chimps/commands/query.rb +59 -0
data/lib/chimps/commands/search.rb +59 -0
data/lib/chimps/commands/show.rb +32 -0
data/lib/chimps/commands/test.rb +40 -0
data/lib/chimps/commands/update.rb +33 -0
data/lib/chimps/commands/upload.rb +63 -0
data/lib/chimps/commands.rb +46 -0
data/lib/chimps/config.rb +57 -0
data/lib/chimps/request.rb +302 -0
data/lib/chimps/response.rb +146 -0
data/lib/chimps/typewriter.rb +326 -0
data/lib/chimps/utils/error.rb +40 -0
data/lib/chimps/utils/extensions.rb +109 -0
data/lib/chimps/utils/uses_curl.rb +26 -0
data/lib/chimps/utils/uses_model.rb +51 -0
data/lib/chimps/utils/uses_yaml_data.rb +94 -0
data/lib/chimps/utils.rb +11 -0
data/lib/chimps/workflows/batch.rb +127 -0
data/lib/chimps/workflows/downloader.rb +102 -0
data/lib/chimps/workflows/uploader.rb +238 -0
data/lib/chimps/workflows.rb +11 -0
data/lib/chimps.rb +22 -0
data/spec/chimps/cli_spec.rb +22 -0
data/spec/chimps/commands/base_spec.rb +25 -0
data/spec/chimps/commands/list_spec.rb +25 -0
data/spec/chimps/response_spec.rb +8 -0
data/spec/chimps/typewriter_spec.rb +114 -0
data/spec/spec_helper.rb +17 -0
data/spec/support/custom_matchers.rb +6 -0
metadata +133 -0

data/lib/chimps/utils/uses_yaml_data.rb ADDED Viewed

@@ -0,0 +1,94 @@
+module Chimps
+  module Utils
+    module UsesYamlData
+      IGNORE_YAML_FILES_ON_COMMAND_LINE = false
+      attr_reader :data_file
+      def data
+        @data ||= merge_all *(data_from_stdin + data_from_file + data_from_command_line)
+      end
+      protected
+      def merge_all *objs
+        objs.compact!
+        return if objs.blank?   # raising an error here is left to the caller
+        klasses = objs.map(&:class).uniq
+        raise CLIError.new("Mismatched YAML data types -- Hashes can only be combined with Hashes, Arrays with Arrays") if klasses.size > 1
+        data_type = klasses.first.new
+        case data_type
+        when Array
+          # greater precedence at the end so iterate in order
+          returning([]) do |d|
+            objs.each do |obj|
+              d.concat(obj)
+            end
+          end
+        when Hash
+          # greater precedence at the end so iterate in order
+          returning({}) do |d|
+            objs.each do |obj|
+              d.merge!(obj)
+            end
+          end
+        else raise CLIError.new("Unsuitable YAML data type #{data_type} -- can only combine Hashes and Arrays")
+        end
+      end
+      def params_from_command_line
+        returning([]) do |d|
+          argv.each do |arg|
+            next unless arg =~ /^(\w+) *=(.*)$/
+            name, value = $1.downcase.to_sym, $2.strip
+            d << { name => value } # always a hash
+          end
+        end
+      end
+      def yaml_files_from_command_line
+        returning([]) do |d|
+          argv.each do |arg|
+            next if arg =~ /^(\w+) *=(.*)$/
+            path = File.expand_path(arg)
+            raise CLIError.new("No such path #{path}") unless File.exist?(path)
+            d << YAML.load(open(path)) # either a hash or an array
+          end
+        end
+      end
+      def data_from_command_line
+        if self.class::IGNORE_YAML_FILES_ON_COMMAND_LINE
+          params_from_command_line
+        else
+          yaml_files_from_command_line + params_from_command_line
+        end
+      end
+      def data_from_file
+        [data_file ? YAML.load_file(data_file) : nil]
+      end
+      def data_from_stdin
+        return [nil] unless $stdin.stat.size > 0
+        returning([]) do |d|
+          YAML.load_stream($stdin).each do |document|
+            d << document
+          end
+        end
+      end
+      def define_data_options
+        on_tail("-d", "--data-file PATH", "Path to a file containing key=value data") do |p|
+          @data_file = File.expand_path(p)
+        end
+      end
+      def ensure_data_is_present!
+        raise CLIError.new("Must provide some data to send, either on the command line, from an input file, or by piping to STDIN.  Try `chimps help #{name}'") unless data.present?
+      end
+    end
+  end
+end

data/lib/chimps/utils.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'chimps/config'
+require 'chimps/utils/extensions'
+require 'chimps/utils/error'
+module Chimps
+  module Utils
+    autoload :UsesCurl,     'chimps/utils/uses_curl'
+    autoload :UsesModel,    'chimps/utils/uses_model'
+    autoload :UsesYamlData, 'chimps/utils/uses_yaml_data'
+  end
+end

data/lib/chimps/workflows/batch.rb ADDED Viewed

@@ -0,0 +1,127 @@
+module Chimps
+  module Workflows
+    # A class for performing batch updates/uploads to Infochimps.
+    #
+    # It works by taking YAML data describing many updates and
+    # performing a single batch API request with this data.
+    #
+    # The batch response is then parsed and analyzed and (given
+    # success or fearlessness) any necessary uploads are performed.
+    #
+    # Examples of the input data format can be found in the
+    # <tt>/examples</tt> directory of the Chimps distribution.
+    class BatchUpdater
+      # The data used sent as a bulk update.
+      attr_reader :data
+      # The batch update response
+      attr_reader :batch_response
+      # The output file to store the bulk update response.
+      attr_reader :output_path
+      # Whether to upload even if there were errors on update.
+      attr_reader :upload_even_if_errors
+      # The data format to annotate the upload with.
+      #
+      # Chimps will try to guess if this isn't given.
+      attr_reader :fmt
+      # Create a new BatchUpdater with the given +data+ and +options+.
+      #
+      # The intermediate batch response can be saved at a file named
+      # by <tt>:output_path</tt>, though this isn't necessary.
+      #
+      # @param [Array] data an array of resource updates
+      # @param [Hash] options
+      # @option options [String] output_path path to store the batch response
+      # @option options [true, false] upload_even_if_errors whether to continue uploading in the presence of errors on update
+      # @option options [String] fmt the data format to annotate each upload with (see `chimps upload')
+      # @return [Chimps::Workflows::BatchUpdater]
+      def initialize data, options={}
+        @data                  = data
+        @output_path           = options[:output_path]
+        @upload_even_if_errors = options[:upload_even_if_errors]
+        @fmt                   = options[:fmt]
+      end
+      # The path to submit batch update requests.
+      #
+      # @return [String]
+      def batch_path
+        "batch.json"
+      end
+      # Perform this batch update followed by the batch upload.
+      def execute!
+        batch_update!
+        batch_upload!
+      end
+      # Perform the batch update.
+      def batch_update!
+        @batch_response = Request.new(batch_path, :data => { :batch => data }, :authenticate => true).post
+        File.open(output_path, 'w') { |f| f.puts batch_response.body } if output_path
+        batch_response.print
+      end
+      # Were any of the updates performed during the batch update
+      # errors?
+      #
+      # @return [true, false]
+      def error?
+        batch_response['batch'].each do |response|
+          status = response['status']
+          return true unless ['created', 'updated'].include?(status)
+        end
+        false
+      end
+      # Did all of the updates performed in the batch update succeed?
+      #
+      # @return [true, false]
+      def success?
+        ! error?
+      end
+      # Perform the batch upload.
+      #
+      # Will bail if the batch update had an error unless
+      # Chimps::Workflows::BatchUpdater#upload_even_if_errors returns
+      # true.
+      def batch_upload!
+        return unless success? || upload_even_if_errors
+        $stderr.puts("WARNING: continuing with uploads even though there were errors") unless success?
+        dataset_ids_and_local_paths.each do |id, local_paths|
+          Chimps::Workflows::Uploader.new(:dataset => id, :local_paths => local_paths, :fmt => fmt).execute!
+        end
+      end
+      protected
+      # Iterate through the batch response and return tuples
+      # consisting of an ID and an array of of local paths to upload.
+      #
+      # Only datasets which were successfully created/updated,
+      # returned an ID, and had local_paths defined in the original
+      # batch update will be output.
+      #
+      # @return [Array<Array>]
+      def dataset_ids_and_local_paths
+        batch_response['batch'].map do |response|
+          status = response['status']
+          next unless (status == 'created' || status == 'updated') # skip errors
+          next unless dataset = response['resource']['dataset']    # skip unless it's a dataset
+          id = dataset['id']
+          next if id.blank?                                        # skip unless it has an ID
+          local_paths = response['local_paths']
+          next if local_paths.blank?                               # skip unless local_paths were defined
+          [id, local_paths]
+        end.compact
+      end
+    end
+  end
+end

data/lib/chimps/workflows/downloader.rb ADDED Viewed

@@ -0,0 +1,102 @@
+module Chimps
+  module Workflows
+    # Downloads data from Infochimps by first making a request for a
+    # download token and, if granted one, proceeding to download the
+    # data.
+    #
+    # Will download the latest package for a given dataset, optionally
+    # constrained to have given data and package formats.
+    class Downloader
+      include Chimps::Utils::UsesCurl
+      # The token received from Infochimps which contains a signed URL
+      # for the download.
+      attr_reader :token
+      # The ID or handle of the dataset to download.
+      attr_reader :dataset
+      # The data format of the data to download.
+      attr_reader :fmt
+      # The package format of the data to download.
+      attr_reader :pkg_fmt
+      # Create a new Downloader with the given parameters.
+      #
+      # @param [Hash] options
+      # @option options [String, Integer] dataset the ID or handle of the dataset to download
+      # @option options [String] fmt the data format to download
+      # @option options [String] pkg_fmt the package format to download
+      # @option options [String] local_path the local path to which the data will be downloaded
+      # @return [Chimps::Workflows::Downloader]
+      def initialize options={}
+        @dataset    = options[:dataset]
+        @fmt        = options[:fmt]
+        @pkg_fmt    = options[:pkg_fmt]
+        @local_path = options[:local_path]
+      end
+      # Params to send for the token.
+      #
+      # @return [Hash]
+      def token_params
+        { :download_token => { :dataset_id => dataset, :fmt =>  fmt, :pkg_fmt => pkg_fmt} }
+      end
+      # Ask for a download token for this dataset/package.  If no or
+      # an invalid token is obtained, raise an error.
+      def ask_for_token!
+        new_token = Request.new(download_tokens_path, :data => token_params, :sign_if_possible => true).post
+        if new_token.error?
+          new_token.print
+          raise AuthenticationError.new("Unauthorized to download dataset #{dataset}")
+        else
+          @token = new_token
+        end
+      end
+      # Path to submit download token requests to.
+      #
+      # @return [String]
+      def download_tokens_path
+        "/download_tokens"
+      end
+      # The signed, remote URL from where the data can be downloaded.
+      #
+      # @return [String]
+      def download_url
+        token['download_token']['package']['url']
+      end
+      # The local path where the downloaded data will be put.
+      #
+      # Defaults to the current directory and the default basename of
+      # the downloaded package.
+      #
+      # @return [String, nil]
+      def local_path
+        @local_path || token["download_token"]["package"]["basename"]
+      end
+      # Issue the download request.
+      #
+      # Uses +curl+ for the data transfer.
+      def download!
+        command = "#{curl} -o '#{local_path}' '#{download_url}'"
+        puts command if Chimps.verbose?
+        system(command)
+      end
+      # Ask for a token and perform the download.
+      def execute!
+        ask_for_token!
+        download!
+      end
+    end
+  end
+end

data/lib/chimps/workflows/uploader.rb ADDED Viewed

@@ -0,0 +1,238 @@
+module Chimps
+  module Workflows
+    # Uploads data to Infochimps by first asking for authorization,
+    # creating an archive, obtaining a token, uploading data, and
+    # notifing Infochimps.
+    class Uploader
+      include Chimps::Utils::UsesCurl
+      # The ID or handle of the dataset to download.
+      attr_reader :dataset
+      # An array of paths to local files and directories to package
+      # into an archive.
+      attr_reader :local_paths
+      # The format to annotate the upload with.
+      attr_reader :fmt
+      # The archive to upload.
+      attr_reader :archive
+      # The token authoring an upload.
+      attr_reader :token
+      # Upload data to Infochimps by first asking for authorization,
+      # creating an archive, obtaining a token, uploading data, and
+      # notifing Infochimps.
+      def execute!
+        authorize_for_upload!
+        create_archive!
+        ask_for_token!
+        upload!
+        notify_infochimps!
+      end
+      # Create a new Uploader from the given parameters.
+      #
+      # If <tt>:fmt</tt> is provided it will be used as the data
+      # format to annotate the upload with.  If not, Chimps will try
+      # to guess.
+      #
+      # @param [Hash] options
+      # @option options [String, Integer] dataset the ID or handle of the dataset to which data should be uploaded
+      # @option options [Array<String>] local_paths the local paths to bundle into an archive
+      # @option options [String, IMW::Resource] archive the path to the archive to create (defaults to IMW::Workflows::Downloader#default_archive_path)
+      # @option options [String] fmt the data format to annotate the upload with
+      def initialize options={}
+        require 'imw'
+        IMW.verbose      = Chimps.verbose?
+        @dataset         = options[:dataset] or raise PackagingError.new("Must provide the ID or handle of a dataset to upload data to.")
+        self.local_paths = options[:local_paths]   # must come before self.archive=
+        self.archive     = options[:archive]
+        self.fmt         = options[:fmt]
+      end
+      # Set the local paths to upload for this dataset.
+      #
+      # If only one local path is given and it is already an archive
+      # or a compressed file then no further packaging will be done by
+      # this uploader.
+      #
+      # @param [Array<String, IMW::Resource>] paths
+      def local_paths= paths
+        raise PackagingError.new("Must provide at least one local path to upload.") if paths.blank?
+        paths.each { |path| raise PackagingError.new("Invalid path, #{path}") unless File.exist?(File.expand_path(path)) }
+        @local_paths = paths
+        if @local_paths.size == 1
+          potential_package = IMW.open(paths.first)
+          if potential_package.exist? && (potential_package.is_compressed? || potential_package.is_archive?)
+            self.archive = potential_package
+            @skip_packaging = true
+          end
+        end
+      end
+      # Should the packaging step be skipped?
+      #
+      # This will happen if only one local input path was provided and
+      # it exists and is a compressed file or archive.
+      #
+      # @return [true, false]
+      def skip_packaging?
+        !! @skip_packaging
+      end
+      # Set the path to the archive that will be built.
+      #
+      # The given +path+ must represent a compressed file or archive
+      # (<tt>.tar</tt>, <tt>.tar.gz.</tt>, <tt>.tar.bz2</tt>,
+      # <tt>.zip</tt>, <tt>.rar</tt>, <tt>.bz2</tt>, or <tt>.gz</tt>
+      # extension).
+      #
+      # Additionally, if multiple local paths are being packaged, the
+      # given +path+ must be an archive (not simply <tt>.bz2</tt> or
+      # <tt>.gz</tt> extensions).
+      #
+      # @param [String, IMW::Resource] path the archive or path to use
+      def archive= path=nil
+        return @archive if @archive
+        potential_package = IMW.open(path || default_archive_path)
+        raise PackagingError.new("Invalid path #{potential_package}, not an archive or compressed file")        unless potential_package.is_compressed? ||  potential_package.is_archive?
+        raise PackagingError.new("Multiple local paths must be packaged in an archive, not a compressed file.") if     local_paths.size > 1             && !potential_package.is_archive?
+        @archive = potential_package
+      end
+      # Set the data format to annotate the upload with.
+      #
+      # If not provided, Chimps will use the Infinite Monkeywrench
+      # (IMW) to try and guess the data format.  See
+      # IMW::Tools::Summarizer for more information.
+      def fmt= new_fmt=nil
+        @fmt ||= new_fmt || IMW::Tools::Summarizer.new(local_paths).most_common_data_format
+      end
+      # The default path to the archive that will be built.
+      #
+      # Defaults to a ZIP file in the current directory named after
+      # the +dataset+'s ID or handle and the current time.
+      #
+      # @return [String]
+      def default_archive_path
+        # in current working directory...
+        "chimps_#{dataset}-#{Time.now.strftime(Chimps::CONFIG[:timestamp_format])}.zip"
+      end
+      # The URL to the <tt>README-infochimps</tt> file on Infochimps'
+      # servers.
+      #
+      # @return [String]
+      def readme_url
+        File.join(Chimps::CONFIG[:host], "/README-infochimps")
+      end
+      # The URL to the ICSS file for this dataset on Infochimps
+      # servers
+      def icss_url
+        File.join(Chimps::CONFIG[:host], "datasets", "#{dataset}.yaml")
+      end
+      # Both the local paths and remote paths to package.
+      #
+      # @return [Array<String>]
+      def input_paths
+        raise PackaginError.new("Must specify some local paths to package") if local_paths.blank?
+        local_paths + [readme_url, icss_url]
+      end
+      # The path on Infochimps to submit upload token requests to.
+      #
+      # @return [String]
+      def token_path
+        "/datasets/#{dataset}/packages/new.json"
+      end
+      # The path on Infochimps to submit package creation requests to.
+      #
+      # @return [String]
+      def package_creation_path
+        "/datasets/#{dataset}/packages.json"
+      end
+      # Return a hash of params for obtaining a new upload token.
+      #
+      # @return [Hash]
+      def package_params
+        { :package => { :fmt => fmt, :pkg_fmt => archive.extension } }
+      end
+      # Authorize the Chimps user for this upload.
+      def authorize_for_upload!
+        # FIXME we're actually just making a token request here...
+        ask_for_token!
+      end
+      # Obtain an upload token from Infochimps.
+      def ask_for_token!
+        new_token = Request.new(token_path, :params => package_params, :signed => true).get
+        if new_token.error?
+          new_token.print
+          raise AuthenticationError.new("Unauthorized for an upload token for dataset #{dataset}")
+        else
+          @token = new_token
+        end
+      end
+      # Build the local archive if necessary.
+      #
+      # Will not build the local archive if there was only one local
+      # input path and it was already compressed or an archive.
+      def create_archive!
+        return if skip_packaging?
+        archiver = IMW::Tools::Archiver.new(archive.name, input_paths)
+        result   = archiver.package(archive.path)
+        raise PackagingError.new("Unable to package files for upload.  Temporary files left in #{archiver.tmp_dir}") if result.is_a?(RuntimeError) || (!archiver.success?)
+        archiver.clean!
+      end
+      # Return a string built from the granted upload token that can
+      # be fed to +curl+ in order to authenticate with and upload to
+      # Amazon.
+      #
+      # @return [String]
+      def upload_data
+        data = ['AWSAccessKeyId', 'acl', 'key', 'policy', 'success_action_status', 'signature'].map { |param| "-F #{param}='#{token[param]}'" }
+        data << ["-F file=@#{archive.path}"]
+        data.join(' ')
+      end
+      # Upload the data.
+      #
+      # Uses +curl+ for the transfer.
+      def upload!
+        progress_meter = Chimps.verbose? ? '' : '-s -S'
+        command = "#{curl} #{progress_meter} -o /dev/null -X POST #{upload_data} #{token['url']}"
+        raise UploadError.new("Failed to upload #{archive.path} to Infochimps") unless IMW.system(command)
+      end
+      # Return a hash of parameters used to create a new Package at
+      # Infochimps corresonding to the upload.
+      #
+      # @return [Hash]
+      def package_data
+        { :package => {:path => token['key'], :fmt => token['fmt'], :pkg_size => archive.size, :pkg_fmt => archive.extension} }
+      end
+      # Make a final POST request to Infochimps, creating the final
+      # resource.
+      def notify_infochimps!
+        package_creation_response = Request.new(package_creation_path, :signed => true, :data => package_data).post
+        package_creation_response.print
+        raise UploadError.new("Unable to notify Infochimps of newly uploaded data.") if package_creation_response.error?
+      end
+    end
+  end
+end

data/lib/chimps/workflows.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Chimps
+  # A module defining classes to handle complex workflows between the
+  # local machine and Infochimps' servers.
+  module Workflows
+    autoload :Uploader,     'chimps/workflows/uploader'
+    autoload :Downloader,   'chimps/workflows/downloader'
+    autoload :BatchUpdater, 'chimps/workflows/batch'
+  end
+end

data/lib/chimps.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'rubygems'
+require 'chimps/utils'
+# The Chimps module implements a Ruby-based command-line interface to
+# the Infochimps data repository.
+#
+# Using this tool you can search, download, edit, and upload data and
+# metadata to and from Infochimps.
+module Chimps
+  autoload :Config,       'chimps/config'
+  autoload :CONFIG,       'chimps/config'
+  autoload :CLI,          'chimps/cli'
+  autoload :Command,      'chimps/commands/base'
+  autoload :Commands,     'chimps/commands'
+  autoload :Request,      'chimps/request'
+  autoload :QueryRequest, 'chimps/request'
+  autoload :Response,     'chimps/response'
+  autoload :Typewriter,   'chimps/typewriter'
+  autoload :Workflows,    'chimps/workflows'
+end

data/spec/chimps/cli_spec.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require File.join(File.dirname(__FILE__), '../spec_helper')
+describe Chimps::CLI do
+end
+describe Chimps::CLI::Runner do
+  it "should raise a CLIError when no command is given" do
+    lambda { Chimps::CLI::Runner.new([]).execute! }.should raise_error(Chimps::CLIError)
+  end
+  it "should raise a CLIError when an unrecognized command is given" do
+    lambda { Chimps::CLI::Runner.new(['foobar', 'arg1', 'arg2']).execute! }.should raise_error(Chimps::CLIError)
+  end
+  it "should recognize a command when given" do
+    Chimps::Commands.should_receive(:construct).with('list', ['arg1', 'arg2'])
+    Chimps::CLI::Runner.new(['list', 'arg1', 'arg2']).command # execute requires the command to be initialized and returned...
+  end
+end

data/spec/chimps/commands/base_spec.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require File.join(File.dirname(__FILE__), '../../spec_helper')
+describe Chimps::Command do
+  it "should return its full name" do
+    Chimps::Command.name.should == "chimps::command"
+  end
+  it "should return just its command name" do
+    Chimps::Command.new([]).name.should == "command"
+  end
+  it "should run any methods beginning with `define' and ending with `options?'" do
+    klass = Class.new(Chimps::Command)
+    klass.class_eval <<RUBY
+      attr_accessor :test_property
+      def define_test_options
+        self.test_property=true
+      end
+RUBY
+    klass.new([]).test_property.should == true
+  end
+end

data/spec/chimps/commands/list_spec.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require File.join(File.dirname(__FILE__), '../../spec_helper')
+describe Chimps::Commands::List do
+  it "should return its full name" do
+    Chimps::Command.name.should == "chimps::command"
+  end
+  it "should return just its command name" do
+    Chimps::Command.new([]).name.should == "command"
+  end
+  it "should run any methods beginning with `define' and ending with `options?'" do
+    klass = Class.new(Chimps::Command)
+    klass.class_eval <<RUBY
+      attr_accessor :test_property
+      def define_test_options
+        self.test_property=true
+      end
+RUBY
+    klass.new([]).test_property.should == true
+  end
+end

data/spec/chimps/response_spec.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require File.join(File.dirname(__FILE__), '../spec_helper')
+require 'restclient'
+describe Chimps::Response do
+end