RubyGems - vayacondios-server - Versions diffs - 0.0.4 - Mend

vayacondios-server 0.0.4

Files changed (46) hide show

data/.gitignore +61 -0
data/.travis.yml +11 -0
data/.yardopts +10 -0
data/CHANGELOG.md +0 -0
data/Gemfile +4 -0
data/Guardfile +41 -0
data/LICENSE.md +99 -0
data/Procfile +2 -0
data/README.md +183 -0
data/Rakefile +6 -0
data/app/http_shim.rb +67 -0
data/bin/vcd.sh +27 -0
data/config/http_shim.rb +43 -0
data/config/vayacondios.example.yaml +4 -0
data/config/vayacondios.yaml +4 -0
data/lib/tasks/publish.rake +23 -0
data/lib/tasks/spec.rake +9 -0
data/lib/tasks/yard.rake +2 -0
data/lib/vayacondios/client/configliere.rb +38 -0
data/lib/vayacondios/client/http_client.rb +49 -0
data/lib/vayacondios/client/notifier.rb +84 -0
data/lib/vayacondios/server/handlers/config_handler.rb +35 -0
data/lib/vayacondios/server/handlers/event_handler.rb +30 -0
data/lib/vayacondios/server/model/config_document.rb +94 -0
data/lib/vayacondios/server/model/document.rb +25 -0
data/lib/vayacondios/server/model/event_document.rb +94 -0
data/lib/vayacondios/version.rb +3 -0
data/lib/vayacondios-client.rb +20 -0
data/lib/vayacondios-server.rb +18 -0
data/scripts/hadoop_monitor/configurable.rb +74 -0
data/scripts/hadoop_monitor/hadoop_client.rb +249 -0
data/scripts/hadoop_monitor/hadoop_monitor.rb +91 -0
data/scripts/hadoop_monitor/hadoopable.rb +65 -0
data/scripts/hadoop_monitor/machine_monitor.rb +115 -0
data/scripts/s3_cataloger/buckets +33 -0
data/scripts/s3_cataloger/foreach_bucket +88 -0
data/scripts/s3_cataloger/parse_ls.py +391 -0
data/spec/client/notifier_spec.rb +120 -0
data/spec/server/config_spec.rb +55 -0
data/spec/server/event_spec.rb +44 -0
data/spec/server/server_spec.rb +20 -0
data/spec/spec_helper.rb +10 -0
data/spec/support/mongo_cleaner.rb +26 -0
data/vayacondios-client.gemspec +26 -0
data/vayacondios-server.gemspec +30 -0
metadata +216 -0

data/scripts/hadoop_monitor/hadoop_monitor.rb ADDED Viewed

@@ -0,0 +1,91 @@
+#!/usr/bin/env jruby19
+require_relative 'hadoop_client'
+require_relative 'configurable'
+require 'java'
+require 'mongo'
+require 'scanf'
+require 'gorillib/hash/slice'
+require 'thread'
+require 'open-uri'
+require 'json'
+module Vayacondios
+  class HadoopMonitor
+    def initialize
+      init_settings
+      @hadoop = HadoopClient.new
+      @monitored_jobs = []
+      logger.debug "Creating mongo collections."
+      @conn = Mongo::Connection.new settings.mongo_ip
+      @db = @conn[settings.mongo_jobs_db]
+      @job_logs = @db.create_collection(settings.mongo_job_logs_collection)
+      # After we create the job_events database, one of the machine
+      # monitors will create the machine stats databse.
+      @job_events = @db.create_collection(settings.mongo_job_events_collection,
+                                          :capped => true,
+                                          :size => settings.job_events_size)
+      @cluster_state = CLUSTER_QUIET
+    end
+    def run
+      loop do
+        logger.debug "In main event loop."
+        cur_running_jobs  = @hadoop.jobs_with_state HadoopClient::RUNNING
+        cur_cluster_state = (cur_running_jobs.size > 0) ? CLUSTER_BUSY : CLUSTER_QUIET
+        @hadoop.subtract(@monitored_jobs, cur_running_jobs).each do |job|
+          logger.debug "#{job.get_id.to_s} is complete."
+          update_job_stats job, Time.now
+        end
+        @hadoop.subtract(cur_running_jobs, @monitored_jobs).each do |job|
+          logger.debug "#{job.get_id.to_s} started."
+          update_job_properties job
+        end
+        (@monitored_jobs + cur_running_jobs).each{|job| update_job_stats job}
+        @monitored_jobs = cur_running_jobs
+        update_cluster_state cur_cluster_state
+        sleep settings.sleep_seconds
+      end
+    end
+  private
+    include Configurable
+    def update_cluster_state new_state
+      return if new_state == @cluster_state
+      @cluster_state = new_state
+      logger.info "Cluster state changed to #{@cluster_state}"
+      @job_events.insert(EVENT => @cluster_state, TIME => Time.now.to_i)
+    end
+    def update_job_properties job
+      properties = @hadoop.job_properties job
+      logger.debug "upserting #{JSON.generate properties}"
+      @job_logs.save(properties, upsert: true, safe: true)
+    end
+    def update_job_stats job, finish_time = nil
+      @hadoop.job_stats(job, finish_time || Time.now).each do |job_stat|
+        logger.debug "upserting #{JSON.generate job_stat}"
+        @job_logs.save(job_stat, upsert: true, safe: true)
+      end
+    end
+  end
+end
+Vayacondios::HadoopMonitor.new.run

data/scripts/hadoop_monitor/hadoopable.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'stringio'
+module Vayacondios
+  module Hadoopable
+    include Configurable
+    #--------------------------------------------------------------------------------
+    # Initialize jruby and tell it about hadoop.
+    #--------------------------------------------------------------------------------
+    begin
+      require 'java'
+    rescue LoadError => e
+      raise "\nJava not found. Are you sure you're running with JRuby?\n#{e.message}"
+    end
+    hadoop_home = ENV['HADOOP_HOME'] || '/usr/lib/hadoop'
+    raise "\nHadoop installation not found. Try setting $HADOOP_HOME\n" unless (hadoop_home and (File.exist? hadoop_home))
+    $CLASSPATH << File.join(File.join(hadoop_home, 'conf') || ENV['HADOOP_CONF_DIR'],
+                            '') # add trailing slash
+    Dir["#{hadoop_home}/{hadoop*.jar,lib/*.jar}"].each{|jar| require jar}
+    include_class org.apache.hadoop.mapred.JobConf
+    include_class org.apache.hadoop.mapred.JobClient
+    include_class org.apache.hadoop.mapred.JobStatus
+    include_class org.apache.hadoop.mapred.TIPStatus
+    include_class org.apache.hadoop.conf.Configuration
+    #--------------------------------------------------------------------------------
+    def get_hadoop_conf
+      logger.debug "Getting hadoop configuration"
+      stderr, $stderr = $stderr, StringIO.new
+      conf = Configuration.new
+      # per-site defaults
+      %w[capacity-scheduler.xml core-site.xml hadoop-policy.xml hadoop-site.xml hdfs-site.xml mapred-site.xml].each do |conf_file|
+        conf.addResource conf_file
+      end
+      conf.reload_configuration
+      # per-user overrides
+      if Swineherd.config[:aws]
+        conf.set("fs.s3.awsAccessKeyId",Swineherd.config[:aws][:access_key])
+        conf.set("fs.s3.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
+        conf.set("fs.s3n.awsAccessKeyId",Swineherd.config[:aws][:access_key])
+        conf.set("fs.s3n.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
+      end
+      return conf
+    ensure
+      stderr_lines = $stderr.string.split("\n")
+      $stderr = stderr
+      stderr_lines.each{|line| logger.debug line}
+    end
+  end
+end

data/scripts/hadoop_monitor/machine_monitor.rb ADDED Viewed

@@ -0,0 +1,115 @@
+#!/usr/bin/env ruby
+require_relative 'configure'
+require 'thread'
+require 'socket'
+require 'scanf'
+require 'json'
+require 'mongo'
+module Vayacondios
+  class StatServer
+    include Configurable
+    def initialize
+      unless get_conf.mongo_ip
+        raise "The IP address of the mongo server must be set!"
+      end
+      logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
+      conn = Mongo::Connection.new get_conf.mongo_ip
+      logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
+      @db = conn[get_conf.mongo_jobs_db]
+    end
+    def run
+      # TODO: This entire script should be replaced by calls to zabbix
+      # initiated by the main loop of the hadoop_monitor.
+      logger.debug "Waiting for hadoop monitor to create the event collection."
+      sleep get_conf.sleep_seconds until
+        @db.collection_names.index get_conf.mongo_job_events_collection
+      job_events = @db[get_conf.mongo_job_events_collection]
+      logger.debug "Got the event collection. Creating machine stats collection."
+      machine_stats = @db.
+        create_collection(get_conf.mongo_machine_stats_collection)
+      logger.debug "Querying job_events until we see an insertion."
+      # Keep querying the job_events collection until there's an
+      # event. Don't just use the cursor from .find without checking,
+      # because if hadoop_monitor inserts an event into an empty
+      # database, this cursor will no longer work, even if it's
+      # tailable. not quite sure why Mongo does it that way.
+      events = job_events.find
+      events.add_option 0x02 # tailable
+      until events.has_next?
+        sleep get_conf.sleep_seconds
+        events = job_events.find
+        events.add_option 0x02 # tailable
+      end
+      logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
+      # Get up-to-date on the state of the cluster. assume quiet to start.
+      cluster_busy = self.class.next_state(events, false, get_conf.event)
+      # main loop
+      loop do
+        logger.debug "In main event loop. Waiting to see if the cluster is busy."
+        # Get up-to-date on the state of the cluster.
+        cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
+        # Don't grab stats unless the cluster is busy
+        unless cluster_busy
+          sleep get_conf.sleep_seconds
+          next
+        end
+        logger.debug "Grabbing stats and pushing them into the collection."
+        # Grab the stats!
+        # ifstat's delay will function as our heartbeat timer.
+        is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
+        headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
+        cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
+          split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
+        # Write the stats into the mongo collection.
+        machine_stats.insert(
+          :net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
+          :disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
+          :cpu => self.class.split_top_stats(cpu),
+          :mem => self.class.split_top_stats(mem),
+          :swap => self.class.split_top_stats(swap))
+      end
+    end
+  private
+    def self.split_top_stats line
+      Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
+             stat.scanf("%f%*c%s").reverse
+           end]
+    end
+    def self.next_state events_cursor, current_state, event_attr_name
+      while current_event = events_cursor.next
+        current_state = case current_event[event_attr_name]
+                        when CLUSTER_BUSY then true
+                        when CLUSTER_QUIET then false
+                        else current_state
+                        end
+      end
+      current_state
+    end
+  end
+end
+Vayacondios::StatServer.new.run

data/scripts/s3_cataloger/buckets ADDED Viewed

@@ -0,0 +1,33 @@
+export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+bdump_and_bparse() {
+    bucket_name=$1
+    bdump $1 ; bparse $1
+}
+bparse_and_bload() {
+    bucket_name=$1
+    bparse $1 ; bload "$@"
+}
+bdump_and_bload() {
+    bucket_name=$1
+    bdump $1; bparse $1 ; bload $1
+}
+bdump() {
+    bucket_name=$1
+    s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
+}
+bparse() {
+    bucket_name=$1
+    $dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
+}
+bload() {
+    bucket_name=$1
+    db=$2
+    collection=$3
+    mongoimport -d $db -c $collection $bucket_name.json
+}

data/scripts/s3_cataloger/foreach_bucket ADDED Viewed

@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+. $dir/buckets
+case $1 in
+    -f)
+	bucket_file=$2
+	shift 2
+	;;
+    -h|--help)
+	cat <<EOF
+foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
+This script is used to do a recursive listing of an s3 bucket using
+the s3cmd and then jsonify the output. It runs the COMMAND on the
+buckets specified in a file; on standard input; or, by default, on all
+buckets that can be seen by s3cmd.
+OPTIONS include the following:
+  -f BUCKET_FILE    file containing a bucket name on each line. If
+                    this is set to '-', then buckets are read from
+                    standard input.
+COMMAND includes anything in the 'buckets' script. The main commands
+are the following:
+    bdump           dumps BUCKET to a file BUCKET.ls in the current
+                    working directory
+    bparse          runs BUCKET.ls through a parser to jsonify it and
+                    outputs the result as BUCKET.json
+    bload           loads BUCKET.json into a mongo database. The first
+                    argument passed to this command specifies the
+                    mongo database, while the second specifies the
+                    collection.
+EOF
+	exit 0
+	;;
+    -*)
+	echo "Invalid option: $1"
+	exit 1
+	;;
+esac
+command=$1
+shift
+buckets=()
+## no bucket file specified read all s3 buckets
+if [[ -z $bucket_file ]]
+then
+    for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
+    do
+	buckets=("${buckets[@]}" "$bucket")
+    done
+## read buckets from standard input
+elif [[ $bucket_file == "-" ]]
+then
+    read bucket
+    until [[ $? -eq 1 ]]
+    do
+	buckets=("${buckets[@]}" "$bucket")
+	read bucket
+    done
+## read from bucket_file
+else
+    tmpIFS=$IFS
+    IFS=$'\n'
+    for bucket in `cat $bucket_file`
+    do
+	buckets=("${buckets[@]}" "$bucket")
+    done
+    IFS=$tmpIFS
+fi
+for bucket in "${buckets[@]}"
+do
+    ($command $bucket "$@")&
+done

data/scripts/s3_cataloger/parse_ls.py ADDED Viewed

@@ -0,0 +1,391 @@
+#!/usr/bin/env python
+import logging
+import sys
+# crank this down to info for progress messages. can also use
+# "filename=" for that kind of thing. The only reason this is stderr
+# is to allow for output redirection.
+logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
+#-------------------------------------------------------------------------------
+def calculate_sizes(parsedHierarchies):
+    """
+    @param parsedHierarchies dictionary mapping filenames to
+                             parsedHierarchies. This is in the same
+                             format as the 'subdirs' component of a
+                             parsedHierarchy.
+    """
+    from operator import add
+    return reduce(
+        add,
+        (
+            calculate_size(parsedHierarchies[name])
+            for name in parsedHierarchies.keys()))
+def calculate_size(parsedHierarchy):
+    """
+    @param  parsedHierarchy dictionary in the same format as the one
+                            operated on by insert_line
+    """
+    if 'subdirs' in parsedHierarchy:
+        parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
+    elif parsedHierarchy['type'] == 'd':
+            parsedHierarchy['tree_size'] = 0
+    if 'tree_size' in parsedHierarchy:
+        return parsedHierarchy['tree_size']
+    else:
+        return parsedHierarchy['file_size']
+#-------------------------------------------------------------------------------
+from sys import stdout
+def write_listing_in_json(listing, writer = stdout):
+    writer.write('{"basename":"%s"' % listing['basename'])
+    from operator import add
+    writer.write(reduce(add, (',"%s":%s' % (key,
+                                             '"%s"' % listing[key]
+                                             if isinstance(listing[key],str)
+                                             else listing[key])
+                              for key in listing.keys() if key != 'subdirs')))
+    writer.write('}\n')
+#-------------------------------------------------------------------------------
+def each_listing_in_hierarchy(parsedHierarchy):
+    """
+    @param parsedHierarchy dictionary mapping filenames to
+                             parsedHierarchies. This is in the same
+                             format as the 'subdirs' component of a
+                             parsedHierarchy.
+    @return one record for every file listing. Every parsedHierarchy
+            will have its 'subdirs' key deleted and will consequently be flat.
+    """
+    if 'subdirs' in parsedHierarchy:
+        subdirs = parsedHierarchy['subdirs']
+        del parsedHierarchy['subdirs']
+        return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
+    else:
+        return [parsedHierarchy]
+def each_listing_in_subdirs(parsedHierarchies):
+    keys = parsedHierarchies.keys()
+    keys.sort()
+    from operator import add
+    return reduce(add,
+                  [each_listing_in_hierarchy(parsedHierarchies[f])
+                   for f in keys])
+#-------------------------------------------------------------------------------
+def insert_line(parsedLine,
+                parsedHierarchy,
+                bucket_name,
+                prefix='/',
+                s3hdfs = False):
+    """
+    @param  parsedHierarchy A parsed hierarchy is a dictionary that
+                            contains the size, date, type, path, and
+                            subdirs of a file. It has two special
+                            properties: the basename contains no /
+                            characters, and the "subdirs" points to a
+                            dictionary that maps names to
+                            parsedHierarchies underneath this one.
+    """
+    def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
+        if 'subdirs' not in parsedHierarchy:
+            parsedHierarchy['subdirs'] = {}
+        if subdir not in parsedHierarchy['subdirs']:
+            parsedHierarchy['subdirs'][subdir] = {}
+            parsedHierarchy['subdirs'][subdir]['basename'] = subdir
+            parsedHierarchy['subdirs'][subdir]['file_size'] = 0
+            parsedHierarchy['subdirs'][subdir]['type'] = 'd'
+            prot = 's3' if s3hdfs else 's3n'
+            parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
+                          else '%s://%s/' % (prot, bucket_name))
+            parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
+            url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
+            parsedHierarchy['subdirs'][subdir]['_id'] = url
+            import hashlib
+            sha1hasher = hashlib.new('sha1')
+            sha1hasher.update(url)
+            parsedHierarchy['subdirs'][subdir]['uuid'] = (
+                sha1hasher.hexdigest().lower())
+    path = parsedLine['path']
+    # recursively insert rest of path after /
+    if path.find('/') != -1:
+        base,rest = path.split('/',1)
+        insert_subdir(parsedHierarchy, base, bucket_name, prefix)
+        parsedLine['path'] = rest
+        insert_line(parsedLine,
+                    parsedHierarchy['subdirs'][base],
+                    bucket_name,
+                    prefix + base + '/')
+    # insert one file or directory into "subdirs"
+    else:
+        insert_subdir(parsedHierarchy, path, bucket_name, prefix)
+        # This will also overwrite the default 'type':'d' from insert_subdir
+        for k in parsedLine.keys():
+            parsedHierarchy['subdirs'][path][k] = parsedLine[k]
+        parsedHierarchy['subdirs'][path]['basename'] = \
+            parsedHierarchy['subdirs'][path]['path']
+        del parsedHierarchy['subdirs'][path]['path']
+#-------------------------------------------------------------------------------
+def json2ls(json, writer, prefix='/'):
+    """
+    sanity check. writes json back out to the command line in ls form
+    """
+    from datetime import datetime
+    d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
+         if 'datetime' in json else '1970-01-01 00:00')
+    writer.write("%s %9d   %s\n" % (
+        d,
+        json['file_size'],
+        json['_id'].replace('s3n', 's3')))
+#-------------------------------------------------------------------------------
+def hdfs_parse_line(bucket_name):
+    import re
+    def line_parser(line):
+        components = re.compile(r"""
+            ^
+            (
+                [d\-]                               # directory bit
+            )
+                (?:[r\-][w\-][xs\-]){2}
+                [r\-][w\-][x\-]
+            [ \t]*
+            (?:-|[0-9]+)                            # number of links. ignore.
+            [ \t]*
+            ([0-9]+)                                # size
+            [ \t]*
+            (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
+            [ \t]*
+            (                                       # path
+                [^ \t]
+                [^\n]*
+            )
+            .*
+            $
+            """, re.VERBOSE)
+        m = components.match(line)
+        if not m:
+            import sys
+            sys.stderr.write("couldn't parse line: %s\n" % (line))
+            return None
+        typ, fsize, datetime, path = m.groups()
+        if typ == '-': typ =  'f'
+        if path.startswith('/'): path = path[1:]
+        return datetime, fsize, bucket_name, path, typ
+    return line_parser
+#-------------------------------------------------------------------------------
+def s3_parse_line(line):
+    import re
+    components = re.compile(r"""
+        ^
+        (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
+        [ \t]*
+        ([0-9]+)
+        [ \t]*
+        (?:
+            (?:s3://)
+            ([^/]*)
+            /
+            ([^\n]*)
+        )
+        .*
+        $
+        """, re.VERBOSE)
+    m = components.match(line)
+    if not m:
+        import sys
+        sys.stderr.write("couldn't parse line: %s\n" % (line))
+        return None
+    datetime, fsize, bucket_name, parsed_line = m.groups()
+    typ = 'f'
+    return datetime, fsize, bucket_name, parsed_line, typ
+#-------------------------------------------------------------------------------
+def ls2json_subdirs(lines, line_parser):
+    parsedHierarchy = None
+    count = 0
+    for line in lines:
+        count = count + 1
+        if count % 1000 == 0:
+            logging.info("inserting line %d" % (count))
+        line_tuple = line_parser(line)
+        if not line_tuple:
+            continue
+        parsedLine = {}
+        (
+            parsedLine['datetime'],
+            parsedLine['file_size'],
+            bucket_name,
+            parsedLine['path'],
+            parsedLine['type']
+            ) = line_tuple
+        if not parsedHierarchy:
+            url = "s3n://%s" % (bucket_name)
+            import hashlib
+            sha1hasher = hashlib.new('sha1')
+            sha1hasher.update(url)
+            parsedHierarchy = {
+                bucket_name : {
+                    "subdirs" : {},
+                    "basename" : bucket_name,
+                    "_id" : url,
+                    "type" : "d",
+                    "file_size" : 0,
+                    "uuid" : sha1hasher.hexdigest(),
+                    }
+                }
+        parsedLine['file_size'] = int(parsedLine['file_size'])
+        if parsedLine['datetime'] == '1970-01-01 00:00':
+            del parsedLine['datetime']
+        else:
+            from datetime import datetime
+            parsedLine['datetime'] = int(datetime.strptime(
+                parsedLine['datetime'],
+                "%Y-%m-%d %H:%M").strftime("%s"))
+            parsedLine['file_size'] = int(parsedLine['file_size'])
+        if parsedLine['path'].endswith('/'):
+            parsedLine['path'] = parsedLine['path'][:-1]
+            parsedLine['type'] = 'd'
+        insert_line(parsedLine,
+                    parsedHierarchy[bucket_name],
+                    bucket_name)
+    if not parsedHierarchy: return []
+    logging.info("calculating sizes")
+    calculate_sizes(parsedHierarchy)
+    logging.info("converting hierarchies")
+    return each_listing_in_subdirs(parsedHierarchy)
+#-------------------------------------------------------------------------------
+if __name__ == '__main__':
+    from optparse import OptionParser
+    parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
+    parser.add_option("-i", "--input", dest="infile", default = None,
+                      help="input file..")
+    parser.add_option("-o", "--output", dest="outfile", default = None,
+                      help="output file.")
+    parser.add_option("-t", "--test", dest="test", default = False,
+                      action="store_true",
+                      help="reoutput in ls format. for debugging")
+    (options, args) = parser.parse_args()
+    import sys
+    if len(args) > 1:
+        parser.print_usage()
+        sys.exit(0)
+    if args:
+        bucket, = args
+        ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
+                                                       hdfs_parse_line(bucket))
+    else:
+        ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
+                                                       s3_parse_line)
+    def open_or_die(fname, flags="r"):
+        try:
+            return open(fname, flags)
+        except IOError as (errno, strerr):
+            sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
+            sys.exit(0)
+    from sys import stdin, stdout
+    instream = open_or_die(options.infile) if options.infile else stdin
+    outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
+    if options.test:
+        for listing in ls_converter(instream):
+            json2ls(listing, outstream)
+    else:
+        for listing in ls_converter(instream):
+            write_listing_in_json(listing, outstream)