wukong 1.5.3 → 1.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +4 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-ls +2 -1
- data/docpages/avro/performance.textile +36 -0
- data/examples/cassandra_streaming/avromapper.rb +85 -0
- data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
- data/examples/cassandra_streaming/cassandra.avpr +468 -0
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
- data/examples/cassandra_streaming/catter.sh +45 -0
- data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
- data/examples/cassandra_streaming/client_schema.avpr +211 -0
- data/examples/cassandra_streaming/client_schema.textile +318 -0
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +1 -0
- data/examples/cassandra_streaming/pyreduce.sh +1 -0
- data/examples/cassandra_streaming/smutation.avpr +188 -0
- data/examples/cassandra_streaming/streamer.sh +51 -0
- data/examples/cassandra_streaming/struct_loader.rb +24 -0
- data/examples/cassandra_streaming/tuning.textile +73 -0
- data/examples/emr/README-elastic_map_reduce.textile +26 -0
- data/examples/emr/dot_wukong_dir/credentials.json +7 -0
- data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
- data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
- data/examples/emr/elastic_mapreduce_example.rb +1 -0
- data/lib/wukong/encoding/asciize.rb +108 -0
- data/lib/wukong/extensions/date_time.rb +33 -7
- data/lib/wukong/extensions/emittable.rb +12 -25
- data/lib/wukong/extensions/hash_like.rb +13 -6
- data/lib/wukong/filename_pattern.rb +8 -7
- data/lib/wukong/schema.rb +47 -0
- data/lib/wukong/script.rb +7 -0
- data/lib/wukong/script/cassandra_loader_script.rb +40 -0
- data/lib/wukong/script/emr_command.rb +74 -43
- data/lib/wukong/script/hadoop_command.rb +89 -72
- data/lib/wukong/store.rb +2 -7
- data/lib/wukong/store/cassandra.rb +10 -0
- data/lib/wukong/store/cassandra/streaming.rb +75 -0
- data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
- data/lib/wukong/store/cassandra_model.rb +90 -0
- data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
- data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
- data/wukong.gemspec +32 -4
- metadata +33 -14
| @@ -0,0 +1,51 @@ | |
| 1 | 
            +
            #!/usr/bin/env bash
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            input_file="$1" 		 ; shift
         | 
| 4 | 
            +
            output_file="$1" 		 ; shift
         | 
| 5 | 
            +
            map_script=${1-/bin/cat}	 ; shift
         | 
| 6 | 
            +
            reduce_script=${1-/usr/bin/uniq} ; shift
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            dest_keyspace=${dest_keyspace-soc_net_tw}
         | 
| 9 | 
            +
            dest_col_family=${dest_col_family-Wordbag}
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            hostname=`hostname`
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            # Path to cassandra and hadoop dirs
         | 
| 14 | 
            +
            script_dir=$(readlink -f `dirname $0`)
         | 
| 15 | 
            +
            CASSANDRA_HOME=${CASSANDRA_HOME-/usr/local/share/cassandra}
         | 
| 16 | 
            +
            HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
         | 
| 17 | 
            +
            avro_file=${avro_file-$CASSANDRA_HOME/interface/avro/cassandra.avpr}
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            ARCHIVES=`/bin/ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
         | 
| 20 | 
            +
            for jar in `/bin/ls -1 $CASSANDRA_HOME/build/lib/jars/*.jar $CASSANDRA_HOME/lib/*.jar`; do
         | 
| 21 | 
            +
                ARCHIVES=$ARCHIVES,$jar
         | 
| 22 | 
            +
            done
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            ${HADOOP_HOME}/bin/hadoop                                                                        \
         | 
| 25 | 
            +
                 jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar                                 \
         | 
| 26 | 
            +
                -D stream.map.output=cassandra_avro_output                                                   \
         | 
| 27 | 
            +
                -D stream.io.identifier.resolver.class=org.apache.cassandra.hadoop.streaming.AvroResolver    \
         | 
| 28 | 
            +
                -D cassandra.output.keyspace="$dest_keyspace"                                                \
         | 
| 29 | 
            +
                -D cassandra.output.columnfamily="$dest_col_family"                                          \
         | 
| 30 | 
            +
                -D cassandra.thrift.address=10.204.41.193,10.204.30.11,10.204.58.238,10.204.239.133,10.196.191.31,10.204.103.21,10.202.74.223,10.202.143.95 \
         | 
| 31 | 
            +
                -D cassandra.partitioner.class=org.apache.cassandra.dht.RandomPartitioner                    \
         | 
| 32 | 
            +
                -D cassandra.thrift.port=9160                                                                \
         | 
| 33 | 
            +
                -D mapreduce.output.columnfamilyoutputformat.batch.threshold=1024                            \
         | 
| 34 | 
            +
                -D mapred.reduce.tasks=0                                                                     \
         | 
| 35 | 
            +
                -D mapred.map.tasks.speculative.execution=false                                              \
         | 
| 36 | 
            +
                -libjars $ARCHIVES                                                                           \
         | 
| 37 | 
            +
                -file $avro_file                                                                             \
         | 
| 38 | 
            +
                -outputformat org.apache.cassandra.hadoop.ColumnFamilyOutputFormat                           \
         | 
| 39 | 
            +
                -mapper  	 "ruby $script_dir/avromapper.rb --map "                                         \
         | 
| 40 | 
            +
                -input       "$input_file"                                                                   \
         | 
| 41 | 
            +
                -output  	 "$output_file"                                                                  \
         | 
| 42 | 
            +
                "$@"
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                # -D cassandra.thrift.address=10.204.54.190,10.244.42.31,10.244.42.176,10.244.42.112,10.244.42.143,10.244.42.79,10.244.42.4,10.204.53.166 \
         | 
| 45 | 
            +
                # -D cassandra.thrift.address=10.204.221.230,10.243.79.223,10.245.19.159,10.242.154.159,10.242.153.155,10.242.153.203 \
         | 
| 46 | 
            +
             | 
| 47 | 
            +
             | 
| 48 | 
            +
            # cat /tmp/mj-flip/chimchim-info.log | cut -f5 | ruby -e 'puts $stdin.readlines.map{|l| l.chomp.gsub(/ip-([0-9\-]+)\..*/,"\\1").gsub(/-/,".") }.join(",")'
         | 
| 49 | 
            +
             | 
| 50 | 
            +
             | 
| 51 | 
            +
             | 
| @@ -0,0 +1,24 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            require 'rubygems'
         | 
| 3 | 
            +
            require 'wukong'
         | 
| 4 | 
            +
            require 'wukong/periodic_monitor'
         | 
| 5 | 
            +
            require 'wukong/store/cassandra'
         | 
| 6 | 
            +
            require 'wukong/script/cassandra_loader_script'
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            Settings.use :commandline
         | 
| 9 | 
            +
            Settings.define :log_interval,    :default => 1
         | 
| 10 | 
            +
            Settings.cassandra_keyspace   = 'soc_net_tw'
         | 
| 11 | 
            +
            Settings.cassandra_col_family = 'TwitterUser'
         | 
| 12 | 
            +
            Settings.cassandra_hosts      = "ip-10-204-41-193.ec2.internal:9160,ip-10-204-30-11.ec2.internal:9160,ip-10-204-58-238.ec2.internal:9160,ip-10-204-239-133.ec2.internal:9160,ip-10-196-191-31.ec2.internal:9160,ip-10-204-103-21.ec2.internal:9160,ip-10-202-74-223.ec2.internal:9160,ip-10-202-143-95.ec2.internal:9160"
         | 
| 13 | 
            +
            Settings.resolve!
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            require 'cassandra/0.7'
         | 
| 16 | 
            +
            require 'wuclan/twitter' ; include Wuclan::Twitter
         | 
| 17 | 
            +
            require 'wuclan/twitter/cassandra_db'
         | 
| 18 | 
            +
            require 'wukong/store/cassandra/streaming'
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            # hdp-catd s3://s3hdfs.infinitemonkeys.info/data/sn/tw/fixd/objects/twitter_user | head
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            # CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
         | 
| 23 | 
            +
            Wukong::CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
         | 
| 24 | 
            +
             | 
| @@ -0,0 +1,73 @@ | |
| 1 | 
            +
             | 
| 2 | 
            +
             | 
| 3 | 
            +
            Start
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            5 c1.xlarge
         | 
| 6 | 
            +
              2000 writes/sec
         | 
| 7 | 
            +
              40   clients
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            4 m2.xlarge
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  :java_max_heap                => "12500M",          #
         | 
| 12 | 
            +
                  # :flush_data_buffer_size     => 32,                # 32,
         | 
| 13 | 
            +
                  # :flush_index_buffer_size    => 8,                 # 8,
         | 
| 14 | 
            +
                  # :binary_memtable_throughput => 256,               # 256,
         | 
| 15 | 
            +
                  # :memtable_flush_after       => 60,                # 60,
         | 
| 16 | 
            +
                  # :memtable_throughput        => 64,                # 64,
         | 
| 17 | 
            +
                  # :memtable_ops               => 0.3,               # 0.3,
         | 
| 18 | 
            +
                  # :column_index_size          => 64,                # 64,
         | 
| 19 | 
            +
                  # :in_memory_compaction_limit => 64                 # 64
         | 
| 20 | 
            +
                  :concurrent_reads             => 8,                 # 8
         | 
| 21 | 
            +
                  :concurrent_writes            => 250,               # 32
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            /usr/bin/java -ea                                                       \
         | 
| 24 | 
            +
                -Xms128M                                                            \
         | 
| 25 | 
            +
                -Xmx12500M                                                          \
         | 
| 26 | 
            +
                -XX:TargetSurvivorRatio=90                                          \
         | 
| 27 | 
            +
                -XX:+AggressiveOpts                                                 \
         | 
| 28 | 
            +
                -XX:+UseParNewGC                                                    \
         | 
| 29 | 
            +
                -XX:+UseConcMarkSweepGC                                             \
         | 
| 30 | 
            +
                -XX:+CMSParallelRemarkEnabled                                       \
         | 
| 31 | 
            +
                -XX:+HeapDumpOnOutOfMemoryError                                     \
         | 
| 32 | 
            +
                -XX:SurvivorRatio=128                                               \
         | 
| 33 | 
            +
                -XX:MaxTenuringThreshold=0                                          \
         | 
| 34 | 
            +
                -Djava.rmi.server.hostname=ec2-184-73-20-37.compute-1.amazonaws.com \
         | 
| 35 | 
            +
                -Dcom.sun.management.jmxremote.port=12345                           \
         | 
| 36 | 
            +
                -Dcom.sun.management.jmxremote.ssl=false                            \
         | 
| 37 | 
            +
                -Dcom.sun.management.jmxremote.authenticate=false                   \
         | 
| 38 | 
            +
                -Dcassandra                                                         \
         | 
| 39 | 
            +
                -Dstorage-config=/etc/cassandra                                     \
         | 
| 40 | 
            +
                -Dcassandra-foreground=yes                                          \
         | 
| 41 | 
            +
                -cp /etc/cassandra:/usr/local/share/cassandra/build/classes:/usr/local/share/cassandra/lib/antlr-3.1.3.jar:/usr/local/share/cassandra/lib/avro-1.3.3-sources~cust1.jar:/usr/local/share/cassandra/lib/avro-1.3.3~cust2.jar:/usr/local/share/cassandra/lib/clhm-production.jar:/usr/local/share/cassandra/lib/commons-cli-1.1.jar:/usr/local/share/cassandra/lib/commons-codec-1.2.jar:/usr/local/share/cassandra/lib/commons-collections-3.2.1.jar:/usr/local/share/cassandra/lib/commons-lang-2.4.jar:/usr/local/share/cassandra/lib/guava-r05.jar:/usr/local/share/cassandra/lib/hadoop-core-0.20.1.jar:/usr/local/share/cassandra/lib/high-scale-lib.jar:/usr/local/share/cassandra/lib/jackson-core-asl-1.4.0.jar:/usr/local/share/cassandra/lib/jackson-mapper-asl-1.4.0.jar:/usr/local/share/cassandra/lib/jetty-6.1.21.jar:/usr/local/share/cassandra/lib/jetty-util-6.1.21.jar:/usr/local/share/cassandra/lib/jline-0.9.94.jar:/usr/local/share/cassandra/lib/json-simple-1.1.jar:/usr/local/share/cassandra/lib/jug-2.0.0.jar:/usr/local/share/cassandra/lib/libthrift-r959516.jar:/usr/local/share/cassandra/lib/log4j-1.2.16.jar:/usr/local/share/cassandra/lib/servlet-api-2.5-20081211.jar:/usr/local/share/cassandra/lib/slf4j-api-1.5.8.jar:/usr/local/share/cassandra/lib/slf4j-log4j12-1.5.8.jar:/usr/local/share/cassandra/lib/snakeyaml-1.6.jar\
         | 
| 42 | 
            +
                org.apache.cassandra.thrift.CassandraDaemon
         | 
| 43 | 
            +
             | 
| 44 | 
            +
             | 
| 45 | 
            +
            avg-cpu:  %user   %nice %system %iowait  %steal   %idle
         | 
| 46 | 
            +
                      81.83    0.00    1.96    0.00    0.00   16.21
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
         | 
| 49 | 
            +
            sda1            155.12         9.45     11450.39         48      58168
         | 
| 50 | 
            +
            sdb               2.76         0.00        22.05          0        112
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            avg-cpu:  %user   %nice %system %iowait  %steal   %idle
         | 
| 53 | 
            +
                      83.72    0.00    3.80    0.20    0.00   12.29
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
         | 
| 56 | 
            +
            sda1             66.53         1.59      3921.91          8      19688
         | 
| 57 | 
            +
            sdb             100.20         0.00      6686.85          0      33568
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            avg-cpu:  %user   %nice %system %iowait  %steal   %idle
         | 
| 60 | 
            +
                      66.40    0.00    5.00    0.80    0.40   27.40
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
         | 
| 63 | 
            +
            sda1              2.40         0.00        19.20          0         96
         | 
| 64 | 
            +
            sdb             186.80         0.00     15318.40          0      76592
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            avg-cpu:  %user   %nice %system %iowait  %steal   %idle
         | 
| 67 | 
            +
                      80.98    0.00    6.08    1.99    0.00   10.96
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
         | 
| 70 | 
            +
            sda1            113.97         0.00      7426.75          0      37208
         | 
| 71 | 
            +
            sdb             360.28         1.60     29232.73          8     146456
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                
         | 
| @@ -0,0 +1,26 @@ | |
| 1 | 
            +
             | 
| 2 | 
            +
            # Download the Amazon elastic-mapreduce runner from http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Create a bucket and path to hold your EMR logs, scripts and other ephemera. For instance you might choose 'emr.yourdomain.com' as the bucket and '/wukong' as a scoping path within that bucket. In that case you will refer to it with a path like s3n://emr.yourdomain.com/wukong (see notes below about s3n:// vs. s3:// URLs).
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            # Copy the contents of wukong/examples/emr/dot_wukong_dir to ~/.wukong
         | 
| 7 | 
            +
            # Edit emr.yaml -- it has instructions for the 
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            h3. s3n:// vs. s3:// URLs
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            Many external tools use a URI convention to address files in S3; they typically use the 's3://' scheme, which makes a lot of sense:
         | 
| 16 | 
            +
              s3://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            Hadoop can maintain an HDFS on the Amazon S3: it uses a block structure and has optimizations for streaming, no file size limitation, and other goodness. However, only hadoop tools can interpret the contents of those blocks -- to everything else it just looks like a soup of blocks labelled block_-8675309 and so forth.  Hadoop unfortunately chose the 's3://' scheme for URIs in this filesystem:
         | 
| 19 | 
            +
              s3://s3hdfs.yourcompany.com/path/to/data
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            Hadoop is happy to read s3 native files -- 'native' as in, you can look at them with a browser and upload them an download them with any S3 tool out there. There's a 5GB limit on file size, and in some cases a performance hit (but not in our experience enough to worry about).  You refer to these files with the 's3n://' scheme ('n' as in 'native'):
         | 
| 22 | 
            +
              s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-mapper.rb
         | 
| 23 | 
            +
              s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-reducer.rb
         | 
| 24 | 
            +
              s3n://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            Wukong will coerce things to the right scheme when it knows what that scheme should be (eg. code should be s3n://). It will otherwise leave the path alone. Specifically, if you use a URI scheme for input and output paths you must use 's3n://' for normal s3 files.
         | 
| @@ -2,51 +2,68 @@ | |
| 2 2 | 
             
            # Elastic MapReduce config in wukong
         | 
| 3 3 | 
             
            #
         | 
| 4 4 |  | 
| 5 | 
            +
            # ===========================================================================
         | 
| 5 6 | 
             
            #
         | 
| 6 7 | 
             
            # Infrastructure options
         | 
| 7 8 | 
             
            #
         | 
| 8 9 |  | 
| 9 | 
            -
            # == Fill all your information into yet another file with your amazon key | 
| 10 | 
            -
            #     | 
| 10 | 
            +
            # == Fill all your information into yet another file with your amazon key
         | 
| 11 | 
            +
            #    It needs to be in so many stupid places because nobody can agree on a
         | 
| 11 12 | 
             
            #    filename or format.
         | 
| 13 | 
            +
            #
         | 
| 12 14 | 
             
            :emr_credentials_file:          ~/.wukong/credentials.json
         | 
| 15 | 
            +
             | 
| 13 16 | 
             
            #
         | 
| 14 | 
            -
            # == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here | 
| 17 | 
            +
            # == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here
         | 
| 18 | 
            +
            #  
         | 
| 15 19 | 
             
            # :access_key:                  ASDFAHKHASDF
         | 
| 16 20 | 
             
            # :secret_access_key:           ADSGHASDFJASDFASDF
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            # == Path to your keypair file.
         | 
| 17 23 | 
             
            #
         | 
| 18 | 
            -
            # == Path to your keypair file. 
         | 
| 19 24 | 
             
            :key_pair_file:                 ~/.wukong/keypairs/gibbon.pem
         | 
| 20 | 
            -
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            # == Keypair will be named after your file, or force the name
         | 
| 27 | 
            +
            #
         | 
| 21 28 | 
             
            # :key_pair:                    ~
         | 
| 22 29 |  | 
| 23 30 | 
             
            # == Path to the Amazon elastic-mapreduce runner. Get a copy from
         | 
| 24 31 | 
             
            #    http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
         | 
| 32 | 
            +
            #
         | 
| 25 33 | 
             
            :emr_runner:                    ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
         | 
| 26 34 |  | 
| 35 | 
            +
            # ===========================================================================
         | 
| 36 | 
            +
            #
         | 
| 37 | 
            +
            # Remote Paths
         | 
| 38 | 
            +
            #
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            # == Wukong is opinionated about the paths and locations of scripts and
         | 
| 41 | 
            +
            #    everything. It will organize files by job name within the following path:
         | 
| 42 | 
            +
            #
         | 
| 43 | 
            +
            :emr_root:                      s3://s3n.infinitemonkeys.info/emr
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            # == If you specify the :emr_data_root path, then relative pathnames -- ones that
         | 
| 46 | 
            +
            #    do not look like a URI (s3://yadda/yada) and do not start with a '/' -- will
         | 
| 47 | 
            +
            #    be prefixed with this path prefix.
         | 
| 48 | 
            +
            :emr_data_root:                 s3n://s3n.infinitemonkeys.info/data
         | 
| 49 | 
            +
             | 
| 50 | 
            +
             | 
| 51 | 
            +
            # ===========================================================================
         | 
| 27 52 | 
             
            #
         | 
| 28 53 | 
             
            # Cluster Config
         | 
| 29 54 | 
             
            #
         | 
| 30 55 | 
             
            :num_instances:                 1
         | 
| 31 | 
            -
            :instance_type:                  | 
| 56 | 
            +
            :instance_type:                 m1.small
         | 
| 32 57 | 
             
            :master_instance_type:          ~
         | 
| 33 58 | 
             
            :hadoop_version:                '0.20'
         | 
| 34 59 | 
             
            :availability_zone:             us-east-1b
         | 
| 35 60 |  | 
| 61 | 
            +
            # ===========================================================================
         | 
| 36 62 | 
             
            #
         | 
| 37 63 | 
             
            # Running and reporting options
         | 
| 38 64 | 
             
            #
         | 
| 39 | 
            -
            :alive:                          | 
| 65 | 
            +
            :alive:                         true
         | 
| 40 66 | 
             
            :enable_debugging:              true
         | 
| 41 67 | 
             
            :emr_runner_verbose:            true
         | 
| 42 68 | 
             
            :emr_runner_debug:              ~
         | 
| 43 69 | 
             
            :step_action:                   CANCEL_AND_WAIT         # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
         | 
| 44 | 
            -
             | 
| 45 | 
            -
            #
         | 
| 46 | 
            -
            # Remote Paths
         | 
| 47 | 
            -
            #
         | 
| 48 | 
            -
            # Wukong is opinionated about the paths and locations of scripts and
         | 
| 49 | 
            -
            # everything. Make an S3 bucket and let the wookiee win -- or hack
         | 
| 50 | 
            -
            # lib/wukong/script/emr_command.rb to be more flexible and send us back a patch.
         | 
| 51 | 
            -
            #
         | 
| 52 | 
            -
            :emr_root:                      s3n://emr.infinitemonkeys.info
         | 
| @@ -24,7 +24,7 @@ sudo apt-get install -y unzip build-essential git-core ruby ruby1.8-dev rubygems | |
| 24 24 | 
             
            echo "`date` Unchaining rubygems from the tyrrany of ubuntu" 
         | 
| 25 25 | 
             
            sudo gem install --no-rdoc --no-ri rubygems-update --version=1.3.7 ; sudo /var/lib/gems/1.8/bin/update_rubygems; sudo gem update --no-rdoc --no-ri --system ; gem --version ;
         | 
| 26 26 |  | 
| 27 | 
            -
            echo "`date` Installing wukong gems" 
         | 
| 27 | 
            +
            echo "`date` Installing wukong and related gems" 
         | 
| 28 28 | 
             
            sudo gem install --no-rdoc --no-ri addressable extlib htmlentities configliere yard wukong right_aws uuidtools cheat
         | 
| 29 29 | 
             
            sudo gem list 
         | 
| 30 30 |  | 
| @@ -0,0 +1,108 @@ | |
| 1 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # http://www.jroller.com/obie/tags/unicode
         | 
| 4 | 
            +
            # http://www.unicode.org/faq/casemap_charprop.html
         | 
| 5 | 
            +
            # http://unicode.org/reports/tr10/#Conformance
         | 
| 6 | 
            +
            # http://intertwingly.net/stories/2009/11/30/asciize.rb
         | 
| 7 | 
            +
            # http://blog.stevenlevithan.com/archives/javascript-regex-and-unicode
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # http://xregexp.com/tests/unicode.html
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            class String
         | 
| 12 | 
            +
              #
         | 
| 13 | 
            +
              # Taken from http://intertwingly.net/stories/2009/11/30/asciize.rb
         | 
| 14 | 
            +
              #
         | 
| 15 | 
            +
              def asciize(name)
         | 
| 16 | 
            +
                if name =~ /[^\x00-\x7F]/
         | 
| 17 | 
            +
                  # digraphs.  May be culturally sensitive
         | 
| 18 | 
            +
                  name.gsub! /\xc3\x9f/, 'ss'
         | 
| 19 | 
            +
                  name.gsub! /\xc3\xa4|a\xcc\x88/, 'ae'
         | 
| 20 | 
            +
                  name.gsub! /\xc3\xa5|a\xcc\x8a/, 'aa'
         | 
| 21 | 
            +
                  name.gsub! /\xc3\xa6/, 'ae'
         | 
| 22 | 
            +
                  name.gsub! /\xc3\xb1|n\xcc\x83/, 'ny'
         | 
| 23 | 
            +
                  name.gsub! /\xc3\xb6|o\xcc\x88/, 'oe'
         | 
| 24 | 
            +
                  name.gsub! /\xc3\xbc|u\xcc\x88/, 'ue'
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  # latin 1
         | 
| 27 | 
            +
                  name.gsub! /\xc3[\xa0-\xa5]/, 'a'
         | 
| 28 | 
            +
                  name.gsub! /\xc3\xa7/, 'c'
         | 
| 29 | 
            +
                  name.gsub! /\xc3[\xa8-\xab]/, 'e'
         | 
| 30 | 
            +
                  name.gsub! /\xc3[\xac-\xaf]/, 'i'
         | 
| 31 | 
            +
                  name.gsub! /\xc3[\xb2-\xb6]|\xc3\xb8/, 'o'
         | 
| 32 | 
            +
                  name.gsub! /\xc3[\xb9-\xbc]/, 'u'
         | 
| 33 | 
            +
                  name.gsub! /\xc3[\xbd\xbf]/, 'y'
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  # Latin Extended-A
         | 
| 36 | 
            +
                  name.gsub! /\xc4[\x80-\x85]/, 'a'
         | 
| 37 | 
            +
                  name.gsub! /\xc4[\x86-\x8d]/, 'c'
         | 
| 38 | 
            +
                  name.gsub! /\xc4[\x8e-\x91]/, 'd'
         | 
| 39 | 
            +
                  name.gsub! /\xc4[\x92-\x9b]/, 'e'
         | 
| 40 | 
            +
                  name.gsub! /\xc4[\x9c-\xa3]/, 'g'
         | 
| 41 | 
            +
                  name.gsub! /\xc4[\xa4-\xa7]/, 'h'
         | 
| 42 | 
            +
                  name.gsub! /\xc4[\xa8-\xb1]/, 'i'
         | 
| 43 | 
            +
                  name.gsub! /\xc4[\xb2-\xb3]/, 'ij'
         | 
| 44 | 
            +
                  name.gsub! /\xc4[\xb4-\xb5]/, 'j'
         | 
| 45 | 
            +
                  name.gsub! /\xc4[\xb6-\xb8]/, 'k'
         | 
| 46 | 
            +
                  name.gsub! /\xc4[\xb9-\xff]|\xc5[\x80-\x82]/, 'l'
         | 
| 47 | 
            +
                  name.gsub! /\xc5[\x83-\x8b]/, 'n'
         | 
| 48 | 
            +
                  name.gsub! /\xc5[\x8c-\x91]/, 'o'
         | 
| 49 | 
            +
                  name.gsub! /\xc5[\x92-\x93]/, 'oe'
         | 
| 50 | 
            +
                  name.gsub! /\xc5[\x94-\x99]/, 'r'
         | 
| 51 | 
            +
                  name.gsub! /\xc5[\x9a-\xa2]/, 's'
         | 
| 52 | 
            +
                  name.gsub! /\xc5[\xa2-\xa7]/, 't'
         | 
| 53 | 
            +
                  name.gsub! /\xc5[\xa8-\xb3]/, 'u'
         | 
| 54 | 
            +
                  name.gsub! /\xc5[\xb4-\xb5]/, 'w'
         | 
| 55 | 
            +
                  name.gsub! /\xc5[\xb6-\xb8]/, 'y'
         | 
| 56 | 
            +
                  name.gsub! /\xc5[\xb9-\xbe]/, 'z'
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  # denormalized diacritics
         | 
| 59 | 
            +
                  name.gsub! /\xcc[\x80-\xff]|\xcd[\x80-\xaf]/, ''
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                name.gsub /[^\w]+/, '-'
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            if __FILE__ == $PROGRAM_NAME
         | 
| 68 | 
            +
              i18n = "I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xc3\xa0liz\xc3\xa6ti\xc3\xb8n"
         | 
| 69 | 
            +
              puts "#{i18n} => #{i18n.asciize}"
         | 
| 70 | 
            +
            end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            # http://www.jroller.com/obie/tags/unicode
         | 
| 73 | 
            +
            #
         | 
| 74 | 
            +
            # require 'iconv'
         | 
| 75 | 
            +
            # require 'unicode'
         | 
| 76 | 
            +
            # 
         | 
| 77 | 
            +
            # class String
         | 
| 78 | 
            +
            #   
         | 
| 79 | 
            +
            #   def to_ascii
         | 
| 80 | 
            +
            #     # split in muti-byte aware fashion and translate characters over 127
         | 
| 81 | 
            +
            #     # and dropping characters not in the translation hash
         | 
| 82 | 
            +
            #     self.chars.split('').collect { |c| (c[0] <= 127) ? c : translation_hash[c[0]] }.join
         | 
| 83 | 
            +
            #   end
         | 
| 84 | 
            +
            #     
         | 
| 85 | 
            +
            #   def to_url_format
         | 
| 86 | 
            +
            #     url_format = self.to_ascii
         | 
| 87 | 
            +
            #     url_format = url_format.gsub(/[^A-Za-z0-9]/, '') # all non-word
         | 
| 88 | 
            +
            #     url_format.downcase!
         | 
| 89 | 
            +
            #     url_format
         | 
| 90 | 
            +
            #   end
         | 
| 91 | 
            +
            #   
         | 
| 92 | 
            +
            #   protected
         | 
| 93 | 
            +
            #   
         | 
| 94 | 
            +
            #     def translation_hash
         | 
| 95 | 
            +
            #       @@translation_hash ||= setup_translation_hash      
         | 
| 96 | 
            +
            #     end
         | 
| 97 | 
            +
            #     
         | 
| 98 | 
            +
            #     def setup_translation_hash
         | 
| 99 | 
            +
            #       accented_chars   = "ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý"
         | 
| 100 | 
            +
            #       unaccented_chars = "AAAAAACEEEEIIIIDNOOOOOxOUUUUYaaaaaaceeeeiiiinoooooouuuuy"
         | 
| 101 | 
            +
            #   
         | 
| 102 | 
            +
            #       translation_hash = Hash.zip(accented_chars.chars, unaccented_chars.chars)
         | 
| 103 | 
            +
            #       translation_hash["Æ".chars[0]] = 'AE'
         | 
| 104 | 
            +
            #       translation_hash["æ".chars[0]] = 'ae'
         | 
| 105 | 
            +
            #       translation_hash
         | 
| 106 | 
            +
            #     end
         | 
| 107 | 
            +
            #     
         | 
| 108 | 
            +
            # end
         | 
| @@ -1,23 +1,31 @@ | |
| 1 1 | 
             
            require 'time'
         | 
| 2 2 | 
             
            require 'date'
         | 
| 3 | 
            -
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            class Time
         | 
| 5 | 
            +
              # strftime() format to flatten a date
         | 
| 6 | 
            +
              FLAT_FORMAT = "%Y%m%d%H%M%S"
         | 
| 7 | 
            +
              # Flatten
         | 
| 8 | 
            +
              def to_flat
         | 
| 9 | 
            +
                utc.strftime(FLAT_FORMAT)
         | 
| 10 | 
            +
              end
         | 
| 11 | 
            +
             | 
| 4 12 | 
             
              #
         | 
| 5 13 | 
             
              # Parses the time but never fails.
         | 
| 6 14 | 
             
              # Return value is always in the UTC time zone.
         | 
| 7 15 | 
             
              #
         | 
| 8 | 
            -
              # A flattened datetime -- a  | 
| 16 | 
            +
              # A flattened datetime -- a 14-digit YYYYmmddHHMMMSS -- is fixed to the UTC
         | 
| 9 17 | 
             
              # time zone by parsing it as YYYYmmddHHMMMSSZ <- 'Z' at end
         | 
| 10 18 | 
             
              #
         | 
| 11 19 | 
             
              def self.parse_safely dt
         | 
| 12 20 | 
             
                return nil if dt.blank?
         | 
| 13 21 | 
             
                begin
         | 
| 14 | 
            -
                   | 
| 15 | 
            -
             | 
| 16 | 
            -
                   | 
| 17 | 
            -
             | 
| 22 | 
            +
                  case
         | 
| 23 | 
            +
                  when dt.is_a?(Time)               then dt.utc
         | 
| 24 | 
            +
                  when (dt.to_s =~ /\A\d{14}\z/)    then parse(dt.to_s+'Z', true)
         | 
| 25 | 
            +
                  else                                   parse(dt.to_s,     true).utc
         | 
| 18 26 | 
             
                  end
         | 
| 19 27 | 
             
                rescue StandardError => e
         | 
| 20 | 
            -
                  Log. | 
| 28 | 
            +
                  Log.debug e
         | 
| 21 29 | 
             
                end
         | 
| 22 30 | 
             
              end
         | 
| 23 31 |  | 
| @@ -25,3 +33,21 @@ DateTime.class_eval do | |
| 25 33 | 
             
                parse_safely(str).to_flat
         | 
| 26 34 | 
             
              end
         | 
| 27 35 | 
             
            end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            class DateTime < Date
         | 
| 38 | 
            +
              # strftime() format to flatten a date
         | 
| 39 | 
            +
              FLAT_FORMAT = "%Y%m%d%H%M%S"
         | 
| 40 | 
            +
              # Flatten
         | 
| 41 | 
            +
              def to_flat
         | 
| 42 | 
            +
                strftime(FLAT_FORMAT)
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
            end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            class Date
         | 
| 47 | 
            +
              # strftime() format to flatten a date
         | 
| 48 | 
            +
              FLAT_FORMAT = "%Y%m%d"
         | 
| 49 | 
            +
              # Flatten
         | 
| 50 | 
            +
              def to_flat
         | 
| 51 | 
            +
                strftime(FLAT_FORMAT)
         | 
| 52 | 
            +
              end
         | 
| 53 | 
            +
            end
         |