wukong 1.5.3 → 1.5.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG.textile +4 -0
  2. data/bin/hdp-bin +44 -0
  3. data/bin/hdp-ls +2 -1
  4. data/docpages/avro/performance.textile +36 -0
  5. data/examples/cassandra_streaming/avromapper.rb +85 -0
  6. data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
  7. data/examples/cassandra_streaming/cassandra.avpr +468 -0
  8. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
  9. data/examples/cassandra_streaming/catter.sh +45 -0
  10. data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
  11. data/examples/cassandra_streaming/client_schema.avpr +211 -0
  12. data/examples/cassandra_streaming/client_schema.textile +318 -0
  13. data/examples/cassandra_streaming/foofile.avr +0 -0
  14. data/examples/cassandra_streaming/pymap.sh +1 -0
  15. data/examples/cassandra_streaming/pyreduce.sh +1 -0
  16. data/examples/cassandra_streaming/smutation.avpr +188 -0
  17. data/examples/cassandra_streaming/streamer.sh +51 -0
  18. data/examples/cassandra_streaming/struct_loader.rb +24 -0
  19. data/examples/cassandra_streaming/tuning.textile +73 -0
  20. data/examples/emr/README-elastic_map_reduce.textile +26 -0
  21. data/examples/emr/dot_wukong_dir/credentials.json +7 -0
  22. data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
  23. data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
  24. data/examples/emr/elastic_mapreduce_example.rb +1 -0
  25. data/lib/wukong/encoding/asciize.rb +108 -0
  26. data/lib/wukong/extensions/date_time.rb +33 -7
  27. data/lib/wukong/extensions/emittable.rb +12 -25
  28. data/lib/wukong/extensions/hash_like.rb +13 -6
  29. data/lib/wukong/filename_pattern.rb +8 -7
  30. data/lib/wukong/schema.rb +47 -0
  31. data/lib/wukong/script.rb +7 -0
  32. data/lib/wukong/script/cassandra_loader_script.rb +40 -0
  33. data/lib/wukong/script/emr_command.rb +74 -43
  34. data/lib/wukong/script/hadoop_command.rb +89 -72
  35. data/lib/wukong/store.rb +2 -7
  36. data/lib/wukong/store/cassandra.rb +10 -0
  37. data/lib/wukong/store/cassandra/streaming.rb +75 -0
  38. data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
  39. data/lib/wukong/store/cassandra_model.rb +90 -0
  40. data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
  41. data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
  42. data/wukong.gemspec +32 -4
  43. metadata +33 -14
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file="$1" ; shift
4
+ output_file="$1" ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+
8
+ dest_keyspace=${dest_keyspace-soc_net_tw}
9
+ dest_col_family=${dest_col_family-Wordbag}
10
+
11
+ hostname=`hostname`
12
+
13
+ # Path to cassandra and hadoop dirs
14
+ script_dir=$(readlink -f `dirname $0`)
15
+ CASSANDRA_HOME=${CASSANDRA_HOME-/usr/local/share/cassandra}
16
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
17
+ avro_file=${avro_file-$CASSANDRA_HOME/interface/avro/cassandra.avpr}
18
+
19
+ ARCHIVES=`/bin/ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
20
+ for jar in `/bin/ls -1 $CASSANDRA_HOME/build/lib/jars/*.jar $CASSANDRA_HOME/lib/*.jar`; do
21
+ ARCHIVES=$ARCHIVES,$jar
22
+ done
23
+
24
+ ${HADOOP_HOME}/bin/hadoop \
25
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
26
+ -D stream.map.output=cassandra_avro_output \
27
+ -D stream.io.identifier.resolver.class=org.apache.cassandra.hadoop.streaming.AvroResolver \
28
+ -D cassandra.output.keyspace="$dest_keyspace" \
29
+ -D cassandra.output.columnfamily="$dest_col_family" \
30
+ -D cassandra.thrift.address=10.204.41.193,10.204.30.11,10.204.58.238,10.204.239.133,10.196.191.31,10.204.103.21,10.202.74.223,10.202.143.95 \
31
+ -D cassandra.partitioner.class=org.apache.cassandra.dht.RandomPartitioner \
32
+ -D cassandra.thrift.port=9160 \
33
+ -D mapreduce.output.columnfamilyoutputformat.batch.threshold=1024 \
34
+ -D mapred.reduce.tasks=0 \
35
+ -D mapred.map.tasks.speculative.execution=false \
36
+ -libjars $ARCHIVES \
37
+ -file $avro_file \
38
+ -outputformat org.apache.cassandra.hadoop.ColumnFamilyOutputFormat \
39
+ -mapper "ruby $script_dir/avromapper.rb --map " \
40
+ -input "$input_file" \
41
+ -output "$output_file" \
42
+ "$@"
43
+
44
+ # -D cassandra.thrift.address=10.204.54.190,10.244.42.31,10.244.42.176,10.244.42.112,10.244.42.143,10.244.42.79,10.244.42.4,10.204.53.166 \
45
+ # -D cassandra.thrift.address=10.204.221.230,10.243.79.223,10.245.19.159,10.242.154.159,10.242.153.155,10.242.153.203 \
46
+
47
+
48
+ # cat /tmp/mj-flip/chimchim-info.log | cut -f5 | ruby -e 'puts $stdin.readlines.map{|l| l.chomp.gsub(/ip-([0-9\-]+)\..*/,"\\1").gsub(/-/,".") }.join(",")'
49
+
50
+
51
+
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong'
4
+ require 'wukong/periodic_monitor'
5
+ require 'wukong/store/cassandra'
6
+ require 'wukong/script/cassandra_loader_script'
7
+
8
+ Settings.use :commandline
9
+ Settings.define :log_interval, :default => 1
10
+ Settings.cassandra_keyspace = 'soc_net_tw'
11
+ Settings.cassandra_col_family = 'TwitterUser'
12
+ Settings.cassandra_hosts = "ip-10-204-41-193.ec2.internal:9160,ip-10-204-30-11.ec2.internal:9160,ip-10-204-58-238.ec2.internal:9160,ip-10-204-239-133.ec2.internal:9160,ip-10-196-191-31.ec2.internal:9160,ip-10-204-103-21.ec2.internal:9160,ip-10-202-74-223.ec2.internal:9160,ip-10-202-143-95.ec2.internal:9160"
13
+ Settings.resolve!
14
+
15
+ require 'cassandra/0.7'
16
+ require 'wuclan/twitter' ; include Wuclan::Twitter
17
+ require 'wuclan/twitter/cassandra_db'
18
+ require 'wukong/store/cassandra/streaming'
19
+
20
+ # hdp-catd s3://s3hdfs.infinitemonkeys.info/data/sn/tw/fixd/objects/twitter_user | head
21
+
22
+ # CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
23
+ Wukong::CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
24
+
@@ -0,0 +1,73 @@
1
+
2
+
3
+ Start
4
+
5
+ 5 c1.xlarge
6
+ 2000 writes/sec
7
+ 40 clients
8
+
9
+ 4 m2.xlarge
10
+
11
+ :java_max_heap => "12500M", #
12
+ # :flush_data_buffer_size => 32, # 32,
13
+ # :flush_index_buffer_size => 8, # 8,
14
+ # :binary_memtable_throughput => 256, # 256,
15
+ # :memtable_flush_after => 60, # 60,
16
+ # :memtable_throughput => 64, # 64,
17
+ # :memtable_ops => 0.3, # 0.3,
18
+ # :column_index_size => 64, # 64,
19
+ # :in_memory_compaction_limit => 64 # 64
20
+ :concurrent_reads => 8, # 8
21
+ :concurrent_writes => 250, # 32
22
+
23
+ /usr/bin/java -ea \
24
+ -Xms128M \
25
+ -Xmx12500M \
26
+ -XX:TargetSurvivorRatio=90 \
27
+ -XX:+AggressiveOpts \
28
+ -XX:+UseParNewGC \
29
+ -XX:+UseConcMarkSweepGC \
30
+ -XX:+CMSParallelRemarkEnabled \
31
+ -XX:+HeapDumpOnOutOfMemoryError \
32
+ -XX:SurvivorRatio=128 \
33
+ -XX:MaxTenuringThreshold=0 \
34
+ -Djava.rmi.server.hostname=ec2-184-73-20-37.compute-1.amazonaws.com \
35
+ -Dcom.sun.management.jmxremote.port=12345 \
36
+ -Dcom.sun.management.jmxremote.ssl=false \
37
+ -Dcom.sun.management.jmxremote.authenticate=false \
38
+ -Dcassandra \
39
+ -Dstorage-config=/etc/cassandra \
40
+ -Dcassandra-foreground=yes \
41
+ -cp /etc/cassandra:/usr/local/share/cassandra/build/classes:/usr/local/share/cassandra/lib/antlr-3.1.3.jar:/usr/local/share/cassandra/lib/avro-1.3.3-sources~cust1.jar:/usr/local/share/cassandra/lib/avro-1.3.3~cust2.jar:/usr/local/share/cassandra/lib/clhm-production.jar:/usr/local/share/cassandra/lib/commons-cli-1.1.jar:/usr/local/share/cassandra/lib/commons-codec-1.2.jar:/usr/local/share/cassandra/lib/commons-collections-3.2.1.jar:/usr/local/share/cassandra/lib/commons-lang-2.4.jar:/usr/local/share/cassandra/lib/guava-r05.jar:/usr/local/share/cassandra/lib/hadoop-core-0.20.1.jar:/usr/local/share/cassandra/lib/high-scale-lib.jar:/usr/local/share/cassandra/lib/jackson-core-asl-1.4.0.jar:/usr/local/share/cassandra/lib/jackson-mapper-asl-1.4.0.jar:/usr/local/share/cassandra/lib/jetty-6.1.21.jar:/usr/local/share/cassandra/lib/jetty-util-6.1.21.jar:/usr/local/share/cassandra/lib/jline-0.9.94.jar:/usr/local/share/cassandra/lib/json-simple-1.1.jar:/usr/local/share/cassandra/lib/jug-2.0.0.jar:/usr/local/share/cassandra/lib/libthrift-r959516.jar:/usr/local/share/cassandra/lib/log4j-1.2.16.jar:/usr/local/share/cassandra/lib/servlet-api-2.5-20081211.jar:/usr/local/share/cassandra/lib/slf4j-api-1.5.8.jar:/usr/local/share/cassandra/lib/slf4j-log4j12-1.5.8.jar:/usr/local/share/cassandra/lib/snakeyaml-1.6.jar\
42
+ org.apache.cassandra.thrift.CassandraDaemon
43
+
44
+
45
+ avg-cpu: %user %nice %system %iowait %steal %idle
46
+ 81.83 0.00 1.96 0.00 0.00 16.21
47
+
48
+ Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
49
+ sda1 155.12 9.45 11450.39 48 58168
50
+ sdb 2.76 0.00 22.05 0 112
51
+
52
+ avg-cpu: %user %nice %system %iowait %steal %idle
53
+ 83.72 0.00 3.80 0.20 0.00 12.29
54
+
55
+ Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
56
+ sda1 66.53 1.59 3921.91 8 19688
57
+ sdb 100.20 0.00 6686.85 0 33568
58
+
59
+ avg-cpu: %user %nice %system %iowait %steal %idle
60
+ 66.40 0.00 5.00 0.80 0.40 27.40
61
+
62
+ Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
63
+ sda1 2.40 0.00 19.20 0 96
64
+ sdb 186.80 0.00 15318.40 0 76592
65
+
66
+ avg-cpu: %user %nice %system %iowait %steal %idle
67
+ 80.98 0.00 6.08 1.99 0.00 10.96
68
+
69
+ Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
70
+ sda1 113.97 0.00 7426.75 0 37208
71
+ sdb 360.28 1.60 29232.73 8 146456
72
+
73
+
@@ -0,0 +1,26 @@
1
+
2
+ # Download the Amazon elastic-mapreduce runner from http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
3
+
4
+ # Create a bucket and path to hold your EMR logs, scripts and other ephemera. For instance you might choose 'emr.yourdomain.com' as the bucket and '/wukong' as a scoping path within that bucket. In that case you will refer to it with a path like s3n://emr.yourdomain.com/wukong (see notes below about s3n:// vs. s3:// URLs).
5
+
6
+ # Copy the contents of wukong/examples/emr/dot_wukong_dir to ~/.wukong
7
+ # Edit emr.yaml -- it has instructions for the
8
+
9
+
10
+
11
+
12
+
13
+ h3. s3n:// vs. s3:// URLs
14
+
15
+ Many external tools use a URI convention to address files in S3; they typically use the 's3://' scheme, which makes a lot of sense:
16
+ s3://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
17
+
18
+ Hadoop can maintain an HDFS on the Amazon S3: it uses a block structure and has optimizations for streaming, no file size limitation, and other goodness. However, only hadoop tools can interpret the contents of those blocks -- to everything else it just looks like a soup of blocks labelled block_-8675309 and so forth. Hadoop unfortunately chose the 's3://' scheme for URIs in this filesystem:
19
+ s3://s3hdfs.yourcompany.com/path/to/data
20
+
21
+ Hadoop is happy to read s3 native files -- 'native' as in, you can look at them with a browser and upload them an download them with any S3 tool out there. There's a 5GB limit on file size, and in some cases a performance hit (but not in our experience enough to worry about). You refer to these files with the 's3n://' scheme ('n' as in 'native'):
22
+ s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-mapper.rb
23
+ s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-reducer.rb
24
+ s3n://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
25
+
26
+ Wukong will coerce things to the right scheme when it knows what that scheme should be (eg. code should be s3n://). It will otherwise leave the path alone. Specifically, if you use a URI scheme for input and output paths you must use 's3n://' for normal s3 files.
@@ -0,0 +1,7 @@
1
+ {
2
+ "key-pair": "gibbon",
3
+ "key-pair-file": "/home/your/.wukong/keypairs/gibbon.pem",
4
+ "access-id": "YOURACCESSID",
5
+ "private-key": "YOURPRIVATEKEY",
6
+ "region": "us-east-1",
7
+ }
@@ -2,51 +2,68 @@
2
2
  # Elastic MapReduce config in wukong
3
3
  #
4
4
 
5
+ # ===========================================================================
5
6
  #
6
7
  # Infrastructure options
7
8
  #
8
9
 
9
- # == Fill all your information into yet another file with your amazon key Sorry
10
- # that it needs to be in so many stupid places, nobody can agree on a
10
+ # == Fill all your information into yet another file with your amazon key
11
+ # It needs to be in so many stupid places because nobody can agree on a
11
12
  # filename or format.
13
+ #
12
14
  :emr_credentials_file: ~/.wukong/credentials.json
15
+
13
16
  #
14
- # == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here:
17
+ # == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here
18
+ #
15
19
  # :access_key: ASDFAHKHASDF
16
20
  # :secret_access_key: ADSGHASDFJASDFASDF
21
+
22
+ # == Path to your keypair file.
17
23
  #
18
- # == Path to your keypair file.
19
24
  :key_pair_file: ~/.wukong/keypairs/gibbon.pem
20
- # == Keypair will be named after your file, or force the name:
25
+
26
+ # == Keypair will be named after your file, or force the name
27
+ #
21
28
  # :key_pair: ~
22
29
 
23
30
  # == Path to the Amazon elastic-mapreduce runner. Get a copy from
24
31
  # http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
32
+ #
25
33
  :emr_runner: ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
26
34
 
35
+ # ===========================================================================
36
+ #
37
+ # Remote Paths
38
+ #
39
+
40
+ # == Wukong is opinionated about the paths and locations of scripts and
41
+ # everything. It will organize files by job name within the following path:
42
+ #
43
+ :emr_root: s3://s3n.infinitemonkeys.info/emr
44
+
45
+ # == If you specify the :emr_data_root path, then relative pathnames -- ones that
46
+ # do not look like a URI (s3://yadda/yada) and do not start with a '/' -- will
47
+ # be prefixed with this path prefix.
48
+ :emr_data_root: s3n://s3n.infinitemonkeys.info/data
49
+
50
+
51
+ # ===========================================================================
27
52
  #
28
53
  # Cluster Config
29
54
  #
30
55
  :num_instances: 1
31
- :instance_type: m2.xlarge
56
+ :instance_type: m1.small
32
57
  :master_instance_type: ~
33
58
  :hadoop_version: '0.20'
34
59
  :availability_zone: us-east-1b
35
60
 
61
+ # ===========================================================================
36
62
  #
37
63
  # Running and reporting options
38
64
  #
39
- :alive: false
65
+ :alive: true
40
66
  :enable_debugging: true
41
67
  :emr_runner_verbose: true
42
68
  :emr_runner_debug: ~
43
69
  :step_action: CANCEL_AND_WAIT # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
44
-
45
- #
46
- # Remote Paths
47
- #
48
- # Wukong is opinionated about the paths and locations of scripts and
49
- # everything. Make an S3 bucket and let the wookiee win -- or hack
50
- # lib/wukong/script/emr_command.rb to be more flexible and send us back a patch.
51
- #
52
- :emr_root: s3n://emr.infinitemonkeys.info
@@ -24,7 +24,7 @@ sudo apt-get install -y unzip build-essential git-core ruby ruby1.8-dev rubygems
24
24
  echo "`date` Unchaining rubygems from the tyrrany of ubuntu"
25
25
  sudo gem install --no-rdoc --no-ri rubygems-update --version=1.3.7 ; sudo /var/lib/gems/1.8/bin/update_rubygems; sudo gem update --no-rdoc --no-ri --system ; gem --version ;
26
26
 
27
- echo "`date` Installing wukong gems"
27
+ echo "`date` Installing wukong and related gems"
28
28
  sudo gem install --no-rdoc --no-ri addressable extlib htmlentities configliere yard wukong right_aws uuidtools cheat
29
29
  sudo gem list
30
30
 
@@ -24,4 +24,5 @@ class FooStreamer < Wukong::Streamer::LineStreamer
24
24
  end
25
25
  end
26
26
 
27
+ Settings.resolve!
27
28
  Wukong::Script.new(FooStreamer, FooStreamer).run
@@ -0,0 +1,108 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # http://www.jroller.com/obie/tags/unicode
4
+ # http://www.unicode.org/faq/casemap_charprop.html
5
+ # http://unicode.org/reports/tr10/#Conformance
6
+ # http://intertwingly.net/stories/2009/11/30/asciize.rb
7
+ # http://blog.stevenlevithan.com/archives/javascript-regex-and-unicode
8
+ #
9
+ # http://xregexp.com/tests/unicode.html
10
+
11
+ class String
12
+ #
13
+ # Taken from http://intertwingly.net/stories/2009/11/30/asciize.rb
14
+ #
15
+ def asciize(name)
16
+ if name =~ /[^\x00-\x7F]/
17
+ # digraphs. May be culturally sensitive
18
+ name.gsub! /\xc3\x9f/, 'ss'
19
+ name.gsub! /\xc3\xa4|a\xcc\x88/, 'ae'
20
+ name.gsub! /\xc3\xa5|a\xcc\x8a/, 'aa'
21
+ name.gsub! /\xc3\xa6/, 'ae'
22
+ name.gsub! /\xc3\xb1|n\xcc\x83/, 'ny'
23
+ name.gsub! /\xc3\xb6|o\xcc\x88/, 'oe'
24
+ name.gsub! /\xc3\xbc|u\xcc\x88/, 'ue'
25
+
26
+ # latin 1
27
+ name.gsub! /\xc3[\xa0-\xa5]/, 'a'
28
+ name.gsub! /\xc3\xa7/, 'c'
29
+ name.gsub! /\xc3[\xa8-\xab]/, 'e'
30
+ name.gsub! /\xc3[\xac-\xaf]/, 'i'
31
+ name.gsub! /\xc3[\xb2-\xb6]|\xc3\xb8/, 'o'
32
+ name.gsub! /\xc3[\xb9-\xbc]/, 'u'
33
+ name.gsub! /\xc3[\xbd\xbf]/, 'y'
34
+
35
+ # Latin Extended-A
36
+ name.gsub! /\xc4[\x80-\x85]/, 'a'
37
+ name.gsub! /\xc4[\x86-\x8d]/, 'c'
38
+ name.gsub! /\xc4[\x8e-\x91]/, 'd'
39
+ name.gsub! /\xc4[\x92-\x9b]/, 'e'
40
+ name.gsub! /\xc4[\x9c-\xa3]/, 'g'
41
+ name.gsub! /\xc4[\xa4-\xa7]/, 'h'
42
+ name.gsub! /\xc4[\xa8-\xb1]/, 'i'
43
+ name.gsub! /\xc4[\xb2-\xb3]/, 'ij'
44
+ name.gsub! /\xc4[\xb4-\xb5]/, 'j'
45
+ name.gsub! /\xc4[\xb6-\xb8]/, 'k'
46
+ name.gsub! /\xc4[\xb9-\xff]|\xc5[\x80-\x82]/, 'l'
47
+ name.gsub! /\xc5[\x83-\x8b]/, 'n'
48
+ name.gsub! /\xc5[\x8c-\x91]/, 'o'
49
+ name.gsub! /\xc5[\x92-\x93]/, 'oe'
50
+ name.gsub! /\xc5[\x94-\x99]/, 'r'
51
+ name.gsub! /\xc5[\x9a-\xa2]/, 's'
52
+ name.gsub! /\xc5[\xa2-\xa7]/, 't'
53
+ name.gsub! /\xc5[\xa8-\xb3]/, 'u'
54
+ name.gsub! /\xc5[\xb4-\xb5]/, 'w'
55
+ name.gsub! /\xc5[\xb6-\xb8]/, 'y'
56
+ name.gsub! /\xc5[\xb9-\xbe]/, 'z'
57
+
58
+ # denormalized diacritics
59
+ name.gsub! /\xcc[\x80-\xff]|\xcd[\x80-\xaf]/, ''
60
+ end
61
+
62
+ name.gsub /[^\w]+/, '-'
63
+ end
64
+
65
+ end
66
+
67
+ if __FILE__ == $PROGRAM_NAME
68
+ i18n = "I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xc3\xa0liz\xc3\xa6ti\xc3\xb8n"
69
+ puts "#{i18n} => #{i18n.asciize}"
70
+ end
71
+
72
+ # http://www.jroller.com/obie/tags/unicode
73
+ #
74
+ # require 'iconv'
75
+ # require 'unicode'
76
+ #
77
+ # class String
78
+ #
79
+ # def to_ascii
80
+ # # split in muti-byte aware fashion and translate characters over 127
81
+ # # and dropping characters not in the translation hash
82
+ # self.chars.split('').collect { |c| (c[0] <= 127) ? c : translation_hash[c[0]] }.join
83
+ # end
84
+ #
85
+ # def to_url_format
86
+ # url_format = self.to_ascii
87
+ # url_format = url_format.gsub(/[^A-Za-z0-9]/, '') # all non-word
88
+ # url_format.downcase!
89
+ # url_format
90
+ # end
91
+ #
92
+ # protected
93
+ #
94
+ # def translation_hash
95
+ # @@translation_hash ||= setup_translation_hash
96
+ # end
97
+ #
98
+ # def setup_translation_hash
99
+ # accented_chars = "ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý"
100
+ # unaccented_chars = "AAAAAACEEEEIIIIDNOOOOOxOUUUUYaaaaaaceeeeiiiinoooooouuuuy"
101
+ #
102
+ # translation_hash = Hash.zip(accented_chars.chars, unaccented_chars.chars)
103
+ # translation_hash["Æ".chars[0]] = 'AE'
104
+ # translation_hash["æ".chars[0]] = 'ae'
105
+ # translation_hash
106
+ # end
107
+ #
108
+ # end
@@ -1,23 +1,31 @@
1
1
  require 'time'
2
2
  require 'date'
3
- DateTime.class_eval do
3
+
4
+ class Time
5
+ # strftime() format to flatten a date
6
+ FLAT_FORMAT = "%Y%m%d%H%M%S"
7
+ # Flatten
8
+ def to_flat
9
+ utc.strftime(FLAT_FORMAT)
10
+ end
11
+
4
12
  #
5
13
  # Parses the time but never fails.
6
14
  # Return value is always in the UTC time zone.
7
15
  #
8
- # A flattened datetime -- a 12-digit YYYYmmddHHMMMSS -- is fixed to the UTC
16
+ # A flattened datetime -- a 14-digit YYYYmmddHHMMMSS -- is fixed to the UTC
9
17
  # time zone by parsing it as YYYYmmddHHMMMSSZ <- 'Z' at end
10
18
  #
11
19
  def self.parse_safely dt
12
20
  return nil if dt.blank?
13
21
  begin
14
- if dt.to_s =~ /\A\d{12}Z?\z/
15
- parse(dt+'Z', true)
16
- else
17
- parse(dt, true).utc
22
+ case
23
+ when dt.is_a?(Time) then dt.utc
24
+ when (dt.to_s =~ /\A\d{14}\z/) then parse(dt.to_s+'Z', true)
25
+ else parse(dt.to_s, true).utc
18
26
  end
19
27
  rescue StandardError => e
20
- Log.info e
28
+ Log.debug e
21
29
  end
22
30
  end
23
31
 
@@ -25,3 +33,21 @@ DateTime.class_eval do
25
33
  parse_safely(str).to_flat
26
34
  end
27
35
  end
36
+
37
+ class DateTime < Date
38
+ # strftime() format to flatten a date
39
+ FLAT_FORMAT = "%Y%m%d%H%M%S"
40
+ # Flatten
41
+ def to_flat
42
+ strftime(FLAT_FORMAT)
43
+ end
44
+ end
45
+
46
+ class Date
47
+ # strftime() format to flatten a date
48
+ FLAT_FORMAT = "%Y%m%d"
49
+ # Flatten
50
+ def to_flat
51
+ strftime(FLAT_FORMAT)
52
+ end
53
+ end