wukong 1.5.3 → 1.5.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +4 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-ls +2 -1
- data/docpages/avro/performance.textile +36 -0
- data/examples/cassandra_streaming/avromapper.rb +85 -0
- data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
- data/examples/cassandra_streaming/cassandra.avpr +468 -0
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
- data/examples/cassandra_streaming/catter.sh +45 -0
- data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
- data/examples/cassandra_streaming/client_schema.avpr +211 -0
- data/examples/cassandra_streaming/client_schema.textile +318 -0
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +1 -0
- data/examples/cassandra_streaming/pyreduce.sh +1 -0
- data/examples/cassandra_streaming/smutation.avpr +188 -0
- data/examples/cassandra_streaming/streamer.sh +51 -0
- data/examples/cassandra_streaming/struct_loader.rb +24 -0
- data/examples/cassandra_streaming/tuning.textile +73 -0
- data/examples/emr/README-elastic_map_reduce.textile +26 -0
- data/examples/emr/dot_wukong_dir/credentials.json +7 -0
- data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
- data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
- data/examples/emr/elastic_mapreduce_example.rb +1 -0
- data/lib/wukong/encoding/asciize.rb +108 -0
- data/lib/wukong/extensions/date_time.rb +33 -7
- data/lib/wukong/extensions/emittable.rb +12 -25
- data/lib/wukong/extensions/hash_like.rb +13 -6
- data/lib/wukong/filename_pattern.rb +8 -7
- data/lib/wukong/schema.rb +47 -0
- data/lib/wukong/script.rb +7 -0
- data/lib/wukong/script/cassandra_loader_script.rb +40 -0
- data/lib/wukong/script/emr_command.rb +74 -43
- data/lib/wukong/script/hadoop_command.rb +89 -72
- data/lib/wukong/store.rb +2 -7
- data/lib/wukong/store/cassandra.rb +10 -0
- data/lib/wukong/store/cassandra/streaming.rb +75 -0
- data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
- data/lib/wukong/store/cassandra_model.rb +90 -0
- data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
- data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
- data/wukong.gemspec +32 -4
- metadata +33 -14
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
input_file="$1" ; shift
|
4
|
+
output_file="$1" ; shift
|
5
|
+
map_script=${1-/bin/cat} ; shift
|
6
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
7
|
+
|
8
|
+
dest_keyspace=${dest_keyspace-soc_net_tw}
|
9
|
+
dest_col_family=${dest_col_family-Wordbag}
|
10
|
+
|
11
|
+
hostname=`hostname`
|
12
|
+
|
13
|
+
# Path to cassandra and hadoop dirs
|
14
|
+
script_dir=$(readlink -f `dirname $0`)
|
15
|
+
CASSANDRA_HOME=${CASSANDRA_HOME-/usr/local/share/cassandra}
|
16
|
+
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
17
|
+
avro_file=${avro_file-$CASSANDRA_HOME/interface/avro/cassandra.avpr}
|
18
|
+
|
19
|
+
ARCHIVES=`/bin/ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
|
20
|
+
for jar in `/bin/ls -1 $CASSANDRA_HOME/build/lib/jars/*.jar $CASSANDRA_HOME/lib/*.jar`; do
|
21
|
+
ARCHIVES=$ARCHIVES,$jar
|
22
|
+
done
|
23
|
+
|
24
|
+
${HADOOP_HOME}/bin/hadoop \
|
25
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
|
26
|
+
-D stream.map.output=cassandra_avro_output \
|
27
|
+
-D stream.io.identifier.resolver.class=org.apache.cassandra.hadoop.streaming.AvroResolver \
|
28
|
+
-D cassandra.output.keyspace="$dest_keyspace" \
|
29
|
+
-D cassandra.output.columnfamily="$dest_col_family" \
|
30
|
+
-D cassandra.thrift.address=10.204.41.193,10.204.30.11,10.204.58.238,10.204.239.133,10.196.191.31,10.204.103.21,10.202.74.223,10.202.143.95 \
|
31
|
+
-D cassandra.partitioner.class=org.apache.cassandra.dht.RandomPartitioner \
|
32
|
+
-D cassandra.thrift.port=9160 \
|
33
|
+
-D mapreduce.output.columnfamilyoutputformat.batch.threshold=1024 \
|
34
|
+
-D mapred.reduce.tasks=0 \
|
35
|
+
-D mapred.map.tasks.speculative.execution=false \
|
36
|
+
-libjars $ARCHIVES \
|
37
|
+
-file $avro_file \
|
38
|
+
-outputformat org.apache.cassandra.hadoop.ColumnFamilyOutputFormat \
|
39
|
+
-mapper "ruby $script_dir/avromapper.rb --map " \
|
40
|
+
-input "$input_file" \
|
41
|
+
-output "$output_file" \
|
42
|
+
"$@"
|
43
|
+
|
44
|
+
# -D cassandra.thrift.address=10.204.54.190,10.244.42.31,10.244.42.176,10.244.42.112,10.244.42.143,10.244.42.79,10.244.42.4,10.204.53.166 \
|
45
|
+
# -D cassandra.thrift.address=10.204.221.230,10.243.79.223,10.245.19.159,10.242.154.159,10.242.153.155,10.242.153.203 \
|
46
|
+
|
47
|
+
|
48
|
+
# cat /tmp/mj-flip/chimchim-info.log | cut -f5 | ruby -e 'puts $stdin.readlines.map{|l| l.chomp.gsub(/ip-([0-9\-]+)\..*/,"\\1").gsub(/-/,".") }.join(",")'
|
49
|
+
|
50
|
+
|
51
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/periodic_monitor'
|
5
|
+
require 'wukong/store/cassandra'
|
6
|
+
require 'wukong/script/cassandra_loader_script'
|
7
|
+
|
8
|
+
Settings.use :commandline
|
9
|
+
Settings.define :log_interval, :default => 1
|
10
|
+
Settings.cassandra_keyspace = 'soc_net_tw'
|
11
|
+
Settings.cassandra_col_family = 'TwitterUser'
|
12
|
+
Settings.cassandra_hosts = "ip-10-204-41-193.ec2.internal:9160,ip-10-204-30-11.ec2.internal:9160,ip-10-204-58-238.ec2.internal:9160,ip-10-204-239-133.ec2.internal:9160,ip-10-196-191-31.ec2.internal:9160,ip-10-204-103-21.ec2.internal:9160,ip-10-202-74-223.ec2.internal:9160,ip-10-202-143-95.ec2.internal:9160"
|
13
|
+
Settings.resolve!
|
14
|
+
|
15
|
+
require 'cassandra/0.7'
|
16
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter
|
17
|
+
require 'wuclan/twitter/cassandra_db'
|
18
|
+
require 'wukong/store/cassandra/streaming'
|
19
|
+
|
20
|
+
# hdp-catd s3://s3hdfs.infinitemonkeys.info/data/sn/tw/fixd/objects/twitter_user | head
|
21
|
+
|
22
|
+
# CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
|
23
|
+
Wukong::CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
|
24
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
Start
|
4
|
+
|
5
|
+
5 c1.xlarge
|
6
|
+
2000 writes/sec
|
7
|
+
40 clients
|
8
|
+
|
9
|
+
4 m2.xlarge
|
10
|
+
|
11
|
+
:java_max_heap => "12500M", #
|
12
|
+
# :flush_data_buffer_size => 32, # 32,
|
13
|
+
# :flush_index_buffer_size => 8, # 8,
|
14
|
+
# :binary_memtable_throughput => 256, # 256,
|
15
|
+
# :memtable_flush_after => 60, # 60,
|
16
|
+
# :memtable_throughput => 64, # 64,
|
17
|
+
# :memtable_ops => 0.3, # 0.3,
|
18
|
+
# :column_index_size => 64, # 64,
|
19
|
+
# :in_memory_compaction_limit => 64 # 64
|
20
|
+
:concurrent_reads => 8, # 8
|
21
|
+
:concurrent_writes => 250, # 32
|
22
|
+
|
23
|
+
/usr/bin/java -ea \
|
24
|
+
-Xms128M \
|
25
|
+
-Xmx12500M \
|
26
|
+
-XX:TargetSurvivorRatio=90 \
|
27
|
+
-XX:+AggressiveOpts \
|
28
|
+
-XX:+UseParNewGC \
|
29
|
+
-XX:+UseConcMarkSweepGC \
|
30
|
+
-XX:+CMSParallelRemarkEnabled \
|
31
|
+
-XX:+HeapDumpOnOutOfMemoryError \
|
32
|
+
-XX:SurvivorRatio=128 \
|
33
|
+
-XX:MaxTenuringThreshold=0 \
|
34
|
+
-Djava.rmi.server.hostname=ec2-184-73-20-37.compute-1.amazonaws.com \
|
35
|
+
-Dcom.sun.management.jmxremote.port=12345 \
|
36
|
+
-Dcom.sun.management.jmxremote.ssl=false \
|
37
|
+
-Dcom.sun.management.jmxremote.authenticate=false \
|
38
|
+
-Dcassandra \
|
39
|
+
-Dstorage-config=/etc/cassandra \
|
40
|
+
-Dcassandra-foreground=yes \
|
41
|
+
-cp /etc/cassandra:/usr/local/share/cassandra/build/classes:/usr/local/share/cassandra/lib/antlr-3.1.3.jar:/usr/local/share/cassandra/lib/avro-1.3.3-sources~cust1.jar:/usr/local/share/cassandra/lib/avro-1.3.3~cust2.jar:/usr/local/share/cassandra/lib/clhm-production.jar:/usr/local/share/cassandra/lib/commons-cli-1.1.jar:/usr/local/share/cassandra/lib/commons-codec-1.2.jar:/usr/local/share/cassandra/lib/commons-collections-3.2.1.jar:/usr/local/share/cassandra/lib/commons-lang-2.4.jar:/usr/local/share/cassandra/lib/guava-r05.jar:/usr/local/share/cassandra/lib/hadoop-core-0.20.1.jar:/usr/local/share/cassandra/lib/high-scale-lib.jar:/usr/local/share/cassandra/lib/jackson-core-asl-1.4.0.jar:/usr/local/share/cassandra/lib/jackson-mapper-asl-1.4.0.jar:/usr/local/share/cassandra/lib/jetty-6.1.21.jar:/usr/local/share/cassandra/lib/jetty-util-6.1.21.jar:/usr/local/share/cassandra/lib/jline-0.9.94.jar:/usr/local/share/cassandra/lib/json-simple-1.1.jar:/usr/local/share/cassandra/lib/jug-2.0.0.jar:/usr/local/share/cassandra/lib/libthrift-r959516.jar:/usr/local/share/cassandra/lib/log4j-1.2.16.jar:/usr/local/share/cassandra/lib/servlet-api-2.5-20081211.jar:/usr/local/share/cassandra/lib/slf4j-api-1.5.8.jar:/usr/local/share/cassandra/lib/slf4j-log4j12-1.5.8.jar:/usr/local/share/cassandra/lib/snakeyaml-1.6.jar\
|
42
|
+
org.apache.cassandra.thrift.CassandraDaemon
|
43
|
+
|
44
|
+
|
45
|
+
avg-cpu: %user %nice %system %iowait %steal %idle
|
46
|
+
81.83 0.00 1.96 0.00 0.00 16.21
|
47
|
+
|
48
|
+
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
|
49
|
+
sda1 155.12 9.45 11450.39 48 58168
|
50
|
+
sdb 2.76 0.00 22.05 0 112
|
51
|
+
|
52
|
+
avg-cpu: %user %nice %system %iowait %steal %idle
|
53
|
+
83.72 0.00 3.80 0.20 0.00 12.29
|
54
|
+
|
55
|
+
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
|
56
|
+
sda1 66.53 1.59 3921.91 8 19688
|
57
|
+
sdb 100.20 0.00 6686.85 0 33568
|
58
|
+
|
59
|
+
avg-cpu: %user %nice %system %iowait %steal %idle
|
60
|
+
66.40 0.00 5.00 0.80 0.40 27.40
|
61
|
+
|
62
|
+
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
|
63
|
+
sda1 2.40 0.00 19.20 0 96
|
64
|
+
sdb 186.80 0.00 15318.40 0 76592
|
65
|
+
|
66
|
+
avg-cpu: %user %nice %system %iowait %steal %idle
|
67
|
+
80.98 0.00 6.08 1.99 0.00 10.96
|
68
|
+
|
69
|
+
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
|
70
|
+
sda1 113.97 0.00 7426.75 0 37208
|
71
|
+
sdb 360.28 1.60 29232.73 8 146456
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
# Download the Amazon elastic-mapreduce runner from http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
3
|
+
|
4
|
+
# Create a bucket and path to hold your EMR logs, scripts and other ephemera. For instance you might choose 'emr.yourdomain.com' as the bucket and '/wukong' as a scoping path within that bucket. In that case you will refer to it with a path like s3n://emr.yourdomain.com/wukong (see notes below about s3n:// vs. s3:// URLs).
|
5
|
+
|
6
|
+
# Copy the contents of wukong/examples/emr/dot_wukong_dir to ~/.wukong
|
7
|
+
# Edit emr.yaml -- it has instructions for the
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
h3. s3n:// vs. s3:// URLs
|
14
|
+
|
15
|
+
Many external tools use a URI convention to address files in S3; they typically use the 's3://' scheme, which makes a lot of sense:
|
16
|
+
s3://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
|
17
|
+
|
18
|
+
Hadoop can maintain an HDFS on the Amazon S3: it uses a block structure and has optimizations for streaming, no file size limitation, and other goodness. However, only hadoop tools can interpret the contents of those blocks -- to everything else it just looks like a soup of blocks labelled block_-8675309 and so forth. Hadoop unfortunately chose the 's3://' scheme for URIs in this filesystem:
|
19
|
+
s3://s3hdfs.yourcompany.com/path/to/data
|
20
|
+
|
21
|
+
Hadoop is happy to read s3 native files -- 'native' as in, you can look at them with a browser and upload them an download them with any S3 tool out there. There's a 5GB limit on file size, and in some cases a performance hit (but not in our experience enough to worry about). You refer to these files with the 's3n://' scheme ('n' as in 'native'):
|
22
|
+
s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-mapper.rb
|
23
|
+
s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-reducer.rb
|
24
|
+
s3n://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
|
25
|
+
|
26
|
+
Wukong will coerce things to the right scheme when it knows what that scheme should be (eg. code should be s3n://). It will otherwise leave the path alone. Specifically, if you use a URI scheme for input and output paths you must use 's3n://' for normal s3 files.
|
@@ -2,51 +2,68 @@
|
|
2
2
|
# Elastic MapReduce config in wukong
|
3
3
|
#
|
4
4
|
|
5
|
+
# ===========================================================================
|
5
6
|
#
|
6
7
|
# Infrastructure options
|
7
8
|
#
|
8
9
|
|
9
|
-
# == Fill all your information into yet another file with your amazon key
|
10
|
-
#
|
10
|
+
# == Fill all your information into yet another file with your amazon key
|
11
|
+
# It needs to be in so many stupid places because nobody can agree on a
|
11
12
|
# filename or format.
|
13
|
+
#
|
12
14
|
:emr_credentials_file: ~/.wukong/credentials.json
|
15
|
+
|
13
16
|
#
|
14
|
-
# == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here
|
17
|
+
# == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here
|
18
|
+
#
|
15
19
|
# :access_key: ASDFAHKHASDF
|
16
20
|
# :secret_access_key: ADSGHASDFJASDFASDF
|
21
|
+
|
22
|
+
# == Path to your keypair file.
|
17
23
|
#
|
18
|
-
# == Path to your keypair file.
|
19
24
|
:key_pair_file: ~/.wukong/keypairs/gibbon.pem
|
20
|
-
|
25
|
+
|
26
|
+
# == Keypair will be named after your file, or force the name
|
27
|
+
#
|
21
28
|
# :key_pair: ~
|
22
29
|
|
23
30
|
# == Path to the Amazon elastic-mapreduce runner. Get a copy from
|
24
31
|
# http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
32
|
+
#
|
25
33
|
:emr_runner: ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
|
26
34
|
|
35
|
+
# ===========================================================================
|
36
|
+
#
|
37
|
+
# Remote Paths
|
38
|
+
#
|
39
|
+
|
40
|
+
# == Wukong is opinionated about the paths and locations of scripts and
|
41
|
+
# everything. It will organize files by job name within the following path:
|
42
|
+
#
|
43
|
+
:emr_root: s3://s3n.infinitemonkeys.info/emr
|
44
|
+
|
45
|
+
# == If you specify the :emr_data_root path, then relative pathnames -- ones that
|
46
|
+
# do not look like a URI (s3://yadda/yada) and do not start with a '/' -- will
|
47
|
+
# be prefixed with this path prefix.
|
48
|
+
:emr_data_root: s3n://s3n.infinitemonkeys.info/data
|
49
|
+
|
50
|
+
|
51
|
+
# ===========================================================================
|
27
52
|
#
|
28
53
|
# Cluster Config
|
29
54
|
#
|
30
55
|
:num_instances: 1
|
31
|
-
:instance_type:
|
56
|
+
:instance_type: m1.small
|
32
57
|
:master_instance_type: ~
|
33
58
|
:hadoop_version: '0.20'
|
34
59
|
:availability_zone: us-east-1b
|
35
60
|
|
61
|
+
# ===========================================================================
|
36
62
|
#
|
37
63
|
# Running and reporting options
|
38
64
|
#
|
39
|
-
:alive:
|
65
|
+
:alive: true
|
40
66
|
:enable_debugging: true
|
41
67
|
:emr_runner_verbose: true
|
42
68
|
:emr_runner_debug: ~
|
43
69
|
:step_action: CANCEL_AND_WAIT # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
|
44
|
-
|
45
|
-
#
|
46
|
-
# Remote Paths
|
47
|
-
#
|
48
|
-
# Wukong is opinionated about the paths and locations of scripts and
|
49
|
-
# everything. Make an S3 bucket and let the wookiee win -- or hack
|
50
|
-
# lib/wukong/script/emr_command.rb to be more flexible and send us back a patch.
|
51
|
-
#
|
52
|
-
:emr_root: s3n://emr.infinitemonkeys.info
|
@@ -24,7 +24,7 @@ sudo apt-get install -y unzip build-essential git-core ruby ruby1.8-dev rubygems
|
|
24
24
|
echo "`date` Unchaining rubygems from the tyrrany of ubuntu"
|
25
25
|
sudo gem install --no-rdoc --no-ri rubygems-update --version=1.3.7 ; sudo /var/lib/gems/1.8/bin/update_rubygems; sudo gem update --no-rdoc --no-ri --system ; gem --version ;
|
26
26
|
|
27
|
-
echo "`date` Installing wukong gems"
|
27
|
+
echo "`date` Installing wukong and related gems"
|
28
28
|
sudo gem install --no-rdoc --no-ri addressable extlib htmlentities configliere yard wukong right_aws uuidtools cheat
|
29
29
|
sudo gem list
|
30
30
|
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# http://www.jroller.com/obie/tags/unicode
|
4
|
+
# http://www.unicode.org/faq/casemap_charprop.html
|
5
|
+
# http://unicode.org/reports/tr10/#Conformance
|
6
|
+
# http://intertwingly.net/stories/2009/11/30/asciize.rb
|
7
|
+
# http://blog.stevenlevithan.com/archives/javascript-regex-and-unicode
|
8
|
+
#
|
9
|
+
# http://xregexp.com/tests/unicode.html
|
10
|
+
|
11
|
+
class String
|
12
|
+
#
|
13
|
+
# Taken from http://intertwingly.net/stories/2009/11/30/asciize.rb
|
14
|
+
#
|
15
|
+
def asciize(name)
|
16
|
+
if name =~ /[^\x00-\x7F]/
|
17
|
+
# digraphs. May be culturally sensitive
|
18
|
+
name.gsub! /\xc3\x9f/, 'ss'
|
19
|
+
name.gsub! /\xc3\xa4|a\xcc\x88/, 'ae'
|
20
|
+
name.gsub! /\xc3\xa5|a\xcc\x8a/, 'aa'
|
21
|
+
name.gsub! /\xc3\xa6/, 'ae'
|
22
|
+
name.gsub! /\xc3\xb1|n\xcc\x83/, 'ny'
|
23
|
+
name.gsub! /\xc3\xb6|o\xcc\x88/, 'oe'
|
24
|
+
name.gsub! /\xc3\xbc|u\xcc\x88/, 'ue'
|
25
|
+
|
26
|
+
# latin 1
|
27
|
+
name.gsub! /\xc3[\xa0-\xa5]/, 'a'
|
28
|
+
name.gsub! /\xc3\xa7/, 'c'
|
29
|
+
name.gsub! /\xc3[\xa8-\xab]/, 'e'
|
30
|
+
name.gsub! /\xc3[\xac-\xaf]/, 'i'
|
31
|
+
name.gsub! /\xc3[\xb2-\xb6]|\xc3\xb8/, 'o'
|
32
|
+
name.gsub! /\xc3[\xb9-\xbc]/, 'u'
|
33
|
+
name.gsub! /\xc3[\xbd\xbf]/, 'y'
|
34
|
+
|
35
|
+
# Latin Extended-A
|
36
|
+
name.gsub! /\xc4[\x80-\x85]/, 'a'
|
37
|
+
name.gsub! /\xc4[\x86-\x8d]/, 'c'
|
38
|
+
name.gsub! /\xc4[\x8e-\x91]/, 'd'
|
39
|
+
name.gsub! /\xc4[\x92-\x9b]/, 'e'
|
40
|
+
name.gsub! /\xc4[\x9c-\xa3]/, 'g'
|
41
|
+
name.gsub! /\xc4[\xa4-\xa7]/, 'h'
|
42
|
+
name.gsub! /\xc4[\xa8-\xb1]/, 'i'
|
43
|
+
name.gsub! /\xc4[\xb2-\xb3]/, 'ij'
|
44
|
+
name.gsub! /\xc4[\xb4-\xb5]/, 'j'
|
45
|
+
name.gsub! /\xc4[\xb6-\xb8]/, 'k'
|
46
|
+
name.gsub! /\xc4[\xb9-\xff]|\xc5[\x80-\x82]/, 'l'
|
47
|
+
name.gsub! /\xc5[\x83-\x8b]/, 'n'
|
48
|
+
name.gsub! /\xc5[\x8c-\x91]/, 'o'
|
49
|
+
name.gsub! /\xc5[\x92-\x93]/, 'oe'
|
50
|
+
name.gsub! /\xc5[\x94-\x99]/, 'r'
|
51
|
+
name.gsub! /\xc5[\x9a-\xa2]/, 's'
|
52
|
+
name.gsub! /\xc5[\xa2-\xa7]/, 't'
|
53
|
+
name.gsub! /\xc5[\xa8-\xb3]/, 'u'
|
54
|
+
name.gsub! /\xc5[\xb4-\xb5]/, 'w'
|
55
|
+
name.gsub! /\xc5[\xb6-\xb8]/, 'y'
|
56
|
+
name.gsub! /\xc5[\xb9-\xbe]/, 'z'
|
57
|
+
|
58
|
+
# denormalized diacritics
|
59
|
+
name.gsub! /\xcc[\x80-\xff]|\xcd[\x80-\xaf]/, ''
|
60
|
+
end
|
61
|
+
|
62
|
+
name.gsub /[^\w]+/, '-'
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
if __FILE__ == $PROGRAM_NAME
|
68
|
+
i18n = "I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xc3\xa0liz\xc3\xa6ti\xc3\xb8n"
|
69
|
+
puts "#{i18n} => #{i18n.asciize}"
|
70
|
+
end
|
71
|
+
|
72
|
+
# http://www.jroller.com/obie/tags/unicode
|
73
|
+
#
|
74
|
+
# require 'iconv'
|
75
|
+
# require 'unicode'
|
76
|
+
#
|
77
|
+
# class String
|
78
|
+
#
|
79
|
+
# def to_ascii
|
80
|
+
# # split in muti-byte aware fashion and translate characters over 127
|
81
|
+
# # and dropping characters not in the translation hash
|
82
|
+
# self.chars.split('').collect { |c| (c[0] <= 127) ? c : translation_hash[c[0]] }.join
|
83
|
+
# end
|
84
|
+
#
|
85
|
+
# def to_url_format
|
86
|
+
# url_format = self.to_ascii
|
87
|
+
# url_format = url_format.gsub(/[^A-Za-z0-9]/, '') # all non-word
|
88
|
+
# url_format.downcase!
|
89
|
+
# url_format
|
90
|
+
# end
|
91
|
+
#
|
92
|
+
# protected
|
93
|
+
#
|
94
|
+
# def translation_hash
|
95
|
+
# @@translation_hash ||= setup_translation_hash
|
96
|
+
# end
|
97
|
+
#
|
98
|
+
# def setup_translation_hash
|
99
|
+
# accented_chars = "ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý"
|
100
|
+
# unaccented_chars = "AAAAAACEEEEIIIIDNOOOOOxOUUUUYaaaaaaceeeeiiiinoooooouuuuy"
|
101
|
+
#
|
102
|
+
# translation_hash = Hash.zip(accented_chars.chars, unaccented_chars.chars)
|
103
|
+
# translation_hash["Æ".chars[0]] = 'AE'
|
104
|
+
# translation_hash["æ".chars[0]] = 'ae'
|
105
|
+
# translation_hash
|
106
|
+
# end
|
107
|
+
#
|
108
|
+
# end
|
@@ -1,23 +1,31 @@
|
|
1
1
|
require 'time'
|
2
2
|
require 'date'
|
3
|
-
|
3
|
+
|
4
|
+
class Time
|
5
|
+
# strftime() format to flatten a date
|
6
|
+
FLAT_FORMAT = "%Y%m%d%H%M%S"
|
7
|
+
# Flatten
|
8
|
+
def to_flat
|
9
|
+
utc.strftime(FLAT_FORMAT)
|
10
|
+
end
|
11
|
+
|
4
12
|
#
|
5
13
|
# Parses the time but never fails.
|
6
14
|
# Return value is always in the UTC time zone.
|
7
15
|
#
|
8
|
-
# A flattened datetime -- a
|
16
|
+
# A flattened datetime -- a 14-digit YYYYmmddHHMMMSS -- is fixed to the UTC
|
9
17
|
# time zone by parsing it as YYYYmmddHHMMMSSZ <- 'Z' at end
|
10
18
|
#
|
11
19
|
def self.parse_safely dt
|
12
20
|
return nil if dt.blank?
|
13
21
|
begin
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
22
|
+
case
|
23
|
+
when dt.is_a?(Time) then dt.utc
|
24
|
+
when (dt.to_s =~ /\A\d{14}\z/) then parse(dt.to_s+'Z', true)
|
25
|
+
else parse(dt.to_s, true).utc
|
18
26
|
end
|
19
27
|
rescue StandardError => e
|
20
|
-
Log.
|
28
|
+
Log.debug e
|
21
29
|
end
|
22
30
|
end
|
23
31
|
|
@@ -25,3 +33,21 @@ DateTime.class_eval do
|
|
25
33
|
parse_safely(str).to_flat
|
26
34
|
end
|
27
35
|
end
|
36
|
+
|
37
|
+
class DateTime < Date
|
38
|
+
# strftime() format to flatten a date
|
39
|
+
FLAT_FORMAT = "%Y%m%d%H%M%S"
|
40
|
+
# Flatten
|
41
|
+
def to_flat
|
42
|
+
strftime(FLAT_FORMAT)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Date
|
47
|
+
# strftime() format to flatten a date
|
48
|
+
FLAT_FORMAT = "%Y%m%d"
|
49
|
+
# Flatten
|
50
|
+
def to_flat
|
51
|
+
strftime(FLAT_FORMAT)
|
52
|
+
end
|
53
|
+
end
|