wukong 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
dir_to_rename = ARGV[0]
|
4
|
+
dest_ext = '.tsv'
|
5
|
+
|
6
|
+
unless dir_to_rename && (! dir_to_rename.empty?)
|
7
|
+
warn "Need a directory or file spec to rename."
|
8
|
+
exit
|
9
|
+
end
|
10
|
+
|
11
|
+
#
|
12
|
+
# Setup
|
13
|
+
#
|
14
|
+
warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
|
15
|
+
|
16
|
+
#
|
17
|
+
# Examine the files
|
18
|
+
#
|
19
|
+
file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
|
20
|
+
command_lists = { }
|
21
|
+
file_listings[1..-1].each do |file_listing|
|
22
|
+
m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
|
23
|
+
if !m then warn "Couldn't grok #{file_listing}" ; next ; end
|
24
|
+
size, filename = m.captures
|
25
|
+
case
|
26
|
+
when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
|
27
|
+
else
|
28
|
+
firstline = `hdp-cat #{filename} | head -qn1 `
|
29
|
+
file_key, _ = firstline.split("\t", 2)
|
30
|
+
unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
|
31
|
+
warn "Don't want to rename to '#{file_key}'... skipping"
|
32
|
+
next
|
33
|
+
end
|
34
|
+
dirname = File.dirname(filename)
|
35
|
+
destfile = File.join(dirname, file_key)+dest_ext
|
36
|
+
(command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Execute the command_lists
|
42
|
+
#
|
43
|
+
command_lists.each do |type, command_list|
|
44
|
+
case type
|
45
|
+
when :deletes
|
46
|
+
command = "hdp-rm #{command_list.join(" ")}"
|
47
|
+
puts command
|
48
|
+
`#{command}`
|
49
|
+
when :moves
|
50
|
+
command_list.each do |command|
|
51
|
+
puts command
|
52
|
+
`#{command}`
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
# -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
|
59
|
+
|
60
|
+
# # Killing empty files
|
61
|
+
# find . -size 0 -print -exec rm {} \;
|
62
|
+
#
|
63
|
+
# for foo in part-0* ; do
|
64
|
+
# newname=`
|
65
|
+
# head -n1 $foo |
|
66
|
+
# cut -d' ' -f1 |
|
67
|
+
# ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
|
68
|
+
# `.tsv ;
|
69
|
+
# echo "moving $foo to $newname"
|
70
|
+
# mv "$foo" "$newname"
|
71
|
+
# done
|
72
|
+
#
|
73
|
+
# # dir=`basename $PWD`
|
74
|
+
# # for foo in *.tsv ; do
|
75
|
+
# # echo "Compressing $dir"
|
76
|
+
# # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
|
77
|
+
# # done
|
data/bin/hdp-ps
ADDED
data/bin/hdp-put
ADDED
data/bin/hdp-rm
ADDED
data/bin/hdp-sort
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
# hadoop dfs -rmr out/parsed-followers
|
3
|
+
|
4
|
+
input_file=${1} ; shift
|
5
|
+
output_file=${1} ; shift
|
6
|
+
map_script=${1-/bin/cat} ; shift
|
7
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
8
|
+
fields=${1-2} ; shift
|
9
|
+
|
10
|
+
if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
|
11
|
+
|
12
|
+
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
13
|
+
|
14
|
+
${HADOOP_HOME}/bin/hadoop \
|
15
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
|
16
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
17
|
+
-jobconf map.output.key.field.separator='\t' \
|
18
|
+
-jobconf num.key.fields.for.partition=1 \
|
19
|
+
-jobconf stream.map.output.field.separator='\t' \
|
20
|
+
-jobconf stream.num.map.output.key.fields="$fields" \
|
21
|
+
-mapper "$map_script" \
|
22
|
+
-reducer "$reduce_script" \
|
23
|
+
-input "$input_file" \
|
24
|
+
-output "$output_file" \
|
25
|
+
"$@"
|
26
|
+
|
27
|
+
|
28
|
+
# -jobconf mapred.map.tasks=3 \
|
29
|
+
# -jobconf mapred.reduce.tasks=3 \
|
data/bin/hdp-stream
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
# hadoop dfs -rmr out/parsed-followers
|
3
|
+
|
4
|
+
input_file=${1} ; shift
|
5
|
+
output_file=${1} ; shift
|
6
|
+
map_script=${1-/bin/cat} ; shift
|
7
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
8
|
+
fields=${1-2} ; shift
|
9
|
+
|
10
|
+
if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
|
11
|
+
|
12
|
+
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
13
|
+
|
14
|
+
${HADOOP_HOME}/bin/hadoop \
|
15
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
|
16
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
17
|
+
-jobconf map.output.key.field.separator='\t' \
|
18
|
+
-jobconf num.key.fields.for.partition=1 \
|
19
|
+
-jobconf stream.map.output.field.separator='\t' \
|
20
|
+
-jobconf stream.num.map.output.key.fields="$fields" \
|
21
|
+
-mapper "$map_script" \
|
22
|
+
-reducer "$reduce_script" \
|
23
|
+
-input "$input_file" \
|
24
|
+
-output "$output_file" \
|
25
|
+
"$@"
|
26
|
+
|
27
|
+
|
28
|
+
# -jobconf mapred.map.tasks=3 \
|
29
|
+
# -jobconf mapred.reduce.tasks=3 \
|
data/bin/hdp-stream-flat
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
input_file=${1} ; shift
|
4
|
+
output_file=${1} ; shift
|
5
|
+
map_script=${1-/bin/cat} ; shift
|
6
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
7
|
+
|
8
|
+
if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
|
9
|
+
|
10
|
+
hadoop jar /home/flip/hadoop/h/contrib/streaming/hadoop-*-streaming.jar \
|
11
|
+
-mapper "$map_script" \
|
12
|
+
-reducer "$reduce_script" \
|
13
|
+
-input "$input_file" \
|
14
|
+
-output "$output_file" \
|
15
|
+
"$@"
|
16
|
+
|
17
|
+
# -jobconf mapred.map.tasks=3 \
|
18
|
+
# -jobconf mapred.reduce.tasks=3 \
|
data/bin/hdp-sync
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'wukong'
|
3
|
+
|
4
|
+
src_dir, dest_dir = ARGV[0..1]
|
5
|
+
src_files = Dir[src_dir + '/*']
|
6
|
+
dest_files = Wukong::Dfs.list_files dest_dir
|
7
|
+
Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
|
8
|
+
case comparison
|
9
|
+
when :missing
|
10
|
+
dest_filename = "%s/%s" % [dest_dir, dest_file]
|
11
|
+
puts "Copying #{src_file} #{dest_filename}"
|
12
|
+
puts `hadoop dfs -put #{src_file} #{dest_filename}`
|
13
|
+
when :differ
|
14
|
+
src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
|
15
|
+
puts "Differ: #{src_ls} \n#{dest_file}"
|
16
|
+
end
|
17
|
+
end
|
data/bin/hdp-wc
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'wukong'
|
3
|
+
NEWLINE_LENGTH = $/.length # KLUDGE
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# !! The +words+ count comes out higher than that of +wc+ -- don't know
|
9
|
+
# why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
|
10
|
+
#
|
11
|
+
class WcMapper < Wukong::Streamer::LineStreamer
|
12
|
+
attr_accessor :lines, :fields, :words, :chars, :bytes
|
13
|
+
|
14
|
+
def before_stream
|
15
|
+
self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
|
16
|
+
end
|
17
|
+
|
18
|
+
def process line
|
19
|
+
return unless line
|
20
|
+
self.lines += 1
|
21
|
+
self.fields += 1 + line.count("\t")
|
22
|
+
self.words += 1 + line.strip.scan(/\s+/).length unless line.blank?
|
23
|
+
self.chars += line.chars.to_a.length + NEWLINE_LENGTH
|
24
|
+
self.bytes += line.bytesize + NEWLINE_LENGTH
|
25
|
+
$stderr.puts line if (line.chars.to_a.length != line.bytesize)
|
26
|
+
end
|
27
|
+
|
28
|
+
def after_stream
|
29
|
+
emit [lines, fields, words, chars, bytes]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
#
|
35
|
+
class WcReducer < Wukong::Streamer::Base
|
36
|
+
attr_accessor :lines, :fields, :words, :chars, :bytes
|
37
|
+
|
38
|
+
def before_stream
|
39
|
+
self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
|
40
|
+
end
|
41
|
+
|
42
|
+
def process m_lines, m_fields, m_words, m_chars, m_bytes
|
43
|
+
self.lines += m_lines.to_i
|
44
|
+
self.fields += m_fields.to_i
|
45
|
+
self.words += m_words.to_i
|
46
|
+
self.chars += m_chars.to_i
|
47
|
+
self.bytes += m_bytes.to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
def after_stream
|
51
|
+
emit [lines, fields, words, chars, bytes]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
|
56
|
+
|
57
|
+
# class FooScript < Wukong::Script
|
58
|
+
# def map_command
|
59
|
+
# '/usr/bin/wc'
|
60
|
+
# end
|
61
|
+
# def reduce_command
|
62
|
+
# '/bin/cat'
|
63
|
+
# end
|
64
|
+
# end
|
65
|
+
# FooScript.new(nil, nil, :reduce_tasks => 1).run
|
66
|
+
#
|
67
|
+
# ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'
|
data/bin/md5sort
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
""" sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
|
3
|
+
optionally prepends with the md5 id too.
|
4
|
+
brendan o'connor - anyall.org - gist.github.com/brendano """
|
5
|
+
|
6
|
+
import hashlib,sys,optparse
|
7
|
+
p = optparse.OptionParser()
|
8
|
+
p.add_option('-k', type='int', default=False)
|
9
|
+
p.add_option('-p', action='store_true')
|
10
|
+
opts,args=p.parse_args()
|
11
|
+
|
12
|
+
lines = sys.stdin.readlines()
|
13
|
+
getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
|
14
|
+
if opts.k:
|
15
|
+
getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
|
16
|
+
|
17
|
+
lines.sort(key=lambda s: getter(s))
|
18
|
+
for line in lines:
|
19
|
+
if opts.p: line = getter(line) + "\t" + line
|
20
|
+
print line,
|
data/bin/tabchar
ADDED
data/bin/uniqc
ADDED
data/bin/wu-hist
ADDED
data/bin/wu-lign
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
USAGE= %Q{
|
4
|
+
# h1. wulign -- format a tab-separated file as aligned columns
|
5
|
+
#
|
6
|
+
# wulign will intelligently reformat a tab-separated file into a tab-separated,
|
7
|
+
# space aligned file that is still suitable for further processing. For example,
|
8
|
+
# given the log-file input
|
9
|
+
#
|
10
|
+
# <pre><code>
|
11
|
+
# 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
12
|
+
# 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
13
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
14
|
+
# 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
|
15
|
+
# 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
16
|
+
# 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
17
|
+
# 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
18
|
+
# </code></pre>
|
19
|
+
#
|
20
|
+
# wulign will reformat it to read
|
21
|
+
#
|
22
|
+
# <pre><code>
|
23
|
+
# 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
24
|
+
# 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
25
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
26
|
+
# 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
|
27
|
+
# 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
28
|
+
# 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
29
|
+
# 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
30
|
+
# </code></pre>
|
31
|
+
#
|
32
|
+
# The fields are still tab-delimited by exactly one tab -- only spaces are used to
|
33
|
+
# pad out fields. You can still use cuttab and friends to manipulate columns.
|
34
|
+
#
|
35
|
+
# wulign isn't intended to be smart, or correct, or reliable -- only to be
|
36
|
+
# useful for previewing and organizing tab-formatted files. In general
|
37
|
+
# @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
|
38
|
+
# equivalent to its input. (That is, the only changes should be insertion of
|
39
|
+
# spaces and re-formatting of numerics.) But still -- reserve its use for human
|
40
|
+
# inspection only.
|
41
|
+
#
|
42
|
+
# (Note: tab characters in this source code file have been converted to spaces;
|
43
|
+
# replace whitespace with tab in the first example if you'd like to play along at
|
44
|
+
# home.)
|
45
|
+
#
|
46
|
+
# h2. How it works
|
47
|
+
#
|
48
|
+
# Wulign takes the first 1000 lines, splits by TAB characters into fields, and
|
49
|
+
# tries to guess the format -- int, float, or string -- for each. It builds a
|
50
|
+
# consensus of the width and type for corresponding columns in the chunk. If a
|
51
|
+
# column has mixed numeric and string formats it degrades to :mixed, which is
|
52
|
+
# basically treated as :string. If a column has mixed :float and :int elements all
|
53
|
+
# of them are formatted as float.
|
54
|
+
#
|
55
|
+
# h2. Command-line arguments
|
56
|
+
#
|
57
|
+
# You can give sprintf-style positional arguments on the command line that will be
|
58
|
+
# applied to the corresponding columns. (Blank args are used for placeholding and
|
59
|
+
# auto-formatting is still applied). So with the example above,
|
60
|
+
#
|
61
|
+
# @cat foo | wulign '' '' '' '%8.4e'@
|
62
|
+
#
|
63
|
+
# will format the fourth column with "%8.4e", while the first three columns and
|
64
|
+
# fifth-and-higher columns are formatted as usual.
|
65
|
+
#
|
66
|
+
# <pre><code>
|
67
|
+
# ...
|
68
|
+
# 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
69
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
70
|
+
# 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
71
|
+
# ....
|
72
|
+
# </code></pre>
|
73
|
+
#
|
74
|
+
# h2. Notes
|
75
|
+
#
|
76
|
+
# * It has no knowledge of header rows. An all-text first line will screw everything up.
|
77
|
+
#
|
78
|
+
# * It also requires a unanimous vote. One screwy line can coerce the whole mess
|
79
|
+
# to :mixed; width formatting will still be applied, though.
|
80
|
+
#
|
81
|
+
# * It won't set columns wider than 70 chars -- this allows for the occasional
|
82
|
+
# super-wide column without completely breaking your screen.
|
83
|
+
#
|
84
|
+
# * For :float values, wulign tries to guess at the right number of significant
|
85
|
+
# digits to the left and right of the decimal point.
|
86
|
+
#
|
87
|
+
# * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
|
88
|
+
# or escaping; every tab delimits a field, every newline a record.
|
89
|
+
}
|
90
|
+
|
91
|
+
if ARGV[0] == '--help'
|
92
|
+
puts $0
|
93
|
+
puts USAGE
|
94
|
+
exit
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# How many initial lines to use to guess formatting. Lines after this are
|
99
|
+
# simply reformatted according to the consensus of the initial
|
100
|
+
# FORMAT_GUESSING_LINES.
|
101
|
+
#
|
102
|
+
FORMAT_GUESSING_LINES = 500
|
103
|
+
# widest column to set
|
104
|
+
MAX_MAX_WIDTH = 70
|
105
|
+
|
106
|
+
INT_RE = /\A\d+\z/
|
107
|
+
FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
108
|
+
|
109
|
+
def consensus_type val, alltype
|
110
|
+
return :mixed if alltype == :mixed
|
111
|
+
case
|
112
|
+
when val == '' then type = nil
|
113
|
+
when val =~ INT_RE then type = :int
|
114
|
+
when val =~ FLOAT_RE then type = :float
|
115
|
+
else type = :str end
|
116
|
+
return if ! type
|
117
|
+
case
|
118
|
+
when alltype.nil? then type
|
119
|
+
when alltype == type then type
|
120
|
+
when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
|
121
|
+
:float
|
122
|
+
else :mixed
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def f_width str
|
127
|
+
str =~ FLOAT_RE or return 0
|
128
|
+
[$1.length, $2 ? $2.length : 0]
|
129
|
+
end
|
130
|
+
|
131
|
+
maxw = []
|
132
|
+
col_types = []
|
133
|
+
col_minmag = []
|
134
|
+
col_maxmag = []
|
135
|
+
rows = []
|
136
|
+
skip_col = []
|
137
|
+
ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
|
138
|
+
FORMAT_GUESSING_LINES.times do
|
139
|
+
line = $stdin.readline rescue nil
|
140
|
+
break unless line
|
141
|
+
cols = line.chomp.split("\t").map{|s| s.strip }
|
142
|
+
col_widths = cols.map{|col| col.length }
|
143
|
+
col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
|
144
|
+
cols.each_with_index{|col,i|
|
145
|
+
next if skip_col[i]
|
146
|
+
col_types[i] = consensus_type(col, col_types[i])
|
147
|
+
if col_types[i] == :float
|
148
|
+
mantissa, radix = f_width(col)
|
149
|
+
col_minmag[i] = [radix, col_minmag[i], 1].compact.max
|
150
|
+
col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
|
151
|
+
end
|
152
|
+
}
|
153
|
+
# p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
|
154
|
+
rows << cols
|
155
|
+
end
|
156
|
+
|
157
|
+
format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
|
158
|
+
next(lambda{|s| default % s rescue s }) if default.to_s != ''
|
159
|
+
case type
|
160
|
+
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
|
161
|
+
when :str then lambda{|s| "%-#{width}s" % s }
|
162
|
+
when :int then lambda{|s| "%#{width}d" % s.to_i }
|
163
|
+
when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
|
164
|
+
else raise "oops type #{type}" end
|
165
|
+
end
|
166
|
+
# p [maxw, col_types, col_minmag, col_maxmag, format]
|
167
|
+
|
168
|
+
pad = [''] * maxw.length
|
169
|
+
rows.each do |row|
|
170
|
+
# note -- strips trailing columns
|
171
|
+
puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
|
172
|
+
end
|
173
|
+
$stdin.each do |line|
|
174
|
+
cols = line.chomp.split("\t").map{|s| s.strip }
|
175
|
+
# note -- strips trailing columns
|
176
|
+
puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
|
177
|
+
end
|