wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
dir_to_rename = ARGV[0]
|
|
4
|
+
dest_ext = '.tsv'
|
|
5
|
+
|
|
6
|
+
unless dir_to_rename && (! dir_to_rename.empty?)
|
|
7
|
+
warn "Need a directory or file spec to rename."
|
|
8
|
+
exit
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
# Setup
|
|
13
|
+
#
|
|
14
|
+
warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# Examine the files
|
|
18
|
+
#
|
|
19
|
+
file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
|
|
20
|
+
command_lists = { }
|
|
21
|
+
file_listings[1..-1].each do |file_listing|
|
|
22
|
+
m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
|
|
23
|
+
if !m then warn "Couldn't grok #{file_listing}" ; next ; end
|
|
24
|
+
size, filename = m.captures
|
|
25
|
+
case
|
|
26
|
+
when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
|
|
27
|
+
else
|
|
28
|
+
firstline = `hdp-cat #{filename} | head -qn1 `
|
|
29
|
+
file_key, _ = firstline.split("\t", 2)
|
|
30
|
+
unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
|
|
31
|
+
warn "Don't want to rename to '#{file_key}'... skipping"
|
|
32
|
+
next
|
|
33
|
+
end
|
|
34
|
+
dirname = File.dirname(filename)
|
|
35
|
+
destfile = File.join(dirname, file_key)+dest_ext
|
|
36
|
+
(command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
#
|
|
41
|
+
# Execute the command_lists
|
|
42
|
+
#
|
|
43
|
+
command_lists.each do |type, command_list|
|
|
44
|
+
case type
|
|
45
|
+
when :deletes
|
|
46
|
+
command = "hdp-rm #{command_list.join(" ")}"
|
|
47
|
+
puts command
|
|
48
|
+
`#{command}`
|
|
49
|
+
when :moves
|
|
50
|
+
command_list.each do |command|
|
|
51
|
+
puts command
|
|
52
|
+
`#{command}`
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
|
|
59
|
+
|
|
60
|
+
# # Killing empty files
|
|
61
|
+
# find . -size 0 -print -exec rm {} \;
|
|
62
|
+
#
|
|
63
|
+
# for foo in part-0* ; do
|
|
64
|
+
# newname=`
|
|
65
|
+
# head -n1 $foo |
|
|
66
|
+
# cut -d' ' -f1 |
|
|
67
|
+
# ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
|
|
68
|
+
# `.tsv ;
|
|
69
|
+
# echo "moving $foo to $newname"
|
|
70
|
+
# mv "$foo" "$newname"
|
|
71
|
+
# done
|
|
72
|
+
#
|
|
73
|
+
# # dir=`basename $PWD`
|
|
74
|
+
# # for foo in *.tsv ; do
|
|
75
|
+
# # echo "Compressing $dir"
|
|
76
|
+
# # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
|
|
77
|
+
# # done
|
data/bin/hdp-ps
ADDED
data/bin/hdp-put
ADDED
data/bin/hdp-rm
ADDED
data/bin/hdp-sort
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# hadoop dfs -rmr out/parsed-followers
|
|
3
|
+
|
|
4
|
+
input_file=${1} ; shift
|
|
5
|
+
output_file=${1} ; shift
|
|
6
|
+
map_script=${1-/bin/cat} ; shift
|
|
7
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
|
8
|
+
fields=${1-2} ; shift
|
|
9
|
+
|
|
10
|
+
if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
|
|
11
|
+
|
|
12
|
+
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
|
13
|
+
|
|
14
|
+
${HADOOP_HOME}/bin/hadoop \
|
|
15
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
|
|
16
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
|
17
|
+
-jobconf map.output.key.field.separator='\t' \
|
|
18
|
+
-jobconf num.key.fields.for.partition=1 \
|
|
19
|
+
-jobconf stream.map.output.field.separator='\t' \
|
|
20
|
+
-jobconf stream.num.map.output.key.fields="$fields" \
|
|
21
|
+
-mapper "$map_script" \
|
|
22
|
+
-reducer "$reduce_script" \
|
|
23
|
+
-input "$input_file" \
|
|
24
|
+
-output "$output_file" \
|
|
25
|
+
"$@"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -jobconf mapred.map.tasks=3 \
|
|
29
|
+
# -jobconf mapred.reduce.tasks=3 \
|
data/bin/hdp-stream
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# hadoop dfs -rmr out/parsed-followers
|
|
3
|
+
|
|
4
|
+
input_file=${1} ; shift
|
|
5
|
+
output_file=${1} ; shift
|
|
6
|
+
map_script=${1-/bin/cat} ; shift
|
|
7
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
|
8
|
+
fields=${1-2} ; shift
|
|
9
|
+
|
|
10
|
+
if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
|
|
11
|
+
|
|
12
|
+
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
|
13
|
+
|
|
14
|
+
${HADOOP_HOME}/bin/hadoop \
|
|
15
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
|
|
16
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
|
17
|
+
-jobconf map.output.key.field.separator='\t' \
|
|
18
|
+
-jobconf num.key.fields.for.partition=1 \
|
|
19
|
+
-jobconf stream.map.output.field.separator='\t' \
|
|
20
|
+
-jobconf stream.num.map.output.key.fields="$fields" \
|
|
21
|
+
-mapper "$map_script" \
|
|
22
|
+
-reducer "$reduce_script" \
|
|
23
|
+
-input "$input_file" \
|
|
24
|
+
-output "$output_file" \
|
|
25
|
+
"$@"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -jobconf mapred.map.tasks=3 \
|
|
29
|
+
# -jobconf mapred.reduce.tasks=3 \
|
data/bin/hdp-stream-flat
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
|
|
3
|
+
input_file=${1} ; shift
|
|
4
|
+
output_file=${1} ; shift
|
|
5
|
+
map_script=${1-/bin/cat} ; shift
|
|
6
|
+
reduce_script=${1-/usr/bin/uniq} ; shift
|
|
7
|
+
|
|
8
|
+
if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
|
|
9
|
+
|
|
10
|
+
hadoop jar /home/flip/hadoop/h/contrib/streaming/hadoop-*-streaming.jar \
|
|
11
|
+
-mapper "$map_script" \
|
|
12
|
+
-reducer "$reduce_script" \
|
|
13
|
+
-input "$input_file" \
|
|
14
|
+
-output "$output_file" \
|
|
15
|
+
"$@"
|
|
16
|
+
|
|
17
|
+
# -jobconf mapred.map.tasks=3 \
|
|
18
|
+
# -jobconf mapred.reduce.tasks=3 \
|
data/bin/hdp-sync
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'wukong'
|
|
3
|
+
|
|
4
|
+
src_dir, dest_dir = ARGV[0..1]
|
|
5
|
+
src_files = Dir[src_dir + '/*']
|
|
6
|
+
dest_files = Wukong::Dfs.list_files dest_dir
|
|
7
|
+
Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
|
|
8
|
+
case comparison
|
|
9
|
+
when :missing
|
|
10
|
+
dest_filename = "%s/%s" % [dest_dir, dest_file]
|
|
11
|
+
puts "Copying #{src_file} #{dest_filename}"
|
|
12
|
+
puts `hadoop dfs -put #{src_file} #{dest_filename}`
|
|
13
|
+
when :differ
|
|
14
|
+
src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
|
|
15
|
+
puts "Differ: #{src_ls} \n#{dest_file}"
|
|
16
|
+
end
|
|
17
|
+
end
|
data/bin/hdp-wc
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'wukong'
|
|
3
|
+
NEWLINE_LENGTH = $/.length # KLUDGE
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
# !! The +words+ count comes out higher than that of +wc+ -- don't know
|
|
9
|
+
# why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
|
|
10
|
+
#
|
|
11
|
+
class WcMapper < Wukong::Streamer::LineStreamer
|
|
12
|
+
attr_accessor :lines, :fields, :words, :chars, :bytes
|
|
13
|
+
|
|
14
|
+
def before_stream
|
|
15
|
+
self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def process line
|
|
19
|
+
return unless line
|
|
20
|
+
self.lines += 1
|
|
21
|
+
self.fields += 1 + line.count("\t")
|
|
22
|
+
self.words += 1 + line.strip.scan(/\s+/).length unless line.blank?
|
|
23
|
+
self.chars += line.chars.to_a.length + NEWLINE_LENGTH
|
|
24
|
+
self.bytes += line.bytesize + NEWLINE_LENGTH
|
|
25
|
+
$stderr.puts line if (line.chars.to_a.length != line.bytesize)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def after_stream
|
|
29
|
+
emit [lines, fields, words, chars, bytes]
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#
|
|
34
|
+
#
|
|
35
|
+
class WcReducer < Wukong::Streamer::Base
|
|
36
|
+
attr_accessor :lines, :fields, :words, :chars, :bytes
|
|
37
|
+
|
|
38
|
+
def before_stream
|
|
39
|
+
self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def process m_lines, m_fields, m_words, m_chars, m_bytes
|
|
43
|
+
self.lines += m_lines.to_i
|
|
44
|
+
self.fields += m_fields.to_i
|
|
45
|
+
self.words += m_words.to_i
|
|
46
|
+
self.chars += m_chars.to_i
|
|
47
|
+
self.bytes += m_bytes.to_i
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def after_stream
|
|
51
|
+
emit [lines, fields, words, chars, bytes]
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
|
|
56
|
+
|
|
57
|
+
# class FooScript < Wukong::Script
|
|
58
|
+
# def map_command
|
|
59
|
+
# '/usr/bin/wc'
|
|
60
|
+
# end
|
|
61
|
+
# def reduce_command
|
|
62
|
+
# '/bin/cat'
|
|
63
|
+
# end
|
|
64
|
+
# end
|
|
65
|
+
# FooScript.new(nil, nil, :reduce_tasks => 1).run
|
|
66
|
+
#
|
|
67
|
+
# ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'
|
data/bin/md5sort
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
""" sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
|
|
3
|
+
optionally prepends with the md5 id too.
|
|
4
|
+
brendan o'connor - anyall.org - gist.github.com/brendano """
|
|
5
|
+
|
|
6
|
+
import hashlib,sys,optparse
|
|
7
|
+
p = optparse.OptionParser()
|
|
8
|
+
p.add_option('-k', type='int', default=False)
|
|
9
|
+
p.add_option('-p', action='store_true')
|
|
10
|
+
opts,args=p.parse_args()
|
|
11
|
+
|
|
12
|
+
lines = sys.stdin.readlines()
|
|
13
|
+
getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
|
|
14
|
+
if opts.k:
|
|
15
|
+
getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
|
|
16
|
+
|
|
17
|
+
lines.sort(key=lambda s: getter(s))
|
|
18
|
+
for line in lines:
|
|
19
|
+
if opts.p: line = getter(line) + "\t" + line
|
|
20
|
+
print line,
|
data/bin/tabchar
ADDED
data/bin/uniqc
ADDED
data/bin/wu-hist
ADDED
data/bin/wu-lign
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
USAGE= %Q{
|
|
4
|
+
# h1. wulign -- format a tab-separated file as aligned columns
|
|
5
|
+
#
|
|
6
|
+
# wulign will intelligently reformat a tab-separated file into a tab-separated,
|
|
7
|
+
# space aligned file that is still suitable for further processing. For example,
|
|
8
|
+
# given the log-file input
|
|
9
|
+
#
|
|
10
|
+
# <pre><code>
|
|
11
|
+
# 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
|
12
|
+
# 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
|
13
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
|
14
|
+
# 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
|
|
15
|
+
# 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
|
16
|
+
# 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
|
17
|
+
# 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
|
18
|
+
# </code></pre>
|
|
19
|
+
#
|
|
20
|
+
# wulign will reformat it to read
|
|
21
|
+
#
|
|
22
|
+
# <pre><code>
|
|
23
|
+
# 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
|
24
|
+
# 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
|
25
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
|
26
|
+
# 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
|
|
27
|
+
# 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
|
28
|
+
# 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
|
29
|
+
# 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
|
30
|
+
# </code></pre>
|
|
31
|
+
#
|
|
32
|
+
# The fields are still tab-delimited by exactly one tab -- only spaces are used to
|
|
33
|
+
# pad out fields. You can still use cuttab and friends to manipulate columns.
|
|
34
|
+
#
|
|
35
|
+
# wulign isn't intended to be smart, or correct, or reliable -- only to be
|
|
36
|
+
# useful for previewing and organizing tab-formatted files. In general
|
|
37
|
+
# @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
|
|
38
|
+
# equivalent to its input. (That is, the only changes should be insertion of
|
|
39
|
+
# spaces and re-formatting of numerics.) But still -- reserve its use for human
|
|
40
|
+
# inspection only.
|
|
41
|
+
#
|
|
42
|
+
# (Note: tab characters in this source code file have been converted to spaces;
|
|
43
|
+
# replace whitespace with tab in the first example if you'd like to play along at
|
|
44
|
+
# home.)
|
|
45
|
+
#
|
|
46
|
+
# h2. How it works
|
|
47
|
+
#
|
|
48
|
+
# Wulign takes the first 1000 lines, splits by TAB characters into fields, and
|
|
49
|
+
# tries to guess the format -- int, float, or string -- for each. It builds a
|
|
50
|
+
# consensus of the width and type for corresponding columns in the chunk. If a
|
|
51
|
+
# column has mixed numeric and string formats it degrades to :mixed, which is
|
|
52
|
+
# basically treated as :string. If a column has mixed :float and :int elements all
|
|
53
|
+
# of them are formatted as float.
|
|
54
|
+
#
|
|
55
|
+
# h2. Command-line arguments
|
|
56
|
+
#
|
|
57
|
+
# You can give sprintf-style positional arguments on the command line that will be
|
|
58
|
+
# applied to the corresponding columns. (Blank args are used for placeholding and
|
|
59
|
+
# auto-formatting is still applied). So with the example above,
|
|
60
|
+
#
|
|
61
|
+
# @cat foo | wulign '' '' '' '%8.4e'@
|
|
62
|
+
#
|
|
63
|
+
# will format the fourth column with "%8.4e", while the first three columns and
|
|
64
|
+
# fifth-and-higher columns are formatted as usual.
|
|
65
|
+
#
|
|
66
|
+
# <pre><code>
|
|
67
|
+
# ...
|
|
68
|
+
# 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
|
69
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
|
70
|
+
# 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
|
71
|
+
# ....
|
|
72
|
+
# </code></pre>
|
|
73
|
+
#
|
|
74
|
+
# h2. Notes
|
|
75
|
+
#
|
|
76
|
+
# * It has no knowledge of header rows. An all-text first line will screw everything up.
|
|
77
|
+
#
|
|
78
|
+
# * It also requires a unanimous vote. One screwy line can coerce the whole mess
|
|
79
|
+
# to :mixed; width formatting will still be applied, though.
|
|
80
|
+
#
|
|
81
|
+
# * It won't set columns wider than 70 chars -- this allows for the occasional
|
|
82
|
+
# super-wide column without completely breaking your screen.
|
|
83
|
+
#
|
|
84
|
+
# * For :float values, wulign tries to guess at the right number of significant
|
|
85
|
+
# digits to the left and right of the decimal point.
|
|
86
|
+
#
|
|
87
|
+
# * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
|
|
88
|
+
# or escaping; every tab delimits a field, every newline a record.
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if ARGV[0] == '--help'
|
|
92
|
+
puts $0
|
|
93
|
+
puts USAGE
|
|
94
|
+
exit
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
#
|
|
98
|
+
# How many initial lines to use to guess formatting. Lines after this are
|
|
99
|
+
# simply reformatted according to the consensus of the initial
|
|
100
|
+
# FORMAT_GUESSING_LINES.
|
|
101
|
+
#
|
|
102
|
+
FORMAT_GUESSING_LINES = 500
|
|
103
|
+
# widest column to set
|
|
104
|
+
MAX_MAX_WIDTH = 70
|
|
105
|
+
|
|
106
|
+
INT_RE = /\A\d+\z/
|
|
107
|
+
FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
|
108
|
+
|
|
109
|
+
def consensus_type val, alltype
|
|
110
|
+
return :mixed if alltype == :mixed
|
|
111
|
+
case
|
|
112
|
+
when val == '' then type = nil
|
|
113
|
+
when val =~ INT_RE then type = :int
|
|
114
|
+
when val =~ FLOAT_RE then type = :float
|
|
115
|
+
else type = :str end
|
|
116
|
+
return if ! type
|
|
117
|
+
case
|
|
118
|
+
when alltype.nil? then type
|
|
119
|
+
when alltype == type then type
|
|
120
|
+
when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
|
|
121
|
+
:float
|
|
122
|
+
else :mixed
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def f_width str
|
|
127
|
+
str =~ FLOAT_RE or return 0
|
|
128
|
+
[$1.length, $2 ? $2.length : 0]
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
maxw = []
|
|
132
|
+
col_types = []
|
|
133
|
+
col_minmag = []
|
|
134
|
+
col_maxmag = []
|
|
135
|
+
rows = []
|
|
136
|
+
skip_col = []
|
|
137
|
+
ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
|
|
138
|
+
FORMAT_GUESSING_LINES.times do
|
|
139
|
+
line = $stdin.readline rescue nil
|
|
140
|
+
break unless line
|
|
141
|
+
cols = line.chomp.split("\t").map{|s| s.strip }
|
|
142
|
+
col_widths = cols.map{|col| col.length }
|
|
143
|
+
col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
|
|
144
|
+
cols.each_with_index{|col,i|
|
|
145
|
+
next if skip_col[i]
|
|
146
|
+
col_types[i] = consensus_type(col, col_types[i])
|
|
147
|
+
if col_types[i] == :float
|
|
148
|
+
mantissa, radix = f_width(col)
|
|
149
|
+
col_minmag[i] = [radix, col_minmag[i], 1].compact.max
|
|
150
|
+
col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
|
|
151
|
+
end
|
|
152
|
+
}
|
|
153
|
+
# p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
|
|
154
|
+
rows << cols
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
|
|
158
|
+
next(lambda{|s| default % s rescue s }) if default.to_s != ''
|
|
159
|
+
case type
|
|
160
|
+
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
|
|
161
|
+
when :str then lambda{|s| "%-#{width}s" % s }
|
|
162
|
+
when :int then lambda{|s| "%#{width}d" % s.to_i }
|
|
163
|
+
when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
|
|
164
|
+
else raise "oops type #{type}" end
|
|
165
|
+
end
|
|
166
|
+
# p [maxw, col_types, col_minmag, col_maxmag, format]
|
|
167
|
+
|
|
168
|
+
pad = [''] * maxw.length
|
|
169
|
+
rows.each do |row|
|
|
170
|
+
# note -- strips trailing columns
|
|
171
|
+
puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
|
|
172
|
+
end
|
|
173
|
+
$stdin.each do |line|
|
|
174
|
+
cols = line.chomp.split("\t").map{|s| s.strip }
|
|
175
|
+
# note -- strips trailing columns
|
|
176
|
+
puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
|
|
177
|
+
end
|