wukong 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ dir_to_rename = ARGV[0]
4
+ dest_ext = '.tsv'
5
+
6
+ unless dir_to_rename && (! dir_to_rename.empty?)
7
+ warn "Need a directory or file spec to rename."
8
+ exit
9
+ end
10
+
11
+ #
12
+ # Setup
13
+ #
14
+ warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
15
+
16
+ #
17
+ # Examine the files
18
+ #
19
+ file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
20
+ command_lists = { }
21
+ file_listings[1..-1].each do |file_listing|
22
+ m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
23
+ if !m then warn "Couldn't grok #{file_listing}" ; next ; end
24
+ size, filename = m.captures
25
+ case
26
+ when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
27
+ else
28
+ firstline = `hdp-cat #{filename} | head -qn1 `
29
+ file_key, _ = firstline.split("\t", 2)
30
+ unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
31
+ warn "Don't want to rename to '#{file_key}'... skipping"
32
+ next
33
+ end
34
+ dirname = File.dirname(filename)
35
+ destfile = File.join(dirname, file_key)+dest_ext
36
+ (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
37
+ end
38
+ end
39
+
40
+ #
41
+ # Execute the command_lists
42
+ #
43
+ command_lists.each do |type, command_list|
44
+ case type
45
+ when :deletes
46
+ command = "hdp-rm #{command_list.join(" ")}"
47
+ puts command
48
+ `#{command}`
49
+ when :moves
50
+ command_list.each do |command|
51
+ puts command
52
+ `#{command}`
53
+ end
54
+ end
55
+ end
56
+
57
+
58
+ # -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
59
+
60
+ # # Killing empty files
61
+ # find . -size 0 -print -exec rm {} \;
62
+ #
63
+ # for foo in part-0* ; do
64
+ # newname=`
65
+ # head -n1 $foo |
66
+ # cut -d' ' -f1 |
67
+ # ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
68
+ # `.tsv ;
69
+ # echo "moving $foo to $newname"
70
+ # mv "$foo" "$newname"
71
+ # done
72
+ #
73
+ # # dir=`basename $PWD`
74
+ # # for foo in *.tsv ; do
75
+ # # echo "Compressing $dir"
76
+ # # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
77
+ # # done
data/bin/hdp-ps ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ hadoop job -list all
data/bin/hdp-put ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ hadoop dfs -put "$1" "$2"
data/bin/hdp-rm ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ "$1" == "-r" ] ; then
4
+ shift
5
+ action=rmr
6
+ else
7
+ action=rm
8
+ fi
9
+ echo hadoop dfs -$action "$@"
10
+ # read -p "Hit ctrl-C to abort or enter to do this...."
11
+ hadoop dfs -$action "$@"
data/bin/hdp-sort ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env bash
2
+ # hadoop dfs -rmr out/parsed-followers
3
+
4
+ input_file=${1} ; shift
5
+ output_file=${1} ; shift
6
+ map_script=${1-/bin/cat} ; shift
7
+ reduce_script=${1-/usr/bin/uniq} ; shift
8
+ fields=${1-2} ; shift
9
+
10
+ if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ ${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
16
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
17
+ -jobconf map.output.key.field.separator='\t' \
18
+ -jobconf num.key.fields.for.partition=1 \
19
+ -jobconf stream.map.output.field.separator='\t' \
20
+ -jobconf stream.num.map.output.key.fields="$fields" \
21
+ -mapper "$map_script" \
22
+ -reducer "$reduce_script" \
23
+ -input "$input_file" \
24
+ -output "$output_file" \
25
+ "$@"
26
+
27
+
28
+ # -jobconf mapred.map.tasks=3 \
29
+ # -jobconf mapred.reduce.tasks=3 \
data/bin/hdp-stream ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env bash
2
+ # hadoop dfs -rmr out/parsed-followers
3
+
4
+ input_file=${1} ; shift
5
+ output_file=${1} ; shift
6
+ map_script=${1-/bin/cat} ; shift
7
+ reduce_script=${1-/usr/bin/uniq} ; shift
8
+ fields=${1-2} ; shift
9
+
10
+ if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ ${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
16
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
17
+ -jobconf map.output.key.field.separator='\t' \
18
+ -jobconf num.key.fields.for.partition=1 \
19
+ -jobconf stream.map.output.field.separator='\t' \
20
+ -jobconf stream.num.map.output.key.fields="$fields" \
21
+ -mapper "$map_script" \
22
+ -reducer "$reduce_script" \
23
+ -input "$input_file" \
24
+ -output "$output_file" \
25
+ "$@"
26
+
27
+
28
+ # -jobconf mapred.map.tasks=3 \
29
+ # -jobconf mapred.reduce.tasks=3 \
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file=${1} ; shift
4
+ output_file=${1} ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+
8
+ if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
9
+
10
+ hadoop jar /home/flip/hadoop/h/contrib/streaming/hadoop-*-streaming.jar \
11
+ -mapper "$map_script" \
12
+ -reducer "$reduce_script" \
13
+ -input "$input_file" \
14
+ -output "$output_file" \
15
+ "$@"
16
+
17
+ # -jobconf mapred.map.tasks=3 \
18
+ # -jobconf mapred.reduce.tasks=3 \
data/bin/hdp-sync ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+
4
+ src_dir, dest_dir = ARGV[0..1]
5
+ src_files = Dir[src_dir + '/*']
6
+ dest_files = Wukong::Dfs.list_files dest_dir
7
+ Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
8
+ case comparison
9
+ when :missing
10
+ dest_filename = "%s/%s" % [dest_dir, dest_file]
11
+ puts "Copying #{src_file} #{dest_filename}"
12
+ puts `hadoop dfs -put #{src_file} #{dest_filename}`
13
+ when :differ
14
+ src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
15
+ puts "Differ: #{src_ls} \n#{dest_file}"
16
+ end
17
+ end
data/bin/hdp-wc ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+ NEWLINE_LENGTH = $/.length # KLUDGE
4
+
5
+ #
6
+ #
7
+ #
8
+ # !! The +words+ count comes out higher than that of +wc+ -- don't know
9
+ # why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
10
+ #
11
+ class WcMapper < Wukong::Streamer::LineStreamer
12
+ attr_accessor :lines, :fields, :words, :chars, :bytes
13
+
14
+ def before_stream
15
+ self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
16
+ end
17
+
18
+ def process line
19
+ return unless line
20
+ self.lines += 1
21
+ self.fields += 1 + line.count("\t")
22
+ self.words += 1 + line.strip.scan(/\s+/).length unless line.blank?
23
+ self.chars += line.chars.to_a.length + NEWLINE_LENGTH
24
+ self.bytes += line.bytesize + NEWLINE_LENGTH
25
+ $stderr.puts line if (line.chars.to_a.length != line.bytesize)
26
+ end
27
+
28
+ def after_stream
29
+ emit [lines, fields, words, chars, bytes]
30
+ end
31
+ end
32
+
33
+ #
34
+ #
35
+ class WcReducer < Wukong::Streamer::Base
36
+ attr_accessor :lines, :fields, :words, :chars, :bytes
37
+
38
+ def before_stream
39
+ self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
40
+ end
41
+
42
+ def process m_lines, m_fields, m_words, m_chars, m_bytes
43
+ self.lines += m_lines.to_i
44
+ self.fields += m_fields.to_i
45
+ self.words += m_words.to_i
46
+ self.chars += m_chars.to_i
47
+ self.bytes += m_bytes.to_i
48
+ end
49
+
50
+ def after_stream
51
+ emit [lines, fields, words, chars, bytes]
52
+ end
53
+ end
54
+
55
+ Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
56
+
57
+ # class FooScript < Wukong::Script
58
+ # def map_command
59
+ # '/usr/bin/wc'
60
+ # end
61
+ # def reduce_command
62
+ # '/bin/cat'
63
+ # end
64
+ # end
65
+ # FooScript.new(nil, nil, :reduce_tasks => 1).run
66
+ #
67
+ # ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'
data/bin/md5sort ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env python
2
+ """ sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
3
+ optionally prepends with the md5 id too.
4
+ brendan o'connor - anyall.org - gist.github.com/brendano """
5
+
6
+ import hashlib,sys,optparse
7
+ p = optparse.OptionParser()
8
+ p.add_option('-k', type='int', default=False)
9
+ p.add_option('-p', action='store_true')
10
+ opts,args=p.parse_args()
11
+
12
+ lines = sys.stdin.readlines()
13
+ getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
14
+ if opts.k:
15
+ getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
16
+
17
+ lines.sort(key=lambda s: getter(s))
18
+ for line in lines:
19
+ if opts.p: line = getter(line) + "\t" + line
20
+ print line,
data/bin/tabchar ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ # insert a tab char from the command line:
3
+ # echo "hi$(tabchar)there"
4
+ # # => "hi there"
5
+ echo -n -e '\t'
data/bin/uniqc ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ uniq -c | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ "%15s\t" % $1 }'
data/bin/wu-hist ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ sort | uniq -c | sort -rn | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ $1+"\t" }'
data/bin/wu-lign ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ USAGE= %Q{
4
+ # h1. wulign -- format a tab-separated file as aligned columns
5
+ #
6
+ # wulign will intelligently reformat a tab-separated file into a tab-separated,
7
+ # space aligned file that is still suitable for further processing. For example,
8
+ # given the log-file input
9
+ #
10
+ # <pre><code>
11
+ # 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
12
+ # 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
13
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
14
+ # 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
15
+ # 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
16
+ # 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
17
+ # 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
18
+ # </code></pre>
19
+ #
20
+ # wulign will reformat it to read
21
+ #
22
+ # <pre><code>
23
+ # 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
24
+ # 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
25
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
26
+ # 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
27
+ # 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
28
+ # 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
29
+ # 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
30
+ # </code></pre>
31
+ #
32
+ # The fields are still tab-delimited by exactly one tab -- only spaces are used to
33
+ # pad out fields. You can still use cuttab and friends to manipulate columns.
34
+ #
35
+ # wulign isn't intended to be smart, or correct, or reliable -- only to be
36
+ # useful for previewing and organizing tab-formatted files. In general
37
+ # @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
38
+ # equivalent to its input. (That is, the only changes should be insertion of
39
+ # spaces and re-formatting of numerics.) But still -- reserve its use for human
40
+ # inspection only.
41
+ #
42
+ # (Note: tab characters in this source code file have been converted to spaces;
43
+ # replace whitespace with tab in the first example if you'd like to play along at
44
+ # home.)
45
+ #
46
+ # h2. How it works
47
+ #
48
+ # Wulign takes the first 1000 lines, splits by TAB characters into fields, and
49
+ # tries to guess the format -- int, float, or string -- for each. It builds a
50
+ # consensus of the width and type for corresponding columns in the chunk. If a
51
+ # column has mixed numeric and string formats it degrades to :mixed, which is
52
+ # basically treated as :string. If a column has mixed :float and :int elements all
53
+ # of them are formatted as float.
54
+ #
55
+ # h2. Command-line arguments
56
+ #
57
+ # You can give sprintf-style positional arguments on the command line that will be
58
+ # applied to the corresponding columns. (Blank args are used for placeholding and
59
+ # auto-formatting is still applied). So with the example above,
60
+ #
61
+ # @cat foo | wulign '' '' '' '%8.4e'@
62
+ #
63
+ # will format the fourth column with "%8.4e", while the first three columns and
64
+ # fifth-and-higher columns are formatted as usual.
65
+ #
66
+ # <pre><code>
67
+ # ...
68
+ # 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
69
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
70
+ # 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
71
+ # ....
72
+ # </code></pre>
73
+ #
74
+ # h2. Notes
75
+ #
76
+ # * It has no knowledge of header rows. An all-text first line will screw everything up.
77
+ #
78
+ # * It also requires a unanimous vote. One screwy line can coerce the whole mess
79
+ # to :mixed; width formatting will still be applied, though.
80
+ #
81
+ # * It won't set columns wider than 70 chars -- this allows for the occasional
82
+ # super-wide column without completely breaking your screen.
83
+ #
84
+ # * For :float values, wulign tries to guess at the right number of significant
85
+ # digits to the left and right of the decimal point.
86
+ #
87
+ # * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
88
+ # or escaping; every tab delimits a field, every newline a record.
89
+ }
90
+
91
+ if ARGV[0] == '--help'
92
+ puts $0
93
+ puts USAGE
94
+ exit
95
+ end
96
+
97
+ #
98
+ # How many initial lines to use to guess formatting. Lines after this are
99
+ # simply reformatted according to the consensus of the initial
100
+ # FORMAT_GUESSING_LINES.
101
+ #
102
+ FORMAT_GUESSING_LINES = 500
103
+ # widest column to set
104
+ MAX_MAX_WIDTH = 70
105
+
106
+ INT_RE = /\A\d+\z/
107
+ FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
108
+
109
+ def consensus_type val, alltype
110
+ return :mixed if alltype == :mixed
111
+ case
112
+ when val == '' then type = nil
113
+ when val =~ INT_RE then type = :int
114
+ when val =~ FLOAT_RE then type = :float
115
+ else type = :str end
116
+ return if ! type
117
+ case
118
+ when alltype.nil? then type
119
+ when alltype == type then type
120
+ when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
121
+ :float
122
+ else :mixed
123
+ end
124
+ end
125
+
126
+ def f_width str
127
+ str =~ FLOAT_RE or return 0
128
+ [$1.length, $2 ? $2.length : 0]
129
+ end
130
+
131
+ maxw = []
132
+ col_types = []
133
+ col_minmag = []
134
+ col_maxmag = []
135
+ rows = []
136
+ skip_col = []
137
+ ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
138
+ FORMAT_GUESSING_LINES.times do
139
+ line = $stdin.readline rescue nil
140
+ break unless line
141
+ cols = line.chomp.split("\t").map{|s| s.strip }
142
+ col_widths = cols.map{|col| col.length }
143
+ col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
144
+ cols.each_with_index{|col,i|
145
+ next if skip_col[i]
146
+ col_types[i] = consensus_type(col, col_types[i])
147
+ if col_types[i] == :float
148
+ mantissa, radix = f_width(col)
149
+ col_minmag[i] = [radix, col_minmag[i], 1].compact.max
150
+ col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
151
+ end
152
+ }
153
+ # p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
154
+ rows << cols
155
+ end
156
+
157
+ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
158
+ next(lambda{|s| default % s rescue s }) if default.to_s != ''
159
+ case type
160
+ when :mixed, nil then lambda{|s| "%-#{width}s" % s }
161
+ when :str then lambda{|s| "%-#{width}s" % s }
162
+ when :int then lambda{|s| "%#{width}d" % s.to_i }
163
+ when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
164
+ else raise "oops type #{type}" end
165
+ end
166
+ # p [maxw, col_types, col_minmag, col_maxmag, format]
167
+
168
+ pad = [''] * maxw.length
169
+ rows.each do |row|
170
+ # note -- strips trailing columns
171
+ puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
172
+ end
173
+ $stdin.each do |line|
174
+ cols = line.chomp.split("\t").map{|s| s.strip }
175
+ # note -- strips trailing columns
176
+ puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
177
+ end