wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ dir_to_rename = ARGV[0]
4
+ dest_ext = '.tsv'
5
+
6
+ unless dir_to_rename && (! dir_to_rename.empty?)
7
+ warn "Need a directory or file spec to rename."
8
+ exit
9
+ end
10
+
11
+ #
12
+ # Setup
13
+ #
14
+ warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
15
+
16
+ #
17
+ # Examine the files
18
+ #
19
+ file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
20
+ command_lists = { }
21
+ file_listings[1..-1].each do |file_listing|
22
+ m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
23
+ if !m then warn "Couldn't grok #{file_listing}" ; next ; end
24
+ size, filename = m.captures
25
+ case
26
+ when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
27
+ else
28
+ firstline = `hdp-cat #{filename} | head -qn1 `
29
+ file_key, _ = firstline.split("\t", 2)
30
+ unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
31
+ warn "Don't want to rename to '#{file_key}'... skipping"
32
+ next
33
+ end
34
+ dirname = File.dirname(filename)
35
+ destfile = File.join(dirname, file_key)+dest_ext
36
+ (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
37
+ end
38
+ end
39
+
40
+ #
41
+ # Execute the command_lists
42
+ #
43
+ command_lists.each do |type, command_list|
44
+ case type
45
+ when :deletes
46
+ command = "hdp-rm #{command_list.join(" ")}"
47
+ puts command
48
+ `#{command}`
49
+ when :moves
50
+ command_list.each do |command|
51
+ puts command
52
+ `#{command}`
53
+ end
54
+ end
55
+ end
56
+
57
+
58
+ # -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
59
+
60
+ # # Killing empty files
61
+ # find . -size 0 -print -exec rm {} \;
62
+ #
63
+ # for foo in part-0* ; do
64
+ # newname=`
65
+ # head -n1 $foo |
66
+ # cut -d' ' -f1 |
67
+ # ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
68
+ # `.tsv ;
69
+ # echo "moving $foo to $newname"
70
+ # mv "$foo" "$newname"
71
+ # done
72
+ #
73
+ # # dir=`basename $PWD`
74
+ # # for foo in *.tsv ; do
75
+ # # echo "Compressing $dir"
76
+ # # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
77
+ # # done
data/bin/hdp-ps ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ hadoop job -list all
data/bin/hdp-put ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ hadoop dfs -put "$1" "$2"
data/bin/hdp-rm ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ "$1" == "-r" ] ; then
4
+ shift
5
+ action=rmr
6
+ else
7
+ action=rm
8
+ fi
9
+ echo hadoop dfs -$action "$@"
10
+ # read -p "Hit ctrl-C to abort or enter to do this...."
11
+ hadoop dfs -$action "$@"
data/bin/hdp-sort ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env bash
2
+ # hadoop dfs -rmr out/parsed-followers
3
+
4
+ input_file=${1} ; shift
5
+ output_file=${1} ; shift
6
+ map_script=${1-/bin/cat} ; shift
7
+ reduce_script=${1-/usr/bin/uniq} ; shift
8
+ fields=${1-2} ; shift
9
+
10
+ if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ ${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
16
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
17
+ -jobconf map.output.key.field.separator='\t' \
18
+ -jobconf num.key.fields.for.partition=1 \
19
+ -jobconf stream.map.output.field.separator='\t' \
20
+ -jobconf stream.num.map.output.key.fields="$fields" \
21
+ -mapper "$map_script" \
22
+ -reducer "$reduce_script" \
23
+ -input "$input_file" \
24
+ -output "$output_file" \
25
+ "$@"
26
+
27
+
28
+ # -jobconf mapred.map.tasks=3 \
29
+ # -jobconf mapred.reduce.tasks=3 \
data/bin/hdp-stream ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env bash
2
+ # hadoop dfs -rmr out/parsed-followers
3
+
4
+ input_file=${1} ; shift
5
+ output_file=${1} ; shift
6
+ map_script=${1-/bin/cat} ; shift
7
+ reduce_script=${1-/usr/bin/uniq} ; shift
8
+ fields=${1-2} ; shift
9
+
10
+ if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ ${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
16
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
17
+ -jobconf map.output.key.field.separator='\t' \
18
+ -jobconf num.key.fields.for.partition=1 \
19
+ -jobconf stream.map.output.field.separator='\t' \
20
+ -jobconf stream.num.map.output.key.fields="$fields" \
21
+ -mapper "$map_script" \
22
+ -reducer "$reduce_script" \
23
+ -input "$input_file" \
24
+ -output "$output_file" \
25
+ "$@"
26
+
27
+
28
+ # -jobconf mapred.map.tasks=3 \
29
+ # -jobconf mapred.reduce.tasks=3 \
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file=${1} ; shift
4
+ output_file=${1} ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+
8
+ if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
9
+
10
+ hadoop jar /home/flip/hadoop/h/contrib/streaming/hadoop-*-streaming.jar \
11
+ -mapper "$map_script" \
12
+ -reducer "$reduce_script" \
13
+ -input "$input_file" \
14
+ -output "$output_file" \
15
+ "$@"
16
+
17
+ # -jobconf mapred.map.tasks=3 \
18
+ # -jobconf mapred.reduce.tasks=3 \
data/bin/hdp-sync ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+
4
+ src_dir, dest_dir = ARGV[0..1]
5
+ src_files = Dir[src_dir + '/*']
6
+ dest_files = Wukong::Dfs.list_files dest_dir
7
+ Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
8
+ case comparison
9
+ when :missing
10
+ dest_filename = "%s/%s" % [dest_dir, dest_file]
11
+ puts "Copying #{src_file} #{dest_filename}"
12
+ puts `hadoop dfs -put #{src_file} #{dest_filename}`
13
+ when :differ
14
+ src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
15
+ puts "Differ: #{src_ls} \n#{dest_file}"
16
+ end
17
+ end
data/bin/hdp-wc ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+ NEWLINE_LENGTH = $/.length # KLUDGE
4
+
5
+ #
6
+ #
7
+ #
8
+ # !! The +words+ count comes out higher than that of +wc+ -- don't know
9
+ # why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
10
+ #
11
+ class WcMapper < Wukong::Streamer::LineStreamer
12
+ attr_accessor :lines, :fields, :words, :chars, :bytes
13
+
14
+ def before_stream
15
+ self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
16
+ end
17
+
18
+ def process line
19
+ return unless line
20
+ self.lines += 1
21
+ self.fields += 1 + line.count("\t")
22
+ self.words += 1 + line.strip.scan(/\s+/).length unless line.blank?
23
+ self.chars += line.chars.to_a.length + NEWLINE_LENGTH
24
+ self.bytes += line.bytesize + NEWLINE_LENGTH
25
+ $stderr.puts line if (line.chars.to_a.length != line.bytesize)
26
+ end
27
+
28
+ def after_stream
29
+ emit [lines, fields, words, chars, bytes]
30
+ end
31
+ end
32
+
33
+ #
34
+ #
35
+ class WcReducer < Wukong::Streamer::Base
36
+ attr_accessor :lines, :fields, :words, :chars, :bytes
37
+
38
+ def before_stream
39
+ self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
40
+ end
41
+
42
+ def process m_lines, m_fields, m_words, m_chars, m_bytes
43
+ self.lines += m_lines.to_i
44
+ self.fields += m_fields.to_i
45
+ self.words += m_words.to_i
46
+ self.chars += m_chars.to_i
47
+ self.bytes += m_bytes.to_i
48
+ end
49
+
50
+ def after_stream
51
+ emit [lines, fields, words, chars, bytes]
52
+ end
53
+ end
54
+
55
+ Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
56
+
57
+ # class FooScript < Wukong::Script
58
+ # def map_command
59
+ # '/usr/bin/wc'
60
+ # end
61
+ # def reduce_command
62
+ # '/bin/cat'
63
+ # end
64
+ # end
65
+ # FooScript.new(nil, nil, :reduce_tasks => 1).run
66
+ #
67
+ # ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'
data/bin/md5sort ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env python
2
+ """ sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
3
+ optionally prepends with the md5 id too.
4
+ brendan o'connor - anyall.org - gist.github.com/brendano """
5
+
6
+ import hashlib,sys,optparse
7
+ p = optparse.OptionParser()
8
+ p.add_option('-k', type='int', default=False)
9
+ p.add_option('-p', action='store_true')
10
+ opts,args=p.parse_args()
11
+
12
+ lines = sys.stdin.readlines()
13
+ getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
14
+ if opts.k:
15
+ getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
16
+
17
+ lines.sort(key=lambda s: getter(s))
18
+ for line in lines:
19
+ if opts.p: line = getter(line) + "\t" + line
20
+ print line,
data/bin/tabchar ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ # insert a tab char from the command line:
3
+ # echo "hi$(tabchar)there"
4
+ # # => "hi there"
5
+ echo -n -e '\t'
data/bin/uniqc ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ uniq -c | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ "%15s\t" % $1 }'
data/bin/wu-hist ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ sort | uniq -c | sort -rn | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ $1+"\t" }'
data/bin/wu-lign ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ USAGE= %Q{
4
+ # h1. wulign -- format a tab-separated file as aligned columns
5
+ #
6
+ # wulign will intelligently reformat a tab-separated file into a tab-separated,
7
+ # space aligned file that is still suitable for further processing. For example,
8
+ # given the log-file input
9
+ #
10
+ # <pre><code>
11
+ # 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
12
+ # 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
13
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
14
+ # 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
15
+ # 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
16
+ # 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
17
+ # 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
18
+ # </code></pre>
19
+ #
20
+ # wulign will reformat it to read
21
+ #
22
+ # <pre><code>
23
+ # 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
24
+ # 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
25
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
26
+ # 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
27
+ # 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
28
+ # 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
29
+ # 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
30
+ # </code></pre>
31
+ #
32
+ # The fields are still tab-delimited by exactly one tab -- only spaces are used to
33
+ # pad out fields. You can still use cuttab and friends to manipulate columns.
34
+ #
35
+ # wulign isn't intended to be smart, or correct, or reliable -- only to be
36
+ # useful for previewing and organizing tab-formatted files. In general
37
+ # @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
38
+ # equivalent to its input. (That is, the only changes should be insertion of
39
+ # spaces and re-formatting of numerics.) But still -- reserve its use for human
40
+ # inspection only.
41
+ #
42
+ # (Note: tab characters in this source code file have been converted to spaces;
43
+ # replace whitespace with tab in the first example if you'd like to play along at
44
+ # home.)
45
+ #
46
+ # h2. How it works
47
+ #
48
+ # Wulign takes the first 1000 lines, splits by TAB characters into fields, and
49
+ # tries to guess the format -- int, float, or string -- for each. It builds a
50
+ # consensus of the width and type for corresponding columns in the chunk. If a
51
+ # column has mixed numeric and string formats it degrades to :mixed, which is
52
+ # basically treated as :string. If a column has mixed :float and :int elements all
53
+ # of them are formatted as float.
54
+ #
55
+ # h2. Command-line arguments
56
+ #
57
+ # You can give sprintf-style positional arguments on the command line that will be
58
+ # applied to the corresponding columns. (Blank args are used for placeholding and
59
+ # auto-formatting is still applied). So with the example above,
60
+ #
61
+ # @cat foo | wulign '' '' '' '%8.4e'@
62
+ #
63
+ # will format the fourth column with "%8.4e", while the first three columns and
64
+ # fifth-and-higher columns are formatted as usual.
65
+ #
66
+ # <pre><code>
67
+ # ...
68
+ # 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
69
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
70
+ # 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
71
+ # ....
72
+ # </code></pre>
73
+ #
74
+ # h2. Notes
75
+ #
76
+ # * It has no knowledge of header rows. An all-text first line will screw everything up.
77
+ #
78
+ # * It also requires a unanimous vote. One screwy line can coerce the whole mess
79
+ # to :mixed; width formatting will still be applied, though.
80
+ #
81
+ # * It won't set columns wider than 70 chars -- this allows for the occasional
82
+ # super-wide column without completely breaking your screen.
83
+ #
84
+ # * For :float values, wulign tries to guess at the right number of significant
85
+ # digits to the left and right of the decimal point.
86
+ #
87
+ # * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
88
+ # or escaping; every tab delimits a field, every newline a record.
89
+ }
90
+
91
+ if ARGV[0] == '--help'
92
+ puts $0
93
+ puts USAGE
94
+ exit
95
+ end
96
+
97
+ #
98
+ # How many initial lines to use to guess formatting. Lines after this are
99
+ # simply reformatted according to the consensus of the initial
100
+ # FORMAT_GUESSING_LINES.
101
+ #
102
+ FORMAT_GUESSING_LINES = 500
103
+ # widest column to set
104
+ MAX_MAX_WIDTH = 70
105
+
106
+ INT_RE = /\A\d+\z/
107
+ FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
108
+
109
+ def consensus_type val, alltype
110
+ return :mixed if alltype == :mixed
111
+ case
112
+ when val == '' then type = nil
113
+ when val =~ INT_RE then type = :int
114
+ when val =~ FLOAT_RE then type = :float
115
+ else type = :str end
116
+ return if ! type
117
+ case
118
+ when alltype.nil? then type
119
+ when alltype == type then type
120
+ when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
121
+ :float
122
+ else :mixed
123
+ end
124
+ end
125
+
126
+ def f_width str
127
+ str =~ FLOAT_RE or return 0
128
+ [$1.length, $2 ? $2.length : 0]
129
+ end
130
+
131
+ maxw = []
132
+ col_types = []
133
+ col_minmag = []
134
+ col_maxmag = []
135
+ rows = []
136
+ skip_col = []
137
+ ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
138
+ FORMAT_GUESSING_LINES.times do
139
+ line = $stdin.readline rescue nil
140
+ break unless line
141
+ cols = line.chomp.split("\t").map{|s| s.strip }
142
+ col_widths = cols.map{|col| col.length }
143
+ col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
144
+ cols.each_with_index{|col,i|
145
+ next if skip_col[i]
146
+ col_types[i] = consensus_type(col, col_types[i])
147
+ if col_types[i] == :float
148
+ mantissa, radix = f_width(col)
149
+ col_minmag[i] = [radix, col_minmag[i], 1].compact.max
150
+ col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
151
+ end
152
+ }
153
+ # p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
154
+ rows << cols
155
+ end
156
+
157
+ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
158
+ next(lambda{|s| default % s rescue s }) if default.to_s != ''
159
+ case type
160
+ when :mixed, nil then lambda{|s| "%-#{width}s" % s }
161
+ when :str then lambda{|s| "%-#{width}s" % s }
162
+ when :int then lambda{|s| "%#{width}d" % s.to_i }
163
+ when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
164
+ else raise "oops type #{type}" end
165
+ end
166
+ # p [maxw, col_types, col_minmag, col_maxmag, format]
167
+
168
+ pad = [''] * maxw.length
169
+ rows.each do |row|
170
+ # note -- strips trailing columns
171
+ puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
172
+ end
173
+ $stdin.each do |line|
174
+ cols = line.chomp.split("\t").map{|s| s.strip }
175
+ # note -- strips trailing columns
176
+ puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
177
+ end