ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,28 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:specs)
6
+
7
+ require 'yard'
8
+ YARD::Rake::YardocTask.new
9
+
10
+ desc 'Run RSpec with code coverage'
11
+ task :cov do
12
+ ENV['WUKONG_COV'] = true
13
+ Rake::Task[:specs].execute
14
+ end
15
+
16
+ task :default => :specs
17
+
18
+ desc "Create a TAGS file for this project"
19
+ task :tags do
20
+ files = [%w[Gemfile Guardfile Rakefile README.md].map { |b| File.join(File.dirname(__FILE__), b) }]
21
+ %w[bin examples lib spec].each do |dir|
22
+ files << Dir[File.join(File.dirname(__FILE__), "#{dir}/**/*.rb")]
23
+ end
24
+ files.each do |arry|
25
+ sh "etags", '-a', *arry unless arry.empty?
26
+ end
27
+ end
28
+
data/TODO.md ADDED
@@ -0,0 +1,99 @@
1
+ * Driving OR iterated
2
+
3
+ * Runner? Executor?
4
+ - hooks up source to flow,
5
+ - if iterated source, drives it, otherwise sits in the flow
6
+
7
+ * these set the contract for the inbound products
8
+
9
+ ### slots
10
+
11
+ Typical case: one input, `:input`, one output `:output`
12
+
13
+ * there are as many products as
14
+ - the total number of action stage outputs
15
+ - the concrete input products
16
+ * The number of rsrc->action edges is at most the total number of input slots
17
+ - (you cannot wire multiple products to the same input slot)
18
+
19
+
20
+ 1. action stage B wires up to an action stage A (which really means "the full set of A's outputs")
21
+ 2. I wire action A's output as production product X
22
+ 3.
23
+
24
+ 4. How do I address other stages?
25
+ - `mapper.cat` a great name for action stage `cat` inside chain `mapper`
26
+ - `mapper.cat:output`? `mapper.cat_output`?
27
+
28
+ Subgraphs own their contents
29
+
30
+
31
+ ## Configuration
32
+
33
+ * options on processor become options on flow -- so, in the telegram example,
34
+ :break_length becomes configurable (somehow)
35
+
36
+ ## Tests
37
+
38
+ * add an examples helper -- runs script on file from data dir, diffs the output.
39
+
40
+
41
+
42
+ __________________________________________________________________________
43
+
44
+
45
+
46
+ * [Ruby-Graphviz](https://github.com/glejeune/Ruby-Graphviz.git) Ruby interface to the GraphViz graphing tool
47
+ * [Ruby GraphML Parser](https://github.com/willcannings/ruby-graphml.git)
48
+
49
+
50
+
51
+ * everything accessible from clean (non-magical) methods.
52
+
53
+ * inputs and outputs:
54
+ - inputs and outputs become an array of symbols
55
+
56
+
57
+ * You can only have as many macro edges as inputs
58
+
59
+ * action stage 'ports'
60
+ - a list of names for them
61
+ - can also have an edge going to a
62
+
63
+
64
+ _____
65
+ |
66
+ --v--
67
+ |
68
+ |
69
+ __^____^__
70
+ | x | y |
71
+ | foo |
72
+ ----------
73
+
74
+ create a product with no action? action with anonymous product, wired up later?
75
+
76
+
77
+ * connections:
78
+
79
+ - action -> action:
80
+
81
+ act_a -> actb
82
+
83
+
84
+
85
+
86
+ act_a :o1 -> rsrc_x
87
+ act_a :o2 -> rsrc_y
88
+
89
+ act_b :i <- act_a
90
+
91
+
92
+
93
+ * references:
94
+ -
95
+
96
+
97
+
98
+
99
+
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # cut 1
5
+ #
6
+ # Example:
7
+ #
8
+ # A quickie histogram of timestamps; say that for the object in the foo/bar
9
+ # directory, field 3 holds a flat timestamp (YYYYmmddHHMMSS) and you want a
10
+ # histogram by hour (and that foo/bar is small enough to be worth sucking
11
+ # through a single machine):
12
+ #
13
+ # hdp-catd foo/bar | cuttab 3 | cutc 12 | sort | uniq -c
14
+ #
15
+ # If foo/bar is already sorted leave out the call to sort.
16
+ #
17
+
18
+
19
+ #
20
+ # Set it to cut up to $1 (if defined), or if not, up to $CUTC_MAX (if defined), or 200 chars as a fallback.
21
+ #
22
+ CUTC_MAX=${CUTC_MAX-200}
23
+ CUTC_MAX=${1-$CUTC_MAX}
24
+ cutchars="1-${CUTC_MAX}"
25
+ shift
26
+
27
+ #
28
+ # Do the cuttin'
29
+ #
30
+ exec cut -c"${cutchars}" "$@"
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+
3
+ fields=${1-"1-"}
4
+ shift
5
+ exec cut -d' ' -f"$fields" "$@"
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # runs grep but always returns a true exit status. (Otherwise hadoop vomits)
4
+ # You can set a command line var in hadoop instead, but we'll leave this around
5
+ grep "$@"
6
+ true
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env python
2
+ """ sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
3
+ optionally prepends with the md5 id too.
4
+ brendan o'connor - anyall.org - gist.github.com/brendano """
5
+
6
+ import hashlib,sys,optparse
7
+ p = optparse.OptionParser()
8
+ p.add_option('-k', type='int', default=False)
9
+ p.add_option('-p', action='store_true')
10
+ opts,args=p.parse_args()
11
+
12
+ lines = sys.stdin.readlines()
13
+ getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
14
+ if opts.k:
15
+ getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
16
+
17
+ lines.sort(key=lambda s: getter(s))
18
+ for line in lines:
19
+ if opts.p: line = getter(line) + "\t" + line
20
+ print line,
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # This script is useful for debugging. it dumps your environment to STDERR
5
+ # and otherwise runs as `cat`
6
+ #
7
+
8
+ set >&2
9
+
10
+ cat
11
+ true
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ # insert a tab char from the command line:
3
+ # echo "hi$(tabchar)there"
4
+ # # => "hi there"
5
+ exec echo -n -e '\t'
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: ASCII-8BIT
3
+ require 'set'
4
+
5
+ unless ARGV.empty?
6
+ unless ARGV.include?('--help')
7
+ puts "\n**\nSorry, uniq-ord only works in-line: cat foo.txt bar.tsv | uniq-ord\n**" ; puts
8
+ end
9
+ puts <<USAGE
10
+ uniq-ord is ike the uniq command but doesn't depend on prior sorting: it tracks
11
+ each line and only emits the first-seen instance of that line.
12
+
13
+ The algorithm is /very/ simplistic: it uses ruby's built-in hash to track lines.
14
+ This can produce false positives, meaning that a line of output might be removed
15
+ even if it hasn't been seen before. It may also consume an unbounded amount of
16
+ memory (though less than the input text). With a million lines it will consume
17
+ about 70 MB of memory and have more than 1 in a million chance of false
18
+ positive. On a billion lines it will consume many GB and have over 25% odds of
19
+ incorrectly skipping a line.
20
+
21
+ However, it's really handy for dealing with in-order lists from the command line.
22
+ USAGE
23
+ exit(0)
24
+ end
25
+
26
+ # # Logging
27
+ #
28
+ # MB = 1024*1024
29
+ # LOG_INTERVAL = 100_000
30
+ # $start = Time.now; $iter = 0; $size = 0
31
+ # def log_line
32
+ # elapsed = (Time.now - $start).to_f
33
+ # $stderr.puts("%5d s\t%10.1f l/s\t%5dk<\t%5dk>\t%5d MB\t%9.1f MB/s\t%11d b/l"%[ elapsed, $iter/elapsed, $iter/1000, LINES.count/1000, $size/MB, ($size/MB)/elapsed, $size/$iter ])
34
+ # end
35
+
36
+ LINES = Set.new
37
+ $stdin.each do |line|
38
+ next if LINES.include?(line.hash)
39
+ puts line
40
+ LINES << line.hash
41
+ # $iter += 1 ; $size += line.length
42
+ # log_line if ($iter % LOG_INTERVAL == 0)
43
+ end
44
+ # log_line
45
+
46
+ #
47
+ # # 2.1 GB data, 1M lines, 2000 avg chars/line
48
+ #
49
+ # # Used: RSS: 71_988 kB VSZ: 2_509_152 kB
50
+ # # Stats: 38 s 25_859.1 l/s 1000k< 1000k> 1976 MB 51.1 MB/s 2072 b/l
51
+ # # Time: real 0m41.4 s user 0m31.6 s sys 0m8.3 s pct 96.48
52
+ #
53
+ # # 4.1 GB data, 5.6M lines, 800 avg chars/line
54
+ #
55
+ # # Used: RSS: 330_644 kB VSZ: 2_764_236 kB
56
+ # # Stats: 861 6_538.2 l/s 5632k< 5632k> 4158 MB 4.8 MB/s 774 b/l
57
+ # # Time: real 14m24.6 s user 13m8.8 s sys 0m12. s pct 92.61
58
+ #
59
+
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ uniq -c | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ "%15s\t" % $1 }'
data/bin/wu ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+ require 'shellwords'
3
+ now=Time.now.strftime("%Y-%m-%d %H:%M:%S")
4
+ if ARGV.empty?
5
+ abort "ERROR #{now} [wu ] -- Must provide a Wukong command to run. Try the --help option."
6
+ else
7
+ if ARGV.size == 1 && ARGV.first == '--help'
8
+ abort <<EOF
9
+ usage: wu COMMAND [OPTIONS] [ARG] ...
10
+
11
+ wu is a wrapper for easy use of Wukong's command-line tools. It takes
12
+ your arguments, constructs the name of the proper wu-tool to call, and
13
+ prepends a call to bundle exec.
14
+
15
+ $ wu local ...
16
+
17
+ is equivalent to
18
+
19
+ $ bundle exec wu-local ...
20
+
21
+ You can run any of the wu-tools this way:
22
+
23
+ wu-local wu-source
24
+ wu-hadoop wu-storm
25
+ wu-deploy wu-load
26
+ EOF
27
+ else
28
+ if ARGV.first =~ /^-/
29
+ abort "ERROR ${now} [wu ] -- First argument must be the name of a wu tool to run, got <${1}>"
30
+ else
31
+ Kernel.exec "bundle exec wu-#{Shellwords.join(ARGV)}"
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding:UTF-8
3
+
4
+ if ARGV.include?('--help')
5
+ puts <<USAGE
6
+ wu-clean-encoding cleans malformed characters from stdin.
7
+
8
+ If a character is malformed, as defined by valid_encoding?,
9
+ it is replaced with a '�'.
10
+
11
+ wu-clean-encoding was built to work with UTF-8, and no
12
+ guarantees are provided for other encodings.
13
+ USAGE
14
+ exit(0)
15
+ end
16
+
17
+ ARGF.each do |line|
18
+ if line.valid_encoding?
19
+ $stdout.write line
20
+ else
21
+ repaired_line = []
22
+ line.each_char do |char|
23
+ if char.valid_encoding?
24
+ repaired_line << char
25
+ else
26
+ repaired_line << "�"
27
+ end
28
+ end
29
+ $stdout.write repaired_line.join
30
+ end
31
+ end
@@ -0,0 +1,13 @@
1
+ #!/bin/sh
2
+
3
+ #
4
+ # Outputs a compact wukong-style date:
5
+ #
6
+ #
7
+ # $ date
8
+ # Sun Nov 8 03:21:37 CST 2009
9
+ # $ wu-date
10
+ # 20091108
11
+ #
12
+
13
+ exec date +"%Y%m%d"
@@ -0,0 +1,13 @@
1
+ #!/bin/sh
2
+
3
+ #
4
+ # Outputs a compact wukong-style datetime:
5
+ #
6
+ #
7
+ # $ date
8
+ # Sun Nov 8 03:21:37 CST 2009
9
+ # $ wu-datetime
10
+ # 20091108032137
11
+ #
12
+
13
+ exec date +"%Y%m%d%H%M%D"
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ sort | uniq -c | sort -rn | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ $1+"\t" }'
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ USAGE= %Q{
4
+ # h1. wulign -- format a tab-separated file as aligned columns
5
+ #
6
+ # wulign will intelligently reformat a tab-separated file into a tab-separated,
7
+ # space aligned file that is still suitable for further processing. For example,
8
+ # given the log-file input
9
+ #
10
+ # # cat tag_usage.tsv
11
+ # 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
12
+ # 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
13
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
14
+ # 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
15
+ # 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
16
+ # 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
17
+ # 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
18
+ #
19
+ # wulign will reformat it to read
20
+ #
21
+ # # cat tag_usage.tsv | wu-lign
22
+ # 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
23
+ # 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
24
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
25
+ # 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
26
+ # 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
27
+ # 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
28
+ # 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
29
+ #
30
+ # The fields are still tab-delimited by exactly one tab -- only spaces are used to
31
+ # pad out fields. You can still use cuttab and friends to manipulate columns.
32
+ #
33
+ # h2. Command-line arguments
34
+ #
35
+ # You can give sprintf-style positional arguments on the command line that will be
36
+ # applied to the corresponding columns. (Blank args are used for placeholding and
37
+ # auto-formatting is still applied). So with the example above,
38
+ #
39
+ # cat foo | wulign '' '' '' '%8.4e'
40
+ #
41
+ # will format the fourth column with "%8.4e", while the first three columns and
42
+ # fifth-and-higher columns are formatted as usual.
43
+ #
44
+ # ...
45
+ # 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
46
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
47
+ # 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
48
+ # ....
49
+ #
50
+ # h2. How it works
51
+ #
52
+ # Wu-lign takes the first 500ish lines, splits into fields on TAB characters,
53
+ # and tries to guess the format (int, float, or string) for each. It builds a
54
+ # consensus of the width and type for corresponding columns in the chunk. If a
55
+ # column has mixed numeric and string formats it degrades to :mixed, which is
56
+ # basically treated as :string. If a column has mixed :float and :int elements all
57
+ # of them are formatted as float.
58
+ #
59
+ # h2. Notes
60
+ #
61
+ # * Header rows: the first line is used for width alignment but not for type detection.
62
+ # This means that an initial row of text headers will inform column spacing
63
+ # but still allow a column of floats (say) to be properly aligned as floats.
64
+ #
65
+ # * It requires a unanimous vote. One screwy line can coerce the whole mess to
66
+ # :mixed; width formatting will still be applied, though.
67
+ #
68
+ # * It won't set columns wider than 100 chars -- this allows for the occasional
69
+ # super-wide column without completely breaking your screen.
70
+ #
71
+ # * For :float values, wulign tries to guess at the right number of significant
72
+ # digits to the left and right of the decimal point.
73
+ #
74
+ # * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab
75
+ # delimits a field, every newline a record.
76
+ #
77
+ # wulign isn't intended to be smart, or correct, or reliable -- only to be
78
+ # useful for previewing and organizing tab-formatted files. In general
79
+ # wulign(foo).split("\t").map(&:strip) *should* give output semantically
80
+ # equivalent to its input. (That is, the only changes should be insertion of
81
+ # spaces and re-formatting of numerics.) But still -- reserve its use for human
82
+ # inspection only.
83
+ #
84
+ }
85
+
86
+ if ARGV[0] == '--help'
87
+ puts $0
88
+ puts USAGE
89
+ exit
90
+ end
91
+
92
+ #
93
+ # How many initial lines to use to guess formatting. Lines after this are
94
+ # simply reformatted according to the consensus of the initial
95
+ # FORMAT_GUESSING_LINES.
96
+ #
97
+ FORMAT_GUESSING_LINES = 500
98
+ # widest column to set
99
+ MAX_MAX_WIDTH = 100
100
+
101
+ INT_RE = /\A[\d,]+\z/
102
+ FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
103
+
104
+ def get_type val
105
+ case
106
+ when val == '' then type = nil
107
+ when val =~ INT_RE then type = :int
108
+ when val =~ FLOAT_RE then type = :float
109
+ else type = :str end
110
+ end
111
+
112
+ def consensus_type val, alltype, is_first
113
+ return :mixed if alltype == :mixed
114
+ type = get_type(val) or return
115
+ case
116
+ when alltype.nil? then type
117
+ when is_first && (alltype == :str) then type
118
+ when alltype == type then type
119
+ when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
120
+ :float
121
+ else :mixed
122
+ end
123
+ end
124
+
125
+ def f_width str
126
+ str =~ FLOAT_RE or return 0
127
+ [$1.length, $2 ? $2.length : 0]
128
+ end
129
+
130
+ maxw = []
131
+ col_types = []
132
+ col_minmag = []
133
+ col_maxmag = []
134
+ rows = []
135
+ skip_col = []
136
+ has_header = false
137
+ ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
138
+ FORMAT_GUESSING_LINES.times do
139
+ line = $stdin.readline rescue nil
140
+ break unless line
141
+ row = line.chomp.split("\t").map{|s| s.strip }
142
+ col_widths = row.map{|col| col.length }
143
+ col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
144
+ row.each_with_index{|col,i|
145
+ next if skip_col[i]
146
+ # Let the first row be text (headers)
147
+ col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
148
+ if col_types[i] == :float
149
+ mantissa, radix = f_width(col)
150
+ col_minmag[i] = [radix, col_minmag[i], 1].compact.max
151
+ col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
152
+ end
153
+ }
154
+ # p [rows.length, has_header, maxw, col_types, col_minmag, col_maxmag, col_widths, row]
155
+ has_header = true if row.all?{|col| get_type(col) == :str } && rows.length == 0
156
+ rows << row
157
+ end
158
+
159
+ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
160
+ next(lambda{|s| default % s rescue s }) if default.to_s != ''
161
+ case type
162
+ when :mixed, nil then lambda{|s| "%-#{width}s" % s }
163
+ when :str then lambda{|s| "%-#{width}s" % s }
164
+ when :int then lambda{|s| "%#{width}d" % s.gsub(/[^\d\-\+]+/, "").to_i }
165
+ when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.gsub(/[^\d\.eE\-\+]+/, "").to_f }
166
+ else raise "oops type #{type}" end
167
+ end
168
+
169
+ def dump_row row, format
170
+ puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
171
+ end
172
+ def dump_header row, maxw
173
+ puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
174
+ end
175
+
176
+ pad = [''] * maxw.length
177
+ dump_header(rows.shift, maxw) if has_header
178
+ rows.each do |row|
179
+ # note -- strips trailing columns
180
+ dump_row(row, format)
181
+ end
182
+ $stdin.each do |line|
183
+ row = line.chomp.split("\t").map{|s| s.strip }
184
+ # note -- strips trailing columns
185
+ dump_row(row, format)
186
+ end