ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Source do
4
+ it_behaves_like 'a plugin'
5
+ end
6
+
@@ -0,0 +1,101 @@
1
+ require 'spec_helper'
2
+
3
+ describe :extract do
4
+
5
+ let(:hsh) { { "hi" => "there", "top" => { "lower" => { "lowest" => "value" } } } }
6
+ let(:ary) { ['1', 2, 'three'] }
7
+
8
+ subject { processor(:extract) }
9
+
10
+ it_behaves_like 'a processor', :named => :extract
11
+
12
+ context "on a string" do
13
+ it "emits the string with no arguments" do
14
+ processor(:extract).given('hi there', 'buddy').should emit('hi there', 'buddy')
15
+ end
16
+ end
17
+ context "on a Fixnum" do
18
+ it "emits the number with no arguments" do
19
+ processor(:extract).given(3, 3.0).should emit(3, 3.0)
20
+ end
21
+ end
22
+ context "on a Hash" do
23
+ it "emits the hash with no arguments" do
24
+ processor(:extract).given(hsh).should emit(hsh)
25
+ end
26
+ it "can extract a key" do
27
+ processor(:extract, part: 'hi').given(hsh).should emit('there')
28
+ end
29
+ it "emits nil when the value of the key is nil" do
30
+ processor(:extract, part: 'bye').given(hsh).should emit(nil)
31
+ end
32
+ it "can extract a nested key" do
33
+ processor(:extract, part: 'top.lower.lowest').given(hsh).should emit('value')
34
+ end
35
+ it "emits nil when the value of this nested key is nil" do
36
+ processor(:extract, part: 'foo.bar.baz').given(hsh).should emit(nil)
37
+ end
38
+ end
39
+ context "on an Array" do
40
+ it "emits the array with no arguments" do
41
+ processor(:extract).given(ary).should emit(ary)
42
+ end
43
+ it "can extract the nth value with an integer argument" do
44
+ processor(:extract, part: 2).given(ary).should emit(2)
45
+ end
46
+ it "can extract the nth value with a string argument" do
47
+ processor(:extract, part: '2').given(ary).should emit(2)
48
+ end
49
+ end
50
+ context "on JSON" do
51
+ let(:garbage) { '{"239823:' }
52
+ it "emits the JSON with no arguments" do
53
+ processor(:extract).given_json(hsh).should emit_json(hsh)
54
+ end
55
+ it "will skip badly formed records" do
56
+ processor(:extract).given(garbage).should emit(garbage)
57
+ end
58
+ it "can extract a key" do
59
+ processor(:extract, part: 'hi').given_json(hsh).should emit('there')
60
+ end
61
+ it "can extract a nested key" do
62
+ processor(:extract, part: 'top.lower.lowest').given_json(hsh).should emit('value')
63
+ end
64
+ it "emits nil when the record is missing the key" do
65
+ processor(:extract, part: 'foo.bar.baz').given_json(hsh).should emit(nil)
66
+ end
67
+ end
68
+ context "on delimited data" do
69
+ it "emits the row with no arguments" do
70
+ processor(:extract).given_delimited('|', ary).should emit(ary.map(&:to_s).join('|'))
71
+ end
72
+ it "can extract the nth value with an integer argument" do
73
+ processor(:extract, part: 2, separator: '|').given_delimited('|', ary).should emit('2')
74
+ end
75
+ it "can extract nth value with a string argument" do
76
+ processor(:extract, part: '2', separator: '|').given_delimited('|', ary).should emit('2')
77
+ end
78
+ end
79
+ context "on TSV" do
80
+ it "emits the TSV with no arguments" do
81
+ processor(:extract).given_tsv(ary).should emit(ary.map(&:to_s).join("\t"))
82
+ end
83
+ it "can extract the nth value with an integer argument" do
84
+ processor(:extract, part: 2).given_tsv(ary).should emit('2')
85
+ end
86
+ it "can extract the nth value with a string argument" do
87
+ processor(:extract, part: '2').given_tsv(ary).should emit('2')
88
+ end
89
+ end
90
+ context "on CSV" do
91
+ it "emits the CSV with no arguments" do
92
+ processor(:extract).given_csv(ary).should emit(ary.map(&:to_s).join(","))
93
+ end
94
+ it "can extract the nth value with an integer argument" do
95
+ processor(:extract, part: 2, separator: ',').given_csv(ary).should emit('2')
96
+ end
97
+ it "can extract the nth value with a string argument" do
98
+ processor(:extract, part: '2', separator: ',').given_csv(ary).should emit('2')
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,79 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Filters" do
4
+
5
+ describe :null do
6
+ it_behaves_like 'a processor', :named => :null
7
+ it "should not pass anything, ever" do
8
+ processor.given('', 3, 'hi', nil).should emit(0).records
9
+ end
10
+ end
11
+
12
+ describe :identity do
13
+ it_behaves_like 'a processor', :named => :identity
14
+ it "should pass everything, always" do
15
+ processor.given('', 3, 'hi', nil).should emit('', 3, 'hi', nil)
16
+ end
17
+ end
18
+
19
+ describe :regexp do
20
+ it_behaves_like 'a processor', :named => :regexp
21
+ it "should pass everything given no 'match' argument" do
22
+ processor.given('snap', 'crackle', 'pop').should emit('snap', 'crackle', 'pop')
23
+ end
24
+ it "should pass everything its 'match' argument matches" do
25
+ processor(match: /a/).given('snap', 'crackle', 'pop').should emit('snap', 'crackle')
26
+ end
27
+ end
28
+
29
+ describe :not_regexp do
30
+ it_behaves_like 'a processor', :named => :not_regexp
31
+ it "should pass everything given no 'match' argument" do
32
+ processor.given('snap', 'crackle', 'pop').should emit('snap', 'crackle', 'pop')
33
+ end
34
+ it "should pass everything its 'match' argument matches" do
35
+ processor(match: /a/).given('snap', 'crackle', 'pop').should emit('pop')
36
+ end
37
+ end
38
+
39
+ describe :limit do
40
+ it_behaves_like 'a processor', :named => :limit
41
+ it "should pass everything given no 'max' argument" do
42
+ processor.given('snap', 'crackle', 'pop').should emit('snap', 'crackle', 'pop')
43
+ end
44
+ it "should pass only as many records as its 'max' argument" do
45
+ processor(max: 2).given('snap', 'crackle', 'pop', 'whoa').should emit('snap', 'crackle')
46
+ end
47
+ end
48
+
49
+ describe :sample do
50
+ it_behaves_like 'a processor', :named => :sample
51
+ it "should pass everything given no 'fraction' argument" do
52
+ processor.given('snap', 'crackle', 'pop').should emit('snap', 'crackle', 'pop')
53
+ end
54
+ it "should pass a fraction of records matching its 'fraction' argument" do
55
+ processor(:fraction => 0.5) { |proc| proc.should_receive(:rand).and_return(0.7, 0.1, 0.6) }.given('snap', 'crackle', 'pop').should emit('crackle')
56
+ end
57
+ end
58
+
59
+ describe :head do
60
+ it_behaves_like 'a processor', :named => :head
61
+ it "should pass the first 10 records given no argument" do
62
+ processor.given(*(1..100).to_a).should emit(10).records
63
+ end
64
+ it "should pass the first n records" do
65
+ processor(:n => 5).given(*(1..100).to_a).should emit(5).records
66
+ end
67
+ end
68
+
69
+ describe :tail do
70
+ it_behaves_like 'a processor', :named => :tail
71
+ it "should pass all records given no argument" do
72
+ processor.given(*(1..100).to_a).should emit(100).records
73
+ end
74
+ it "should skip the first n records" do
75
+ processor(:n => 5).given(*(1..100).to_a).should emit(95).records
76
+ end
77
+ end
78
+
79
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe :logger do
4
+ it_behaves_like "a processor", :named => :logger
5
+
6
+ it "logs each event at the 'info' level by default" do
7
+ log = double("logger")
8
+ log.should_receive(:info).with('hi there')
9
+ log.should_receive(:info).with('buddy')
10
+ processor(:logger) do |proc|
11
+ proc.stub(:log).and_return(log)
12
+ end.given('hi there', 'buddy').should emit(0).records
13
+ end
14
+
15
+ it "logs each event at the a desired level set with an argument" do
16
+ log = double("logger")
17
+ log.should_receive(:debug).with('hi there')
18
+ log.should_receive(:debug).with('buddy')
19
+ processor(:logger, level: :debug) do |proc|
20
+ proc.stub(:log).and_return(log)
21
+ end.given('hi there', 'buddy').should emit(0).records
22
+ end
23
+ end
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Operators" do
4
+
5
+ describe :map do
6
+ it_behaves_like 'a processor', :named => :map
7
+ it "performs an action on each input record" do
8
+ processor(:map, action: ->(input_record) { input_record.upcase }).given('snap', 'crackle', 'pop').should emit('SNAP', 'CRACKLE', 'POP')
9
+ end
10
+
11
+ it "can simultaneously filter out records" do
12
+ processor(:map, compact: true, action: ->(input_record) { input_record + 1 if input_record > 0 }).given(2, -4, 6).should emit(3, 7)
13
+ end
14
+ end
15
+
16
+ describe :flatten do
17
+ it_behaves_like 'a processor', :named => :flatten
18
+
19
+ it "yields each input record or its contents" do
20
+ processor(:flatten).given('foo', ['bar', 'baz'], 'bing').should emit('foo', 'bar', 'baz', 'bing')
21
+ end
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,92 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Reducers" do
4
+ describe :bin do
5
+ include_context "reducers"
6
+ it_behaves_like 'a processor', :named => :bin
7
+
8
+ let(:bins) {
9
+ [
10
+ ['0.0', '2.000', '9.000'],
11
+ ['2.000', '4.000', '9.000'],
12
+ ['4.000', '6.000', '8.000'],
13
+ ['6.000', '8.000', '11.000'],
14
+ ['8.000', '10.000', '13.000']
15
+ ]
16
+ }
17
+
18
+ it "raises an error when called with a non-positive-definite number of bins" do
19
+ lambda { processor(num_bins: -1) }.should raise_error(Wukong::Error)
20
+ end
21
+
22
+ it "raises an error when called with a a minimum that's less than or equal to the maximum" do
23
+ lambda { processor(min: 10, max: 0) }.should raise_error(Wukong::Error)
24
+ end
25
+
26
+ it "will bin 50 numbers into 7 bins (uses the square root)" do
27
+ processor.given(*nums).should emit(7).records
28
+ end
29
+
30
+ it "will bin 50 numbers into 5 bins if asked" do
31
+ processor(num_bins: 10).given(*nums).should emit(10).records
32
+ end
33
+
34
+ it "counts correctly in each bin" do
35
+ processor(num_bins: 5).given(*nums).should emit(*bins)
36
+ end
37
+
38
+ it "can express counts logarithmically" do
39
+ row = processor(num_bins: 5, log_counts: true).given(*nums).output.first
40
+ row.size.should == 3
41
+ row[2].to_f.should be_within(0.1).of(2.197)
42
+ end
43
+
44
+ it "can add a normalized frequency" do
45
+ row = processor(num_bins: 5, normalize: true).given(*nums).output.first
46
+ row.size.should == 4
47
+ row[3].to_f.should be_within(0.1).of(0.18)
48
+ end
49
+
50
+ it "can add a normalized frequency and express counts logarithmically" do
51
+ row = processor(num_bins: 5, normalize: true, log_counts: true).given(*nums).output.first
52
+ row.size.should == 4
53
+ row[2].to_f.should be_within(0.1).of(2.197)
54
+ row[3].to_f.should be_within(0.1).of(-1.715)
55
+ end
56
+
57
+ it "can bin on the fly given min, max, and num_bins options" do
58
+ output = processor(min: -30, max: 30, num_bins: 3) do |proc|
59
+ # we can bin on the fly
60
+ proc.values.should_not_receive(:<<)
61
+ proc.should_not_receive(:bin!)
62
+ end.given(*nums).output
63
+
64
+ output.size.should == 3
65
+ output.first[0].to_f.should be_within(0.1).of(-30)
66
+ output.last[1].to_f.should be_within(0.1).of(30)
67
+ end
68
+
69
+ it "can bin on the fly given fixed bin edges" do
70
+ output = processor(edges: [0,1,5,10]) do |proc|
71
+ # we can bin on the fly
72
+ proc.values.should_not_receive(:<<)
73
+ proc.should_not_receive(:bin!)
74
+ end.given(*nums).output
75
+ output.size.should == 3
76
+ output[0][0].to_f.should be_within(0.1).of(0.0)
77
+ output[0][1].to_f.should be_within(0.1).of(1.0)
78
+ output[1][0].to_f.should be_within(0.1).of(1.0)
79
+ output[1][1].to_f.should be_within(0.1).of(5.0)
80
+ output[2][0].to_f.should be_within(0.1).of(5.0)
81
+ output[2][1].to_f.should be_within(0.1).of(10.0)
82
+ end
83
+
84
+ it "can extract the value to bin by from an object" do
85
+ output = processor(by: 'data.n', min: 0).given(*json).output
86
+ output.size.should == 2
87
+ output.first[0].to_f.should be_within(0.1).of(0.0)
88
+ output.last[1].to_f.should be_within(0.1).of(100.0)
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Reducers" do
4
+ describe :count do
5
+ include_context "reducers"
6
+ it_behaves_like 'a processor', :named => :count
7
+ it "should emit the total count of records" do
8
+ processor.given(*strings).should emit(4)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Reducers" do
4
+ describe :group do
5
+ include_context "reducers"
6
+ it_behaves_like 'a processor', :named => :group
7
+
8
+ let(:grouped_strings) { [['apple', 2], ['banana', 1], ['cookie', 1]] }
9
+ let(:grouped_nums_from_json) { [[nil, 2], [1, 1], [5, 1], [10, 1], [100, 1]] }
10
+ let(:grouped_nums_from_tsv) { [[nil, 2], ['1', 1], ['5', 1], ['10', 1], ['100', 1]] }
11
+ it "will group single values" do
12
+ processor(:group).given(*strings.sort).should emit(*grouped_strings)
13
+ end
14
+ it "can group from within a JSON hash" do
15
+ proc = processor(:group, by: 'data.n').given(*json_sorted_n).should emit(*grouped_nums_from_json)
16
+ end
17
+ it "can group from within a TSV row" do
18
+ proc = processor(:group, by: '3').given(*tsv_sorted).should emit(*grouped_nums_from_tsv)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Reducers" do
4
+ describe :join_xml do
5
+ include_context "reducers"
6
+ it_behaves_like 'a processor', :named => :join_xml
7
+
8
+ it "joins XML spread out over multiple lines" do
9
+ processor(:join_xml).given('<xml>first line', 'second line', 'third line</xml>').should emit("<xml>first line\nsecond line\nthird line</xml>")
10
+ end
11
+
12
+ it "joins XML one-per-line" do
13
+ processor(:join_xml).given('<xml>first line</xml>', '<xml>second line</xml>', '<xml>third line</xml>').should emit('<xml>first line</xml>', '<xml>second line</xml>', '<xml>third line</xml>')
14
+ end
15
+
16
+ it "joins XML split in the middle of a line" do
17
+ processor(:join_xml).given('<xml>first line', 'second</xml><xml> line', 'third line</xml>').should emit("<xml>first line\nsecond</xml>", "<xml> line\nthird line</xml>")
18
+ end
19
+
20
+ it "joins XML with a custom tag" do
21
+ processor(:join_xml, root: 'foobar').given('<foobar>first line', 'second line', 'third line</foobar>').should emit("<foobar>first line\nsecond line\nthird line</foobar>")
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Reducers" do
4
+ describe :moments do
5
+ include_context "reducers"
6
+ it_behaves_like 'a processor', :named => :moments
7
+
8
+ it "behaves like group when not called with any arguments" do
9
+ processor(:moments).given(*strings.sort).should emit(
10
+ {group: 'apple', count: 2, results: {}},
11
+ {group: 'banana', count: 1, results: {}},
12
+ {group: 'cookie', count: 1, results: {}}
13
+ )
14
+ end
15
+
16
+ it "behaves calculates the moments of numeric fields" do
17
+ processor(:moments, group_by: 'outer', of: 'data.n').given(*json_sorted_outer).should emit(
18
+ {group: nil, count: 2, results: {"data.n" => {}}},
19
+ {group: 'apple', count: 2, results: {"data.n"=>{:count=>2, :mean=>3.0, :std_dev=>2.0}}},
20
+ {group: 'banana', count: 1, results: {"data.n"=>{:count=>1, :mean=>100.0, :std_dev=>0.0}}},
21
+ {group: 'cookie', count: 1, results: {"data.n"=>{:count=>1, :mean=>10.0, :std_dev=>0.0}}}
22
+ )
23
+ end
24
+
25
+ it "will leave off the standard deviation if desired" do
26
+ processor(:moments, group_by: 'outer', of: 'data.n', no_std_dev: true).given(*json_sorted_outer).should emit(
27
+ {group: nil, count: 2, results: {"data.n" => {}}},
28
+ {group: 'apple', count: 2, results: {"data.n"=>{:count=>2, :mean=>3.0 }}},
29
+ {group: 'banana', count: 1, results: {"data.n"=>{:count=>1, :mean=>100.0 }}},
30
+ {group: 'cookie', count: 1, results: {"data.n"=>{:count=>1, :mean=>10.0 }}}
31
+ )
32
+ end
33
+
34
+ end
35
+ end
36
+
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Reducers" do
4
+ describe :sort do
5
+ include_context "reducers"
6
+ it_behaves_like 'a processor', :named => :sort
7
+ it "will use ascending order by default" do
8
+ processor(:sort).given(*strings).should emit(*strings.sort)
9
+ end
10
+ it "can sort in reversed (descending) order" do
11
+ processor(:sort, reverse: true).given(*strings).should emit(*strings.sort.reverse)
12
+ end
13
+ it "will use lexical order by default" do
14
+ processor(:sort).given(*nums).should emit(*nums.sort)
15
+ end
16
+ it "can sort in numerical order" do
17
+ processor(:sort, numeric: true).given(*nums).should emit(*nums.map(&:to_i).sort.map(&:to_s))
18
+ end
19
+ it "can sort from within a JSON hash" do
20
+ proc = processor(:sort, numeric: true, on: 'data.n').given(*json).should emit(*json_sorted_n)
21
+ end
22
+ it "can sort from within a TSV row" do
23
+ proc = processor(:sort, numeric: true, on: '3').given(*tsv).should emit(*tsv_sorted)
24
+ end
25
+ end
26
+ end