ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative './common'
3
+
4
+ class BreadcrumbsMapper < Wukong::Streamer::ModelStreamer
5
+ self.model_klass = Logline
6
+ def process visit, *args
7
+ # return unless Settings.page_types.include?(visit.page_type)
8
+ yield [visit.ip, visit.requested_at.to_i, visit.path]
9
+ end
10
+ end
11
+
12
+ class BreadcrumbEdgesReducer < Wukong::Streamer::Reducer
13
+ def get_key(ip, itime, path)
14
+ [ip]
15
+ end
16
+ def start!(*args)
17
+ @paths = Set.new
18
+ super
19
+ end
20
+ def accumulate ip, itime, path
21
+ @paths << path
22
+ end
23
+
24
+ # for each pair of paths, emit the edge in both directions
25
+ def finalize
26
+ @paths = @paths.to_a
27
+ while @paths.present?
28
+ from = @paths.shift
29
+ @paths.each do |into|
30
+ yield [key, from, into]
31
+ yield [key, into, from]
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+
38
+ Wukong.run( BreadcrumbsMapper, BreadcrumbEdgesReducer, :sort_fields => 2 )
@@ -0,0 +1,94 @@
1
+ require 'wukong'
2
+
3
+ module Verbose
4
+
5
+ def verbose?
6
+ end
7
+
8
+ def setup
9
+ # log.info("Setting up #{label}")
10
+ end
11
+
12
+ def finalize
13
+ # log.info("Finalizing #{label}")
14
+ end
15
+ end
16
+
17
+ Wukong.processor(:upcaser) do
18
+ include Verbose
19
+ def process(string)
20
+ # log.info("#process #{string}")
21
+ yield string.upcase
22
+ end
23
+ end
24
+
25
+ Wukong.processor(:downcaser) do
26
+ include Verbose
27
+ def process(string)
28
+ # log.info("#process #{string}")
29
+ yield string.downcase
30
+ end
31
+ end
32
+
33
+ Wukong.processor(:tokenizer) do
34
+ include Verbose
35
+ def process string
36
+ # log.info("#process #{string}")
37
+ string.split.each { |token| yield token }
38
+ end
39
+ end
40
+
41
+ Wukong.processor(:stripper) do
42
+ include Verbose
43
+ def process(string)
44
+ # log.info("#process #{string}")
45
+ yield string.gsub(/[^\w\s]/,'')
46
+ end
47
+ end
48
+
49
+ Wukong.processor(:devoweler) do
50
+ include Verbose
51
+ def process(string)
52
+ # log.info("#process #{string}")
53
+ yield string.gsub(/[aeiou]/i,'')
54
+ end
55
+ end
56
+
57
+ # stripper = Wukong.registry.retrieve(:stripper)
58
+ # tokenizer = Wukong.registry.retrieve(:tokenizer)
59
+ # upcaser = Wukong.registry.retrieve(:upcaser)
60
+ # downcaser = Wukong.registry.retrieve(:downcaser)
61
+ # devoweler = Wukong.registry.retrieve(:devoweler)
62
+
63
+ # Splitter = Class.new(Wukong::Dataflow)
64
+ # builder = Wukong::DataflowBuilder.receive({label: :splitter,
65
+ # for_class: Splitter,
66
+ # stages: {
67
+ # stripper: stripper,
68
+ # tokenizer: tokenizer,
69
+ # upcaser: upcaser,
70
+ # downcaser: downcaser,
71
+ # devoweler: devoweler,
72
+ # },
73
+ # links: [
74
+ # Hanuman::LinkFactory.connect(:simple, :stripper, :tokenizer),
75
+ # Hanuman::LinkFactory.connect(:simple, :tokenizer, :upcaser),
76
+ # Hanuman::LinkFactory.connect(:simple, :tokenizer, :downcaser),
77
+ # Hanuman::LinkFactory.connect(:simple, :upcaser, :devoweler),
78
+ # ]})
79
+
80
+ # builder.extract_links!
81
+ # Splitter.set_builder(builder)
82
+ # Splitter.register
83
+
84
+ Wukong.dataflow(:splitter) do
85
+ stripper | tokenizer |
86
+ [
87
+ upcaser | devoweler |
88
+ [
89
+ regexp | count,
90
+ identity
91
+ ],
92
+ downcaser | reject { |word| word == 'hell' }
93
+ ]
94
+ end
@@ -0,0 +1,7 @@
1
+ # A simple processor in its own little file.
2
+ class StringReverser < Wukong::Processor
3
+ def process line
4
+ yield line.reverse
5
+ end
6
+ register(:string_reverser)
7
+ end
@@ -0,0 +1,35 @@
1
+ require File.expand_path('../examples_helper', File.dirname(__FILE__))
2
+
3
+ Wukong.processor :pig_latinize do
4
+
5
+ CONSONANTS = "bcdfghjklmnpqrstvwxz"
6
+ UPPERCASE = /^[A-Z]/
7
+
8
+ # Regular expression to identify the parts of a pig-latin-izable word
9
+ PIG_LATIN_WORD_RE = %r{
10
+ \b # word boundary
11
+ ([#{CONSONANTS}]*) # all initial consonants
12
+ ([\w\']+) # remaining word characters
13
+ }xi
14
+
15
+ def latinize(line)
16
+ line.gsub(PIG_LATIN_WORD_RE) do
17
+ init, rest = [$1, $2]
18
+ init = 'w' if init.blank?
19
+ rest.capitalize! if init =~ UPPERCASE
20
+ "#{rest}#{init.downcase}ay"
21
+ end
22
+ end
23
+
24
+ def process(line)
25
+ emit latinize(line)
26
+ end
27
+
28
+ end
29
+
30
+ ExampleUniverse.dataflow(:pig_latin) do
31
+ set_input :default, file_source(Pathname.path_to(:data, 'text/gift_of_the_magi.txt'))
32
+ set_output :default, file_sink( Pathname.path_to(:tmp, 'text/pig_latin/gift_of_the_magi.txt'))
33
+
34
+ input(:default) > pig_latinize > output(:default)
35
+ end
@@ -0,0 +1,16 @@
1
+ Wukong.dataflow(:pig_latinizer) do
2
+ doc <<-DOC
3
+ Accepts plaintext documents posted to its HTTP listener,
4
+ translates it into pig latin, and archives both the
5
+ translated and original texts into a mysql database
6
+ DOC
7
+
8
+ input :raw_texts, http_listener(:port => 8300)
9
+ output :original_texts, mysql_sink
10
+ output :latinized_texts, mysql_sink
11
+
12
+ input(:raw_texts) > many_to_many([
13
+ :original_texts,
14
+ pig_latinizer > :latinized_texts
15
+ ])
16
+ end
@@ -0,0 +1,14 @@
1
+ # Find the Regional Flavor of topics using Geolocated Wikipedia Articles
2
+
3
+ (Chapter 1 of "Big Data for Chimps")
4
+
5
+ 1. article -> wordbag
6
+ 2. join on page data to get geolocation
7
+ 3. use pagelinks to get larger pool of implied geolocations
8
+ 4. turn geolocations into quadtile keys
9
+ 5. aggregate topics by quadtile
10
+ 6. take summary statistics aggregated over term and quadkey
11
+ 7. combine those statistics to identify terms that occur more frequently than the base rate would predict
12
+ 8. explore and validate the results
13
+ 9. filter to find strongly-flavored words, and other reductions of the data for visualization
14
+
@@ -0,0 +1,39 @@
1
+ %declare wukong_dir '/Users/flip/ics/core/wukong'
2
+ %declare data_dir '$wukong_dir/data'
3
+ %declare dsfp_dir '/Users/flip/ics/data_science_fun_pack'
4
+ ;
5
+
6
+ register '$dsfp_dir/pig/varaha/target/varaha-1.0-SNAPSHOT.jar';
7
+ register '$dsfp_dir/pig/varaha/lib/mallet-2.0.7-RC2.jar';
8
+ register '$dsfp_dir/pig/varaha/lib/trove-2.0.4.jar';
9
+ register '$dsfp_dir/pig/varaha/lib/lucene-core-3.1.0.jar';
10
+ register '$dsfp_dir/pig/varaha/lib/pygmalion-1.1.0-SNAPSHOT.jar';
11
+ register '$dsfp_dir/pig/pigsy/target/pigsy-2.1.0-SNAPSHOT.jar';
12
+ register '$dsfp_dir/pig/datafu/dist/datafu-0.0.6-SNAPSHOT.jar';
13
+
14
+ define JsonStrToString pigsy.pig.json.JsonStrToString();
15
+ define TokenizeText varaha.text.TokenizeText();
16
+ define LDATopics varaha.topic.LDATopics();
17
+ define RangeConcat org.pygmalion.udf.RangeBasedStringConcat('0', ' ');
18
+
19
+ -- Load the markup-stripped wikipedia text
20
+ torture_strings = LOAD '$data_dir/helpers/torture/string_handling_test.tsv' AS (
21
+ desc:chararray, len:int, bytesize:int, str:chararray, has_str:int, jsonized_str:chararray,
22
+ escaped_chars:chararray, escaped_bytes:chararray, chars_list:chararray, bytes_list:chararray
23
+ );
24
+
25
+ -- Generate a random integer between 0 and n
26
+ decoded = FOREACH torture_strings {
27
+ unjsonized_str = JsonStrToString(jsonized_str);
28
+ is_str_equal = (((str == unjsonized_str) OR (has_str == 0)) ? 1 : 0);
29
+ is_len_equal = ( len == SIZE(unjsonized_str) ? 1 : 0 );
30
+ GENERATE
31
+ is_str_equal,
32
+ is_len_equal,
33
+ str AS str,
34
+ unjsonized_str AS unjsonized_str
35
+ ;
36
+ };
37
+
38
+ -- take a dump on your terminal window
39
+ DUMP decoded;
@@ -0,0 +1,4 @@
1
+
2
+ #
3
+ # Summarize each article as a wordbag.
4
+ #
@@ -0,0 +1,27 @@
1
+ %declare wukong_dir '/Users/flip/ics/core/wukong'
2
+ %declare data_dir '$wukong_dir/data'
3
+ %declare dsfp_dir '/Users/flip/ics/data_science_fun_pack'
4
+ ;
5
+
6
+ register '$dsfp_dir/pig/datafu/dist/datafu-0.0.6-SNAPSHOT.jar';
7
+ register '$dsfp_dir/pig/pigsy/target/pigsy-2.1.0-SNAPSHOT.jar';
8
+
9
+ define RandInt datafu.pig.numbers.RandInt();
10
+ define ConcatBag com.infochimps.hadoop.pig.ConcatBag();
11
+
12
+ -- Load the data: the integers from 0 .. 1023
13
+ ones = LOAD '$data_dir/helpers/numbers/integers-1ki.tsv' AS (val:int);
14
+
15
+ -- Generate a random integer between 0 and n
16
+ rands = FOREACH ones GENERATE val, RandInt(0,val) AS rand_val;
17
+ rand_str_g = GROUP rands ALL;
18
+ rand_str_s = FOREACH rand_str_g {
19
+ joined_str = ConcatBag(rands.(rand_val));
20
+ len = SIZE(joined_str);
21
+ GENERATE joined_str AS joined_str, len AS len;
22
+ };
23
+
24
+ -- take a dump on your terminal window
25
+ STORE rand_str_s INTO '/tmp/dump';
26
+
27
+ DESCRIBE rand_str_s;
@@ -0,0 +1,5 @@
1
+ Wukong.dataflow(:twitter) do
2
+ from_json | reject { |obj| obj["delete"] } |
3
+ [
4
+
5
+ end
@@ -0,0 +1,36 @@
1
+ require 'gorillib/some'
2
+ require 'gorillib/model'
3
+
4
+ require 'hanuman/registry'
5
+ require 'hanuman/link'
6
+ require 'hanuman/stage'
7
+ require 'hanuman/graph'
8
+ require 'hanuman/tree'
9
+
10
+ module Hanuman
11
+ module Shortcuts
12
+
13
+ def builder_shortcut(builder_type, label, *args, &blk)
14
+ if GlobalRegistry.registered?(label)
15
+ builder = GlobalRegistry.retrieve(label)
16
+ else
17
+ builder = builder_type.receive(label: label)
18
+ end
19
+ GlobalRegistry.decorate_with_registry(builder) if builder.is_a?(GraphBuilder)
20
+ builder.define(*args, &blk)
21
+ end
22
+
23
+ def add_shortcut_method_for(method_name, builder_type)
24
+ self.define_singleton_method(method_name){ |label, *args, &blk| builder_shortcut(builder_type, label, *args, &blk) }
25
+ end
26
+
27
+ def registry() Hanuman::GlobalRegistry ; end
28
+
29
+ end
30
+
31
+ extend Hanuman::Shortcuts
32
+
33
+ add_shortcut_method_for(:stage, StageBuilder)
34
+ add_shortcut_method_for(:graph, GraphBuilder)
35
+
36
+ end
@@ -0,0 +1,97 @@
1
+ module Hanuman
2
+
3
+ module GraphInstanceMethods
4
+ def each_stage &block
5
+ stages.values.each(&block)
6
+ end
7
+
8
+ def descendents stage=nil
9
+ links.find_all do |link|
10
+ stage ? link.from == stage.label : true
11
+ end.map(&:into).uniq.map { |label| stages[label] }.compact
12
+ end
13
+
14
+ def ancestors stage=nil
15
+ links.find_all do |link|
16
+ stage ? link.into == stage.label : true
17
+ end.map(&:from).uniq.map { |label| stages[label] }.compact
18
+ end
19
+
20
+ def add_stage stage
21
+ stages[stage.label] = stage
22
+ end
23
+
24
+ def has_link? from, into
25
+ links.detect { |link| link.from == from.label && link.into == into.label } ? true : false
26
+ end
27
+
28
+ def add_link type, from, into
29
+ add_stage(from)
30
+ add_stage(into)
31
+ self.links << Hanuman::LinkFactory.connect(type, from.linkable_name(:in), into.linkable_name(:out))
32
+ end
33
+ end
34
+
35
+ class Graph < Stage
36
+ include GraphInstanceMethods
37
+
38
+ field :stages, Hash, :default => {}
39
+ field :links, Array, :default => []
40
+ end
41
+
42
+ class GraphBuilder < StageBuilder
43
+
44
+ include GraphInstanceMethods
45
+
46
+ field :stages, Hash, :default => {}
47
+ field :links, Array, :default => []
48
+
49
+ def define(&blk)
50
+ graph = for_class || define_class(label)
51
+ self.instance_eval(&blk) if block_given?
52
+ extract_links!
53
+ graph.register
54
+ end
55
+
56
+ def build(options = {})
57
+ attrs = serialize
58
+ stages = attrs.delete(:stages).inject({}){ |hsh, (name, builder)| hsh[name] = builder.build(stage_specific_options(name, options)) ; hsh }
59
+ for_class.receive attrs.merge(stages: stages)
60
+ end
61
+
62
+ def stage_specific_options(stage, options)
63
+ scope = options.delete(stage) || {}
64
+ options.merge(scope)
65
+ end
66
+
67
+ def namespace() Hanuman::Graph ; end
68
+
69
+ def handle_dsl_arguments_for(stage, *args, &blk)
70
+ options = args.extract_options!
71
+ stage.merge!(options)
72
+ stage
73
+ end
74
+
75
+ def extract_links!
76
+ self.links.replace([])
77
+ stages.each_pair{ |name, builder| links << builder.links }
78
+ links.flatten!
79
+ end
80
+
81
+ def serialize
82
+ attrs = attributes
83
+ args = attrs.delete(:args)
84
+ attrs.delete(:for_class)
85
+ attrs.merge(args)
86
+ end
87
+
88
+ def clone
89
+ cloned_attrs = Hash[ serialize.select{ |key, val| key != :stages }.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
90
+ cloned_links = links.map{ |link| link.dup }
91
+ cloned_stages = Hash[ stages.map{ |stage| stage.clone } ]
92
+ self.class.receive(cloned_attrs.merge(links: cloned_links).merge(stages: cloned_stages).merge(for_class: for_class))
93
+ end
94
+
95
+ end
96
+
97
+ end
@@ -0,0 +1,206 @@
1
+ require 'graphviz'
2
+
3
+ module Hanuman
4
+ class Graphvizzer
5
+
6
+ attr_accessor :hanuman_graph, :visual
7
+
8
+ def initialize(hanuman_graph)
9
+ @hanuman_graph = hanuman_graph
10
+ @visual = GraphViz.new(@hanuman_graph.label, :type => :digraph)
11
+ end
12
+
13
+ def create
14
+ hanuman_graph.directed_sort.each{ |stage| visual.add_nodes(stage.to_s) }
15
+ hanuman_graph.links.each{ |link| visual.add_edges(link.from.to_s, link.into.to_s) }
16
+ visual.output(:png => 'test.png')
17
+ end
18
+
19
+ end
20
+ end
21
+
22
+ # module Hanuman
23
+ # module Graphvizzer
24
+ # include Gorillib::Builder
25
+
26
+ # class Item
27
+ # include Gorillib::Builder
28
+ # alias_method :configurate, :receive!
29
+
30
+ # field :name, Symbol
31
+ # field :label, String, :default => ->{ name }
32
+ # field :owner, Item
33
+
34
+ # def initialize(attrs={}, &block)
35
+ # receive!(attrs, &block)
36
+ # end
37
+
38
+ # def depth
39
+ # owner.depth + 1
40
+ # end
41
+
42
+ # def indent(adj=0)
43
+ # " " * (depth + adj)
44
+ # end
45
+
46
+ # def quote(str)
47
+ # return str if str.to_s.include?('"')
48
+ # %Q{"#{str}"}
49
+ # end
50
+
51
+ # def line(str, attrs={}, term=';')
52
+ # if attrs.empty?
53
+ # attr_strs = ''
54
+ # else
55
+ # width = 40 - indent.length
56
+ # str = "%-#{width}s" % str
57
+ # attr_strs = attrs.map{|attr, val| attrib(attr, val) }
58
+ # attr_strs = "[ #{attr_strs.join(",")} ]"
59
+ # end
60
+ # [indent, str, attr_strs, term].join
61
+ # end
62
+
63
+ # def attrib(attr, val)
64
+ # "#{attr} = #{val}"
65
+ # end
66
+
67
+ # def brace(str)
68
+ # "#{indent}#{str} {"
69
+ # end
70
+ # def close_brace
71
+ # "#{indent}}"
72
+ # end
73
+
74
+ # def pad_with_attributes(text, attrs=nil)
75
+ # width = 40 - (2 * graph.depth)
76
+ # if attrs then
77
+ # attr_strs = attrs.map{|attr, val| attribute_str(attr, val) }
78
+ # "%-#{width}s [ %s ]" % [text, attr_strs.join(',')]
79
+ # else
80
+ # text
81
+ # end
82
+ # end
83
+ # end
84
+
85
+ # class Node < Item
86
+ # field :inslots, Array, :default => []
87
+ # field :outslots, Array, :default => []
88
+ # field :shape, Symbol, :default => :Mrecord
89
+
90
+ # def graph_attribs
91
+ # {
92
+ # :shape => shape,
93
+ # :label => quote(shape =~ /record/ ? structured_label : label),
94
+ # # :fixedsize => true, :width => "1.0",
95
+ # }
96
+ # end
97
+
98
+ # def inslots_str
99
+ # inslots.map{|slot| "<#{slot}>#{slot[0..0]}"}.join("|")
100
+ # end
101
+
102
+ # def outslots_str
103
+ # outslots.map{|slot| "<out_#{slot}>#{slot[0..0]}"}.join("|")
104
+ # end
105
+
106
+ # def label
107
+ # super.to_s.gsub(/_\d+$/, '').gsub(/[_\.]+/, "\\n")
108
+ # end
109
+
110
+ # def structured_label
111
+ # str = "{"
112
+ # str << "{" << inslots_str << "}|" unless inslots.empty?
113
+ # str << label
114
+ # str << "|{" << outslots_str << "}" unless outslots.empty?
115
+ # str << "}"
116
+ # end
117
+
118
+ # def to_s
119
+ # str = []
120
+ # str << line(quote(name), graph_attribs)
121
+ # str.join("\n")
122
+ # end
123
+ # end
124
+
125
+ # class Edge < Item
126
+ # magic :from, String
127
+ # magic :into, String
128
+
129
+ # def to_s
130
+ # str = ""
131
+ # str << quote(from)
132
+ # str << " -> "
133
+ # str << quote(into)
134
+ # line(str)
135
+ # end
136
+ # end
137
+
138
+ # class Graph < Item
139
+ # field :items, Array, :default => []
140
+ # field :edges, Array, :default => []
141
+
142
+ # def graph(name, attrs={})
143
+ # obj = Graph.new(attrs.merge(:name => name, :owner => self))
144
+ # items << obj
145
+ # yield(obj) if block_given?
146
+ # obj
147
+ # end
148
+
149
+ # def node(name, attrs={})
150
+ # obj = Node.new(attrs.merge(:name => name, :owner => self))
151
+ # items << obj
152
+ # yield(obj) if block_given?
153
+ # obj
154
+ # end
155
+
156
+ # def edge(from, into, from_slot=nil, into_slot=nil)
157
+ # obj = Edge.new(
158
+ # :name => name, :owner => self,
159
+ # :from => from, :into => into,
160
+ # :from_slot => from_slot, :into_slot => into_slot)
161
+ # edges << obj
162
+ # yield(obj) if block_given?
163
+ # obj
164
+ # end
165
+
166
+ # def to_s
167
+ # str = []
168
+ # str << brace("subgraph #{quote("cluster_#{name}")}")
169
+ # str << line(attrib(" label", quote(label)))
170
+ # items.each do |item|
171
+ # str << item.to_s
172
+ # end
173
+ # edges.each do |edge|
174
+ # str << edge.to_s
175
+ # end
176
+ # str << close_brace
177
+ # str.join("\n")
178
+ # end
179
+ # end
180
+
181
+ # class Universe < Graph
182
+ # field :orient, Symbol, :doc => 'one of :TB, :BT, :LR, :RL', :default => :TB
183
+ # field :engine, Symbol, :default => :dot
184
+
185
+ # def to_s
186
+ # str = []
187
+ # str << brace("digraph #{name}")
188
+ # str << line(" rankdir = #{orient}")
189
+ # items.each do |item|
190
+ # str << item.to_s
191
+ # end
192
+ # str << close_brace
193
+ # str.join("\n")
194
+ # end
195
+
196
+ # def depth() 0; end
197
+
198
+ # def save(path, type=nil)
199
+ # File.open "#{path}.dot", "w" do |f|
200
+ # f.puts self.to_s
201
+ # end
202
+ # system "#{engine} -T#{type} #{path}.dot > #{path}.#{type}" if type
203
+ # end
204
+ # end
205
+ # end
206
+ # end