wukong 3.0.0.pre2 → 3.0.0.pre3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. data/Gemfile +13 -0
  2. data/README.md +182 -6
  3. data/bin/wu-local +13 -5
  4. data/bin/wu-server +1 -1
  5. data/examples/Gemfile +2 -1
  6. data/examples/basic/string_reverser.rb +23 -0
  7. data/examples/{tiny_count.rb → basic/tiny_count.rb} +0 -0
  8. data/examples/{word_count → basic/word_count}/accumulator.rb +0 -0
  9. data/examples/{word_count → basic/word_count}/tokenizer.rb +0 -0
  10. data/examples/{word_count → basic/word_count}/word_count.rb +0 -0
  11. data/examples/deploy_pack/Gemfile +7 -0
  12. data/examples/deploy_pack/README.md +6 -0
  13. data/examples/{text/latinize_text.rb → deploy_pack/a/b/c/.gitkeep} +0 -0
  14. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  15. data/examples/deploy_pack/config/environment.rb +1 -0
  16. data/examples/{dataflow → dsl/dataflow}/fibonacci_series.rb +0 -0
  17. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  18. data/examples/{dataflow → dsl/dataflow}/simple.rb +0 -0
  19. data/examples/{dataflow → dsl/dataflow}/telegram.rb +0 -0
  20. data/examples/{workflow → dsl/workflow}/cherry_pie.dot +0 -0
  21. data/examples/{workflow → dsl/workflow}/cherry_pie.md +0 -0
  22. data/examples/{workflow → dsl/workflow}/cherry_pie.png +0 -0
  23. data/examples/{workflow → dsl/workflow}/cherry_pie.rb +0 -0
  24. data/examples/empty/.gitkeep +0 -0
  25. data/examples/graph/implied_geolocation/README.md +63 -0
  26. data/examples/graph/{minimum_spanning_tree.rb → minimum_spanning_tree/airfares_graphviz.rb} +0 -0
  27. data/examples/munging/airline_flights/indexable.rb +75 -0
  28. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  29. data/examples/munging/geo/geonames_models.rb +29 -0
  30. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +1 -0
  31. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  32. data/examples/munging/wikipedia/dbpedia/extract_links.rb +213 -146
  33. data/examples/rake_helper.rb +12 -0
  34. data/examples/ruby_project/Gemfile +7 -0
  35. data/examples/ruby_project/README.md +6 -0
  36. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  37. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  38. data/examples/serverlogs/models/logline.rb +102 -0
  39. data/examples/{dataflow/parse_apache_logs.rb → serverlogs/parser/apache_parser_widget.rb} +0 -0
  40. data/examples/serverlogs/visit_paths/common.rb +4 -0
  41. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  42. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  43. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  44. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  45. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  46. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  47. data/examples/text/{pig_latin.rb → pig_latin/pig_latinizer.rb} +0 -0
  48. data/examples/{dataflow/pig_latinizer.rb → text/pig_latin/pig_latinizer_widget.rb} +0 -0
  49. data/lib/hanuman/graph.rb +6 -1
  50. data/lib/wu/geo.rb +4 -0
  51. data/lib/wu/geo/geo_grids.numbers +0 -0
  52. data/lib/wu/geo/geolocated.rb +331 -0
  53. data/lib/wu/geo/quadtile.rb +69 -0
  54. data/{examples → lib/wu}/graph/union_find.rb +0 -0
  55. data/lib/wu/model/reconcilable.rb +63 -0
  56. data/{examples/munging/wikipedia/utils/munging_utils.rb → lib/wu/munging.rb} +7 -4
  57. data/lib/wu/social/models/twitter.rb +31 -0
  58. data/{examples/models/wikipedia.rb → lib/wu/wikipedia/models.rb} +0 -0
  59. data/lib/wukong.rb +9 -4
  60. data/lib/wukong/boot.rb +10 -1
  61. data/lib/wukong/driver.rb +65 -71
  62. data/lib/wukong/logger.rb +93 -0
  63. data/lib/wukong/processor.rb +38 -29
  64. data/lib/wukong/runner.rb +144 -0
  65. data/lib/wukong/server.rb +119 -0
  66. data/lib/wukong/spec_helpers.rb +1 -0
  67. data/lib/wukong/spec_helpers/integration_driver.rb +22 -9
  68. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +26 -4
  69. data/lib/wukong/spec_helpers/processor_helpers.rb +4 -10
  70. data/lib/wukong/spec_helpers/shared_examples.rb +12 -13
  71. data/lib/wukong/version.rb +1 -1
  72. data/lib/wukong/widget/processors.rb +13 -0
  73. data/lib/wukong/widget/serializers.rb +55 -65
  74. data/lib/wukong/widgets.rb +0 -2
  75. data/spec/hanuman/graph_spec.rb +14 -0
  76. data/spec/spec_helper.rb +4 -30
  77. data/spec/support/{wukong_test_helpers.rb → example_test_helpers.rb} +29 -2
  78. data/spec/support/integration_helper.rb +38 -0
  79. data/spec/support/model_test_helpers.rb +115 -0
  80. data/spec/wu/geo/geolocated_spec.rb +247 -0
  81. data/spec/wu/model/reconcilable_spec.rb +152 -0
  82. data/spec/wukong/widget/processors_spec.rb +0 -1
  83. data/spec/wukong/widget/serializers_spec.rb +88 -62
  84. data/spec/wukong/wu_local_spec.rb +125 -0
  85. data/wukong.gemspec +3 -16
  86. metadata +72 -266
  87. data/examples/dataflow/apache_log_line.rb +0 -100
  88. data/examples/jabberwocky.txt +0 -36
  89. data/examples/munging/Gemfile +0 -8
  90. data/examples/munging/airline_flights/airline.rb +0 -57
  91. data/examples/munging/airline_flights/airport.rb +0 -211
  92. data/examples/munging/airline_flights/flight.rb +0 -156
  93. data/examples/munging/airline_flights/models.rb +0 -4
  94. data/examples/munging/airline_flights/parse.rb +0 -26
  95. data/examples/munging/airline_flights/route.rb +0 -35
  96. data/examples/munging/airline_flights/timezone_fixup.rb +0 -62
  97. data/examples/munging/airports/40_wbans.txt +0 -40
  98. data/examples/munging/airports/filter_weather_reports.rb +0 -37
  99. data/examples/munging/airports/join.pig +0 -31
  100. data/examples/munging/airports/to_tsv.rb +0 -33
  101. data/examples/munging/airports/usa_wbans.pig +0 -19
  102. data/examples/munging/airports/usa_wbans.txt +0 -2157
  103. data/examples/munging/airports/wbans.pig +0 -19
  104. data/examples/munging/airports/wbans.txt +0 -2310
  105. data/examples/munging/rake_helper.rb +0 -62
  106. data/examples/munging/weather/.gitignore +0 -1
  107. data/examples/munging/weather/Gemfile +0 -4
  108. data/examples/munging/weather/Rakefile +0 -28
  109. data/examples/munging/weather/extract_ish.rb +0 -13
  110. data/examples/munging/weather/models/weather.rb +0 -119
  111. data/examples/munging/weather/utils/noaa_downloader.rb +0 -46
  112. data/examples/munging/wikipedia/README.md +0 -34
  113. data/examples/munging/wikipedia/Rakefile +0 -193
  114. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +0 -18
  115. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +0 -21
  116. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +0 -27
  117. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +0 -29
  118. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +0 -14
  119. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +0 -25
  120. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +0 -29
  121. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +0 -32
  122. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +0 -85
  123. data/examples/munging/wikipedia/pig_style_guide.md +0 -25
  124. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +0 -19
  125. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +0 -23
  126. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +0 -24
  127. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +0 -22
  128. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +0 -22
  129. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +0 -26
  130. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +0 -29
  131. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +0 -24
  132. data/examples/munging/wikipedia/utils/get_namespaces.rb +0 -86
  133. data/examples/munging/wikipedia/utils/namespaces.json +0 -1
  134. data/examples/string_reverser.rb +0 -26
  135. data/examples/twitter/locations.rb +0 -29
  136. data/examples/twitter/models.rb +0 -24
  137. data/examples/twitter/pt1-fiddle.pig +0 -8
  138. data/examples/twitter/pt2-simple_parse.pig +0 -31
  139. data/examples/twitter/pt2-simple_parse.rb +0 -18
  140. data/examples/twitter/pt3-join_on_zips.pig +0 -39
  141. data/examples/twitter/pt4-strong_links.rb +0 -20
  142. data/examples/twitter/pt5-lnglat_and_strong_links.pig +0 -16
  143. data/examples/twitter/states.tsv +0 -50
  144. data/examples/workflow/package_gem.rb +0 -55
  145. data/lib/wukong/widget/sink.rb +0 -16
  146. data/lib/wukong/widget/source.rb +0 -14
data/Gemfile CHANGED
@@ -1,3 +1,16 @@
1
1
  source :rubygems
2
2
 
3
3
  gemspec
4
+
5
+ group :development do
6
+ gem 'rake', '>= 0.9'
7
+ gem 'rspec', '>= 2.8'
8
+ gem 'guard', '>= 1.0'
9
+ gem 'guard-rspec', '>= 0.6'
10
+ gem 'simplecov', '>= 0.5'
11
+ gem 'pry'
12
+ gem 'yard'
13
+ gem 'redcarpet'
14
+ gem 'addressable'
15
+ gem 'htmlentities'
16
+ end
data/README.md CHANGED
@@ -131,7 +131,7 @@ the last example:
131
131
  ```
132
132
  # in find_t_words.rb
133
133
  Wukong.dataflow(:find_t_words) do
134
- tokenizer > regexp(match: /^t/)
134
+ tokenizer | regexp(match: /^t/)
135
135
  end
136
136
  ```
137
137
 
@@ -196,7 +196,7 @@ beginning and at the end
196
196
 
197
197
  ```ruby
198
198
  Wukong.dataflow(:complicated) do
199
- from_json > proc_1 > proc_2 > proc_3 ... proc_n > to_json
199
+ from_json | proc_1 | proc_2 | proc_3 ... proc_n | to_json
200
200
  end
201
201
  ```
202
202
 
@@ -222,11 +222,11 @@ arguments
222
222
 
223
223
  ```ruby
224
224
  Wukong.processor(:log_everything) do
225
- proc_1 > proc_2 > ... > logger
225
+ proc_1 | proc_2 | ... | logger
226
226
  end
227
227
 
228
228
  Wukong.processor(:log_everything_important) do
229
- proc_1 > proc_2 > ... > regexp(match: /important/i) > logger
229
+ proc_1 | proc_2 | ... | regexp(match: /important/i) | logger
230
230
  end
231
231
  ```
232
232
 
@@ -234,7 +234,7 @@ Other widgets require a block to define their action:
234
234
 
235
235
  ```ruby
236
236
  Wukong.processor(:log_everything_important) do
237
- parser > select { |record| record.priority =~ /important/i } > logger
237
+ parser | select { |record| record.priority =~ /important/i } | logger
238
238
  end
239
239
  ```
240
240
 
@@ -278,6 +278,182 @@ You can also use these within a more complicated dataflow:
278
278
 
279
279
  ```ruby
280
280
  Wukong.dataflow(:word_count) do
281
- tokenize > remove_stopwords > sort > group
281
+ tokenize | remove_stopwords | sort | group
282
282
  end
283
283
  ```
284
+
285
+ ## Testing
286
+
287
+ Wukong comes with several helpers to make writing specs using
288
+ [RSpec](http://rspec.info/) easier.
289
+
290
+ The only method that you need to test in a Processor is the `process`
291
+ method. The rest of the processor's methods and functionality are
292
+ provided by Wukong and are already tested.
293
+
294
+ You may want to test this process method in two ways:
295
+
296
+ * unit tests of the class itself in various contexts
297
+ * integration tests of running the class with the `wu-local` (or other) command-line runner
298
+
299
+ ### Unit Tests
300
+
301
+ Let's start with a simple processor
302
+
303
+ ```ruby
304
+ # in tokenizer.rb
305
+ Wukong.processor(:tokenizer) do
306
+ def process text
307
+ text.downcase.gsub(/[^\s\w]/,'').split.each do |token|
308
+ yield token
309
+ end
310
+ end
311
+ end
312
+ ```
313
+
314
+ You could test this processor directly:
315
+
316
+ ```ruby
317
+ # in spec/tokenizer_spec.rb
318
+ require 'spec_helper'
319
+ describe :tokenizer do
320
+ subject { Wukong::Processor::Tokenizer.new }
321
+ before { subject.setup }
322
+ after { subject.finalize ; subject.stop }
323
+ it "correctly counts tokens" do
324
+ expect { |b| subject.process("Hi there, Wukong!", &b) }.to yield_successive_args('hi', 'there', 'wukong')
325
+ end
326
+ end
327
+ ```
328
+
329
+ but having to handle the yield from the block yourself can lead to
330
+ verbose and unreadable tests. Wukong defines some helpers for this
331
+ case. Require and include them first in your `spec_helper.rb`:
332
+
333
+ ```ruby
334
+ # spec/spec_helper.rb
335
+ require 'wukong'
336
+ require 'wukong/spec_helpers'
337
+ RSpec.configure do |config|
338
+ config.include(Wukong::SpecHelpers)
339
+ end
340
+ ```
341
+
342
+ and then use them in your test
343
+
344
+ ```ruby
345
+ # in spec/tokenizer_spec.rb
346
+ require 'spec_helper'
347
+ describe :tokenizer do
348
+ it_behaves_like 'a processor', :named => :tokenizer
349
+ it "emits the correct number of tokens" do
350
+ processor.given("Hi there.\nMy name is Wukong!").should emit(6).records
351
+ end
352
+ it "eliminates all punctuation" do
353
+ processor.given("Never!").output.first.should_not include(',')
354
+ end
355
+ it "downcases all input text" do
356
+ processor.given("Whatever").output.first.should match(/^w/)
357
+ end
358
+ end
359
+ ```
360
+
361
+ Let's look at each kind of helper:
362
+
363
+ * The `a processor` shared example (invoked with RSpec's
364
+ `it_behaves_like` helper) adds some tests that ensure that the
365
+ processor conforms to the API of a Wukong::Processor.
366
+
367
+ * The `processor` method instantiates a processor very similarly to
368
+ the way `wu-local` instantiates one on the command-line. It accepts
369
+ a (registered) processor name and options and creates a new
370
+ processor. If no name is given, the argument of the enclosing
371
+ `describe` or `context` block is used. The object returned by
372
+ `processor` is the Wukong::Processor you're testing so you can
373
+ directly declare introspect on it or declare expectations about its
374
+ behavior.
375
+
376
+ * The `given` method (and other helpers like `given_json`,
377
+ `given_tsv`, &c.) is added to the Processor class when
378
+ Wukong::SpecHelpers is required. It's a way of lazily feeding
379
+ records to a processor, without having to go through the `process`
380
+ method directly and having to handle the block or the processor's
381
+ lifecycle as in the prior example.
382
+
383
+ * The `output` and `emit` matchers will `process` all previously
384
+ `given` records when they are called. This lets you separate
385
+ instantiation, input, expectations, and output. Here's a more
386
+ complicated example:
387
+
388
+ The same helpers can be used to test dataflows as well as
389
+ processors. For complete details, see documentation for the
390
+ Wukong::SpecHelpers module.
391
+
392
+ ### Integration Tests
393
+
394
+ Sometimes unit tests aren't enough and you need to test your
395
+ processors or flows as they will be run in production using
396
+ `wu-local`.
397
+
398
+ For these use cases, Wukong provides some integration helpers that
399
+ make testing command line processes easier.
400
+
401
+ ```ruby
402
+ # spec/integration/tokenizer_spec.rb
403
+ context "running the tokenizer with wu-local" do
404
+ subject { command("wu-local tokenizer") < "hi there" }
405
+ it { should exit_with(0) }
406
+ it { should have_stdout("hi", "there") }
407
+ end
408
+
409
+ context "interpreting its arguments" do
410
+ context "with a valid --match argument" do
411
+ subject { command("wu-local tokenizer --match='^hi'") < "hi there" }
412
+ it { should exit_with(0) }
413
+ it { should have_stdout("hi") }
414
+ it { should_not have_stdout("there") }
415
+ end
416
+ context "with a malformed --match argument" do
417
+ # invalid b/c the regexp is broken...
418
+ subject { command("wu-local tokenizer --match='^[h'") < "hi there" }
419
+ it { should exit_with(:non_zero) }
420
+ it { should have_stderr(/invalid/) }
421
+ end
422
+ end
423
+ ```
424
+
425
+ Let's go through the helpers:
426
+
427
+ * The `command` helper creates a wrapper around a command-line that will be launched. The command's environment and working directory will be taken from the current values of `ENV` and `Dir.pwd`, unless
428
+
429
+ * The `in` or `using` arguments are chained with `command` to specify the working directory and environment:
430
+
431
+ ```ruby
432
+ command("some-command with --args").in("/my/working/directory").using("THIS" => "ENV_HASH", "WILL_BE" => "MERGED_OVER_EXISTING_ENV")
433
+ ```
434
+
435
+ * The scope in which the `command` helper is called defines methods `integration_cwd` and `integration_env`. This can be done through including a module in your `spec_helper.rb`:
436
+
437
+ ```ruby
438
+ # in spec/support/integration_helper.rb
439
+ module IntegrationHelper
440
+ def integration_cwd
441
+ "/my/working/directory"
442
+ end
443
+ def integration_env
444
+ { "THIS" => "ENV_HASH", "WILL_BE" => "MERGED_OVER_EXISTING_ENV" }
445
+ end
446
+ end
447
+
448
+ # in spec/spec_helper.rb
449
+ require_relative("support/integration_helper")
450
+ RSpec.configure do |config|
451
+ config.include(IntegrationHelper)
452
+ end
453
+ ```
454
+
455
+ * The `command` helper can accept input with the `<` method. Input can be either a String or an Array of strings. It will be passed to the command over STDIN.
456
+
457
+ * The `have_stdout` and `have_stderr` matchers let you test the STDOUT or STDERR of the command for particular strings or regular expressions.
458
+
459
+ * The `exit_with` matcher lets you test the exit code of the command. You can pass the symbol `:non_zero` to set the expectation of _any_ non-zero exit code.
data/bin/wu-local CHANGED
@@ -42,8 +42,8 @@ again test locally:
42
42
  clever
43
43
  EOF
44
44
 
45
- settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of the given path.", flag: 'r'
46
-
45
+ settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of the given path.", flag: 'r'
46
+ # settings.define :tcp_server, description: "Run locally as a tcp server on a specified port", default: false, flag: 't'
47
47
  require 'wukong/boot' ; Wukong.boot!(settings)
48
48
 
49
49
  thing = settings.rest.first
@@ -60,10 +60,18 @@ else
60
60
  settings.dump_help
61
61
  exit(2)
62
62
  end
63
- # p settings
63
+
64
+
65
+
64
66
  begin
65
- Wukong::LocalDriver.run(processor.to_sym, settings)
66
- rescue Wukong::Error => e
67
+ # EM.run do
68
+ # settings.tcp_server ? Wu::TCPServer.start(processor.to_sym, settings) : Wu::StdioServer.start(processor.to_sym, settings)
69
+ # end
70
+ StupidServer.new(processor.to_sym, settings).run!
71
+ rescue Wu::Error => e
67
72
  $stderr.puts e.message
68
73
  exit(3)
69
74
  end
75
+
76
+ # One day, it will be this easy...
77
+ # Wukong::LocalRunner.run!
data/bin/wu-server CHANGED
@@ -67,4 +67,4 @@ end
67
67
  EM::run {
68
68
  EM::start_server(Settings[:host], Settings[:port], WukongMachine)
69
69
  puts "Listening on #{Settings[:host]}:#{Settings[:port]}"
70
- }
70
+ }
data/examples/Gemfile CHANGED
@@ -11,7 +11,7 @@ gem "log4r"
11
11
  group :examples do
12
12
  gem "forgery"
13
13
  gem "nokogiri"
14
- # gem "sanitize"
14
+ gem "sanitize"
15
15
  gem "addressable"
16
16
  gem "forgery"
17
17
  gem "crack"
@@ -28,6 +28,7 @@ group :development do
28
28
  gem "simplecov", '>= 0.5'
29
29
  gem "pry"
30
30
  gem "ap"
31
+ gem "ruby-progressbar"
31
32
  end
32
33
 
33
34
  group :docs do
@@ -0,0 +1,23 @@
1
+ Wukong.processor(:string_reverser) do
2
+
3
+ def setup
4
+ log.info("Inside the setup method")
5
+ @count = 0
6
+ EM.add_periodic_timer(10){ notify('metrics', count: @count) }
7
+ end
8
+
9
+ def process(record)
10
+ @count += 1
11
+ yield record.reverse
12
+ yield nil
13
+ end
14
+
15
+ def finalize
16
+ log.info("Finalizing flow")
17
+ end
18
+
19
+ def stop
20
+ log.info("Inside the stop method")
21
+ end
22
+
23
+ end
File without changes
@@ -0,0 +1,7 @@
1
+ # This is a minimal Gemfile that lets Wukong boot up (but not work).
2
+ gem 'configliere'
3
+ gem 'gorillib'
4
+ gem 'multi_json'
5
+ gem 'vayacondios-client'
6
+ gem 'log4r'
7
+ gem 'eventmachine'
@@ -0,0 +1,6 @@
1
+ This directory contains the skeleton of a Wukong deploy pack. It's
2
+ used to test the behavior of Wukong's command line tools within such a
3
+ context.
4
+
5
+ The Gemfile here contains the minimal subset of gems that let Wukong
6
+ boot.
@@ -0,0 +1,5 @@
1
+ Wukong.processor(:string_reverser) do
2
+ def process string
3
+ yield string.reverse
4
+ end
5
+ end
@@ -0,0 +1 @@
1
+ require_relative("../app/processors/string_reverser.rb")
@@ -0,0 +1,28 @@
1
+ require 'wukong/widgets/sinks/hbase_record_sink.rb'
2
+
3
+ Wukong.chain(:friend_graph) do
4
+ tail(:scrapables) do
5
+ directory 'scrapables/ids-%{t:ymd}.tsv'
6
+ end
7
+
8
+ requester = decorator('tw_requester.rb') do
9
+ input :scrape_url, Url
10
+ output :raw_json_request, JsonString
11
+ config do
12
+ define :request_types, :default => [:follower_ids, :friend_ids], :doc => 'which requests to make: follower_ids, user_timeline, etc'
13
+ end
14
+ end
15
+
16
+ retriable_requester = retriable do
17
+ with :timeouts => [1,2,3]
18
+ on_failure :sleep
19
+ guest requester
20
+ end
21
+
22
+ tail(:scrapables)> retriable_requester > processor('tw_parse.rb') > hbase_record_sink
23
+ end
24
+
25
+ Wukong.processor(:tw_parse) do
26
+ def process
27
+ end
28
+ end
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,63 @@
1
+ # Implied Geolocation
2
+
3
+ * Some objects are explicitly geolocated: "Austin, Texas", "Cornell University", the "USS_Constitution".
4
+ * Some objects are not only geolocated, they are 'places' -- present as well in the geonames dataset.
5
+
6
+ The estimator is as follows:
7
+
8
+ * a best-estimate longitude and latitude
9
+ * the radius of uncertainty for the point
10
+ * the likelihood the point is erroneous
11
+
12
+ 12000 krec articles
13
+ 7000 krec geonames
14
+ 400 krec dbpedia-geo_coordinates_en.json
15
+ 87 krec dbpedia-geonames_links.json
16
+
17
+
18
+
19
+ ### dispatch geolocation estimates along links
20
+
21
+ * Send every neighbor your geoestimate
22
+
23
+ accumulate all neighbors' geoestimates.
24
+
25
+
26
+ In this drawing, the vertical bars show implied locations; six reasonably nearby each other and two with large error.
27
+
28
+ | | | | || | |
29
+ ----+------+-+-------+--++------- // ----+---- // --+-----
30
+
31
+ But of course in some places I _know_ the location
32
+
33
+ | X | | | || | |
34
+ ----+----X-+-+-------+--++------- // ----+---- // --+-----
35
+ X
36
+ `-- actual location
37
+
38
+
39
+ Why are the estimates spread from the actual?
40
+
41
+ * intrinsic size of the actual: the graph neighbors of "Texas" are spread over a much larger area than the graph neighbors of "Yee-Haw Junction, FL".
42
+ * strength of the relationship: for example, this naive model can't tell the difference between "X is located in Y" and "X borders Y"
43
+ * errors in the relationship: the link might be irrelevant or not explanatory for any reason -- anything from "X has the same area as Virginia" to a hacked page.
44
+ * multi-modal location: Davey Crockett (TODO: verify) was from XXX to XXX the representative of Tennesee (location #1) to the US Congress in Washington, DC (locaton #2). Upon losing re-election, he famously said "You can all go to hell, I am going to Texas"; he died during the battle of the Alamo. The most robust assignment of a geolocation to "Davey Crockett" would look something like the following cartoon:
45
+
46
+ ____
47
+ / \ ------
48
+ / \ / \ +-+
49
+ | |_____| |____/ \
50
+
51
+ Tennesee Texas DC
52
+
53
+
54
+ So what we're going to do is track two separate types of error:
55
+
56
+ * the likelihood the estimate is drawn from purely irrelevant points
57
+ * assuming the estimates are relevant, the fuzziness of the implied geolocation.
58
+
59
+
60
+
61
+ * ?? only use estimates with some strength ??
62
+ * For all known points, the number of neighbors that are irrelevant
63
+