wukong 3.0.0.pre2 → 3.0.0.pre3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (146) hide show
  1. data/Gemfile +13 -0
  2. data/README.md +182 -6
  3. data/bin/wu-local +13 -5
  4. data/bin/wu-server +1 -1
  5. data/examples/Gemfile +2 -1
  6. data/examples/basic/string_reverser.rb +23 -0
  7. data/examples/{tiny_count.rb → basic/tiny_count.rb} +0 -0
  8. data/examples/{word_count → basic/word_count}/accumulator.rb +0 -0
  9. data/examples/{word_count → basic/word_count}/tokenizer.rb +0 -0
  10. data/examples/{word_count → basic/word_count}/word_count.rb +0 -0
  11. data/examples/deploy_pack/Gemfile +7 -0
  12. data/examples/deploy_pack/README.md +6 -0
  13. data/examples/{text/latinize_text.rb → deploy_pack/a/b/c/.gitkeep} +0 -0
  14. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  15. data/examples/deploy_pack/config/environment.rb +1 -0
  16. data/examples/{dataflow → dsl/dataflow}/fibonacci_series.rb +0 -0
  17. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  18. data/examples/{dataflow → dsl/dataflow}/simple.rb +0 -0
  19. data/examples/{dataflow → dsl/dataflow}/telegram.rb +0 -0
  20. data/examples/{workflow → dsl/workflow}/cherry_pie.dot +0 -0
  21. data/examples/{workflow → dsl/workflow}/cherry_pie.md +0 -0
  22. data/examples/{workflow → dsl/workflow}/cherry_pie.png +0 -0
  23. data/examples/{workflow → dsl/workflow}/cherry_pie.rb +0 -0
  24. data/examples/empty/.gitkeep +0 -0
  25. data/examples/graph/implied_geolocation/README.md +63 -0
  26. data/examples/graph/{minimum_spanning_tree.rb → minimum_spanning_tree/airfares_graphviz.rb} +0 -0
  27. data/examples/munging/airline_flights/indexable.rb +75 -0
  28. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  29. data/examples/munging/geo/geonames_models.rb +29 -0
  30. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +1 -0
  31. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  32. data/examples/munging/wikipedia/dbpedia/extract_links.rb +213 -146
  33. data/examples/rake_helper.rb +12 -0
  34. data/examples/ruby_project/Gemfile +7 -0
  35. data/examples/ruby_project/README.md +6 -0
  36. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  37. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  38. data/examples/serverlogs/models/logline.rb +102 -0
  39. data/examples/{dataflow/parse_apache_logs.rb → serverlogs/parser/apache_parser_widget.rb} +0 -0
  40. data/examples/serverlogs/visit_paths/common.rb +4 -0
  41. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  42. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  43. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  44. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  45. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  46. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  47. data/examples/text/{pig_latin.rb → pig_latin/pig_latinizer.rb} +0 -0
  48. data/examples/{dataflow/pig_latinizer.rb → text/pig_latin/pig_latinizer_widget.rb} +0 -0
  49. data/lib/hanuman/graph.rb +6 -1
  50. data/lib/wu/geo.rb +4 -0
  51. data/lib/wu/geo/geo_grids.numbers +0 -0
  52. data/lib/wu/geo/geolocated.rb +331 -0
  53. data/lib/wu/geo/quadtile.rb +69 -0
  54. data/{examples → lib/wu}/graph/union_find.rb +0 -0
  55. data/lib/wu/model/reconcilable.rb +63 -0
  56. data/{examples/munging/wikipedia/utils/munging_utils.rb → lib/wu/munging.rb} +7 -4
  57. data/lib/wu/social/models/twitter.rb +31 -0
  58. data/{examples/models/wikipedia.rb → lib/wu/wikipedia/models.rb} +0 -0
  59. data/lib/wukong.rb +9 -4
  60. data/lib/wukong/boot.rb +10 -1
  61. data/lib/wukong/driver.rb +65 -71
  62. data/lib/wukong/logger.rb +93 -0
  63. data/lib/wukong/processor.rb +38 -29
  64. data/lib/wukong/runner.rb +144 -0
  65. data/lib/wukong/server.rb +119 -0
  66. data/lib/wukong/spec_helpers.rb +1 -0
  67. data/lib/wukong/spec_helpers/integration_driver.rb +22 -9
  68. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +26 -4
  69. data/lib/wukong/spec_helpers/processor_helpers.rb +4 -10
  70. data/lib/wukong/spec_helpers/shared_examples.rb +12 -13
  71. data/lib/wukong/version.rb +1 -1
  72. data/lib/wukong/widget/processors.rb +13 -0
  73. data/lib/wukong/widget/serializers.rb +55 -65
  74. data/lib/wukong/widgets.rb +0 -2
  75. data/spec/hanuman/graph_spec.rb +14 -0
  76. data/spec/spec_helper.rb +4 -30
  77. data/spec/support/{wukong_test_helpers.rb → example_test_helpers.rb} +29 -2
  78. data/spec/support/integration_helper.rb +38 -0
  79. data/spec/support/model_test_helpers.rb +115 -0
  80. data/spec/wu/geo/geolocated_spec.rb +247 -0
  81. data/spec/wu/model/reconcilable_spec.rb +152 -0
  82. data/spec/wukong/widget/processors_spec.rb +0 -1
  83. data/spec/wukong/widget/serializers_spec.rb +88 -62
  84. data/spec/wukong/wu_local_spec.rb +125 -0
  85. data/wukong.gemspec +3 -16
  86. metadata +72 -266
  87. data/examples/dataflow/apache_log_line.rb +0 -100
  88. data/examples/jabberwocky.txt +0 -36
  89. data/examples/munging/Gemfile +0 -8
  90. data/examples/munging/airline_flights/airline.rb +0 -57
  91. data/examples/munging/airline_flights/airport.rb +0 -211
  92. data/examples/munging/airline_flights/flight.rb +0 -156
  93. data/examples/munging/airline_flights/models.rb +0 -4
  94. data/examples/munging/airline_flights/parse.rb +0 -26
  95. data/examples/munging/airline_flights/route.rb +0 -35
  96. data/examples/munging/airline_flights/timezone_fixup.rb +0 -62
  97. data/examples/munging/airports/40_wbans.txt +0 -40
  98. data/examples/munging/airports/filter_weather_reports.rb +0 -37
  99. data/examples/munging/airports/join.pig +0 -31
  100. data/examples/munging/airports/to_tsv.rb +0 -33
  101. data/examples/munging/airports/usa_wbans.pig +0 -19
  102. data/examples/munging/airports/usa_wbans.txt +0 -2157
  103. data/examples/munging/airports/wbans.pig +0 -19
  104. data/examples/munging/airports/wbans.txt +0 -2310
  105. data/examples/munging/rake_helper.rb +0 -62
  106. data/examples/munging/weather/.gitignore +0 -1
  107. data/examples/munging/weather/Gemfile +0 -4
  108. data/examples/munging/weather/Rakefile +0 -28
  109. data/examples/munging/weather/extract_ish.rb +0 -13
  110. data/examples/munging/weather/models/weather.rb +0 -119
  111. data/examples/munging/weather/utils/noaa_downloader.rb +0 -46
  112. data/examples/munging/wikipedia/README.md +0 -34
  113. data/examples/munging/wikipedia/Rakefile +0 -193
  114. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +0 -18
  115. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +0 -21
  116. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +0 -27
  117. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +0 -29
  118. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +0 -14
  119. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +0 -25
  120. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +0 -29
  121. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +0 -32
  122. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +0 -85
  123. data/examples/munging/wikipedia/pig_style_guide.md +0 -25
  124. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +0 -19
  125. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +0 -23
  126. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +0 -24
  127. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +0 -22
  128. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +0 -22
  129. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +0 -26
  130. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +0 -29
  131. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +0 -24
  132. data/examples/munging/wikipedia/utils/get_namespaces.rb +0 -86
  133. data/examples/munging/wikipedia/utils/namespaces.json +0 -1
  134. data/examples/string_reverser.rb +0 -26
  135. data/examples/twitter/locations.rb +0 -29
  136. data/examples/twitter/models.rb +0 -24
  137. data/examples/twitter/pt1-fiddle.pig +0 -8
  138. data/examples/twitter/pt2-simple_parse.pig +0 -31
  139. data/examples/twitter/pt2-simple_parse.rb +0 -18
  140. data/examples/twitter/pt3-join_on_zips.pig +0 -39
  141. data/examples/twitter/pt4-strong_links.rb +0 -20
  142. data/examples/twitter/pt5-lnglat_and_strong_links.pig +0 -16
  143. data/examples/twitter/states.tsv +0 -50
  144. data/examples/workflow/package_gem.rb +0 -55
  145. data/lib/wukong/widget/sink.rb +0 -16
  146. data/lib/wukong/widget/source.rb +0 -14
@@ -1,18 +0,0 @@
1
- /*
2
- * This script generates the list of all nodes in the 1-neighborhood of the specified node.
3
- *
4
- * Output Format:
5
- * node_id:int
6
- */
7
-
8
- %default UNDIRECTED_PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
9
- -- %default HUB1 13692155 -- Philosophy
10
- %default HUB1 494528786 -- Chimpanzee
11
- %default HUB2 482846027 -- Elephant
12
- %default N1_NODES_OUT '/data/results/wikipedia/mini/nodes' -- where output will be stored
13
-
14
- undirected_pagelinks = LOAD '$UNDIRECTED_PAGELINKS' AS (node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int);
15
- spokes = FILTER undirected_pagelinks BY (node_a == $HUB1) OR (node_b == $HUB1) OR (node_a == $HUB2) OR (node_b == $HUB2);
16
- neighbors = FOREACH spokes GENERATE ((node_a == $HUB) ? node_b : node_a) AS node;
17
- distinct_neighbors = DISTINCT neighbors;
18
- STORE distinct_neighbors INTO '$N1_NODES_OUT';
@@ -1,21 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # Sample pig load statement:
4
- #
5
- # page_metadata = LOAD '$page_metadata' AS (id:int, namespace:int, title:chararray,
6
- # restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
7
- # touched:int, page_latest:int, len:int);
8
-
9
- require 'wukong'
10
- require 'wukong/streamer/sql_streamer'
11
- require 'wukong/streamer/encoding_cleaner'
12
-
13
- module PageMetadataExtractor
14
- class Mapper < Wukong::Streamer::SQLStreamer
15
- include Wukong::Streamer::EncodingCleaner
16
- columns [:int, :int, :string, :string, :int,
17
- :int, :int, :float, :string, :int, :int]
18
- end
19
- end
20
-
21
- Wukong::Script.new(PageMetadataExtractor::Mapper, nil).run
@@ -1,27 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'wukong'
4
-
5
- load '/home/dlaw/dev/wukong/examples/wikipedia/munging_utils.rb'
6
-
7
- module PagesToTSV
8
- class Mapper < Wukong::Streamer::LineStreamer
9
-
10
- COLUMNS= [:int, :int, :string, :string, :int,
11
- :int, :int, :float, :string, :int, :int]
12
-
13
- def initialize
14
- @sql_parser = MungingUtils::SQLParser.new(COLUMNS)
15
- end
16
-
17
- def process(line, &blk)
18
- @sql_parser.parse(line,&blk)
19
- end
20
- end
21
- end
22
-
23
- # go to town
24
- Wukong::Script.new(
25
- PagesToTSV::Mapper,
26
- nil
27
- ).run
@@ -1,29 +0,0 @@
1
- /*
2
- A script to generate Wikipedia page graph edge list
3
- Accepts as input 2 tsvs: list of pages and list of links
4
- Link table should initially be formatted as from_page_id, into_namespace, into_title
5
- Assumes that the combination of namespace and title uniquely identifies a page
6
-
7
- Output Format:
8
- from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
9
- */
10
-
11
- %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
12
- %default EXTRACTED_PAGELINKS '/data/scratch/wikipedia/full/pagelinks' -- raw extracted pagelinks
13
- %default AUGMENTED_PAGELINKS_OUT '/data/results/wikipedia/full/pagelinks' -- augmented pagelinks
14
-
15
- page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
16
- restrictions:chararray, counter:long, is_redirect:int, is_new:int,
17
- random:float, touched:int, page_latest:int, len:int);
18
- links = LOAD '$EXTRACTED_PAGELINKS' AS (from_id:int, into_namespace:int, into_title:chararray);
19
-
20
- first_pass_j = JOIN page_metadata BY id RIGHT OUTER, links BY from_id;
21
- first_pass = FOREACH first_pass_j GENERATE
22
- links::from_id AS from_id, page_metadata::namespace AS from_namespace, page_metadata::title AS from_title,
23
- links::into_namespace AS into_namespace, links::into_title AS into_title;
24
- second_pass_j = JOIN page_metadata BY (namespace, title) RIGHT OUTER, first_pass BY (into_namespace, into_title);
25
- second_pass = FOREACH second_pass_j GENERATE
26
- first_pass::from_id, page_metadata::id,
27
- first_pass::from_namespace, first_pass::from_title,
28
- first_pass::into_namespace, first_pass::into_title;
29
- STORE second_pass INTO '$AUGMENTED_PAGELINKS_OUT';
@@ -1,14 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'wukong'
4
- require 'wukong/streamer/sql_streamer'
5
- require 'wukong/streamer/encoding_cleaner'
6
-
7
- module PagelinksExtractor
8
- class Mapper < Wukong::Streamer::SQLStreamer
9
- include Wukong::Streamer::EncodingCleaner
10
- columns [:int, :int, :string]
11
- end
12
- end
13
-
14
- Wukong::Script.new(PagelinksExtractor::Mapper, nil).run
@@ -1,25 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'wukong'
3
-
4
- load '/home/dlaw/dev/wukong/examples/wikipedia/munging_utils.rb'
5
-
6
- module PagelinksToTSV
7
- class Mapper < Wukong::Streamer::LineStreamer
8
-
9
- COLUMNS = [:int, :int, :string]
10
-
11
- def initialize
12
- @sql_parser = MungingUtils::SQLParser.new(COLUMNS)
13
- end
14
-
15
- def process(line, &blk)
16
- @sql_parser.parse(line, &blk)
17
- end
18
- end
19
- end
20
-
21
- # go to town
22
- Wukong::Script.new(
23
- PagelinksToTSV::Mapper,
24
- nil
25
- ).run
@@ -1,29 +0,0 @@
1
- /*
2
- * Takes a directed edge list and transforms it into an undirected edge list
3
- * that stores edge direction as metadata.
4
- *
5
- * Input table should be of the format (from_id:int, into_id:int ... )
6
- *
7
- * Output format:
8
- * from_id:int, into_id:int, a_into_b:int , b_into_a:int, symmetric:int
9
- *
10
- * a_into_b, b_into_a, and symmetric are really booleans.
11
- */
12
-
13
- %default AUGMENTED_PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all wikipedia pagelinks (see augment_pagelinks.pig)
14
- %default UNDIRECTED_PAGELINKS_OUT '/data/results/wikipedia/full/undirected_pagelinks' -- undirected pagelinks
15
-
16
- edges = LOAD '$AUGMENTED_PAGELINKS' AS (from:int, into:int);
17
- edges_sorted = FOREACH edges GENERATE
18
- ((from <= into)? from : into) AS node_a,
19
- ((from <= into)? into : from) AS node_b,
20
- ((from <= into)? 1 : 0) AS a_to_b,
21
- ((from <= into)? 0 : 1) AS b_to_a;
22
- edges_grouped = GROUP edges_sorted by (node_a, node_b);
23
- edges_final = FOREACH edges_grouped GENERATE
24
- group.node_a AS node_a,
25
- group.node_b AS node_b,
26
- ((SUM(edges.$2) > 0) ? 1:0) AS a_into_b,
27
- ((SUM(edges.$3) > 0) ? 1:0) AS b_into_a,
28
- ((SUM(edges.$2) > 0 AND SUM(edges.$3) > 0) ? 1:0) as symmetric:int;
29
- STORE edges final INTO '$UNDIRECTED_PAGELINKS_OUT';
@@ -1,32 +0,0 @@
1
- /*
2
- * Augments raw pageview data with page ID.
3
- * Pageview stats are *theoretically* uniquely keyed by namespace
4
- * and title, so that is what is used to join pageviews with page_metadata.
5
- *
6
- * In practice, the original pageview stats only give the URL visited, and
7
- * reliably extracting namespace and title from the URL is difficult. Additionally,
8
- * page names change, redirects happen, and many other small things can go
9
- * wrong with the join. All pageview data is kept in the final table, but
10
- * the page id will be blank in rows where the join failed.
11
- *
12
- * Output format:
13
- * page_id:int, namespace:int, title:chararray, num_visitors:long,
14
- * date:int, time:int, epoch_time:long, day_of_week:int
15
- */
16
-
17
- %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
18
- %default EXTRACTED_PAGEVIEWS '/data/scratch/wikipedia/full/pageviews' -- raw extracted pageview stats (see extract_pageviews.rb)
19
- %default AUGMENTED_PAGEVIEWS_OUT '/data/results/wikipedia/full/pageviews' -- where output will be stored
20
-
21
- page_metadata = LOAD '$PAGE_METADATA' AS
22
- (id:int, namespace:int, title:chararray,
23
- restrictions:chararray, counter:long, is_redirect:int, is_new:int,
24
- random:float, touched:int, page_latest:int, len:int);
25
- pageviews = LOAD '$EXTRACTED_PAGEVIEWS' AS (namespace:int, title:chararray,
26
- num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int);
27
-
28
- first_join = JOIN page_metadata BY (namespace, title) RIGHT OUTER, pageviews BY (namespace, title);
29
- final = FOREACH first_join GENERATE
30
- page_metadata::id, pageviews::namespace, pageviews::title, pageviews::num_visitors,
31
- pageviews::date, pageviews::time, pageviews::epoch_time, pageviews::day_of_week;
32
- STORE final INTO '$AUGMENTED_PAGEVIEWS_OUT';
@@ -1,85 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # encoding:UTF-8
3
-
4
- # Pig output format:
5
- # namespace:int, title:chararray, num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int
6
-
7
- $:.unshift '/home/dlaw/dev/wukong_og/lib'
8
- $:.unshift '/home/dlaw/dev/gorillib/lib'
9
-
10
- require 'uri'
11
- require 'pathname'
12
- require 'json'
13
- require 'wukong'
14
- require 'wukong/streamer'
15
- require 'wukong/streamer/encoding_cleaner'
16
- load '/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/munging_utils.rb'
17
-
18
- ENV['map_input_file'] ||= 'pagecounts-20071222-100000.gz'
19
-
20
- class String
21
- def is_enwiki?
22
- return (not (self =~ /^en /).nil?)
23
- end
24
-
25
- def is_after_enwiki?
26
- return (not (self =~ /^(e[o-z][a-z]*|[f-z][a-z]+) /).nil?)
27
- end
28
- end
29
-
30
- module PageviewsExtractor
31
- class Mapper < Wukong::Streamer::LineStreamer
32
- include Wukong::Streamer::EncodingCleaner
33
- include MungingUtils
34
-
35
- ns_json_file = File.open("/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/namespaces.json",'r:UTF-8')
36
- NAMESPACES = JSON.parse(ns_json_file.read)
37
-
38
- # the filename strings are formatted as
39
- # pagecounts-YYYYMMDD-HH0000.gz
40
- def time_from_filename(filename)
41
- parts = filename.split('-')
42
- year = parts[1][0..3].to_i
43
- month = parts[1][4..5].to_i
44
- day = parts[1][6..7].to_i
45
- hour = parts[2][0..1].to_i
46
- return Time.new(year,month,day,hour)
47
- end
48
-
49
- def process line
50
- # we only want enwiki lines
51
- return if @done
52
- if line.is_after_enwiki?
53
- @done = true
54
- return
55
- end
56
- return if not line.is_enwiki?
57
- # we have an enwiki line - process it!
58
- fields = line.split(' ')[1..-1]
59
- out_fields = []
60
- # add the namespace
61
- namespace = nil
62
- if fields[0].include? ':'
63
- namespace = NAMESPACES[fields[0].split(':')[0]]
64
- out_fields << (namespace || '0')
65
- else
66
- out_fields << '0'
67
- end
68
- # add the title
69
- if namespace.nil?
70
- out_fields << URI.unescape(fields[0])
71
- else
72
- out_fields << URI.unescape(fields[0][(fields[0].index(':')||-1)+1..-1])
73
- end
74
- # add number of visitors in the hour
75
- out_fields << fields[2]
76
- # grab date info from filename
77
- file = Pathname.new(ENV['map_input_file']).basename
78
- time = time_from_filename(file.to_s)
79
- out_fields += time_columns_from_time(time)
80
- yield out_fields
81
- end
82
- end
83
- end
84
-
85
- Wukong::Script.new(PageviewsExtractor::Mapper, Wukong::Streamer::LineStreamer).run
@@ -1,25 +0,0 @@
1
- # Pig Style Guide
2
-
3
- - Everything except names should be in all caps. E.g.
4
-
5
- first_join = JOIN pages BY (namespace,title)
6
- RIGHT OUTER, pageviews BY (namespace, title);
7
-
8
- - Group and align columns in the script in ways that make sense. Don't be afraid of newlines. E.g.
9
-
10
- second_pass = FOREACH second_pass_j GENERATE
11
- first_pass::from_id, pages::id,
12
- first_pass::from_namespace, first_pass::from_title,
13
- first_pass::into_namespace, first_pass::into_title;
14
-
15
- - Columns that form an important sub-set of the table's data should be easily accessible as a unit.
16
-
17
- E.g. The edge list above has the from and into ids in the first and second columns, making it easy to just get an edge list of ids without the additional metadata.
18
-
19
- - When at all possible, you should include sample LOAD statements in the comments for your script. This makes it easy to use the output of your script
20
-
21
- - Parameterize as much as possible. All paths should be parameterized.
22
-
23
- - Parameters should be in all caps, e.g. $NODE.
24
-
25
- - Parameters should have defaults if at all possible. When you define the default, also include a comment describing the parameter.
@@ -1,19 +0,0 @@
1
- /*
2
- * Filters the page metadata table, leaving only pages that
3
- * are redirects.
4
- *
5
- * Output Format (same as page_metadata):
6
- * (id:int, namespace:int, title:chararray, restrictions:chararray,
7
- * counter:long, is_redirect:int, is_new:int, random:float, touched:int,
8
- * page_latest:int, len:int)
9
- */
10
-
11
- %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metdata for all pages in Wikipedia
12
- %default REDIRECTS_OUT '/data/results/wikipedia/full/redirect_page_metadata' -- place to store page metdata for redirects
13
-
14
- page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
15
- restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
16
- touched:int, page_latest:int, len:int);
17
-
18
- redirects = FILTER page_metadata BY (is_redirect == 1);
19
- STORE redirects INTO '$REDIRECTS_OUT';
@@ -1,23 +0,0 @@
1
- /*
2
- * This script filters the articles table, leaving only the articles
3
- * in the specified subuniverse.
4
- *
5
- * Output format:
6
- * page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int,
7
- * rev_epoch_time:long, rev_dow:int, article_text:chararray
8
- */
9
-
10
- %default ARTICLES '/data/results/wikipedia/full/articles' -- all articles in the wikipedia corpus
11
- %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
12
- %default SUB_ARTICLES_OUT '/data/results/wikipedia/mini/articles' -- where output will be stored
13
-
14
- articles = LOAD '$ARTICLES' AS (page_id:int, title:chararray, namespace:int,
15
- rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
16
- sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
17
- sub_articles_unfiltered = JOIN articles BY id, sub_nodes BY node_id;
18
- sub_articles = FOREACH sub_articles_unfiltered GENERATE
19
- articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
20
- articles::rev_date AS rev_date, articles::rev_time AS rev_time,
21
- articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
22
- articles::article_text AS article_text;
23
- STORE sub_articles INTO '$SUB_ARTICLES_OUT';
@@ -1,24 +0,0 @@
1
- /*
2
- * This script filters the page metadata table, leaving only the pages
3
- * in the specified subuniverse.
4
- *
5
- * Output format (same as page_metadata):
6
- * id:int, namespace:int, title:chararray, restrictions:chararray, counter:long,
7
- * is_redirect:int, is_new:int, random:float, touched:int, page_latest:int, len:int
8
- */
9
-
10
- %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- metadata for all pages in the wikipedia corpus
11
- %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
12
- %default SUB_PAGE_METADATA_OUT '/data/results/wikipedia/mini/page_metadata' -- where output will be stored
13
-
14
- page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
15
- restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
16
- touched:int, page_latest:int, len:int);
17
- sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
18
- sub_page_metadata_unfiltered = JOIN page_metadata BY id, sub_nodes BY node_id;
19
- sub_page_metadata = FOREACH sub_page_metadata_unfiltered GENERATE
20
- page_metadata::id, page_metadata::namespace, page_metadata::title,
21
- page_metadata::restrictions, page_metadata::counter, page_metadata::is_redirect,
22
- page_metadata::is_new, page_metadata::random, page_metadata::touched,
23
- page_metadata::page_latest, page_metadata::len;
24
- STORE sub_page_metadata INTO '$SUB_PAGE_METADATA_OUT';
@@ -1,22 +0,0 @@
1
- /*
2
- * This script filters the pagelinks table, leaving only the pagelinks
3
- * that start within supplied subuniverse.
4
- *
5
- * Output format (same as augmented_pagelinks):
6
- * from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
7
- */
8
-
9
- %default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
10
- %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
- %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
-
13
- all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
14
- from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
15
- sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
16
-
17
- sub_pagelinks_from = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
18
- sub_pagelinks = FOREACH sub_pagelinks_from GENERATE
19
- all_pagelinks::from_id, all_pagelinks::into_id,
20
- all_pagelinks::from_namespace, all_pagelinks::from_title,
21
- all_pagelinks::into_namespace, all_pagelinks::into_title;
22
- STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
@@ -1,22 +0,0 @@
1
- /*
2
- * This script filters the pagelinks table, leaving only the pagelinks
3
- * that terminate within supplied subuniverse.
4
- *
5
- * Output format (same as augment_pagelinks):
6
- * node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int
7
- */
8
-
9
- %default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
10
- %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
- %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
-
13
- all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
14
- from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
15
- sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
16
-
17
- sub_pagelinks_into = JOIN all_pagelinks BY into_id, sub_nodes BY node_id;
18
- sub_pagelinks = FOREACH sub_pagelinks_into GENERATE
19
- all_pagelinks::from_id, all_pagelinks::into_id,
20
- all_pagelinks::from_namespace, all_pagelinks::from_title,
21
- all_pagelinks::into_namespace, all_pagelinks::into_title;
22
- STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
@@ -1,26 +0,0 @@
1
- /*
2
- * This script filters the pagelinks table, leaving only the pagelinks
3
- * that start and end within supplied subuniverse.
4
- *
5
- * Output format (same as augment_pagelinks):
6
- * from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
7
- */
8
-
9
- %default PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
10
- %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
- %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
-
13
- all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
14
- from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
15
- sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
16
-
17
- sub_pagelinks_in = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
18
- sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY into_id, sub_nodes BY node_id;
19
- sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
20
- sub_pagelinks_in::all_pagelinks::from_id,
21
- sub_pagelinks_in::all_pagelinks::into_id,
22
- sub_pagelinks_in::all_pagelinks::from_namespace,
23
- sub_pagelinks_in::all_pagelinks::from_title,
24
- sub_pagelinks_in::all_pagelinks::into_namespace,
25
- sub_pagelinks_in::all_pagelinks::into_title;
26
- STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';