wukong 3.0.0.pre2 → 3.0.0.pre3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +13 -0
- data/README.md +182 -6
- data/bin/wu-local +13 -5
- data/bin/wu-server +1 -1
- data/examples/Gemfile +2 -1
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/{tiny_count.rb → basic/tiny_count.rb} +0 -0
- data/examples/{word_count → basic/word_count}/accumulator.rb +0 -0
- data/examples/{word_count → basic/word_count}/tokenizer.rb +0 -0
- data/examples/{word_count → basic/word_count}/word_count.rb +0 -0
- data/examples/deploy_pack/Gemfile +7 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/{text/latinize_text.rb → deploy_pack/a/b/c/.gitkeep} +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/{dataflow → dsl/dataflow}/fibonacci_series.rb +0 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/{dataflow → dsl/dataflow}/simple.rb +0 -0
- data/examples/{dataflow → dsl/dataflow}/telegram.rb +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.dot +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.md +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.png +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.rb +0 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/{minimum_spanning_tree.rb → minimum_spanning_tree/airfares_graphviz.rb} +0 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/geo/geonames_models.rb +29 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +1 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +213 -146
- data/examples/rake_helper.rb +12 -0
- data/examples/ruby_project/Gemfile +7 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/{dataflow/parse_apache_logs.rb → serverlogs/parser/apache_parser_widget.rb} +0 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/text/{pig_latin.rb → pig_latin/pig_latinizer.rb} +0 -0
- data/examples/{dataflow/pig_latinizer.rb → text/pig_latin/pig_latinizer_widget.rb} +0 -0
- data/lib/hanuman/graph.rb +6 -1
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/{examples → lib/wu}/graph/union_find.rb +0 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/{examples/munging/wikipedia/utils/munging_utils.rb → lib/wu/munging.rb} +7 -4
- data/lib/wu/social/models/twitter.rb +31 -0
- data/{examples/models/wikipedia.rb → lib/wu/wikipedia/models.rb} +0 -0
- data/lib/wukong.rb +9 -4
- data/lib/wukong/boot.rb +10 -1
- data/lib/wukong/driver.rb +65 -71
- data/lib/wukong/logger.rb +93 -0
- data/lib/wukong/processor.rb +38 -29
- data/lib/wukong/runner.rb +144 -0
- data/lib/wukong/server.rb +119 -0
- data/lib/wukong/spec_helpers.rb +1 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +22 -9
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +26 -4
- data/lib/wukong/spec_helpers/processor_helpers.rb +4 -10
- data/lib/wukong/spec_helpers/shared_examples.rb +12 -13
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/processors.rb +13 -0
- data/lib/wukong/widget/serializers.rb +55 -65
- data/lib/wukong/widgets.rb +0 -2
- data/spec/hanuman/graph_spec.rb +14 -0
- data/spec/spec_helper.rb +4 -30
- data/spec/support/{wukong_test_helpers.rb → example_test_helpers.rb} +29 -2
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/wu/geo/geolocated_spec.rb +247 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/widget/processors_spec.rb +0 -1
- data/spec/wukong/widget/serializers_spec.rb +88 -62
- data/spec/wukong/wu_local_spec.rb +125 -0
- data/wukong.gemspec +3 -16
- metadata +72 -266
- data/examples/dataflow/apache_log_line.rb +0 -100
- data/examples/jabberwocky.txt +0 -36
- data/examples/munging/Gemfile +0 -8
- data/examples/munging/airline_flights/airline.rb +0 -57
- data/examples/munging/airline_flights/airport.rb +0 -211
- data/examples/munging/airline_flights/flight.rb +0 -156
- data/examples/munging/airline_flights/models.rb +0 -4
- data/examples/munging/airline_flights/parse.rb +0 -26
- data/examples/munging/airline_flights/route.rb +0 -35
- data/examples/munging/airline_flights/timezone_fixup.rb +0 -62
- data/examples/munging/airports/40_wbans.txt +0 -40
- data/examples/munging/airports/filter_weather_reports.rb +0 -37
- data/examples/munging/airports/join.pig +0 -31
- data/examples/munging/airports/to_tsv.rb +0 -33
- data/examples/munging/airports/usa_wbans.pig +0 -19
- data/examples/munging/airports/usa_wbans.txt +0 -2157
- data/examples/munging/airports/wbans.pig +0 -19
- data/examples/munging/airports/wbans.txt +0 -2310
- data/examples/munging/rake_helper.rb +0 -62
- data/examples/munging/weather/.gitignore +0 -1
- data/examples/munging/weather/Gemfile +0 -4
- data/examples/munging/weather/Rakefile +0 -28
- data/examples/munging/weather/extract_ish.rb +0 -13
- data/examples/munging/weather/models/weather.rb +0 -119
- data/examples/munging/weather/utils/noaa_downloader.rb +0 -46
- data/examples/munging/wikipedia/README.md +0 -34
- data/examples/munging/wikipedia/Rakefile +0 -193
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +0 -18
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +0 -21
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +0 -27
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +0 -29
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +0 -14
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +0 -25
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +0 -29
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +0 -32
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +0 -85
- data/examples/munging/wikipedia/pig_style_guide.md +0 -25
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +0 -19
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +0 -23
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +0 -24
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +0 -22
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +0 -22
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +0 -26
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +0 -29
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +0 -24
- data/examples/munging/wikipedia/utils/get_namespaces.rb +0 -86
- data/examples/munging/wikipedia/utils/namespaces.json +0 -1
- data/examples/string_reverser.rb +0 -26
- data/examples/twitter/locations.rb +0 -29
- data/examples/twitter/models.rb +0 -24
- data/examples/twitter/pt1-fiddle.pig +0 -8
- data/examples/twitter/pt2-simple_parse.pig +0 -31
- data/examples/twitter/pt2-simple_parse.rb +0 -18
- data/examples/twitter/pt3-join_on_zips.pig +0 -39
- data/examples/twitter/pt4-strong_links.rb +0 -20
- data/examples/twitter/pt5-lnglat_and_strong_links.pig +0 -16
- data/examples/twitter/states.tsv +0 -50
- data/examples/workflow/package_gem.rb +0 -55
- data/lib/wukong/widget/sink.rb +0 -16
- data/lib/wukong/widget/source.rb +0 -14
@@ -1,18 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This script generates the list of all nodes in the 1-neighborhood of the specified node.
|
3
|
-
*
|
4
|
-
* Output Format:
|
5
|
-
* node_id:int
|
6
|
-
*/
|
7
|
-
|
8
|
-
%default UNDIRECTED_PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
|
9
|
-
-- %default HUB1 13692155 -- Philosophy
|
10
|
-
%default HUB1 494528786 -- Chimpanzee
|
11
|
-
%default HUB2 482846027 -- Elephant
|
12
|
-
%default N1_NODES_OUT '/data/results/wikipedia/mini/nodes' -- where output will be stored
|
13
|
-
|
14
|
-
undirected_pagelinks = LOAD '$UNDIRECTED_PAGELINKS' AS (node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int);
|
15
|
-
spokes = FILTER undirected_pagelinks BY (node_a == $HUB1) OR (node_b == $HUB1) OR (node_a == $HUB2) OR (node_b == $HUB2);
|
16
|
-
neighbors = FOREACH spokes GENERATE ((node_a == $HUB) ? node_b : node_a) AS node;
|
17
|
-
distinct_neighbors = DISTINCT neighbors;
|
18
|
-
STORE distinct_neighbors INTO '$N1_NODES_OUT';
|
@@ -1,21 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Sample pig load statement:
|
4
|
-
#
|
5
|
-
# page_metadata = LOAD '$page_metadata' AS (id:int, namespace:int, title:chararray,
|
6
|
-
# restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
|
7
|
-
# touched:int, page_latest:int, len:int);
|
8
|
-
|
9
|
-
require 'wukong'
|
10
|
-
require 'wukong/streamer/sql_streamer'
|
11
|
-
require 'wukong/streamer/encoding_cleaner'
|
12
|
-
|
13
|
-
module PageMetadataExtractor
|
14
|
-
class Mapper < Wukong::Streamer::SQLStreamer
|
15
|
-
include Wukong::Streamer::EncodingCleaner
|
16
|
-
columns [:int, :int, :string, :string, :int,
|
17
|
-
:int, :int, :float, :string, :int, :int]
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
Wukong::Script.new(PageMetadataExtractor::Mapper, nil).run
|
@@ -1,27 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'wukong'
|
4
|
-
|
5
|
-
load '/home/dlaw/dev/wukong/examples/wikipedia/munging_utils.rb'
|
6
|
-
|
7
|
-
module PagesToTSV
|
8
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
9
|
-
|
10
|
-
COLUMNS= [:int, :int, :string, :string, :int,
|
11
|
-
:int, :int, :float, :string, :int, :int]
|
12
|
-
|
13
|
-
def initialize
|
14
|
-
@sql_parser = MungingUtils::SQLParser.new(COLUMNS)
|
15
|
-
end
|
16
|
-
|
17
|
-
def process(line, &blk)
|
18
|
-
@sql_parser.parse(line,&blk)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
# go to town
|
24
|
-
Wukong::Script.new(
|
25
|
-
PagesToTSV::Mapper,
|
26
|
-
nil
|
27
|
-
).run
|
@@ -1,29 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
A script to generate Wikipedia page graph edge list
|
3
|
-
Accepts as input 2 tsvs: list of pages and list of links
|
4
|
-
Link table should initially be formatted as from_page_id, into_namespace, into_title
|
5
|
-
Assumes that the combination of namespace and title uniquely identifies a page
|
6
|
-
|
7
|
-
Output Format:
|
8
|
-
from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
|
9
|
-
*/
|
10
|
-
|
11
|
-
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
|
12
|
-
%default EXTRACTED_PAGELINKS '/data/scratch/wikipedia/full/pagelinks' -- raw extracted pagelinks
|
13
|
-
%default AUGMENTED_PAGELINKS_OUT '/data/results/wikipedia/full/pagelinks' -- augmented pagelinks
|
14
|
-
|
15
|
-
page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
|
16
|
-
restrictions:chararray, counter:long, is_redirect:int, is_new:int,
|
17
|
-
random:float, touched:int, page_latest:int, len:int);
|
18
|
-
links = LOAD '$EXTRACTED_PAGELINKS' AS (from_id:int, into_namespace:int, into_title:chararray);
|
19
|
-
|
20
|
-
first_pass_j = JOIN page_metadata BY id RIGHT OUTER, links BY from_id;
|
21
|
-
first_pass = FOREACH first_pass_j GENERATE
|
22
|
-
links::from_id AS from_id, page_metadata::namespace AS from_namespace, page_metadata::title AS from_title,
|
23
|
-
links::into_namespace AS into_namespace, links::into_title AS into_title;
|
24
|
-
second_pass_j = JOIN page_metadata BY (namespace, title) RIGHT OUTER, first_pass BY (into_namespace, into_title);
|
25
|
-
second_pass = FOREACH second_pass_j GENERATE
|
26
|
-
first_pass::from_id, page_metadata::id,
|
27
|
-
first_pass::from_namespace, first_pass::from_title,
|
28
|
-
first_pass::into_namespace, first_pass::into_title;
|
29
|
-
STORE second_pass INTO '$AUGMENTED_PAGELINKS_OUT';
|
@@ -1,14 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'wukong'
|
4
|
-
require 'wukong/streamer/sql_streamer'
|
5
|
-
require 'wukong/streamer/encoding_cleaner'
|
6
|
-
|
7
|
-
module PagelinksExtractor
|
8
|
-
class Mapper < Wukong::Streamer::SQLStreamer
|
9
|
-
include Wukong::Streamer::EncodingCleaner
|
10
|
-
columns [:int, :int, :string]
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
Wukong::Script.new(PagelinksExtractor::Mapper, nil).run
|
@@ -1,25 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'wukong'
|
3
|
-
|
4
|
-
load '/home/dlaw/dev/wukong/examples/wikipedia/munging_utils.rb'
|
5
|
-
|
6
|
-
module PagelinksToTSV
|
7
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
8
|
-
|
9
|
-
COLUMNS = [:int, :int, :string]
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
@sql_parser = MungingUtils::SQLParser.new(COLUMNS)
|
13
|
-
end
|
14
|
-
|
15
|
-
def process(line, &blk)
|
16
|
-
@sql_parser.parse(line, &blk)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
# go to town
|
22
|
-
Wukong::Script.new(
|
23
|
-
PagelinksToTSV::Mapper,
|
24
|
-
nil
|
25
|
-
).run
|
@@ -1,29 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Takes a directed edge list and transforms it into an undirected edge list
|
3
|
-
* that stores edge direction as metadata.
|
4
|
-
*
|
5
|
-
* Input table should be of the format (from_id:int, into_id:int ... )
|
6
|
-
*
|
7
|
-
* Output format:
|
8
|
-
* from_id:int, into_id:int, a_into_b:int , b_into_a:int, symmetric:int
|
9
|
-
*
|
10
|
-
* a_into_b, b_into_a, and symmetric are really booleans.
|
11
|
-
*/
|
12
|
-
|
13
|
-
%default AUGMENTED_PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all wikipedia pagelinks (see augment_pagelinks.pig)
|
14
|
-
%default UNDIRECTED_PAGELINKS_OUT '/data/results/wikipedia/full/undirected_pagelinks' -- undirected pagelinks
|
15
|
-
|
16
|
-
edges = LOAD '$AUGMENTED_PAGELINKS' AS (from:int, into:int);
|
17
|
-
edges_sorted = FOREACH edges GENERATE
|
18
|
-
((from <= into)? from : into) AS node_a,
|
19
|
-
((from <= into)? into : from) AS node_b,
|
20
|
-
((from <= into)? 1 : 0) AS a_to_b,
|
21
|
-
((from <= into)? 0 : 1) AS b_to_a;
|
22
|
-
edges_grouped = GROUP edges_sorted by (node_a, node_b);
|
23
|
-
edges_final = FOREACH edges_grouped GENERATE
|
24
|
-
group.node_a AS node_a,
|
25
|
-
group.node_b AS node_b,
|
26
|
-
((SUM(edges.$2) > 0) ? 1:0) AS a_into_b,
|
27
|
-
((SUM(edges.$3) > 0) ? 1:0) AS b_into_a,
|
28
|
-
((SUM(edges.$2) > 0 AND SUM(edges.$3) > 0) ? 1:0) as symmetric:int;
|
29
|
-
STORE edges final INTO '$UNDIRECTED_PAGELINKS_OUT';
|
@@ -1,32 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Augments raw pageview data with page ID.
|
3
|
-
* Pageview stats are *theoretically* uniquely keyed by namespace
|
4
|
-
* and title, so that is what is used to join pageviews with page_metadata.
|
5
|
-
*
|
6
|
-
* In practice, the original pageview stats only give the URL visited, and
|
7
|
-
* reliably extracting namespace and title from the URL is difficult. Additionally,
|
8
|
-
* page names change, redirects happen, and many other small things can go
|
9
|
-
* wrong with the join. All pageview data is kept in the final table, but
|
10
|
-
* the page id will be blank in rows where the join failed.
|
11
|
-
*
|
12
|
-
* Output format:
|
13
|
-
* page_id:int, namespace:int, title:chararray, num_visitors:long,
|
14
|
-
* date:int, time:int, epoch_time:long, day_of_week:int
|
15
|
-
*/
|
16
|
-
|
17
|
-
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
|
18
|
-
%default EXTRACTED_PAGEVIEWS '/data/scratch/wikipedia/full/pageviews' -- raw extracted pageview stats (see extract_pageviews.rb)
|
19
|
-
%default AUGMENTED_PAGEVIEWS_OUT '/data/results/wikipedia/full/pageviews' -- where output will be stored
|
20
|
-
|
21
|
-
page_metadata = LOAD '$PAGE_METADATA' AS
|
22
|
-
(id:int, namespace:int, title:chararray,
|
23
|
-
restrictions:chararray, counter:long, is_redirect:int, is_new:int,
|
24
|
-
random:float, touched:int, page_latest:int, len:int);
|
25
|
-
pageviews = LOAD '$EXTRACTED_PAGEVIEWS' AS (namespace:int, title:chararray,
|
26
|
-
num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int);
|
27
|
-
|
28
|
-
first_join = JOIN page_metadata BY (namespace, title) RIGHT OUTER, pageviews BY (namespace, title);
|
29
|
-
final = FOREACH first_join GENERATE
|
30
|
-
page_metadata::id, pageviews::namespace, pageviews::title, pageviews::num_visitors,
|
31
|
-
pageviews::date, pageviews::time, pageviews::epoch_time, pageviews::day_of_week;
|
32
|
-
STORE final INTO '$AUGMENTED_PAGEVIEWS_OUT';
|
@@ -1,85 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# encoding:UTF-8
|
3
|
-
|
4
|
-
# Pig output format:
|
5
|
-
# namespace:int, title:chararray, num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int
|
6
|
-
|
7
|
-
$:.unshift '/home/dlaw/dev/wukong_og/lib'
|
8
|
-
$:.unshift '/home/dlaw/dev/gorillib/lib'
|
9
|
-
|
10
|
-
require 'uri'
|
11
|
-
require 'pathname'
|
12
|
-
require 'json'
|
13
|
-
require 'wukong'
|
14
|
-
require 'wukong/streamer'
|
15
|
-
require 'wukong/streamer/encoding_cleaner'
|
16
|
-
load '/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/munging_utils.rb'
|
17
|
-
|
18
|
-
ENV['map_input_file'] ||= 'pagecounts-20071222-100000.gz'
|
19
|
-
|
20
|
-
class String
|
21
|
-
def is_enwiki?
|
22
|
-
return (not (self =~ /^en /).nil?)
|
23
|
-
end
|
24
|
-
|
25
|
-
def is_after_enwiki?
|
26
|
-
return (not (self =~ /^(e[o-z][a-z]*|[f-z][a-z]+) /).nil?)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
module PageviewsExtractor
|
31
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
32
|
-
include Wukong::Streamer::EncodingCleaner
|
33
|
-
include MungingUtils
|
34
|
-
|
35
|
-
ns_json_file = File.open("/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/namespaces.json",'r:UTF-8')
|
36
|
-
NAMESPACES = JSON.parse(ns_json_file.read)
|
37
|
-
|
38
|
-
# the filename strings are formatted as
|
39
|
-
# pagecounts-YYYYMMDD-HH0000.gz
|
40
|
-
def time_from_filename(filename)
|
41
|
-
parts = filename.split('-')
|
42
|
-
year = parts[1][0..3].to_i
|
43
|
-
month = parts[1][4..5].to_i
|
44
|
-
day = parts[1][6..7].to_i
|
45
|
-
hour = parts[2][0..1].to_i
|
46
|
-
return Time.new(year,month,day,hour)
|
47
|
-
end
|
48
|
-
|
49
|
-
def process line
|
50
|
-
# we only want enwiki lines
|
51
|
-
return if @done
|
52
|
-
if line.is_after_enwiki?
|
53
|
-
@done = true
|
54
|
-
return
|
55
|
-
end
|
56
|
-
return if not line.is_enwiki?
|
57
|
-
# we have an enwiki line - process it!
|
58
|
-
fields = line.split(' ')[1..-1]
|
59
|
-
out_fields = []
|
60
|
-
# add the namespace
|
61
|
-
namespace = nil
|
62
|
-
if fields[0].include? ':'
|
63
|
-
namespace = NAMESPACES[fields[0].split(':')[0]]
|
64
|
-
out_fields << (namespace || '0')
|
65
|
-
else
|
66
|
-
out_fields << '0'
|
67
|
-
end
|
68
|
-
# add the title
|
69
|
-
if namespace.nil?
|
70
|
-
out_fields << URI.unescape(fields[0])
|
71
|
-
else
|
72
|
-
out_fields << URI.unescape(fields[0][(fields[0].index(':')||-1)+1..-1])
|
73
|
-
end
|
74
|
-
# add number of visitors in the hour
|
75
|
-
out_fields << fields[2]
|
76
|
-
# grab date info from filename
|
77
|
-
file = Pathname.new(ENV['map_input_file']).basename
|
78
|
-
time = time_from_filename(file.to_s)
|
79
|
-
out_fields += time_columns_from_time(time)
|
80
|
-
yield out_fields
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
Wukong::Script.new(PageviewsExtractor::Mapper, Wukong::Streamer::LineStreamer).run
|
@@ -1,25 +0,0 @@
|
|
1
|
-
# Pig Style Guide
|
2
|
-
|
3
|
-
- Everything except names should be in all caps. E.g.
|
4
|
-
|
5
|
-
first_join = JOIN pages BY (namespace,title)
|
6
|
-
RIGHT OUTER, pageviews BY (namespace, title);
|
7
|
-
|
8
|
-
- Group and align columns in the script in ways that make sense. Don't be afraid of newlines. E.g.
|
9
|
-
|
10
|
-
second_pass = FOREACH second_pass_j GENERATE
|
11
|
-
first_pass::from_id, pages::id,
|
12
|
-
first_pass::from_namespace, first_pass::from_title,
|
13
|
-
first_pass::into_namespace, first_pass::into_title;
|
14
|
-
|
15
|
-
- Columns that form an important sub-set of the table's data should be easily accessible as a unit.
|
16
|
-
|
17
|
-
E.g. The edge list above has the from and into ids in the first and second columns, making it easy to just get an edge list of ids without the additional metadata.
|
18
|
-
|
19
|
-
- When at all possible, you should include sample LOAD statements in the comments for your script. This makes it easy to use the output of your script
|
20
|
-
|
21
|
-
- Parameterize as much as possible. All paths should be parameterized.
|
22
|
-
|
23
|
-
- Parameters should be in all caps, e.g. $NODE.
|
24
|
-
|
25
|
-
- Parameters should have defaults if at all possible. When you define the default, also include a comment describing the parameter.
|
@@ -1,19 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Filters the page metadata table, leaving only pages that
|
3
|
-
* are redirects.
|
4
|
-
*
|
5
|
-
* Output Format (same as page_metadata):
|
6
|
-
* (id:int, namespace:int, title:chararray, restrictions:chararray,
|
7
|
-
* counter:long, is_redirect:int, is_new:int, random:float, touched:int,
|
8
|
-
* page_latest:int, len:int)
|
9
|
-
*/
|
10
|
-
|
11
|
-
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metdata for all pages in Wikipedia
|
12
|
-
%default REDIRECTS_OUT '/data/results/wikipedia/full/redirect_page_metadata' -- place to store page metdata for redirects
|
13
|
-
|
14
|
-
page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
|
15
|
-
restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
|
16
|
-
touched:int, page_latest:int, len:int);
|
17
|
-
|
18
|
-
redirects = FILTER page_metadata BY (is_redirect == 1);
|
19
|
-
STORE redirects INTO '$REDIRECTS_OUT';
|
@@ -1,23 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This script filters the articles table, leaving only the articles
|
3
|
-
* in the specified subuniverse.
|
4
|
-
*
|
5
|
-
* Output format:
|
6
|
-
* page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int,
|
7
|
-
* rev_epoch_time:long, rev_dow:int, article_text:chararray
|
8
|
-
*/
|
9
|
-
|
10
|
-
%default ARTICLES '/data/results/wikipedia/full/articles' -- all articles in the wikipedia corpus
|
11
|
-
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
12
|
-
%default SUB_ARTICLES_OUT '/data/results/wikipedia/mini/articles' -- where output will be stored
|
13
|
-
|
14
|
-
articles = LOAD '$ARTICLES' AS (page_id:int, title:chararray, namespace:int,
|
15
|
-
rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
|
16
|
-
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
17
|
-
sub_articles_unfiltered = JOIN articles BY id, sub_nodes BY node_id;
|
18
|
-
sub_articles = FOREACH sub_articles_unfiltered GENERATE
|
19
|
-
articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
|
20
|
-
articles::rev_date AS rev_date, articles::rev_time AS rev_time,
|
21
|
-
articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
|
22
|
-
articles::article_text AS article_text;
|
23
|
-
STORE sub_articles INTO '$SUB_ARTICLES_OUT';
|
@@ -1,24 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This script filters the page metadata table, leaving only the pages
|
3
|
-
* in the specified subuniverse.
|
4
|
-
*
|
5
|
-
* Output format (same as page_metadata):
|
6
|
-
* id:int, namespace:int, title:chararray, restrictions:chararray, counter:long,
|
7
|
-
* is_redirect:int, is_new:int, random:float, touched:int, page_latest:int, len:int
|
8
|
-
*/
|
9
|
-
|
10
|
-
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- metadata for all pages in the wikipedia corpus
|
11
|
-
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
12
|
-
%default SUB_PAGE_METADATA_OUT '/data/results/wikipedia/mini/page_metadata' -- where output will be stored
|
13
|
-
|
14
|
-
page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
|
15
|
-
restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
|
16
|
-
touched:int, page_latest:int, len:int);
|
17
|
-
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
18
|
-
sub_page_metadata_unfiltered = JOIN page_metadata BY id, sub_nodes BY node_id;
|
19
|
-
sub_page_metadata = FOREACH sub_page_metadata_unfiltered GENERATE
|
20
|
-
page_metadata::id, page_metadata::namespace, page_metadata::title,
|
21
|
-
page_metadata::restrictions, page_metadata::counter, page_metadata::is_redirect,
|
22
|
-
page_metadata::is_new, page_metadata::random, page_metadata::touched,
|
23
|
-
page_metadata::page_latest, page_metadata::len;
|
24
|
-
STORE sub_page_metadata INTO '$SUB_PAGE_METADATA_OUT';
|
@@ -1,22 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
-
* that start within supplied subuniverse.
|
4
|
-
*
|
5
|
-
* Output format (same as augmented_pagelinks):
|
6
|
-
* from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
|
7
|
-
*/
|
8
|
-
|
9
|
-
%default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
|
10
|
-
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
-
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
-
|
13
|
-
all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
|
14
|
-
from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
|
15
|
-
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
16
|
-
|
17
|
-
sub_pagelinks_from = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
|
18
|
-
sub_pagelinks = FOREACH sub_pagelinks_from GENERATE
|
19
|
-
all_pagelinks::from_id, all_pagelinks::into_id,
|
20
|
-
all_pagelinks::from_namespace, all_pagelinks::from_title,
|
21
|
-
all_pagelinks::into_namespace, all_pagelinks::into_title;
|
22
|
-
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|
@@ -1,22 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
-
* that terminate within supplied subuniverse.
|
4
|
-
*
|
5
|
-
* Output format (same as augment_pagelinks):
|
6
|
-
* node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int
|
7
|
-
*/
|
8
|
-
|
9
|
-
%default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
|
10
|
-
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
-
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
-
|
13
|
-
all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
|
14
|
-
from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
|
15
|
-
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
16
|
-
|
17
|
-
sub_pagelinks_into = JOIN all_pagelinks BY into_id, sub_nodes BY node_id;
|
18
|
-
sub_pagelinks = FOREACH sub_pagelinks_into GENERATE
|
19
|
-
all_pagelinks::from_id, all_pagelinks::into_id,
|
20
|
-
all_pagelinks::from_namespace, all_pagelinks::from_title,
|
21
|
-
all_pagelinks::into_namespace, all_pagelinks::into_title;
|
22
|
-
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|
@@ -1,26 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
-
* that start and end within supplied subuniverse.
|
4
|
-
*
|
5
|
-
* Output format (same as augment_pagelinks):
|
6
|
-
* from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
|
7
|
-
*/
|
8
|
-
|
9
|
-
%default PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
|
10
|
-
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
-
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
-
|
13
|
-
all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
|
14
|
-
from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
|
15
|
-
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
16
|
-
|
17
|
-
sub_pagelinks_in = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
|
18
|
-
sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY into_id, sub_nodes BY node_id;
|
19
|
-
sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
|
20
|
-
sub_pagelinks_in::all_pagelinks::from_id,
|
21
|
-
sub_pagelinks_in::all_pagelinks::into_id,
|
22
|
-
sub_pagelinks_in::all_pagelinks::from_namespace,
|
23
|
-
sub_pagelinks_in::all_pagelinks::from_title,
|
24
|
-
sub_pagelinks_in::all_pagelinks::into_namespace,
|
25
|
-
sub_pagelinks_in::all_pagelinks::into_title;
|
26
|
-
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|