wukong 0.1.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/INSTALL.textile +89 -0
  2. data/README.textile +41 -74
  3. data/docpages/INSTALL.textile +94 -0
  4. data/{doc → docpages}/LICENSE.textile +0 -0
  5. data/{doc → docpages}/README-wulign.textile +6 -0
  6. data/docpages/UsingWukong-part1-get_ready.textile +17 -0
  7. data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
  8. data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
  9. data/docpages/_config.yml +39 -0
  10. data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
  11. data/{doc → docpages}/code/api_response_example.txt +0 -0
  12. data/{doc → docpages}/code/parser_skeleton.rb +0 -0
  13. data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
  14. data/docpages/favicon.ico +0 -0
  15. data/docpages/gem.css +16 -0
  16. data/docpages/hadoop-tips.textile +83 -0
  17. data/docpages/index.textile +90 -0
  18. data/docpages/intro.textile +8 -0
  19. data/docpages/moreinfo.textile +174 -0
  20. data/docpages/news.html +24 -0
  21. data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
  22. data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
  23. data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
  24. data/docpages/tutorial.textile +283 -0
  25. data/docpages/usage.textile +195 -0
  26. data/docpages/wutils.textile +263 -0
  27. data/wukong.gemspec +80 -50
  28. metadata +87 -54
  29. data/doc/INSTALL.textile +0 -41
  30. data/doc/README-tutorial.textile +0 -163
  31. data/doc/README-wutils.textile +0 -128
  32. data/doc/TODO.textile +0 -61
  33. data/doc/UsingWukong-part1-setup.textile +0 -2
  34. data/doc/UsingWukong-part2-scraping.textile +0 -2
  35. data/doc/hadoop-nfs.textile +0 -51
  36. data/doc/hadoop-setup.textile +0 -29
  37. data/doc/index.textile +0 -124
  38. data/doc/links.textile +0 -42
  39. data/doc/usage.textile +0 -102
  40. data/doc/utils.textile +0 -48
  41. data/examples/and_pig/sample_queries.rb +0 -128
  42. data/lib/wukong/and_pig.rb +0 -62
  43. data/lib/wukong/and_pig/README.textile +0 -12
  44. data/lib/wukong/and_pig/as.rb +0 -37
  45. data/lib/wukong/and_pig/data_types.rb +0 -30
  46. data/lib/wukong/and_pig/functions.rb +0 -50
  47. data/lib/wukong/and_pig/generate.rb +0 -85
  48. data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
  49. data/lib/wukong/and_pig/junk.rb +0 -51
  50. data/lib/wukong/and_pig/operators.rb +0 -8
  51. data/lib/wukong/and_pig/operators/compound.rb +0 -29
  52. data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
  53. data/lib/wukong/and_pig/operators/execution.rb +0 -15
  54. data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
  55. data/lib/wukong/and_pig/operators/foreach.rb +0 -98
  56. data/lib/wukong/and_pig/operators/groupies.rb +0 -212
  57. data/lib/wukong/and_pig/operators/load_store.rb +0 -65
  58. data/lib/wukong/and_pig/operators/meta.rb +0 -42
  59. data/lib/wukong/and_pig/operators/relational.rb +0 -129
  60. data/lib/wukong/and_pig/pig_struct.rb +0 -48
  61. data/lib/wukong/and_pig/pig_var.rb +0 -95
  62. data/lib/wukong/and_pig/symbol.rb +0 -29
  63. data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,129 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # == RelationalOperators
3
- #
4
- # GROUP, COGROUP, JOIN see groupies.rb
5
- # CROSS see
6
-
7
- # distinct
8
- # filter
9
- # limit
10
- # order
11
- # split
12
- # union
13
-
14
- #
15
- # stream
16
- # load
17
- # store
18
- #
19
- module Wukong
20
- module AndPig
21
- class PigVar
22
-
23
- # ===========================================================================
24
- #
25
- # Options
26
- #
27
- def self.parallelize! str, options
28
- str << " PARALLEL #{options[:parallel]}" if options[:parallel]
29
- end
30
-
31
- # ===========================================================================
32
- #
33
- # DISTINCT
34
- #
35
- def distinct lval, options={}
36
- self.class.distinct lval, self, options
37
- end
38
-
39
- def self.distinct lval, rel, options={ }
40
- cmd_str = rel.relationize
41
- parallelize! cmd_str, options
42
- simple_operation lval, rel, :distinct, cmd_str
43
- end
44
-
45
- # ===========================================================================
46
- #
47
- # FILTER
48
- #
49
- def filter by_str
50
- new_in_chain klass, "FILTER #{relation} BY #{by_str}"
51
- end
52
- def self.filter lval, rel, by_str
53
- simple_operation lval, rel, "FILTER", "#{rel.relation} BY #{by_str}"
54
- end
55
-
56
- # ===========================================================================
57
- #
58
- # LIMIT
59
- #
60
- def limit n
61
- new_in_chain klass, "LIMIT #{relation} #{n}"
62
- end
63
-
64
- # ===========================================================================
65
- #
66
- # ORDER
67
- #
68
- # alias = ORDER alias BY { * [ASC|DESC] |
69
- # field_alias [ASC|DESC] [, field_alias [ASC|DESC] …]
70
- # } [PARALLEL n];
71
- #
72
- def order cmd_str, options={}
73
- result = new_in_chain klass, "ORDER #{relation} BY #{cmd_str}"
74
- parallelize! result.cmd, options
75
- result
76
- end
77
-
78
- # ===========================================================================
79
- #
80
- # SPLIT
81
- #
82
- # SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
83
- #
84
- #
85
- def split relation_tests={}
86
- split_str = relation_tests.map do |out_rel, test|
87
- "#{out_rel} IF #{test}"
88
- end.join(", ")
89
- new_in_chain klass, "SPLIT #{relation} INTO #{split_str}"
90
- end
91
-
92
- # ===========================================================================
93
- #
94
- # CROSS
95
- #
96
- def cross *relations
97
- options = relations.extract_options!
98
- raise CrossArgumentError unless relations.length >= 1
99
- relations_str = [self, *relations].map(&:relation).join(", ")
100
- result = new_in_chain relations.first.klass, "CROSS #{relations_str}"
101
- parallelize! result.cmd, options
102
- result
103
- end
104
-
105
- # ===========================================================================
106
- #
107
- # UNION
108
- #
109
- # def self.union *relations
110
- # raise UnionArgumentError unless relations.length >= 2
111
- # new_in_chain relations.first.klass, "UNION #{relations}"
112
- # end
113
-
114
- # UNION as method
115
- def union lval, *relations
116
- self.class.union lval, [self]+relations
117
- end
118
-
119
- def self.union lval, *relations
120
- raise UnionArgumentError unless relations.length >= 2
121
- relations_str = relations.map(&:relation).join(", ")
122
- simple_operation lval, relations.first, :union, relations_str
123
- end
124
-
125
- end
126
- CrossArgumentError = ArgumentError.new("CROSS requires at least two relations. Heh heh: relations.")
127
- UnionArgumentError = ArgumentError.new("UNION requires at least two relations. Heh heh: relations.")
128
- end
129
- end
@@ -1,48 +0,0 @@
1
- module Wukong
2
- module PigStructMethods
3
- module ClassMethods
4
- #
5
- # Pig type string --
6
- # the pig type strings for each sub-element.
7
- #
8
- def typify has_rsrc=nil
9
- vars_str = members.zip(mtypes).map do |attr, mtype|
10
- "%s: %s" % [attr, mtype.typify]
11
- end
12
- vars_str = ["rsrc: chararray"] + vars_str if has_rsrc
13
- "(#{vars_str.join(', ')})"
14
- end
15
-
16
- #
17
- #
18
- #
19
- def pig_load rel, *args
20
- Wukong::AndPig::PigVar.pig_load rel, self, *args
21
- end
22
-
23
- #
24
- # Returns type for a fieldspec
25
- #
26
- def field_type field
27
- case field
28
- when Symbol then members_types[field]
29
- # when Array
30
- # if field.length > 1 then members_types[field.first].field_type(field[1..-1])
31
- # else field_type field.first
32
- # end
33
- end
34
- end
35
-
36
- end
37
- def self.included base
38
- base.extend ClassMethods
39
- end
40
- end
41
- end
42
-
43
- Struct.class_eval do
44
- include Wukong::PigStructMethods
45
- def self.mtypes
46
- members
47
- end
48
- end
@@ -1,95 +0,0 @@
1
- module Wukong
2
- module AndPig
3
-
4
- #
5
- # Make a PigVar understand the struct it describes
6
- #
7
- class PigVar
8
- attr_accessor :klass, :name, :cmd
9
- cattr_accessor :working_dir ; self.working_dir = '.'
10
- def initialize klass, name, cmd
11
- self.klass = klass
12
- self.name = name
13
- self.cmd = cmd
14
- end
15
-
16
- # Sugar for PigVar.new_relation
17
- def self.[]= name, *args
18
- set name, *args
19
- end
20
- # Sugar for PigVar.new_relation
21
- def self.[] name
22
- PIG_SYMBOLS[name]
23
- end
24
-
25
- # extract a field from an alias
26
- def _ field
27
- as_name = [name, field].join("_").to_sym
28
- AS["#{relationize}.(#{field})", as_name, Bag.new([field, field_type(field)]), nil, :skip_type]
29
- end
30
-
31
-
32
- def self.set name, rval
33
- PIG_SYMBOLS[name] = rval
34
- rval.name = name
35
- emit_setter rval.relation, rval
36
- end
37
-
38
- def relation
39
- name.relationize
40
- end
41
- alias_method :relationize, :relation
42
-
43
- #
44
- # Create a name for a new anonymous relation
45
- #
46
- def self.anon slug
47
- idx = (Wukong::AndPig.anon_var_idx += 1)
48
- "anon_#{slug}_#{idx}_".to_sym
49
- end
50
- # Create a name building off this one
51
- def anon
52
- slug = name.to_s.gsub(/^anon_/,'').gsub(/_\d+_$/,'')
53
- self.class.anon slug
54
- end
55
-
56
- #
57
- def new_in_chain lval, l_klass, l_cmd
58
- rval = self.class.new l_klass, lval, l_cmd
59
- self.class.set lval, rval
60
- end
61
-
62
- # Delegate to klass
63
- def field_type *args
64
- self.klass.field_type *args
65
- end
66
-
67
- # Fields in this relation
68
- def fields
69
- klass.members.map(&:to_sym)
70
- end
71
-
72
- #
73
- # Side-effect free operation
74
- #
75
- def simple_operation op
76
- self.class.emit "#{op.to_s.upcase} #{relation}"
77
- self
78
- end
79
-
80
- def self.simple_operation lval, rel, op, r_str
81
- cmd = "%-8s %s" % [op.to_s.upcase, r_str]
82
- rval = new(rel.klass, lval, cmd)
83
- set lval, rval
84
- end
85
-
86
- def self.simple_declaration op, r_str
87
- cmd = "%-8s %s" % [op.to_s.upcase, r_str]
88
- emit cmd
89
- end
90
-
91
- end
92
- end
93
- end
94
-
95
-
@@ -1,29 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- PIG_SYMBOLS = { }
4
- mattr_accessor :anon_var_idx
5
- self.anon_var_idx = 0
6
- end
7
- end
8
-
9
-
10
- Symbol.class_eval do
11
- def << relation
12
- case
13
- when relation.is_a?(Wukong::AndPig::PigVar)
14
- Wukong::AndPig::PigVar.new_relation(self, relation)
15
- when relation.is_a?(Symbol) && (pig_var = Wukong::AndPig::PIG_SYMBOLS[relation])
16
- Wukong::AndPig::PigVar.new_relation(self, pig_var)
17
- else raise "Don't know how to pigify RHS #{relation.inspect}"
18
- end
19
- end
20
-
21
- def method_missing method, *args
22
- pig_var = Wukong::AndPig::PIG_SYMBOLS[self]
23
- if pig_var && pig_var.respond_to?(method)
24
- pig_var.send(method, *args)
25
- else
26
- super method, *args
27
- end
28
- end
29
- end
File without changes