zillabyte 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/ruby/README.md +2 -0
- data/ruby/lib/zillabyte.rb +20 -0
- data/ruby/lib/zillabyte/common/progress.rb +17 -0
- data/ruby/lib/zillabyte/harness.rb +16 -0
- data/ruby/lib/zillabyte/harness/aggregate.rb +29 -0
- data/ruby/lib/zillabyte/harness/counter.rb +11 -0
- data/ruby/lib/zillabyte/harness/each.rb +29 -0
- data/ruby/lib/zillabyte/harness/groupby.rb +9 -0
- data/ruby/lib/zillabyte/harness/helper.rb +326 -0
- data/ruby/lib/zillabyte/harness/live_delegator.rb +369 -0
- data/ruby/lib/zillabyte/harness/simple_function.rb +131 -0
- data/ruby/lib/zillabyte/harness/simple_spout.rb +90 -0
- data/ruby/lib/zillabyte/harness/sink.rb +23 -0
- data/ruby/lib/zillabyte/harness/spout.rb +48 -0
- data/ruby/lib/zillabyte/harness/topology.rb +132 -0
- data/ruby/lib/zillabyte/harness/tuple.rb +32 -0
- data/ruby/lib/zillabyte/version.rb +3 -0
- metadata +103 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OTlmOGNkMGVlNzY4MTdjMThlODE3NDk3ZTUwMjEzNjIyMzBlOWZkMQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDYwN2YwYmMyN2M2Yjg2Y2E3YzIxMWNlYjE0N2Y5ZDQ4OWNkNjczZg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MDFjMDk3NGU2NmUyYTUyNGViMTAwYWI1MjBiMzdkMzk1MThjYmIxMjM5NzE2
|
10
|
+
NzMwMGRiMjUyYzM2YzJhNjJhY2ZlZGE2N2ZmMDM5N2VlZTdhMDNmMmZlMDQx
|
11
|
+
OTI0MGM1MmUxNzVjOTM2OWM5ZWU2Y2NjYWJlNjQxNjA4NGVmZTM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZDcyOTM1YjkzZjBjYmNjZDY1MzJjOTVjZTJjMTYyNDEyZDQzMGEzYmJiZGFi
|
14
|
+
ZTAyYjM1NjAyOTdjNjU1NTg1MzJjY2JmNDA3YTZjYTZmNTk3ZGJhNzEyN2U3
|
15
|
+
NjM5MTYyMTJiOGI1YThlN2MzNWZkNWJhYTVkMTU5YjMwMzkwZWY=
|
data/ruby/README.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "zillabyte/harness"
|
2
|
+
require "zillabyte/common/progress"
|
3
|
+
|
4
|
+
module Zillabyte
|
5
|
+
|
6
|
+
def self.new(name=nil)
|
7
|
+
@topology = Zillabyte::Harness::Topology.build(name)
|
8
|
+
@topology
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.simple_function(*args, &block)
|
12
|
+
Zillabyte::Harness::SimpleFunction.build(*args, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.simple_spout(*args, &block)
|
16
|
+
Zillabyte::Harness::SimpleSpout.build(*args, &block)
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class Zillabyte::Harness::Aggregate
|
2
|
+
attr_accessor :_name, :_type, :_emits, :_start, :_aggregate, :_complete
|
3
|
+
|
4
|
+
def initialize()
|
5
|
+
@_name = "aggregate_"+Zillabyte::Harness::Counter.get()
|
6
|
+
@_type = 'aggregate'
|
7
|
+
end
|
8
|
+
|
9
|
+
def name(v)
|
10
|
+
@_name = v
|
11
|
+
end
|
12
|
+
|
13
|
+
def emits(v)
|
14
|
+
@_emits = v
|
15
|
+
end
|
16
|
+
|
17
|
+
def start(&block)
|
18
|
+
@_start = block
|
19
|
+
end
|
20
|
+
|
21
|
+
def aggregate(&block)
|
22
|
+
@_aggregate = block
|
23
|
+
end
|
24
|
+
|
25
|
+
def complete(&block)
|
26
|
+
@_complete = block
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class Zillabyte::Harness::Each
|
2
|
+
attr_accessor :_name, :_type, :_emits, :_consumes, :_prepare, :_execute
|
3
|
+
|
4
|
+
def initialize()
|
5
|
+
@_name = "each_"+Zillabyte::Harness::Counter.get()
|
6
|
+
@_type = 'each'
|
7
|
+
end
|
8
|
+
|
9
|
+
def name(v)
|
10
|
+
@_name = v
|
11
|
+
end
|
12
|
+
|
13
|
+
def emits(v)
|
14
|
+
@_emits = v
|
15
|
+
end
|
16
|
+
|
17
|
+
def consumes(v)
|
18
|
+
@_consumes = v
|
19
|
+
end
|
20
|
+
|
21
|
+
def prepare(&block)
|
22
|
+
@_prepare = block
|
23
|
+
end
|
24
|
+
|
25
|
+
def execute(&block)
|
26
|
+
@_execute = block
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,326 @@
|
|
1
|
+
class Zillabyte::Harness::Helper
|
2
|
+
|
3
|
+
def self.opt_parser()
|
4
|
+
options = {
|
5
|
+
"name" => Dir.pwd.split("/")[-1]
|
6
|
+
}
|
7
|
+
OptionParser.new do |opts|
|
8
|
+
opts.on("--execute_live") do |v|
|
9
|
+
options[:command] = :execute
|
10
|
+
end
|
11
|
+
opts.on("--info") do |v|
|
12
|
+
options[:command] = :info
|
13
|
+
end
|
14
|
+
opts.on("--name NAME") do |v|
|
15
|
+
options[:name] = v
|
16
|
+
end
|
17
|
+
opts.on("--pipe PIPE") do |v|
|
18
|
+
options[:pipe] = v
|
19
|
+
end
|
20
|
+
opts.on("--file FNAME") do |v|
|
21
|
+
options[:file] = v
|
22
|
+
end
|
23
|
+
end.parse(self.argv)
|
24
|
+
options
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.write_hash_to_file(hash, file)
|
28
|
+
file.write(hash.to_json+"\n")
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.print_error(msg)
|
32
|
+
puts msg
|
33
|
+
exit(1)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.check_name(operation, name, names)
|
37
|
+
ee = "Error in \"#{operation}\" at \"name\": \n\t "
|
38
|
+
|
39
|
+
if(!name.instance_of?(String) or name == "")
|
40
|
+
msg = "#{ee}\"Name\" must be a non-empty STRING at #{name}."
|
41
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
42
|
+
end
|
43
|
+
if(names[name] and names[name] != "new" and names[name] != "sink" and operation != "new" and operation != "sink")
|
44
|
+
msg = "#{ee}The \"name\" \"#{name}\" was previously defined in a #{names[name]}!"
|
45
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
46
|
+
end
|
47
|
+
names[name] = operation
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.check_emits(operation, emits, streams)
|
52
|
+
ee = "Error in \"#{operation}\" at \"emits\": \n\t "
|
53
|
+
if(operation == "simple_function" or operation == "simple_spout")
|
54
|
+
pp = @@_print_check_simple_function_emits
|
55
|
+
nn = "relation"
|
56
|
+
else
|
57
|
+
pp = @@_print_check_emits
|
58
|
+
nn = "stream"
|
59
|
+
end
|
60
|
+
|
61
|
+
if(!emits.instance_of?(Array))
|
62
|
+
msg = "#{ee}\"Emits\" must be an ARRAY at #{emits}. #{pp}"
|
63
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
64
|
+
end
|
65
|
+
n_emits = emits.length
|
66
|
+
if(n_emits == 0)
|
67
|
+
msg = "#{ee}Must emit at least one #{nn}, \"emits\" cannot be an empty array. #{pp}"
|
68
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
69
|
+
end
|
70
|
+
|
71
|
+
current_op_streams = {}
|
72
|
+
emits.each do |e|
|
73
|
+
if(!e.instance_of?(Array) or e.length != 2)
|
74
|
+
msg = "#{ee}Invalid format for \"emits\" in #{e}. #{pp}"
|
75
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
76
|
+
end
|
77
|
+
if(!e[0].instance_of?(String) or e[0] == "")
|
78
|
+
msg = "#{ee}\"Emits\" #{nn} name must be a non-empty STRING in #{e}. #{pp}"
|
79
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
80
|
+
end
|
81
|
+
if(current_op_streams[e[0]])
|
82
|
+
msg = "#{ee}The #{nn} \"#{e[0]}\" is listed multiple times in the same \"emits\". #{pp}"
|
83
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
84
|
+
end
|
85
|
+
current_op_streams[e[0]] = 1
|
86
|
+
if(streams[e[0]] and streams[e[0]] != e[1])
|
87
|
+
msg = "#{ee}The #{nn} name \"#{e[0]}\" was previously defined with a different set of fields! #{pp}"
|
88
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
89
|
+
end
|
90
|
+
streams[e[0]] = e[1]
|
91
|
+
if(e[1].length == 0)
|
92
|
+
msg = "#{ee}Must be at least one output field to #{nn} \"#{e[0]}\". #{pp}"
|
93
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
94
|
+
end
|
95
|
+
|
96
|
+
if(operation == "simple_function" or operation == "simple_spout")
|
97
|
+
Zillabyte::Harness::Helper.check_simple_function_emits(e)
|
98
|
+
else
|
99
|
+
if(!e[1].instance_of?(Array))
|
100
|
+
msg = "#{ee}Field names must be an ARRAY of STRINGS in stream #{e[0]}. #{pp}"
|
101
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
102
|
+
end
|
103
|
+
e[1].each do |f|
|
104
|
+
if(!f.instance_of?(String) or f == "")
|
105
|
+
msg = "#{ee}Field names must be non-empty STRINGS in stream #{e[0]}. #{pp}"
|
106
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
if(n_emits > 1)
|
112
|
+
return true
|
113
|
+
else
|
114
|
+
return false
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.check_consumes(h, streams)
|
120
|
+
if(h._type == "each")
|
121
|
+
ee = "Error in \"each\" at \"consumes\": \n\t "
|
122
|
+
pp = @@_print_check_each_consumes
|
123
|
+
elsif(h._type == "sink")
|
124
|
+
ee = "Error in \"sink\" at \"consumes\": \n\t "
|
125
|
+
pp = @@_print_check_sink
|
126
|
+
end
|
127
|
+
|
128
|
+
consumes = h._consumes
|
129
|
+
if(!consumes)
|
130
|
+
msg = "#{ee}\"Consumes\" must be specified since a preceding \"each\" or \"spout\" emitted multiple streams. #{pp}"
|
131
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
132
|
+
end
|
133
|
+
if(!consumes.instance_of?(String) or consumes == "")
|
134
|
+
msg = "#{ee}\"Consumes\" must be a non-empty STRING at #{consumes}. #{pp}"
|
135
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
136
|
+
end
|
137
|
+
if(!streams[consumes])
|
138
|
+
msg = "#{ee}The stream \"#{consumes}\" specified in \"consumes\" does not exist! #{pp}"
|
139
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
140
|
+
end
|
141
|
+
|
142
|
+
if(h._type == "sink")
|
143
|
+
Zillabyte::Harness::Helper.check_sink_consumes(h, streams)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.check_sink(sink, nodes)
|
148
|
+
ee = "Error in \"sink\": \n\t "
|
149
|
+
pp = @@_print_check_sink
|
150
|
+
|
151
|
+
name = sink._name
|
152
|
+
columns = sink._columns
|
153
|
+
if(!name)
|
154
|
+
msg = "#{ee}Relation name must be specified! #{pp}"
|
155
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
156
|
+
end
|
157
|
+
Zillabyte::Harness::Helper.check_name("sink", sink._name, {})
|
158
|
+
|
159
|
+
if(columns.length == 0)
|
160
|
+
msg = "#{ee}Must be at least one output field to relation \"#{name}\". #{pp}"
|
161
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
162
|
+
end
|
163
|
+
Zillabyte::Harness::Helper.check_sink_columns(sink)
|
164
|
+
|
165
|
+
nodes.each do |s|
|
166
|
+
if(s._type != "sink")
|
167
|
+
next
|
168
|
+
end
|
169
|
+
if(s._name == name and s._columns != columns)
|
170
|
+
msg = "#{ee}The relation \"#{name}\" has already been specified and contains a different set of fields/types. #{pp}"
|
171
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def self.check_sink_columns(sink)
|
177
|
+
name = sink._name
|
178
|
+
columns = sink._columns
|
179
|
+
columns.each do |col|
|
180
|
+
cname = col.keys()[0]
|
181
|
+
ctype = col[cname]
|
182
|
+
Zillabyte::Harness::Helper.check_sink_column_format("sink",cname,ctype,name)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def self.check_sink_column_format(operation, cname, ctype, relation_name)
|
187
|
+
if(operation == "sink")
|
188
|
+
ee = "Error in \"sink\" at \"column\": \n\t "
|
189
|
+
pp = @@_print_check_sink
|
190
|
+
elsif(operation == "simple_function")
|
191
|
+
ee = "Error in \"simple_function\" at \"emits\": \n\t "
|
192
|
+
pp = @@_print_check_simple_function_emits
|
193
|
+
end
|
194
|
+
|
195
|
+
if(!cname.instance_of?(String) or cname == "")
|
196
|
+
msg = "#{ee}Field names must be non-empty STRINGS in relation \"#{relation_name}\". #{pp}"
|
197
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
198
|
+
end
|
199
|
+
if(!ctype.instance_of?(Symbol))
|
200
|
+
msg = "#{ee}Field data types must be SYMBOLS in relation \"#{relation_name}\". #{pp}"
|
201
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
202
|
+
end
|
203
|
+
if(ctype != :string and ctype != :integer and ctype != :float and ctype != :double && ctype != :boolean)
|
204
|
+
msg = "#{ee}Invalid field data type at \"#{ctype}\" in relation \"#{relation_name}\". #{pp}"
|
205
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def self.check_sink_consumes(sink, streams)
|
210
|
+
ee = "Error in \"sink\" at \"consumes\": \n\t "
|
211
|
+
pp = @@_print_check_sink
|
212
|
+
|
213
|
+
name = sink._name
|
214
|
+
columns = sink._columns
|
215
|
+
consumes = sink._consumes
|
216
|
+
|
217
|
+
stream_fields = streams[consumes]
|
218
|
+
if(stream_fields.length != columns.length)
|
219
|
+
msg = "#{ee}Number of columns in \"sink\" differs from number of fields in the consumed stream at relation \"#{name}\". #{pp}"
|
220
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
221
|
+
end
|
222
|
+
columns.each do |col|
|
223
|
+
col_name = col.keys()[0]
|
224
|
+
if(!stream_fields.include?(col_name))
|
225
|
+
msg = "#{ee}The column \"#{col_name}\", is not emitted by the stream \"#{consumes}\". #{pp}"
|
226
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
def self.check_simple_function_emits(emits)
|
232
|
+
ee = "Error in \"simple_function\" at \"emits\": \n\t "
|
233
|
+
pp = @@_print_check_simple_function_emits
|
234
|
+
|
235
|
+
name = emits[0]
|
236
|
+
columns = emits[1]
|
237
|
+
if(!columns.instance_of?(Array))
|
238
|
+
msg = "#{ee}Field names must be an ARRAY of HASHES in relation \"#{name}\". #{pp}"
|
239
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
240
|
+
end
|
241
|
+
columns.each do |col|
|
242
|
+
if(!col.instance_of?(Hash))
|
243
|
+
msg = "#{ee}Fields names must be listed in HASH format in relation \"#{name}\". #{pp}"
|
244
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
245
|
+
end
|
246
|
+
colkeys = col.keys()
|
247
|
+
if(colkeys.length != 1)
|
248
|
+
msg = "#{ee}Each field must be a separate HASH with {field_name : data_type} in relation \"#{name}\". #{pp}"
|
249
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
250
|
+
end
|
251
|
+
colkey = colkeys[0]
|
252
|
+
colval = col[colkey]
|
253
|
+
Zillabyte::Harness::Helper.check_sink_column_format("simple_function",colkey,colval,name)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
|
258
|
+
|
259
|
+
# Test helper...
|
260
|
+
def self.argv()
|
261
|
+
@_argv || ARGV
|
262
|
+
end
|
263
|
+
|
264
|
+
# Test helper..
|
265
|
+
def self.argv=(v)
|
266
|
+
@_argv = v
|
267
|
+
end
|
268
|
+
|
269
|
+
@@_print_check_emits = "\n
|
270
|
+
\"Emits\" Syntax:
|
271
|
+
- \"Emits\" must be a non-empty ARRAY.
|
272
|
+
- Each element of \"emits\" must be an ARRAY of length = 2.
|
273
|
+
* The first element should be the unique stream name defined as a non-empty STRING.
|
274
|
+
* The second element should be an ARRAY of field names for that stream.
|
275
|
+
e.g.
|
276
|
+
\t emits = [ [ \"stream_1\", [ \"field_11\", \"field_12\", ... ] ],
|
277
|
+
\t [ \"stream_2\", [ \"field_21\", \"field_22\", ... ] ] ] .
|
278
|
+
- Stream and field names must all be non-empty STRINGS."
|
279
|
+
|
280
|
+
@@_print_check_simple_function_emits = "\n
|
281
|
+
\"Emits\" Syntax:
|
282
|
+
- \"Emits\" must be a non-empty ARRAY.
|
283
|
+
- Each element of \"emits\" must be an ARRAY of length = 2.
|
284
|
+
* The first element should be the unique relation name defined as a non-empty STRING.
|
285
|
+
* The second element should be an ARRAY of HASHES with field names and data types for that relation. e.g.
|
286
|
+
\t emits = [ [ \"relation_1\", [ {\"field_11\" => \"type_11\"}, {\"field_12\" => \"type_12\"}, ... ] ],
|
287
|
+
\t [ \"relation_2\", [ {\"field_21\" => \"type_21\"}, {\"field_22\" => \"type_22\"}, ... ] ] ] .
|
288
|
+
- Relation and field names must all be non-empty STRINGS.
|
289
|
+
- Field types must be SYMBOLS. The following types are allowed :string, :integer, :float, :double, and :boolean."
|
290
|
+
|
291
|
+
@@_print_check_sink = "\n
|
292
|
+
\"Sink\" Syntax:
|
293
|
+
- Sinks must be specified using the following syntax:
|
294
|
+
Single stream:
|
295
|
+
\t flow.sink do |h|
|
296
|
+
\t \t h.name \"name_of_relation\"
|
297
|
+
\t \t h.columns \"field_1\" :type_1
|
298
|
+
\t \t h.columns \"field_2\" :type_2 ...
|
299
|
+
\t end
|
300
|
+
Multiple streams:
|
301
|
+
\t flow.sink do |h|
|
302
|
+
\t \t h.name \"relation_name\"
|
303
|
+
\t \t h.consumes \"stream_consumed\"
|
304
|
+
\t \t h.columns \"field_1\" :type_1
|
305
|
+
\t \t h.columns \"field_2\" :type_2 ...
|
306
|
+
\t end
|
307
|
+
- \"Sink\" relation \"name\" must be specified as a non-empty STRING!
|
308
|
+
- Field names must be non-empty STRINGS.
|
309
|
+
- Field types must be SYMBOLS. The following types are allowed :string, :integer, :float, :double, and :boolean.
|
310
|
+
- If there are multiple streams, \"consumes\" must be specified for each sink as a non-empty STRING!
|
311
|
+
* \"Consumes\" is the name of a stream emitted by an \"each\" or a \"spout\" which the \"sink\" should save as a table.
|
312
|
+
* The columns specified in \"sink\" must match the fields emitted by the stream."
|
313
|
+
|
314
|
+
@@_print_check_each_consumes = "\n
|
315
|
+
\"Each\" Syntax for multiple streams:
|
316
|
+
\t flow.each do |h|
|
317
|
+
\t \t h.name \"name\", => optional
|
318
|
+
\t \t h.emits emits,
|
319
|
+
\t \t h.consumes \"consumed_stream\"
|
320
|
+
\t \t h.prepare ...
|
321
|
+
\t \t h.execute ...
|
322
|
+
\t end
|
323
|
+
- If there are multiple streams, \"consumes\" must be specified as a non-empty STRING!
|
324
|
+
* \"Consumes\" is the name of a stream emitted by a preceding \"each\" or \"spout\" which the current \"each\" operates on."
|
325
|
+
|
326
|
+
end
|