zillabyte 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ruby/lib/zillabyte/harness/each.rb +50 -0
- data/ruby/lib/zillabyte/harness/filter.rb +35 -0
- data/ruby/lib/zillabyte/harness/group_by.rb +39 -0
- data/ruby/lib/zillabyte/harness/helper.rb +9 -2
- data/ruby/lib/zillabyte/harness/join.rb +28 -0
- data/ruby/lib/zillabyte/harness/sink.rb +23 -0
- data/ruby/lib/zillabyte/harness/source.rb +57 -0
- data/ruby/lib/zillabyte/harness/stream.rb +1 -2
- data/ruby/lib/zillabyte/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a43d9f7b3f9e80e5258eb7aa7eadd5e737da2098
|
4
|
+
data.tar.gz: 9cd1867ebe50c7ed22659a79c52be31023e75648
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cf5964779b966dc2eb8ff59479cc994e14d76d56bb42738b51f2d2564609caecc5f552947daad3cea0f153d209c9efeae9f942f9cc734b8364e0a8114869f56
|
7
|
+
data.tar.gz: 0429f42873893ec3a0035a9c8abe63a75be5c6e2d961596445c1e8c783683f954857e1ed43f9579260fa5d02cde4a37ce2618db19c6d6b71e1556b1945ec309c
|
@@ -1,3 +1,53 @@
|
|
1
|
+
# OPERATION Each
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The each block can be thought of as a map operation that runs across multiple machines.
|
4
|
+
# The rows of the web_pages dataset are distributed across our system and processed.
|
5
|
+
# The result of the each is another stream object that will contain the emitted rows.
|
6
|
+
# The input argument tuple contains a single row, that can be accessed like a hash object.
|
7
|
+
#
|
8
|
+
#
|
9
|
+
# LANGUAGE_SYNTAX
|
10
|
+
# Simplified syntax:
|
11
|
+
#
|
12
|
+
# stream.each do |tuple|
|
13
|
+
# |=block=|
|
14
|
+
# end
|
15
|
+
# - This is equivalent to just specifying an \"execute\" block below.
|
16
|
+
# - Note that this syntax only works for a single output stream.
|
17
|
+
#
|
18
|
+
# Custom each:
|
19
|
+
# stream.each do
|
20
|
+
# name "name" # the name of the operation
|
21
|
+
# emits "stream_1", "stream_2" # optional for single output stream
|
22
|
+
# output_format :replace # :replace or :merge, optional, defaults to :replace
|
23
|
+
# prepare |=block=| # optional if no initialization needed
|
24
|
+
# execute |=block=|
|
25
|
+
# end
|
26
|
+
#
|
27
|
+
# - The allowed output formats are :replace and :merge.
|
28
|
+
# * :replace - discards the input tuple values and only emits the specified values. This is the default.
|
29
|
+
# * :merge - re-emits the input tuple values along with the specified values.
|
30
|
+
# - The "prepare" and "execute" blocks can be in do...end format or {} format.
|
31
|
+
# * the "prepare" block is where any setup is done to prepare for tuple processing in the "execute" block.
|
32
|
+
# * the "execute" block is where the tuples are actually processed. It must take in a single argument (the "tuple").
|
33
|
+
#
|
34
|
+
# EXAMPLE
|
35
|
+
#
|
36
|
+
# # This is a simple example which emits the url of a tuple
|
37
|
+
# s = s.each do |tuple|
|
38
|
+
# emit :url => tuple["url"]
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
#
|
42
|
+
#
|
43
|
+
# # Eaches can also be used for conditional emitting
|
44
|
+
# stream = result_stream.each{ |tuple|
|
45
|
+
#
|
46
|
+
# if tuple['html'].include? "hello_world"
|
47
|
+
# emit :url => tuple['url']
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# }
|
1
51
|
class Zillabyte::Harness::Each
|
2
52
|
attr_accessor :_app, :_node, :_options
|
3
53
|
|
@@ -1,3 +1,38 @@
|
|
1
|
+
# OPERATION Filter
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The filter is a conditional each. Conditional expressions can be used to determine if a
|
4
|
+
# tuple received by the operation is emitted.
|
5
|
+
#
|
6
|
+
# LANGUAGE_SYNTAX
|
7
|
+
# Simplified syntax:
|
8
|
+
# stream.filter do |tuple|
|
9
|
+
# |=block=|
|
10
|
+
# end
|
11
|
+
# - The condition for keeping the tuple should be specified in the block.
|
12
|
+
# - This is equivalent to just specifying a "keep" block below.
|
13
|
+
# Custom filter:
|
14
|
+
# stream.filter do
|
15
|
+
# name "name" \t\t\t => optional
|
16
|
+
# emits "stream_1", "stream_2", ... \t\t => optional for single output stream
|
17
|
+
# prepare |=block=| \t\t\t\t => optional if no initialization needed
|
18
|
+
# keep |=block=|
|
19
|
+
# end
|
20
|
+
# - The "prepare" and "keep" blocks can be in do...end format or {} format.
|
21
|
+
# * the "prepare" block is where any setup is done to prepare for tuple processing in the "keep" block.
|
22
|
+
# * tuples will pass through the filter if "keep" returns "True". It must take in a single argument (the "tuple").
|
23
|
+
#
|
24
|
+
# EXAMPLE
|
25
|
+
# # Filter for simple string inclusion
|
26
|
+
# stream = stream.filter { |tuple| tuple["url"].include? "hello world" }
|
27
|
+
#
|
28
|
+
# # Custom Filter
|
29
|
+
# stream = stream.filter do\
|
30
|
+
# name "hello_world_filter"
|
31
|
+
# emits "hello_stream"
|
32
|
+
# keep do |tuple|
|
33
|
+
# return tuple["url"].include? "url"
|
34
|
+
# end
|
35
|
+
# end
|
1
36
|
class Zillabyte::Harness::Filter
|
2
37
|
attr_accessor :_app, :_node, :_options
|
3
38
|
|
@@ -1,3 +1,42 @@
|
|
1
|
+
# OPERATION Group By
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The `group_by` function can be used to implement aggregate computations
|
4
|
+
# such as counting and summing. The following is a full example of a
|
5
|
+
# word count operation in Zillabyte.
|
6
|
+
#
|
7
|
+
# LANGUAGE_SYNTAX
|
8
|
+
# stream.group_by do
|
9
|
+
# name "name" \t\t\t\t\t => optional
|
10
|
+
# group_by "field_1", "field_2", ...
|
11
|
+
# emits "stream_1", "stream_2", ... \t\t => optional for single output stream
|
12
|
+
# begin_group |=block=|
|
13
|
+
# aggregate |=block=|
|
14
|
+
# end_group |=block=|
|
15
|
+
# end
|
16
|
+
# - The "begin_group", "aggregate" and "end_group" blocks can be in do...end format or {} format.
|
17
|
+
# * the "begin_group" block is where the initial values for the aggregation are set. It must take in a single argument (the "grouping tuple", which is emitted at the beginning of each group and contains the values of the fields specified in "group_by").
|
18
|
+
# * the "aggregate" block is where the aggregation is performed. It must take in a single argument (the "tuple").
|
19
|
+
# * the "end_group" block is where the final aggregated value is emitted.
|
20
|
+
# EXAMPLE
|
21
|
+
# # Declare the group_by, grouping on the :word field
|
22
|
+
# stream = stream.group_by(:word) do
|
23
|
+
#
|
24
|
+
# # Save the word being grouped, initialize any state
|
25
|
+
# begin_group do |g_tuple|
|
26
|
+
# @word = g_tuple[:word]
|
27
|
+
# @count = 0
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# # In this case, simply increment the counter associated with the word
|
31
|
+
# aggregate do |tuple|
|
32
|
+
# @count += 1
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
# # Emit the grouped word, along with its count
|
36
|
+
# end_group do |g_tuple|
|
37
|
+
# emit :word => @word, :count => @count
|
38
|
+
# end
|
39
|
+
# end
|
1
40
|
class Zillabyte::Harness::GroupBy
|
2
41
|
attr_accessor :_app, :_node, :_fields, :_options
|
3
42
|
|
@@ -387,8 +387,9 @@ class Zillabyte::Harness::Helper
|
|
387
387
|
end
|
388
388
|
end
|
389
389
|
|
390
|
-
def self.check_loop_back(node_name, nodes)
|
391
|
-
ee = "Error in \"loop_back\": \n\t "
|
390
|
+
def self.check_loop_back(stream, node_name, nodes)
|
391
|
+
ee = "Error in \"loop_back\" to \"#{node_name}\": \n\t "
|
392
|
+
previous_node_name = stream._previous_node_name
|
392
393
|
found = false
|
393
394
|
nodes.each do |node|
|
394
395
|
if node._name == node_name
|
@@ -398,6 +399,12 @@ class Zillabyte::Harness::Helper
|
|
398
399
|
Zillabyte::Harness::Helper.print_error(msg)
|
399
400
|
end
|
400
401
|
end
|
402
|
+
if node._name == previous_node_name
|
403
|
+
if node._emits.size < 2
|
404
|
+
msg = "#{ee}The preceding operation does not emit multiple streams. Please make sure that it emits one stream for the loop back and another stream for downstream operations."
|
405
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
406
|
+
end
|
407
|
+
end
|
401
408
|
end
|
402
409
|
if !found
|
403
410
|
msg = "#{ee}The specified loop-back node \"#{node_name}\" was not found in the operations preceding it."
|
@@ -1,3 +1,31 @@
|
|
1
|
+
# OPERATION Join
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# Joins allow you to combine two streams into one in a manner similar
|
4
|
+
# to traditional relational systems. Realtime tuple data from a variety
|
5
|
+
# of sources can be combined into a single stream via joins on specified
|
6
|
+
# fields. Here is an example of a `join` in Zillabyte.
|
7
|
+
# LANGUAGE_SYNTAX
|
8
|
+
# lhs_stream.join_with( rhs_stream_object, options )
|
9
|
+
# - Options should be specified as a hash. The following keys are recognized:
|
10
|
+
# Mandatory:
|
11
|
+
# * "on" -- specifies the fields to join on. The value must be a STRING or a length-2 ARRAY.
|
12
|
+
# If value = a STRING, the LH and RH join fields will both be set to this STRING.
|
13
|
+
# If value = a length-2 ARRAY, the LH join field will be set to array[0] and the RH join field will be set to array[1].
|
14
|
+
# Optional:
|
15
|
+
# * "type" -- specifies the join type. The default is :left. Options are [:inner, :outer, :left, :right]
|
16
|
+
# * "emits" -- specifies the stream to emit. A "join" may only emit a single stream.
|
17
|
+
# EXAMPLE
|
18
|
+
#
|
19
|
+
# # Left join on a shared field, in this case, an :owner_id
|
20
|
+
# joined_stream = auto_stream.join_with(repairs_stream, :on => :owner_id)
|
21
|
+
#
|
22
|
+
# # Join with budget_stream's :average on left, employee_stream's :number on right.
|
23
|
+
# joined_stream = budget_stream.join_with(
|
24
|
+
# employee_stream,
|
25
|
+
# :on => [:average, :number],
|
26
|
+
# :type => :inner
|
27
|
+
# )
|
28
|
+
#
|
1
29
|
class Zillabyte::Harness::Join
|
2
30
|
attr_accessor :_app, :_node, :_args
|
3
31
|
|
@@ -1,3 +1,26 @@
|
|
1
|
+
# OPERATION Sink
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The `sink` is a passive operation that defines the schema of the rows that need to be saved.
|
4
|
+
# Of all the operations where a stream is consumed, only the `sink` requires a schema to be defined.
|
5
|
+
#
|
6
|
+
# LANGUAGE_SYNTAX
|
7
|
+
# stream.sink do
|
8
|
+
# name "name_of_relation"
|
9
|
+
# column "field_1", :type_1
|
10
|
+
# column "field_2", :type_2 ...
|
11
|
+
# end
|
12
|
+
# - "Sink" relation "name" must be specified as a non-empty STRING with only alphanumeric and underscore characters!
|
13
|
+
# - Field names must be non-empty STRINGS with only alphanumeric or underscore characters.
|
14
|
+
# - Field names cannot be "v[number]", "id", "confidence", "since" or "source" which are reserved Zillabyte names.
|
15
|
+
# - Field types must be SYMBOLS. The following types are allowed [:string, :integer, :float, :double, :boolean, :array, :map]
|
16
|
+
#
|
17
|
+
# EXAMPLE
|
18
|
+
# stream.sink{
|
19
|
+
# name "patent_sink"
|
20
|
+
# column "doc_number", :integer
|
21
|
+
# column "title", :string
|
22
|
+
# column "date", :string
|
23
|
+
# }
|
1
24
|
class Zillabyte::Harness::Sink
|
2
25
|
attr_accessor :_app, :_node, :_options
|
3
26
|
|
@@ -1,3 +1,60 @@
|
|
1
|
+
# OPERATION Source
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# A `source` is where the data for your app originates and is defined
|
4
|
+
# on the app object. The easiest way to stream data into a Zillabyte
|
5
|
+
# app is to use a Zillabyte dataset. A simple `source` takes in the
|
6
|
+
# name of a dataset and produces a stream object.
|
7
|
+
#
|
8
|
+
# A `source` can also use data outside of the public datasets Zillabyte
|
9
|
+
# provides. Any external data available on the web can be streamed
|
10
|
+
# into a Zillabyte app. To do so, we use the expanded syntax of a
|
11
|
+
# `source` and include the code needed to generate the `source` data
|
12
|
+
# in the `next_tuple` block. The expanded syntax also allows for
|
13
|
+
# additional customizations such as providing a `name` for the `source`
|
14
|
+
# and specifying any preparatory steps such as initializing global
|
15
|
+
# values in the `begin_cycle` block.
|
16
|
+
#
|
17
|
+
# LANGUAGE_SYNTAX
|
18
|
+
#
|
19
|
+
# Sourcing from a dataset:
|
20
|
+
# app.source("dataset_name")
|
21
|
+
# Custom source:
|
22
|
+
# app.source do
|
23
|
+
# name "name" \t\t\t\t\t => optional
|
24
|
+
# emits "stream_1", "stream_2", ... \t\t => optional for single output stream
|
25
|
+
# end_cycle_policy :null_emit OR :explicit \t => default :null_emit
|
26
|
+
# begin_cycle |=block=| \t\t\t\t => optional if no initialization needed
|
27
|
+
# next_tuple |=block=|
|
28
|
+
# end
|
29
|
+
# - The "end_cycle_policy" is used to specify when a cycle should end. Two options are available:
|
30
|
+
# * :null_emit - end the cycle when a field contains "nil" or when nothing is emitted from the "next_tuple" block.
|
31
|
+
# * :explicit - the end of a cycle is explicitly declared in the "next_tuple" block. This is done by including the "end_cycle" keyword in the "next_tuple" block, e.g. end_cycle if @queue.nil?.
|
32
|
+
# - The "begin_cycle" and "next_tuple" blocks can be in do...end format or {} format.
|
33
|
+
# * the "begin_cycle" block is where any setup is done to initialize the content and quantity of tuples emitted by the "next_tuple" block.
|
34
|
+
# * the "next_tuple" block is where the tuples are actually emitted.
|
35
|
+
# EXAMPLE
|
36
|
+
#
|
37
|
+
# # Source from the "homepages" relation
|
38
|
+
# stream = app.source("homepages")
|
39
|
+
#
|
40
|
+
# # Custom Source
|
41
|
+
# app = Zillabyte.app(name = "small_cycle")
|
42
|
+
# stream = app.source do
|
43
|
+
# DICTIONARY = %w(apple bananna cherry dairy eel) unless defined? DICTIONARY
|
44
|
+
# emits "small_cycle_relation"
|
45
|
+
# end_cycle_policy :explicit
|
46
|
+
# begin_cycle do
|
47
|
+
# @count = 0
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# next_tuple do
|
51
|
+
# url = "http://#{@count}.example.com/#{@count}.html"
|
52
|
+
# html = "<html>#{DICTIONARY.sample}</html>"
|
53
|
+
# emit("small_cycle_relation", {:url => url, :html => html})
|
54
|
+
# @count += 1
|
55
|
+
# end_cycle if @count==1000
|
56
|
+
# end
|
57
|
+
# end
|
1
58
|
class Zillabyte::Harness::Source
|
2
59
|
attr_accessor :_app, :_node, :_relation
|
3
60
|
|
@@ -73,9 +73,8 @@ class Zillabyte::Harness::Stream
|
|
73
73
|
def loop_back(*args, &block)
|
74
74
|
# This is not a real operation, just telling the stream to loop back to the previous operation
|
75
75
|
loop_back_node = args[0]
|
76
|
-
Zillabyte::Harness::Helper.check_loop_back(loop_back_node, @_app._nodes)
|
76
|
+
Zillabyte::Harness::Helper.check_loop_back(self, loop_back_node, @_app._nodes)
|
77
77
|
Zillabyte::Harness::Helper.write_arc_to_file({"name" => self._name, "origin" => self._previous_node_name, "dest" => loop_back_node, "loop_back" => 1}, @_app._socket)
|
78
|
-
self
|
79
78
|
end
|
80
79
|
|
81
80
|
def sink(*args, &block)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zillabyte
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zillabyte
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.9.
|
33
|
+
version: 0.9.1
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 0.9.
|
40
|
+
version: 0.9.1
|
41
41
|
description: The Official Zillabyte Gem
|
42
42
|
email:
|
43
43
|
- gem@zillabyte.com
|