zillabyte 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ruby/lib/zillabyte/harness/each.rb +50 -0
- data/ruby/lib/zillabyte/harness/filter.rb +35 -0
- data/ruby/lib/zillabyte/harness/group_by.rb +39 -0
- data/ruby/lib/zillabyte/harness/helper.rb +9 -2
- data/ruby/lib/zillabyte/harness/join.rb +28 -0
- data/ruby/lib/zillabyte/harness/sink.rb +23 -0
- data/ruby/lib/zillabyte/harness/source.rb +57 -0
- data/ruby/lib/zillabyte/harness/stream.rb +1 -2
- data/ruby/lib/zillabyte/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a43d9f7b3f9e80e5258eb7aa7eadd5e737da2098
|
4
|
+
data.tar.gz: 9cd1867ebe50c7ed22659a79c52be31023e75648
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cf5964779b966dc2eb8ff59479cc994e14d76d56bb42738b51f2d2564609caecc5f552947daad3cea0f153d209c9efeae9f942f9cc734b8364e0a8114869f56
|
7
|
+
data.tar.gz: 0429f42873893ec3a0035a9c8abe63a75be5c6e2d961596445c1e8c783683f954857e1ed43f9579260fa5d02cde4a37ce2618db19c6d6b71e1556b1945ec309c
|
@@ -1,3 +1,53 @@
|
|
1
|
+
# OPERATION Each
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The each block can be thought of as a map operation that runs across multiple machines.
|
4
|
+
# The rows of the web_pages dataset are distributed across our system and processed.
|
5
|
+
# The result of the each is another stream object that will contain the emitted rows.
|
6
|
+
# The input argument tuple contains a single row, that can be accessed like a hash object.
|
7
|
+
#
|
8
|
+
#
|
9
|
+
# LANGUAGE_SYNTAX
|
10
|
+
# Simplified syntax:
|
11
|
+
#
|
12
|
+
# stream.each do |tuple|
|
13
|
+
# |=block=|
|
14
|
+
# end
|
15
|
+
# - This is equivalent to just specifying an \"execute\" block below.
|
16
|
+
# - Note that this syntax only works for a single output stream.
|
17
|
+
#
|
18
|
+
# Custom each:
|
19
|
+
# stream.each do
|
20
|
+
# name "name" # the name of the operation
|
21
|
+
# emits "stream_1", "stream_2" # optional for single output stream
|
22
|
+
# output_format :replace # :replace or :merge, optional, defaults to :replace
|
23
|
+
# prepare |=block=| # optional if no initialization needed
|
24
|
+
# execute |=block=|
|
25
|
+
# end
|
26
|
+
#
|
27
|
+
# - The allowed output formats are :replace and :merge.
|
28
|
+
# * :replace - discards the input tuple values and only emits the specified values. This is the default.
|
29
|
+
# * :merge - re-emits the input tuple values along with the specified values.
|
30
|
+
# - The "prepare" and "execute" blocks can be in do...end format or {} format.
|
31
|
+
# * the "prepare" block is where any setup is done to prepare for tuple processing in the "execute" block.
|
32
|
+
# * the "execute" block is where the tuples are actually processed. It must take in a single argument (the "tuple").
|
33
|
+
#
|
34
|
+
# EXAMPLE
|
35
|
+
#
|
36
|
+
# # This is a simple example which emits the url of a tuple
|
37
|
+
# s = s.each do |tuple|
|
38
|
+
# emit :url => tuple["url"]
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
#
|
42
|
+
#
|
43
|
+
# # Eaches can also be used for conditional emitting
|
44
|
+
# stream = result_stream.each{ |tuple|
|
45
|
+
#
|
46
|
+
# if tuple['html'].include? "hello_world"
|
47
|
+
# emit :url => tuple['url']
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# }
|
1
51
|
class Zillabyte::Harness::Each
|
2
52
|
attr_accessor :_app, :_node, :_options
|
3
53
|
|
@@ -1,3 +1,38 @@
|
|
1
|
+
# OPERATION Filter
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The filter is a conditional each. Conditional expressions can be used to determine if a
|
4
|
+
# tuple received by the operation is emitted.
|
5
|
+
#
|
6
|
+
# LANGUAGE_SYNTAX
|
7
|
+
# Simplified syntax:
|
8
|
+
# stream.filter do |tuple|
|
9
|
+
# |=block=|
|
10
|
+
# end
|
11
|
+
# - The condition for keeping the tuple should be specified in the block.
|
12
|
+
# - This is equivalent to just specifying a "keep" block below.
|
13
|
+
# Custom filter:
|
14
|
+
# stream.filter do
|
15
|
+
# name "name" \t\t\t => optional
|
16
|
+
# emits "stream_1", "stream_2", ... \t\t => optional for single output stream
|
17
|
+
# prepare |=block=| \t\t\t\t => optional if no initialization needed
|
18
|
+
# keep |=block=|
|
19
|
+
# end
|
20
|
+
# - The "prepare" and "keep" blocks can be in do...end format or {} format.
|
21
|
+
# * the "prepare" block is where any setup is done to prepare for tuple processing in the "keep" block.
|
22
|
+
# * tuples will pass through the filter if "keep" returns "True". It must take in a single argument (the "tuple").
|
23
|
+
#
|
24
|
+
# EXAMPLE
|
25
|
+
# # Filter for simple string inclusion
|
26
|
+
# stream = stream.filter { |tuple| tuple["url"].include? "hello world" }
|
27
|
+
#
|
28
|
+
# # Custom Filter
|
29
|
+
# stream = stream.filter do\
|
30
|
+
# name "hello_world_filter"
|
31
|
+
# emits "hello_stream"
|
32
|
+
# keep do |tuple|
|
33
|
+
# return tuple["url"].include? "url"
|
34
|
+
# end
|
35
|
+
# end
|
1
36
|
class Zillabyte::Harness::Filter
|
2
37
|
attr_accessor :_app, :_node, :_options
|
3
38
|
|
@@ -1,3 +1,42 @@
|
|
1
|
+
# OPERATION Group By
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The `group_by` function can be used to implement aggregate computations
|
4
|
+
# such as counting and summing. The following is a full example of a
|
5
|
+
# word count operation in Zillabyte.
|
6
|
+
#
|
7
|
+
# LANGUAGE_SYNTAX
|
8
|
+
# stream.group_by do
|
9
|
+
# name "name" \t\t\t\t\t => optional
|
10
|
+
# group_by "field_1", "field_2", ...
|
11
|
+
# emits "stream_1", "stream_2", ... \t\t => optional for single output stream
|
12
|
+
# begin_group |=block=|
|
13
|
+
# aggregate |=block=|
|
14
|
+
# end_group |=block=|
|
15
|
+
# end
|
16
|
+
# - The "begin_group", "aggregate" and "end_group" blocks can be in do...end format or {} format.
|
17
|
+
# * the "begin_group" block is where the initial values for the aggregation are set. It must take in a single argument (the "grouping tuple", which is emitted at the beginning of each group and contains the values of the fields specified in "group_by").
|
18
|
+
# * the "aggregate" block is where the aggregation is performed. It must take in a single argument (the "tuple").
|
19
|
+
# * the "end_group" block is where the final aggregated value is emitted.
|
20
|
+
# EXAMPLE
|
21
|
+
# # Declare the group_by, grouping on the :word field
|
22
|
+
# stream = stream.group_by(:word) do
|
23
|
+
#
|
24
|
+
# # Save the word being grouped, initialize any state
|
25
|
+
# begin_group do |g_tuple|
|
26
|
+
# @word = g_tuple[:word]
|
27
|
+
# @count = 0
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# # In this case, simply increment the counter associated with the word
|
31
|
+
# aggregate do |tuple|
|
32
|
+
# @count += 1
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
# # Emit the grouped word, along with its count
|
36
|
+
# end_group do |g_tuple|
|
37
|
+
# emit :word => @word, :count => @count
|
38
|
+
# end
|
39
|
+
# end
|
1
40
|
class Zillabyte::Harness::GroupBy
|
2
41
|
attr_accessor :_app, :_node, :_fields, :_options
|
3
42
|
|
@@ -387,8 +387,9 @@ class Zillabyte::Harness::Helper
|
|
387
387
|
end
|
388
388
|
end
|
389
389
|
|
390
|
-
def self.check_loop_back(node_name, nodes)
|
391
|
-
ee = "Error in \"loop_back\": \n\t "
|
390
|
+
def self.check_loop_back(stream, node_name, nodes)
|
391
|
+
ee = "Error in \"loop_back\" to \"#{node_name}\": \n\t "
|
392
|
+
previous_node_name = stream._previous_node_name
|
392
393
|
found = false
|
393
394
|
nodes.each do |node|
|
394
395
|
if node._name == node_name
|
@@ -398,6 +399,12 @@ class Zillabyte::Harness::Helper
|
|
398
399
|
Zillabyte::Harness::Helper.print_error(msg)
|
399
400
|
end
|
400
401
|
end
|
402
|
+
if node._name == previous_node_name
|
403
|
+
if node._emits.size < 2
|
404
|
+
msg = "#{ee}The preceding operation does not emit multiple streams. Please make sure that it emits one stream for the loop back and another stream for downstream operations."
|
405
|
+
Zillabyte::Harness::Helper.print_error(msg)
|
406
|
+
end
|
407
|
+
end
|
401
408
|
end
|
402
409
|
if !found
|
403
410
|
msg = "#{ee}The specified loop-back node \"#{node_name}\" was not found in the operations preceding it."
|
@@ -1,3 +1,31 @@
|
|
1
|
+
# OPERATION Join
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# Joins allow you to combine two streams into one in a manner similar
|
4
|
+
# to traditional relational systems. Realtime tuple data from a variety
|
5
|
+
# of sources can be combined into a single stream via joins on specified
|
6
|
+
# fields. Here is an example of a `join` in Zillabyte.
|
7
|
+
# LANGUAGE_SYNTAX
|
8
|
+
# lhs_stream.join_with( rhs_stream_object, options )
|
9
|
+
# - Options should be specified as a hash. The following keys are recognized:
|
10
|
+
# Mandatory:
|
11
|
+
# * "on" -- specifies the fields to join on. The value must be a STRING or a length-2 ARRAY.
|
12
|
+
# If value = a STRING, the LH and RH join fields will both be set to this STRING.
|
13
|
+
# If value = a length-2 ARRAY, the LH join field will be set to array[0] and the RH join field will be set to array[1].
|
14
|
+
# Optional:
|
15
|
+
# * "type" -- specifies the join type. The default is :left. Options are [:inner, :outer, :left, :right]
|
16
|
+
# * "emits" -- specifies the stream to emit. A "join" may only emit a single stream.
|
17
|
+
# EXAMPLE
|
18
|
+
#
|
19
|
+
# # Left join on a shared field, in this case, an :owner_id
|
20
|
+
# joined_stream = auto_stream.join_with(repairs_stream, :on => :owner_id)
|
21
|
+
#
|
22
|
+
# # Join with budget_stream's :average on left, employee_stream's :number on right.
|
23
|
+
# joined_stream = budget_stream.join_with(
|
24
|
+
# employee_stream,
|
25
|
+
# :on => [:average, :number],
|
26
|
+
# :type => :inner
|
27
|
+
# )
|
28
|
+
#
|
1
29
|
class Zillabyte::Harness::Join
|
2
30
|
attr_accessor :_app, :_node, :_args
|
3
31
|
|
@@ -1,3 +1,26 @@
|
|
1
|
+
# OPERATION Sink
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# The `sink` is a passive operation that defines the schema of the rows that need to be saved.
|
4
|
+
# Of all the operations where a stream is consumed, only the `sink` requires a schema to be defined.
|
5
|
+
#
|
6
|
+
# LANGUAGE_SYNTAX
|
7
|
+
# stream.sink do
|
8
|
+
# name "name_of_relation"
|
9
|
+
# column "field_1", :type_1
|
10
|
+
# column "field_2", :type_2 ...
|
11
|
+
# end
|
12
|
+
# - "Sink" relation "name" must be specified as a non-empty STRING with only alphanumeric and underscore characters!
|
13
|
+
# - Field names must be non-empty STRINGS with only alphanumeric or underscore characters.
|
14
|
+
# - Field names cannot be "v[number]", "id", "confidence", "since" or "source" which are reserved Zillabyte names.
|
15
|
+
# - Field types must be SYMBOLS. The following types are allowed [:string, :integer, :float, :double, :boolean, :array, :map]
|
16
|
+
#
|
17
|
+
# EXAMPLE
|
18
|
+
# stream.sink{
|
19
|
+
# name "patent_sink"
|
20
|
+
# column "doc_number", :integer
|
21
|
+
# column "title", :string
|
22
|
+
# column "date", :string
|
23
|
+
# }
|
1
24
|
class Zillabyte::Harness::Sink
|
2
25
|
attr_accessor :_app, :_node, :_options
|
3
26
|
|
@@ -1,3 +1,60 @@
|
|
1
|
+
# OPERATION Source
|
2
|
+
# HIGH_LEVEL_DESCRIPTION
|
3
|
+
# A `source` is where the data for your app originates and is defined
|
4
|
+
# on the app object. The easiest way to stream data into a Zillabyte
|
5
|
+
# app is to use a Zillabyte dataset. A simple `source` takes in the
|
6
|
+
# name of a dataset and produces a stream object.
|
7
|
+
#
|
8
|
+
# A `source` can also use data outside of the public datasets Zillabyte
|
9
|
+
# provides. Any external data available on the web can be streamed
|
10
|
+
# into a Zillabyte app. To do so, we use the expanded syntax of a
|
11
|
+
# `source` and include the code needed to generate the `source` data
|
12
|
+
# in the `next_tuple` block. The expanded syntax also allows for
|
13
|
+
# additional customizations such as providing a `name` for the `source`
|
14
|
+
# and specifying any preparatory steps such as initializing global
|
15
|
+
# values in the `begin_cycle` block.
|
16
|
+
#
|
17
|
+
# LANGUAGE_SYNTAX
|
18
|
+
#
|
19
|
+
# Sourcing from a dataset:
|
20
|
+
# app.source("dataset_name")
|
21
|
+
# Custom source:
|
22
|
+
# app.source do
|
23
|
+
# name "name" \t\t\t\t\t => optional
|
24
|
+
# emits "stream_1", "stream_2", ... \t\t => optional for single output stream
|
25
|
+
# end_cycle_policy :null_emit OR :explicit \t => default :null_emit
|
26
|
+
# begin_cycle |=block=| \t\t\t\t => optional if no initialization needed
|
27
|
+
# next_tuple |=block=|
|
28
|
+
# end
|
29
|
+
# - The "end_cycle_policy" is used to specify when a cycle should end. Two options are available:
|
30
|
+
# * :null_emit - end the cycle when a field contains "nil" or when nothing is emitted from the "next_tuple" block.
|
31
|
+
# * :explicit - the end of a cycle is explicitly declared in the "next_tuple" block. This is done by including the "end_cycle" keyword in the "next_tuple" block, e.g. end_cycle if @queue.nil?.
|
32
|
+
# - The "begin_cycle" and "next_tuple" blocks can be in do...end format or {} format.
|
33
|
+
# * the "begin_cycle" block is where any setup is done to initialize the content and quantity of tuples emitted by the "next_tuple" block.
|
34
|
+
# * the "next_tuple" block is where the tuples are actually emitted.
|
35
|
+
# EXAMPLE
|
36
|
+
#
|
37
|
+
# # Source from the "homepages" relation
|
38
|
+
# stream = app.source("homepages")
|
39
|
+
#
|
40
|
+
# # Custom Source
|
41
|
+
# app = Zillabyte.app(name = "small_cycle")
|
42
|
+
# stream = app.source do
|
43
|
+
# DICTIONARY = %w(apple bananna cherry dairy eel) unless defined? DICTIONARY
|
44
|
+
# emits "small_cycle_relation"
|
45
|
+
# end_cycle_policy :explicit
|
46
|
+
# begin_cycle do
|
47
|
+
# @count = 0
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# next_tuple do
|
51
|
+
# url = "http://#{@count}.example.com/#{@count}.html"
|
52
|
+
# html = "<html>#{DICTIONARY.sample}</html>"
|
53
|
+
# emit("small_cycle_relation", {:url => url, :html => html})
|
54
|
+
# @count += 1
|
55
|
+
# end_cycle if @count==1000
|
56
|
+
# end
|
57
|
+
# end
|
1
58
|
class Zillabyte::Harness::Source
|
2
59
|
attr_accessor :_app, :_node, :_relation
|
3
60
|
|
@@ -73,9 +73,8 @@ class Zillabyte::Harness::Stream
|
|
73
73
|
def loop_back(*args, &block)
|
74
74
|
# This is not a real operation, just telling the stream to loop back to the previous operation
|
75
75
|
loop_back_node = args[0]
|
76
|
-
Zillabyte::Harness::Helper.check_loop_back(loop_back_node, @_app._nodes)
|
76
|
+
Zillabyte::Harness::Helper.check_loop_back(self, loop_back_node, @_app._nodes)
|
77
77
|
Zillabyte::Harness::Helper.write_arc_to_file({"name" => self._name, "origin" => self._previous_node_name, "dest" => loop_back_node, "loop_back" => 1}, @_app._socket)
|
78
|
-
self
|
79
78
|
end
|
80
79
|
|
81
80
|
def sink(*args, &block)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zillabyte
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zillabyte
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.9.
|
33
|
+
version: 0.9.1
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 0.9.
|
40
|
+
version: 0.9.1
|
41
41
|
description: The Official Zillabyte Gem
|
42
42
|
email:
|
43
43
|
- gem@zillabyte.com
|