tros 1.7.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.travis.yml +13 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +18 -0
- data/README.md +18 -0
- data/Rakefile +25 -0
- data/lib/tros.rb +39 -0
- data/lib/tros/data_file.rb +342 -0
- data/lib/tros/io.rb +610 -0
- data/lib/tros/ipc.rb +550 -0
- data/lib/tros/protocol.rb +161 -0
- data/lib/tros/schema.rb +405 -0
- data/lib/tros/version.rb +3 -0
- data/test/datafile_test.rb +193 -0
- data/test/fixtures/schemas/org/apache/avro/data/Json.avsc +15 -0
- data/test/fixtures/schemas/org/apache/avro/ipc/HandshakeRequest.avsc +11 -0
- data/test/fixtures/schemas/org/apache/avro/ipc/HandshakeResponse.avsc +15 -0
- data/test/fixtures/schemas/org/apache/avro/ipc/trace/avroTrace.avdl +68 -0
- data/test/fixtures/schemas/org/apache/avro/ipc/trace/avroTrace.avpr +82 -0
- data/test/fixtures/schemas/org/apache/avro/mapred/tether/InputProtocol.avpr +64 -0
- data/test/fixtures/schemas/org/apache/avro/mapred/tether/OutputProtocol.avpr +82 -0
- data/test/helpers/random_data.rb +90 -0
- data/test/io_test.rb +419 -0
- data/test/protocol_test.rb +195 -0
- data/test/sample_ipc_client.rb +85 -0
- data/test/sample_ipc_http_client.rb +84 -0
- data/test/sample_ipc_http_server.rb +79 -0
- data/test/sample_ipc_server.rb +92 -0
- data/test/schema_test.rb +135 -0
- data/test/socket_transport_test.rb +40 -0
- data/test/test_helper.rb +26 -0
- data/test/tool.rb +144 -0
- data/tros.gemspec +32 -0
- metadata +137 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
2
|
+
# contributor license agreements. See the NOTICE file distributed with
|
3
|
+
# this work for additional information regarding copyright ownership.
|
4
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
5
|
+
# (the "License"); you may not use this file except in compliance with
|
6
|
+
# the License. You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
source 'https://rubygems.org'
|
17
|
+
gemspec
|
data/Gemfile.lock
ADDED
data/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Tros [](https://travis-ci.org/wvanbergen/tros)
|
2
|
+
|
3
|
+
This is a cleanup fork of the Avro gem.
|
4
|
+
|
5
|
+
Reasons:
|
6
|
+
- Get rid of yajl/multi_json dependency
|
7
|
+
- Drop support for Ruby 1.8
|
8
|
+
- Add proper unicode support.
|
9
|
+
- Not being stuck to Apache Avro project release schedule
|
10
|
+
- Public CI.
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
For now, the API is the same as the Avro API. Just replace `Avro` with `Tros`.
|
15
|
+
|
16
|
+
## Tros?
|
17
|
+
|
18
|
+
The name Tros probably only makes sense to Dutch people.
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'bundler/gem_tasks'
|
18
|
+
require 'rake/testtask'
|
19
|
+
|
20
|
+
Rake::TestTask.new('test') do |t|
|
21
|
+
t.libs << 'lib' << 'test'
|
22
|
+
t.test_files = FileList['test/*_test.rb']
|
23
|
+
end
|
24
|
+
|
25
|
+
task :default => :test
|
data/lib/tros.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'json'
|
18
|
+
require 'set'
|
19
|
+
require 'digest/md5'
|
20
|
+
require 'net/http'
|
21
|
+
require 'stringio'
|
22
|
+
require 'zlib'
|
23
|
+
|
24
|
+
module Tros
|
25
|
+
class TrosError < StandardError; end
|
26
|
+
|
27
|
+
class TrosTypeError < Tros::TrosError
|
28
|
+
def initialize(schm=nil, datum=nil, msg=nil)
|
29
|
+
msg ||= "Not a #{schm.to_s}: #{datum}"
|
30
|
+
super(msg)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
require 'tros/schema'
|
36
|
+
require 'tros/io'
|
37
|
+
require 'tros/data_file'
|
38
|
+
require 'tros/protocol'
|
39
|
+
require 'tros/ipc'
|
@@ -0,0 +1,342 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'openssl'
|
18
|
+
|
19
|
+
module Tros
|
20
|
+
module DataFile
|
21
|
+
VERSION = 1
|
22
|
+
MAGIC = "Obj" + [VERSION].pack('c')
|
23
|
+
MAGIC_SIZE = MAGIC.bytesize
|
24
|
+
SYNC_SIZE = 16
|
25
|
+
SYNC_INTERVAL = 4000 * SYNC_SIZE
|
26
|
+
META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
|
27
|
+
VALID_ENCODINGS = ['binary'] # not used yet
|
28
|
+
|
29
|
+
class DataFileError < TrosError; end
|
30
|
+
|
31
|
+
def self.open(file_path, mode='r', schema=nil, codec=nil)
|
32
|
+
schema = Tros::Schema.parse(schema) if schema
|
33
|
+
case mode
|
34
|
+
when 'w'
|
35
|
+
unless schema
|
36
|
+
raise DataFileError, "Writing an Tros file requires a schema."
|
37
|
+
end
|
38
|
+
io = open_writer(File.open(file_path, 'wb'), schema, codec)
|
39
|
+
when 'r'
|
40
|
+
io = open_reader(File.open(file_path, 'rb'), schema)
|
41
|
+
else
|
42
|
+
raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
|
43
|
+
end
|
44
|
+
|
45
|
+
yield io if block_given?
|
46
|
+
io
|
47
|
+
ensure
|
48
|
+
io.close if block_given? && io
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.codecs
|
52
|
+
@codecs
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.register_codec(codec)
|
56
|
+
@codecs ||= {}
|
57
|
+
codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
|
58
|
+
@codecs[codec.codec_name.to_s] = codec
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.get_codec(codec)
|
62
|
+
codec ||= 'null'
|
63
|
+
if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
|
64
|
+
codec # it's a codec instance
|
65
|
+
elsif codec.is_a?(Class)
|
66
|
+
codec.new # it's a codec class
|
67
|
+
elsif @codecs.include?(codec.to_s)
|
68
|
+
@codecs[codec.to_s] # it's a string or symbol (codec name)
|
69
|
+
else
|
70
|
+
raise DataFileError, "Unknown codec: #{codec.inspect}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class << self
|
75
|
+
private
|
76
|
+
def open_writer(file, schema, codec=nil)
|
77
|
+
writer = Tros::IO::DatumWriter.new(schema)
|
78
|
+
Tros::DataFile::Writer.new(file, writer, schema, codec)
|
79
|
+
end
|
80
|
+
|
81
|
+
def open_reader(file, schema)
|
82
|
+
reader = Tros::IO::DatumReader.new(nil, schema)
|
83
|
+
Tros::DataFile::Reader.new(file, reader)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Writer
|
88
|
+
def self.generate_sync_marker
|
89
|
+
OpenSSL::Random.random_bytes(16)
|
90
|
+
end
|
91
|
+
|
92
|
+
attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
|
93
|
+
attr_accessor :block_count
|
94
|
+
|
95
|
+
def initialize(writer, datum_writer, writers_schema=nil, codec=nil)
|
96
|
+
# If writers_schema is not present, presume we're appending
|
97
|
+
@writer = writer
|
98
|
+
@encoder = IO::BinaryEncoder.new(@writer)
|
99
|
+
@datum_writer = datum_writer
|
100
|
+
@buffer_writer = StringIO.new('', 'w')
|
101
|
+
@buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
|
102
|
+
@block_count = 0
|
103
|
+
|
104
|
+
@meta = {}
|
105
|
+
|
106
|
+
if writers_schema
|
107
|
+
@sync_marker = Writer.generate_sync_marker
|
108
|
+
@codec = DataFile.get_codec(codec)
|
109
|
+
meta['tros.codec'] = @codec.codec_name.to_s
|
110
|
+
meta['tros.schema'] = writers_schema.to_s
|
111
|
+
datum_writer.writers_schema = writers_schema
|
112
|
+
write_header
|
113
|
+
else
|
114
|
+
# open writer for reading to collect metadata
|
115
|
+
dfr = Reader.new(writer, Tros::IO::DatumReader.new)
|
116
|
+
|
117
|
+
# FIXME(jmhodges): collect arbitrary metadata
|
118
|
+
# collect metadata
|
119
|
+
@sync_marker = dfr.sync_marker
|
120
|
+
meta['tros.codec'] = dfr.meta['tros.codec']
|
121
|
+
@codec = DataFile.get_codec(meta['tros.codec'])
|
122
|
+
|
123
|
+
# get schema used to write existing file
|
124
|
+
schema_from_file = dfr.meta['tros.schema']
|
125
|
+
meta['tros.schema'] = schema_from_file
|
126
|
+
datum_writer.writers_schema = Schema.parse(schema_from_file)
|
127
|
+
|
128
|
+
# seek to the end of the file and prepare for writing
|
129
|
+
writer.seek(0,2)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# Append a datum to the file
|
134
|
+
def <<(datum)
|
135
|
+
datum_writer.write(datum, buffer_encoder)
|
136
|
+
self.block_count += 1
|
137
|
+
|
138
|
+
# if the data to write is larger than the sync interval, write
|
139
|
+
# the block
|
140
|
+
if buffer_writer.tell >= SYNC_INTERVAL
|
141
|
+
write_block
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return the current position as a value that may be passed to
|
146
|
+
# DataFileReader.seek(long). Forces the end of the current block,
|
147
|
+
# emitting a synchronization marker.
|
148
|
+
def sync
|
149
|
+
write_block
|
150
|
+
writer.tell
|
151
|
+
end
|
152
|
+
|
153
|
+
# Flush the current state of the file, including metadata
|
154
|
+
def flush
|
155
|
+
write_block
|
156
|
+
writer.flush
|
157
|
+
end
|
158
|
+
|
159
|
+
def close
|
160
|
+
flush
|
161
|
+
writer.close
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def write_header
|
167
|
+
# write magic
|
168
|
+
writer.write(MAGIC)
|
169
|
+
|
170
|
+
# write metadata
|
171
|
+
datum_writer.write_data(META_SCHEMA, meta, encoder)
|
172
|
+
|
173
|
+
# write sync marker
|
174
|
+
writer.write(sync_marker)
|
175
|
+
end
|
176
|
+
|
177
|
+
# TODO(jmhodges): make a schema for blocks and use datum_writer
|
178
|
+
# TODO(jmhodges): do we really need the number of items in the block?
|
179
|
+
def write_block
|
180
|
+
if block_count > 0
|
181
|
+
# write number of items in block and block size in bytes
|
182
|
+
encoder.write_long(block_count)
|
183
|
+
to_write = codec.compress(buffer_writer.string)
|
184
|
+
encoder.write_long(to_write.bytesize)
|
185
|
+
|
186
|
+
# write block contents
|
187
|
+
writer.write(to_write)
|
188
|
+
|
189
|
+
# write sync marker
|
190
|
+
writer.write(sync_marker)
|
191
|
+
|
192
|
+
# reset buffer
|
193
|
+
buffer_writer.truncate(0)
|
194
|
+
buffer_writer.rewind
|
195
|
+
self.block_count = 0
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Read files written by DataFileWriter
|
201
|
+
class Reader
|
202
|
+
include ::Enumerable
|
203
|
+
|
204
|
+
# The reader and binary decoder for the raw file stream
|
205
|
+
attr_reader :reader, :decoder
|
206
|
+
|
207
|
+
# The binary decoder for the contents of a block (after codec decompression)
|
208
|
+
attr_reader :block_decoder
|
209
|
+
|
210
|
+
attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
|
211
|
+
attr_accessor :block_count # records remaining in current block
|
212
|
+
|
213
|
+
def initialize(reader, datum_reader)
|
214
|
+
@reader = reader
|
215
|
+
@decoder = IO::BinaryDecoder.new(reader)
|
216
|
+
@datum_reader = datum_reader
|
217
|
+
|
218
|
+
# read the header: magic, meta, sync
|
219
|
+
read_header
|
220
|
+
|
221
|
+
@codec = DataFile.get_codec(meta['tros.codec'])
|
222
|
+
|
223
|
+
# get ready to read
|
224
|
+
@block_count = 0
|
225
|
+
datum_reader.writers_schema = Schema.parse meta['tros.schema']
|
226
|
+
end
|
227
|
+
|
228
|
+
# Iterates through each datum in this file
|
229
|
+
# TODO(jmhodges): handle block of length zero
|
230
|
+
def each
|
231
|
+
loop do
|
232
|
+
if block_count == 0
|
233
|
+
case
|
234
|
+
when eof?; break
|
235
|
+
when skip_sync
|
236
|
+
break if eof?
|
237
|
+
read_block_header
|
238
|
+
else
|
239
|
+
read_block_header
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
datum = datum_reader.read(block_decoder)
|
244
|
+
self.block_count -= 1
|
245
|
+
yield(datum)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def eof?; reader.eof?; end
|
250
|
+
|
251
|
+
def close
|
252
|
+
reader.close
|
253
|
+
end
|
254
|
+
|
255
|
+
private
|
256
|
+
def read_header
|
257
|
+
# seek to the beginning of the file to get magic block
|
258
|
+
reader.seek(0, 0)
|
259
|
+
|
260
|
+
# check magic number
|
261
|
+
magic_in_file = reader.read(MAGIC_SIZE)
|
262
|
+
if magic_in_file.size < MAGIC_SIZE
|
263
|
+
msg = 'Not an Tros data file: shorter than the Tros magic block'
|
264
|
+
raise DataFileError, msg
|
265
|
+
elsif magic_in_file != MAGIC
|
266
|
+
msg = "Not an Tros data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
|
267
|
+
raise DataFileError, msg
|
268
|
+
end
|
269
|
+
|
270
|
+
# read metadata
|
271
|
+
@meta = datum_reader.read_data(META_SCHEMA,
|
272
|
+
META_SCHEMA,
|
273
|
+
decoder)
|
274
|
+
# read sync marker
|
275
|
+
@sync_marker = reader.read(SYNC_SIZE)
|
276
|
+
end
|
277
|
+
|
278
|
+
def read_block_header
|
279
|
+
self.block_count = decoder.read_long
|
280
|
+
block_bytes = decoder.read_long
|
281
|
+
data = codec.decompress(reader.read(block_bytes))
|
282
|
+
@block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
|
283
|
+
end
|
284
|
+
|
285
|
+
# read the length of the sync marker; if it matches the sync
|
286
|
+
# marker, return true. Otherwise, seek back to where we started
|
287
|
+
# and return false
|
288
|
+
def skip_sync
|
289
|
+
proposed_sync_marker = reader.read(SYNC_SIZE)
|
290
|
+
if proposed_sync_marker != sync_marker
|
291
|
+
reader.seek(-SYNC_SIZE, 1)
|
292
|
+
false
|
293
|
+
else
|
294
|
+
true
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
|
300
|
+
class NullCodec
|
301
|
+
def codec_name; 'null'; end
|
302
|
+
def decompress(data); data; end
|
303
|
+
def compress(data); data; end
|
304
|
+
end
|
305
|
+
|
306
|
+
class DeflateCodec
|
307
|
+
attr_reader :level
|
308
|
+
|
309
|
+
def initialize(level=Zlib::DEFAULT_COMPRESSION)
|
310
|
+
@level = level
|
311
|
+
end
|
312
|
+
|
313
|
+
def codec_name; 'deflate'; end
|
314
|
+
|
315
|
+
def decompress(compressed)
|
316
|
+
# Passing a negative number to Inflate puts it into "raw" RFC1951 mode
|
317
|
+
# (without the RFC1950 header & checksum). See the docs for
|
318
|
+
# inflateInit2 in http://www.zlib.net/manual.html
|
319
|
+
zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
320
|
+
data = zstream.inflate(compressed)
|
321
|
+
data << zstream.finish
|
322
|
+
ensure
|
323
|
+
zstream.close
|
324
|
+
end
|
325
|
+
|
326
|
+
def compress(data)
|
327
|
+
zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
|
328
|
+
compressed = zstream.deflate(data)
|
329
|
+
compressed << zstream.finish
|
330
|
+
ensure
|
331
|
+
zstream.close
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
DataFile.register_codec NullCodec
|
336
|
+
DataFile.register_codec DeflateCodec
|
337
|
+
|
338
|
+
# TODO this constant won't be updated if you register another codec.
|
339
|
+
# Deprecated in favor of Tros::DataFile::codecs
|
340
|
+
VALID_CODECS = DataFile.codecs.keys
|
341
|
+
end
|
342
|
+
end
|