avro 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ v0.0.1 stuff
@@ -0,0 +1,19 @@
1
+ CHANGELOG
2
+ Manifest
3
+ Rakefile
4
+ avro.gemspec
5
+ interop/test_interop.rb
6
+ lib/avro.rb
7
+ lib/avro/collect_hash.rb
8
+ lib/avro/data_file.rb
9
+ lib/avro/io.rb
10
+ lib/avro/ipc.rb
11
+ lib/avro/protocol.rb
12
+ lib/avro/schema.rb
13
+ test/random_data.rb
14
+ test/sample_ipc_client.rb
15
+ test/sample_ipc_server.rb
16
+ test/test_help.rb
17
+ test/test_io.rb
18
+ test/test_protocol.rb
19
+ tmp/test.rb.avro
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'rubygems'
18
+ require 'echoe'
19
+ VERSION = File.open('../../share/VERSION.txt').read
20
+ Echoe.new('avro', VERSION) do |p|
21
+ p.author = "Apache Software Foundation"
22
+ p.email = "avro-dev@hadoop.apache.org"
23
+ p.summary = "Apache Avro for Ruby"
24
+ p.description = "Apache is a data serialization and RPC format"
25
+ p.url = "http://hadoop.apache.org/avro/"
26
+ p.runtime_dependencies = %w[yajl-ruby]
27
+ end
28
+
29
+ t = Rake::TestTask.new(:interop)
30
+ t.pattern = 'interop/test*.rb'
31
+
32
+ task :generate_interop do
33
+ $:.unshift(HERE + '/lib')
34
+ $:.unshift(HERE + '/test')
35
+ require 'avro'
36
+ require 'random_data'
37
+
38
+ schema = Avro::Schema.parse(File.read(SCHEMAS + '/interop.avsc'))
39
+ r = RandomData.new(schema, ENV['SEED'])
40
+ f = File.open(BUILD + '/interop/data/ruby.avro', 'w')
41
+ writer = Avro::DataFile::Writer.new(f, Avro::IO::DatumWriter.new(schema), schema)
42
+ begin
43
+ writer << r.next
44
+ writer << r.next
45
+ ensure
46
+ writer.close
47
+ end
48
+ end
49
+
50
+
51
+ HERE = File.expand_path(File.dirname(__FILE__))
52
+ SHARE = HERE + '/../../share'
53
+ SCHEMAS = SHARE + '/test/schemas'
54
+ BUILD = HERE + '/../../build'
55
+
56
+ task :dist => [:manifest, :gem] do
57
+ mkdir_p "../../dist/ruby"
58
+ cp "pkg/avro-#{VERSION}.gem", "../../dist/ruby"
59
+ end
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{avro}
5
+ s.version = "1.3.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Apache Software Foundation"]
9
+ s.date = %q{2010-03-01}
10
+ s.description = %q{Apache is a data serialization and RPC format}
11
+ s.email = %q{avro-dev@hadoop.apache.org}
12
+ s.extra_rdoc_files = ["CHANGELOG", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb"]
13
+ s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_server.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "tmp/test.rb.avro"]
14
+ s.homepage = %q{http://hadoop.apache.org/avro/}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Avro"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{avro}
18
+ s.rubygems_version = %q{1.3.5}
19
+ s.summary = %q{Apache Avro for Ruby}
20
+ s.test_files = ["test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb"]
21
+
22
+ if s.respond_to? :specification_version then
23
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
24
+ s.specification_version = 3
25
+
26
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
27
+ s.add_runtime_dependency(%q<yajl-ruby>, [">= 0"])
28
+ else
29
+ s.add_dependency(%q<yajl-ruby>, [">= 0"])
30
+ end
31
+ else
32
+ s.add_dependency(%q<yajl-ruby>, [">= 0"])
33
+ end
34
+ end
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ # Licensed to the Apache Software Foundation (ASF) under one
3
+ # or more contributor license agreements. See the NOTICE file
4
+ # distributed with this work for additional information
5
+ # regarding copyright ownership. The ASF licenses this file
6
+ # to you under the Apache License, Version 2.0 (the
7
+ # "License"); you may not use this file except in compliance
8
+ # with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ require 'rubygems'
19
+ require 'test/unit'
20
+ require 'avro'
21
+
22
+ class TestInterop < Test::Unit::TestCase
23
+ HERE = File.expand_path(File.dirname(__FILE__))
24
+ SHARE = HERE + '/../../../share'
25
+ SCHEMAS = SHARE + '/test/schemas'
26
+ Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
27
+ define_method("test_read_#{File.basename(fn, 'avro')}") do
28
+ projection = Avro::Schema.parse(File.read(SCHEMAS+'/interop.avsc'))
29
+
30
+ File.open(fn) do |f|
31
+ r = Avro::DataFile::Reader.new(f, Avro::IO::DatumReader.new(projection))
32
+ i = 0
33
+ r.each do |datum|
34
+ i += 1
35
+ assert_not_nil datum, "nil datum from #{fn}"
36
+ end
37
+ assert_not_equal 0, i, "no data read in from #{fn}"
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,39 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'yajl'
18
+ require 'set'
19
+ require 'md5'
20
+
21
+ module Avro
22
+ VERSION = "FIXME"
23
+
24
+ class AvroError < StandardError; end
25
+
26
+ class AvroTypeError < Avro::AvroError
27
+ def initialize(schm=nil, datum=nil, msg=nil)
28
+ msg ||= "Not a #{schm.to_s}: #{datum}"
29
+ super(msg)
30
+ end
31
+ end
32
+ end
33
+
34
+ require 'avro/collect_hash'
35
+ require 'avro/schema'
36
+ require 'avro/io'
37
+ require 'avro/data_file'
38
+ require 'avro/protocol'
39
+ require 'avro/ipc'
@@ -0,0 +1,25 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Enumerable
18
+ def collect_hash
19
+ inject(Hash.new) do |memo, i|
20
+ k, v = yield(i)
21
+ memo[k] = v if k
22
+ memo
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,243 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'openssl'
18
+
19
+ module Avro
20
+ module DataFile
21
+ VERSION = 1
22
+ MAGIC = "Obj" + [VERSION].pack('c')
23
+ MAGIC_SIZE = MAGIC.size
24
+ SYNC_SIZE = 16
25
+ SYNC_INTERVAL = 1000 * SYNC_SIZE
26
+ META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
27
+ VALID_CODECS = ['null']
28
+ VALID_ENCODINGS = ['binary'] # not used yet
29
+
30
+ class DataFileError < AvroError; end
31
+
32
+ class Writer
33
+ def self.generate_sync_marker
34
+ OpenSSL::Random.random_bytes(16)
35
+ end
36
+
37
+ attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta
38
+ attr_accessor :block_count
39
+
40
+ def initialize(writer, datum_writer, writers_schema=nil)
41
+ # If writers_schema is not present, presume we're appending
42
+ @writer = writer
43
+ @encoder = IO::BinaryEncoder.new(@writer)
44
+ @datum_writer = datum_writer
45
+ @buffer_writer = StringIO.new('', 'w')
46
+ @buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
47
+ @block_count = 0
48
+
49
+ @meta = {}
50
+
51
+ if writers_schema
52
+ @sync_marker = Writer.generate_sync_marker
53
+ meta['avro.codec'] = 'null'
54
+ meta['avro.schema'] = writers_schema.to_s
55
+ datum_writer.writers_schema = writers_schema
56
+ write_header
57
+ else
58
+ # open writer for reading to collect metadata
59
+ dfr = Reader.new(writer, Avro::IO::DatumReader.new)
60
+
61
+ # FIXME(jmhodges): collect arbitrary metadata
62
+ # collect metadata
63
+ @sync_marker = dfr.sync_marker
64
+ meta['avro.codec'] = dfr.meta['avro.codec']
65
+
66
+ # get schema used to write existing file
67
+ schema_from_file = dfr.meta['avro.schema']
68
+ meta['avro.schema'] = schema_from_file
69
+ datum_writer.writers_schema = Schema.parse(schema_from_file)
70
+
71
+ # seek to the end of the file and prepare for writing
72
+ writer.seek(0,2)
73
+ end
74
+ end
75
+
76
+ # Append a datum to the file
77
+ def <<(datum)
78
+ datum_writer.write(datum, buffer_encoder)
79
+ self.block_count += 1
80
+
81
+ # if the data to write is larger than the sync interval, write
82
+ # the block
83
+ if buffer_writer.tell >= SYNC_INTERVAL
84
+ write_block
85
+ end
86
+ end
87
+
88
+ # Return the current position as a value that may be passed to
89
+ # DataFileReader.seek(long). Forces the end of the current block,
90
+ # emitting a synchronization marker.
91
+ def sync
92
+ write_block
93
+ writer.tell
94
+ end
95
+
96
+ # Flush the current state of the file, including metadata
97
+ def flush
98
+ write_block
99
+ writer.flush
100
+ end
101
+
102
+ def close
103
+ flush
104
+ writer.close
105
+ end
106
+
107
+ private
108
+
109
+ def write_header
110
+ # write magic
111
+ writer.write(MAGIC)
112
+
113
+ # write metadata
114
+ datum_writer.write_data(META_SCHEMA, meta, encoder)
115
+
116
+ # write sync marker
117
+ writer.write(sync_marker)
118
+ end
119
+
120
+ # TODO(jmhodges): make a schema for blocks and use datum_writer
121
+ # TODO(jmhodges): do we really need the number of items in the block?
122
+ # TODO(jmhodges): use codec when writing the block contents
123
+ def write_block
124
+ if block_count > 0
125
+ # write number of items in block and block size in bytes
126
+ encoder.write_long(block_count)
127
+ to_write = buffer_writer.string
128
+ encoder.write_long(to_write.size)
129
+
130
+ # write block contents
131
+ if meta['avro.codec'] == 'null'
132
+ writer.write(to_write)
133
+ else
134
+ msg = "#{meta['avro.codec'].inspect} coded is not supported"
135
+ raise DataFileError, msg
136
+ end
137
+
138
+ # write sync marker
139
+ writer.write(sync_marker)
140
+
141
+ # reset buffer
142
+ buffer_writer.truncate(0)
143
+ self.block_count = 0
144
+ end
145
+ end
146
+ end
147
+
148
+ # Read files written by DataFileWriter
149
+ class Reader
150
+ include ::Enumerable
151
+
152
+ attr_reader :reader, :decoder, :datum_reader, :sync_marker, :meta, :file_length
153
+ attr_accessor :block_count
154
+
155
+ def initialize(reader, datum_reader)
156
+ @reader = reader
157
+ @decoder = IO::BinaryDecoder.new(reader)
158
+ @datum_reader = datum_reader
159
+
160
+ # read the header: magic, meta, sync
161
+ read_header
162
+
163
+ # ensure the codec is valid
164
+ codec_from_file = meta['avro.codec']
165
+ if codec_from_file && ! VALID_CODECS.include?(codec_from_file)
166
+ raise DataFileError, "Unknown codec: #{codec_from_file}"
167
+ end
168
+
169
+ # get ready to read
170
+ @block_count = 0
171
+ datum_reader.writers_schema = Schema.parse meta['avro.schema']
172
+ end
173
+
174
+ # Iterates through each datum in this file
175
+ # TODO(jmhodges): handle block of length zero
176
+ def each
177
+ loop do
178
+ if block_count == 0
179
+ case
180
+ when eof?; break
181
+ when skip_sync
182
+ break if eof?
183
+ read_block_header
184
+ else
185
+ read_block_header
186
+ end
187
+ end
188
+
189
+ datum = datum_reader.read(decoder)
190
+ self.block_count -= 1
191
+ yield(datum)
192
+ end
193
+ end
194
+
195
+ def eof?; reader.eof?; end
196
+
197
+ def close
198
+ reader.close
199
+ end
200
+
201
+ private
202
+ def read_header
203
+ # seek to the beginning of the file to get magic block
204
+ reader.seek(0, 0)
205
+
206
+ # check magic number
207
+ magic_in_file = reader.read(MAGIC_SIZE)
208
+ if magic_in_file.size < MAGIC_SIZE
209
+ msg = 'Not an Avro data file: shorter than the Avro magic block'
210
+ raise DataFileError, msg
211
+ elsif magic_in_file != MAGIC
212
+ msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
213
+ raise DataFileError, msg
214
+ end
215
+
216
+ # read metadata
217
+ @meta = datum_reader.read_data(META_SCHEMA,
218
+ META_SCHEMA,
219
+ decoder)
220
+ # read sync marker
221
+ @sync_marker = reader.read(SYNC_SIZE)
222
+ end
223
+
224
+ def read_block_header
225
+ self.block_count = decoder.read_long
226
+ decoder.read_long # not doing anything with length in bytes
227
+ end
228
+
229
+ # read the length of the sync marker; if it matches the sync
230
+ # marker, return true. Otherwise, seek back to where we started
231
+ # and return false
232
+ def skip_sync
233
+ proposed_sync_marker = reader.read(SYNC_SIZE)
234
+ if proposed_sync_marker != sync_marker
235
+ reader.seek(-SYNC_SIZE, 1)
236
+ false
237
+ else
238
+ true
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end