avro_turf 0.8.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,21 +21,53 @@ class AvroTurf
21
21
  # 1: https://github.com/confluentinc/schema-registry
22
22
  class Messaging
23
23
  MAGIC_BYTE = [0].pack("C").freeze
24
+ DecodedMessage = Struct.new(:schema_id, :writer_schema, :reader_schema, :message)
25
+ private_constant(:DecodedMessage)
24
26
 
25
27
  # Instantiate a new Messaging instance with the given configuration.
26
28
  #
27
- # registry - A schema registry object that responds to all methods in the
28
- # AvroTurf::ConfluentSchemaRegistry interface.
29
- # registry_url - The String URL of the schema registry that should be used.
30
- # schema_store - A schema store object that responds to #find(schema_name, namespace).
31
- # schemas_path - The String file system path where local schemas are stored.
32
- # namespace - The String default schema namespace.
33
- # logger - The Logger that should be used to log information (optional).
34
- def initialize(registry: nil, registry_url: nil, schema_store: nil, schemas_path: nil, namespace: nil, logger: nil)
29
+ # registry - A schema registry object that responds to all methods in the
30
+ # AvroTurf::ConfluentSchemaRegistry interface.
31
+ # registry_url - The String URL of the schema registry that should be used.
32
+ # schema_store - A schema store object that responds to #find(schema_name, namespace).
33
+ # schemas_path - The String file system path where local schemas are stored.
34
+ # namespace - The String default schema namespace.
35
+ # logger - The Logger that should be used to log information (optional).
36
+ # proxy - Forward the request via proxy (optional).
37
+ # client_cert - Name of file containing client certificate (optional).
38
+ # client_key - Name of file containing client private key to go with client_cert (optional).
39
+ # client_key_pass - Password to go with client_key (optional).
40
+ # client_cert_data - In-memory client certificate (optional).
41
+ # client_key_data - In-memory client private key to go with client_cert_data (optional).
42
+ def initialize(
43
+ registry: nil,
44
+ registry_url: nil,
45
+ schema_store: nil,
46
+ schemas_path: nil,
47
+ namespace: nil,
48
+ logger: nil,
49
+ proxy: nil,
50
+ client_cert: nil,
51
+ client_key: nil,
52
+ client_key_pass: nil,
53
+ client_cert_data: nil,
54
+ client_key_data: nil
55
+ )
35
56
  @logger = logger || Logger.new($stderr)
36
57
  @namespace = namespace
37
58
  @schema_store = schema_store || SchemaStore.new(path: schemas_path || DEFAULT_SCHEMAS_PATH)
38
- @registry = registry || CachedConfluentSchemaRegistry.new(ConfluentSchemaRegistry.new(registry_url, logger: @logger))
59
+ @registry = registry || CachedConfluentSchemaRegistry.new(
60
+ ConfluentSchemaRegistry.new(
61
+ registry_url,
62
+ logger: @logger,
63
+ proxy: proxy,
64
+ client_cert: client_cert,
65
+ client_key: client_key,
66
+ client_key_pass: client_key_pass,
67
+ client_cert_data: client_cert_data,
68
+ client_key_data: client_key_data
69
+ )
70
+ )
39
71
  @schemas_by_id = {}
40
72
  end
41
73
 
@@ -46,14 +78,24 @@ class AvroTurf
46
78
  # schema_name - The String name of the schema that should be used to encode
47
79
  # the data.
48
80
  # namespace - The namespace of the schema (optional).
81
+ # subject - The subject name the schema should be registered under in
82
+ # the schema registry (optional).
83
+ # version - The integer version of the schema that should be used to decode
84
+ # the data. Must match the schema used when encoding (optional).
85
+ # schema_id - The integer id of the schema that should be used to encode
86
+ # the data.
49
87
  #
50
88
  # Returns the encoded data as a String.
51
- def encode(message, schema_name: nil, namespace: @namespace, subject: nil)
52
- schema = @schema_store.find(schema_name, namespace)
53
-
54
- # Schemas are registered under the full name of the top level Avro record
55
- # type, or `subject` if it's provided.
56
- schema_id = @registry.register(subject || schema.fullname, schema)
89
+ def encode(message, schema_name: nil, namespace: @namespace, subject: nil, version: nil, schema_id: nil)
90
+ schema_id, schema = if schema_id
91
+ fetch_schema_by_id(schema_id)
92
+ elsif subject && version
93
+ fetch_schema(subject, version)
94
+ elsif schema_name
95
+ register_schema(subject, schema_name, namespace)
96
+ else
97
+ raise ArgumentError.new('Neither schema_name nor schema_id nor subject + version provided to determine the schema.')
98
+ end
57
99
 
58
100
  stream = StringIO.new
59
101
  writer = Avro::IO::DatumWriter.new(schema)
@@ -69,6 +111,12 @@ class AvroTurf
69
111
  writer.write(message, encoder)
70
112
 
71
113
  stream.string
114
+ rescue Excon::Error::NotFound
115
+ if schema_id
116
+ raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
117
+ else
118
+ raise SchemaNotFoundError.new("Schema with subject: `#{subject}` version: `#{version}` is not found on registry")
119
+ end
72
120
  end
73
121
 
74
122
  # Decodes data into the original message.
@@ -80,6 +128,20 @@ class AvroTurf
80
128
  #
81
129
  # Returns the decoded message.
82
130
  def decode(data, schema_name: nil, namespace: @namespace)
131
+ decode_message(data, schema_name: schema_name, namespace: namespace).message
132
+ end
133
+
134
+ # Decodes data into the original message.
135
+ #
136
+ # data - A String containing encoded data.
137
+ # schema_name - The String name of the schema that should be used to decode
138
+ # the data. Must match the schema used when encoding (optional).
139
+ # namespace - The namespace of the schema (optional).
140
+ #
141
+ # Returns Struct with the next attributes:
142
+ # schema_id - The integer id of schema used to encode the message
143
+ # message - The decoded message
144
+ def decode_message(data, schema_name: nil, namespace: @namespace)
83
145
  readers_schema = schema_name && @schema_store.find(schema_name, namespace)
84
146
  stream = StringIO.new(data)
85
147
  decoder = Avro::IO::BinaryDecoder.new(stream)
@@ -100,7 +162,38 @@ class AvroTurf
100
162
  end
101
163
 
102
164
  reader = Avro::IO::DatumReader.new(writers_schema, readers_schema)
103
- reader.read(decoder)
165
+ message = reader.read(decoder)
166
+
167
+ DecodedMessage.new(schema_id, writers_schema, readers_schema, message)
168
+ rescue Excon::Error::NotFound
169
+ raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
170
+ end
171
+
172
+ private
173
+
174
+ # Providing subject and version to determine the schema,
175
+ # which skips the auto registeration of schema on the schema registry.
176
+ # Fetch the schema from registry with the provided subject name and version.
177
+ def fetch_schema(subject, version)
178
+ schema_data = @registry.subject_version(subject, version)
179
+ schema_id = schema_data.fetch('id')
180
+ schema = Avro::Schema.parse(schema_data.fetch('schema'))
181
+ [schema_id, schema]
182
+ end
183
+
184
+ # Fetch the schema from registry with the provided schema_id.
185
+ def fetch_schema_by_id(schema_id)
186
+ schema_json = @registry.fetch(schema_id)
187
+ schema = Avro::Schema.parse(schema_json)
188
+ [schema_id, schema]
189
+ end
190
+
191
+ # Schemas are registered under the full name of the top level Avro record
192
+ # type, or `subject` if it's provided.
193
+ def register_schema(subject, schema_name, namespace)
194
+ schema = @schema_store.find(schema_name, namespace)
195
+ schema_id = @registry.register(subject || schema.fullname, schema)
196
+ [schema_id, schema]
104
197
  end
105
198
  end
106
199
  end
@@ -3,6 +3,7 @@ class AvroTurf::SchemaStore
3
3
  def initialize(path: nil)
4
4
  @path = path or raise "Please specify a schema path"
5
5
  @schemas = Hash.new
6
+ @mutex = Mutex.new
6
7
  end
7
8
 
8
9
  # Resolves and returns a schema.
@@ -12,33 +13,17 @@ class AvroTurf::SchemaStore
12
13
  # Returns an Avro::Schema.
13
14
  def find(name, namespace = nil)
14
15
  fullname = Avro::Name.make_fullname(name, namespace)
15
-
16
+ # Optimistic non-blocking read from @schemas
17
+ # No sense to lock the resource when all the schemas already loaded
16
18
  return @schemas[fullname] if @schemas.key?(fullname)
17
19
 
18
- *namespace, schema_name = fullname.split(".")
19
- schema_path = File.join(@path, *namespace, schema_name + ".avsc")
20
- schema_json = JSON.parse(File.read(schema_path))
21
- schema = Avro::Schema.real_parse(schema_json, @schemas)
20
+ # Pessimistic blocking write to @schemas
21
+ @mutex.synchronize do
22
+ # Still need to check is the schema already loaded
23
+ return @schemas[fullname] if @schemas.key?(fullname)
22
24
 
23
- if schema.respond_to?(:fullname) && schema.fullname != fullname
24
- raise AvroTurf::SchemaError, "expected schema `#{schema_path}' to define type `#{fullname}'"
25
+ load_schema!(fullname, namespace)
25
26
  end
26
-
27
- schema
28
- rescue ::Avro::SchemaParseError => e
29
- # This is a hack in order to figure out exactly which type was missing. The
30
- # Avro gem ought to provide this data directly.
31
- if e.to_s =~ /"([\w\.]+)" is not a schema we know about/
32
- find($1)
33
-
34
- # Re-resolve the original schema now that the dependency has been resolved.
35
- @schemas.delete(fullname)
36
- find(fullname)
37
- else
38
- raise
39
- end
40
- rescue Errno::ENOENT, Errno::ENAMETOOLONG
41
- raise AvroTurf::SchemaNotFoundError, "could not find Avro schema at `#{schema_path}'"
42
27
  end
43
28
 
44
29
  # Loads all schema definition files in the `schemas_dir`.
@@ -57,4 +42,53 @@ class AvroTurf::SchemaStore
57
42
  end
58
43
  end
59
44
 
45
+ private
46
+
47
+ # Loads single schema
48
+ # Such method is not thread-safe, do not call it of from mutex synchronization routine
49
+ def load_schema!(fullname, namespace = nil, local_schemas_cache = {})
50
+ *namespace, schema_name = fullname.split(".")
51
+ schema_path = File.join(@path, *namespace, schema_name + ".avsc")
52
+ schema_json = JSON.parse(File.read(schema_path))
53
+
54
+ schema = Avro::Schema.real_parse(schema_json, local_schemas_cache)
55
+
56
+ # Don't cache the parsed schema until after its fullname is validated
57
+ if schema.respond_to?(:fullname) && schema.fullname != fullname
58
+ raise AvroTurf::SchemaError, "expected schema `#{schema_path}' to define type `#{fullname}'"
59
+ end
60
+
61
+ # Cache only this new top-level schema by its fullname. It's critical
62
+ # not to make every sub-schema resolvable at the top level here because
63
+ # multiple different avsc files may define the same sub-schema, and
64
+ # if we share the @schemas cache across all parsing contexts, the Avro
65
+ # gem will raise an Avro::SchemaParseError when parsing another avsc
66
+ # file that contains a subschema with the same fullname as one
67
+ # encountered previously in a different file:
68
+ # <Avro::SchemaParseError: The name "foo.bar" is already in use.>
69
+ # Essentially, the only schemas that should be resolvable in @schemas
70
+ # are those that have their own .avsc files on disk.
71
+ @schemas[fullname] = schema
72
+
73
+ schema
74
+ rescue ::Avro::SchemaParseError => e
75
+ # This is a hack in order to figure out exactly which type was missing. The
76
+ # Avro gem ought to provide this data directly.
77
+ if e.to_s =~ /"([\w\.]+)" is not a schema we know about/
78
+ # Try to first resolve a referenced schema from disk.
79
+ # If this is successful, the Avro gem will have mutated the
80
+ # local_schemas_cache, adding all the new schemas it found.
81
+ load_schema!($1, nil, local_schemas_cache)
82
+
83
+ # Attempt to re-parse the original schema now that the dependency
84
+ # has been resolved and use the now-updated local_schemas_cache to
85
+ # pick up where we left off.
86
+ local_schemas_cache.delete(fullname)
87
+ load_schema!(fullname, nil, local_schemas_cache)
88
+ else
89
+ raise
90
+ end
91
+ rescue Errno::ENOENT, Errno::ENAMETOOLONG
92
+ raise AvroTurf::SchemaNotFoundError, "could not find Avro schema at `#{schema_path}'"
93
+ end
60
94
  end
@@ -34,10 +34,21 @@ class FakeConfluentSchemaRegistryServer < Sinatra::Base
34
34
  end
35
35
 
36
36
  post "/subjects/:subject/versions" do
37
- SCHEMAS << parse_schema
37
+ schema = parse_schema
38
+ ids_for_subject = SUBJECTS[params[:subject]]
39
+
40
+ schemas_for_subject =
41
+ SCHEMAS.select
42
+ .with_index { |_, i| ids_for_subject.include?(i) }
43
+
44
+ if schemas_for_subject.include?(schema)
45
+ schema_id = SCHEMAS.index(schema)
46
+ else
47
+ SCHEMAS << schema
48
+ schema_id = SCHEMAS.size - 1
49
+ SUBJECTS[params[:subject]] = SUBJECTS[params[:subject]] << schema_id
50
+ end
38
51
 
39
- schema_id = SCHEMAS.size - 1
40
- SUBJECTS[params[:subject]] = SUBJECTS[params[:subject]] << schema_id
41
52
  { id: schema_id }.to_json
42
53
  end
43
54
 
@@ -73,6 +84,7 @@ class FakeConfluentSchemaRegistryServer < Sinatra::Base
73
84
  {
74
85
  name: params[:subject],
75
86
  version: schema_ids.index(schema_id) + 1,
87
+ id: schema_id,
76
88
  schema: schema
77
89
  }.to_json
78
90
  end
@@ -1,3 +1,3 @@
1
1
  class AvroTurf
2
- VERSION = "0.8.1"
2
+ VERSION = "1.1.0"
3
3
  end
@@ -16,8 +16,9 @@ describe AvroTurf::CachedConfluentSchemaRegistry do
16
16
 
17
17
  describe "#fetch" do
18
18
  it "caches the result of fetch" do
19
+ # multiple calls return same result, with only one upstream call
19
20
  allow(upstream).to receive(:fetch).with(id).and_return(schema)
20
- registry.fetch(id)
21
+ expect(registry.fetch(id)).to eq(schema)
21
22
  expect(registry.fetch(id)).to eq(schema)
22
23
  expect(upstream).to have_received(:fetch).exactly(1).times
23
24
  end
@@ -27,13 +28,34 @@ describe AvroTurf::CachedConfluentSchemaRegistry do
27
28
  let(:subject_name) { "a_subject" }
28
29
 
29
30
  it "caches the result of register" do
31
+ # multiple calls return same result, with only one upstream call
30
32
  allow(upstream).to receive(:register).with(subject_name, schema).and_return(id)
31
- registry.register(subject_name, schema)
33
+ expect(registry.register(subject_name, schema)).to eq(id)
32
34
  expect(registry.register(subject_name, schema)).to eq(id)
33
35
  expect(upstream).to have_received(:register).exactly(1).times
34
36
  end
35
37
  end
36
38
 
39
+ describe '#subject_version' do
40
+ let(:subject_name) { 'a_subject' }
41
+ let(:version) { 1 }
42
+ let(:schema_with_meta) do
43
+ {
44
+ subject: subject_name,
45
+ id: 1,
46
+ version: 1,
47
+ schema: schema
48
+ }
49
+ end
50
+
51
+ it 'caches the result of subject_version' do
52
+ allow(upstream).to receive(:subject_version).with(subject_name, version).and_return(schema_with_meta)
53
+ registry.subject_version(subject_name, version)
54
+ registry.subject_version(subject_name, version)
55
+ expect(upstream).to have_received(:subject_version).exactly(1).times
56
+ end
57
+ end
58
+
37
59
  it_behaves_like "a confluent schema registry client" do
38
60
  let(:upstream) { AvroTurf::ConfluentSchemaRegistry.new(registry_url, logger: logger) }
39
61
  let(:registry) { described_class.new(upstream) }
@@ -3,7 +3,19 @@ require 'avro_turf/confluent_schema_registry'
3
3
  require 'avro_turf/test/fake_confluent_schema_registry_server'
4
4
 
5
5
  describe AvroTurf::ConfluentSchemaRegistry do
6
+ let(:client_cert) { "test client cert" }
7
+ let(:client_key) { "test client key" }
8
+ let(:client_key_pass) { "test client key password" }
9
+
6
10
  it_behaves_like "a confluent schema registry client" do
7
- let(:registry) { described_class.new(registry_url, logger: logger) }
11
+ let(:registry) {
12
+ described_class.new(
13
+ registry_url,
14
+ logger: logger,
15
+ client_cert: client_cert,
16
+ client_key: client_key,
17
+ client_key_pass: client_key_pass
18
+ )
19
+ }
8
20
  end
9
21
  end
@@ -0,0 +1,159 @@
1
+ require 'webmock/rspec'
2
+ require 'avro_turf/cached_confluent_schema_registry'
3
+ require 'avro_turf/test/fake_confluent_schema_registry_server'
4
+
5
+ describe AvroTurf::CachedConfluentSchemaRegistry do
6
+ let(:upstream) { instance_double(AvroTurf::ConfluentSchemaRegistry) }
7
+ let(:cache) { AvroTurf::DiskCache.new("spec/cache")}
8
+ let(:registry) { described_class.new(upstream, cache: cache) }
9
+ let(:id) { rand(999) }
10
+ let(:schema) do
11
+ {
12
+ type: "record",
13
+ name: "person",
14
+ fields: [{ name: "name", type: "string" }]
15
+ }.to_json
16
+ end
17
+
18
+ let(:city_id) { rand(999) }
19
+ let(:city_schema) do
20
+ {
21
+ type: "record",
22
+ name: "city",
23
+ fields: [{ name: "name", type: "string" }]
24
+ }.to_json
25
+ end
26
+
27
+ let(:subject) { 'subject' }
28
+ let(:version) { rand(999) }
29
+ let(:subject_version_schema) do
30
+ {
31
+ subject: subject,
32
+ version: version,
33
+ id: id,
34
+ schema: {
35
+ type: "record",
36
+ name: "city",
37
+ fields: { name: "name", type: "string" }
38
+ }
39
+ }.to_json
40
+ end
41
+
42
+ before do
43
+ FileUtils.mkdir_p("spec/cache")
44
+ end
45
+
46
+ describe "#fetch" do
47
+ let(:cache_before) do
48
+ {
49
+ "#{id}" => "#{schema}"
50
+ }
51
+ end
52
+ let(:cache_after) do
53
+ {
54
+ "#{id}" => "#{schema}",
55
+ "#{city_id}" => "#{city_schema}"
56
+ }
57
+ end
58
+
59
+ # setup the disk cache to avoid performing the upstream fetch
60
+ before do
61
+ store_cache("schemas_by_id.json", cache_before)
62
+ end
63
+
64
+ it "uses preloaded disk cache" do
65
+ # multiple calls return same result, with zero upstream calls
66
+ allow(upstream).to receive(:fetch).with(id).and_return(schema)
67
+ expect(registry.fetch(id)).to eq(schema)
68
+ expect(registry.fetch(id)).to eq(schema)
69
+ expect(upstream).to have_received(:fetch).exactly(0).times
70
+ expect(load_cache("schemas_by_id.json")).to eq cache_before
71
+ end
72
+
73
+ it "writes thru to disk cache" do
74
+ # multiple calls return same result, with only one upstream call
75
+ allow(upstream).to receive(:fetch).with(city_id).and_return(city_schema)
76
+ expect(registry.fetch(city_id)).to eq(city_schema)
77
+ expect(registry.fetch(city_id)).to eq(city_schema)
78
+ expect(upstream).to have_received(:fetch).exactly(1).times
79
+ expect(load_cache("schemas_by_id.json")).to eq cache_after
80
+ end
81
+ end
82
+
83
+ describe "#register" do
84
+ let(:subject_name) { "a_subject" }
85
+ let(:cache_before) do
86
+ {
87
+ "#{subject_name}#{schema}" => id
88
+ }
89
+ end
90
+
91
+ let(:city_name) { "a_city" }
92
+ let(:cache_after) do
93
+ {
94
+ "#{subject_name}#{schema}" => id,
95
+ "#{city_name}#{city_schema}" => city_id
96
+ }
97
+ end
98
+
99
+ # setup the disk cache to avoid performing the upstream register
100
+ before do
101
+ store_cache("ids_by_schema.json", cache_before)
102
+ end
103
+
104
+ it "uses preloaded disk cache" do
105
+ # multiple calls return same result, with zero upstream calls
106
+ allow(upstream).to receive(:register).with(subject_name, schema).and_return(id)
107
+ expect(registry.register(subject_name, schema)).to eq(id)
108
+ expect(registry.register(subject_name, schema)).to eq(id)
109
+ expect(upstream).to have_received(:register).exactly(0).times
110
+ expect(load_cache("ids_by_schema.json")).to eq cache_before
111
+ end
112
+
113
+ it "writes thru to disk cache" do
114
+ # multiple calls return same result, with only one upstream call
115
+ allow(upstream).to receive(:register).with(city_name, city_schema).and_return(city_id)
116
+ expect(registry.register(city_name, city_schema)).to eq(city_id)
117
+ expect(registry.register(city_name, city_schema)).to eq(city_id)
118
+ expect(upstream).to have_received(:register).exactly(1).times
119
+ expect(load_cache("ids_by_schema.json")).to eq cache_after
120
+ end
121
+ end
122
+
123
+ describe "#subject_version" do
124
+ it "writes thru to disk cache" do
125
+ # multiple calls return same result, with zero upstream calls
126
+ allow(upstream).to receive(:subject_version).with(subject, version).and_return(subject_version_schema)
127
+ expect(File).not_to exist("./spec/cache/schemas_by_subject_version.json")
128
+
129
+ expect(registry.subject_version(subject, version)).to eq(subject_version_schema)
130
+
131
+ json = JSON.parse(File.read("./spec/cache/schemas_by_subject_version.json"))["#{subject}#{version}"]
132
+ expect(json).to eq(subject_version_schema)
133
+
134
+ expect(registry.subject_version(subject, version)).to eq(subject_version_schema)
135
+ expect(upstream).to have_received(:subject_version).exactly(1).times
136
+ end
137
+
138
+ it "reads from disk cache and populates mem cache" do
139
+ allow(upstream).to receive(:subject_version).with(subject, version).and_return(subject_version_schema)
140
+ key = "#{subject}#{version}"
141
+ hash = {key => subject_version_schema}
142
+ cache.send(:write_to_disk_cache, "./spec/cache/schemas_by_subject_version.json", hash)
143
+
144
+ cached_schema = cache.instance_variable_get(:@schemas_by_subject_version)
145
+ expect(cached_schema).to eq({})
146
+
147
+ expect(registry.subject_version(subject, version)).to eq(subject_version_schema)
148
+ expect(upstream).to have_received(:subject_version).exactly(0).times
149
+
150
+ cached_schema = cache.instance_variable_get(:@schemas_by_subject_version)
151
+ expect(cached_schema).to eq({key => subject_version_schema})
152
+ end
153
+ end
154
+
155
+ it_behaves_like "a confluent schema registry client" do
156
+ let(:upstream) { AvroTurf::ConfluentSchemaRegistry.new(registry_url, logger: logger) }
157
+ let(:registry) { described_class.new(upstream) }
158
+ end
159
+ end