avro_turf 0.8.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,21 +21,53 @@ class AvroTurf
21
21
  # 1: https://github.com/confluentinc/schema-registry
22
22
  class Messaging
23
23
  MAGIC_BYTE = [0].pack("C").freeze
24
+ DecodedMessage = Struct.new(:schema_id, :writer_schema, :reader_schema, :message)
25
+ private_constant(:DecodedMessage)
24
26
 
25
27
  # Instantiate a new Messaging instance with the given configuration.
26
28
  #
27
- # registry - A schema registry object that responds to all methods in the
28
- # AvroTurf::ConfluentSchemaRegistry interface.
29
- # registry_url - The String URL of the schema registry that should be used.
30
- # schema_store - A schema store object that responds to #find(schema_name, namespace).
31
- # schemas_path - The String file system path where local schemas are stored.
32
- # namespace - The String default schema namespace.
33
- # logger - The Logger that should be used to log information (optional).
34
- def initialize(registry: nil, registry_url: nil, schema_store: nil, schemas_path: nil, namespace: nil, logger: nil)
29
+ # registry - A schema registry object that responds to all methods in the
30
+ # AvroTurf::ConfluentSchemaRegistry interface.
31
+ # registry_url - The String URL of the schema registry that should be used.
32
+ # schema_store - A schema store object that responds to #find(schema_name, namespace).
33
+ # schemas_path - The String file system path where local schemas are stored.
34
+ # namespace - The String default schema namespace.
35
+ # logger - The Logger that should be used to log information (optional).
36
+ # proxy - Forward the request via proxy (optional).
37
+ # client_cert - Name of file containing client certificate (optional).
38
+ # client_key - Name of file containing client private key to go with client_cert (optional).
39
+ # client_key_pass - Password to go with client_key (optional).
40
+ # client_cert_data - In-memory client certificate (optional).
41
+ # client_key_data - In-memory client private key to go with client_cert_data (optional).
42
+ def initialize(
43
+ registry: nil,
44
+ registry_url: nil,
45
+ schema_store: nil,
46
+ schemas_path: nil,
47
+ namespace: nil,
48
+ logger: nil,
49
+ proxy: nil,
50
+ client_cert: nil,
51
+ client_key: nil,
52
+ client_key_pass: nil,
53
+ client_cert_data: nil,
54
+ client_key_data: nil
55
+ )
35
56
  @logger = logger || Logger.new($stderr)
36
57
  @namespace = namespace
37
58
  @schema_store = schema_store || SchemaStore.new(path: schemas_path || DEFAULT_SCHEMAS_PATH)
38
- @registry = registry || CachedConfluentSchemaRegistry.new(ConfluentSchemaRegistry.new(registry_url, logger: @logger))
59
+ @registry = registry || CachedConfluentSchemaRegistry.new(
60
+ ConfluentSchemaRegistry.new(
61
+ registry_url,
62
+ logger: @logger,
63
+ proxy: proxy,
64
+ client_cert: client_cert,
65
+ client_key: client_key,
66
+ client_key_pass: client_key_pass,
67
+ client_cert_data: client_cert_data,
68
+ client_key_data: client_key_data
69
+ )
70
+ )
39
71
  @schemas_by_id = {}
40
72
  end
41
73
 
@@ -46,14 +78,24 @@ class AvroTurf
46
78
  # schema_name - The String name of the schema that should be used to encode
47
79
  # the data.
48
80
  # namespace - The namespace of the schema (optional).
81
+ # subject - The subject name the schema should be registered under in
82
+ # the schema registry (optional).
83
+ # version - The integer version of the schema that should be used to decode
84
+ # the data. Must match the schema used when encoding (optional).
85
+ # schema_id - The integer id of the schema that should be used to encode
86
+ # the data.
49
87
  #
50
88
  # Returns the encoded data as a String.
51
- def encode(message, schema_name: nil, namespace: @namespace, subject: nil)
52
- schema = @schema_store.find(schema_name, namespace)
53
-
54
- # Schemas are registered under the full name of the top level Avro record
55
- # type, or `subject` if it's provided.
56
- schema_id = @registry.register(subject || schema.fullname, schema)
89
+ def encode(message, schema_name: nil, namespace: @namespace, subject: nil, version: nil, schema_id: nil)
90
+ schema_id, schema = if schema_id
91
+ fetch_schema_by_id(schema_id)
92
+ elsif subject && version
93
+ fetch_schema(subject, version)
94
+ elsif schema_name
95
+ register_schema(subject, schema_name, namespace)
96
+ else
97
+ raise ArgumentError.new('Neither schema_name nor schema_id nor subject + version provided to determine the schema.')
98
+ end
57
99
 
58
100
  stream = StringIO.new
59
101
  writer = Avro::IO::DatumWriter.new(schema)
@@ -69,6 +111,12 @@ class AvroTurf
69
111
  writer.write(message, encoder)
70
112
 
71
113
  stream.string
114
+ rescue Excon::Error::NotFound
115
+ if schema_id
116
+ raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
117
+ else
118
+ raise SchemaNotFoundError.new("Schema with subject: `#{subject}` version: `#{version}` is not found on registry")
119
+ end
72
120
  end
73
121
 
74
122
  # Decodes data into the original message.
@@ -80,6 +128,20 @@ class AvroTurf
80
128
  #
81
129
  # Returns the decoded message.
82
130
  def decode(data, schema_name: nil, namespace: @namespace)
131
+ decode_message(data, schema_name: schema_name, namespace: namespace).message
132
+ end
133
+
134
+ # Decodes data into the original message.
135
+ #
136
+ # data - A String containing encoded data.
137
+ # schema_name - The String name of the schema that should be used to decode
138
+ # the data. Must match the schema used when encoding (optional).
139
+ # namespace - The namespace of the schema (optional).
140
+ #
141
+ # Returns Struct with the next attributes:
142
+ # schema_id - The integer id of schema used to encode the message
143
+ # message - The decoded message
144
+ def decode_message(data, schema_name: nil, namespace: @namespace)
83
145
  readers_schema = schema_name && @schema_store.find(schema_name, namespace)
84
146
  stream = StringIO.new(data)
85
147
  decoder = Avro::IO::BinaryDecoder.new(stream)
@@ -100,7 +162,38 @@ class AvroTurf
100
162
  end
101
163
 
102
164
  reader = Avro::IO::DatumReader.new(writers_schema, readers_schema)
103
- reader.read(decoder)
165
+ message = reader.read(decoder)
166
+
167
+ DecodedMessage.new(schema_id, writers_schema, readers_schema, message)
168
+ rescue Excon::Error::NotFound
169
+ raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
170
+ end
171
+
172
+ private
173
+
174
+ # Providing subject and version to determine the schema,
175
+ # which skips the auto registeration of schema on the schema registry.
176
+ # Fetch the schema from registry with the provided subject name and version.
177
+ def fetch_schema(subject, version)
178
+ schema_data = @registry.subject_version(subject, version)
179
+ schema_id = schema_data.fetch('id')
180
+ schema = Avro::Schema.parse(schema_data.fetch('schema'))
181
+ [schema_id, schema]
182
+ end
183
+
184
+ # Fetch the schema from registry with the provided schema_id.
185
+ def fetch_schema_by_id(schema_id)
186
+ schema_json = @registry.fetch(schema_id)
187
+ schema = Avro::Schema.parse(schema_json)
188
+ [schema_id, schema]
189
+ end
190
+
191
+ # Schemas are registered under the full name of the top level Avro record
192
+ # type, or `subject` if it's provided.
193
+ def register_schema(subject, schema_name, namespace)
194
+ schema = @schema_store.find(schema_name, namespace)
195
+ schema_id = @registry.register(subject || schema.fullname, schema)
196
+ [schema_id, schema]
104
197
  end
105
198
  end
106
199
  end
@@ -0,0 +1,18 @@
1
+ require 'avro_turf/schema_store'
2
+
3
+ class AvroTurf
4
+ # A schema store that allows you to add or remove schemas, and to access
5
+ # them externally.
6
+ class MutableSchemaStore < SchemaStore
7
+ attr_accessor :schemas
8
+
9
+ # @param schema_hash [Hash]
10
+ def add_schema(schema_hash)
11
+ name = schema_hash['name']
12
+ namespace = schema_hash['namespace']
13
+ full_name = Avro::Name.make_fullname(name, namespace)
14
+ return if @schemas.key?(full_name)
15
+ Avro::Schema.real_parse(schema_hash, @schemas)
16
+ end
17
+ end
18
+ end
@@ -1,7 +1,9 @@
1
1
  class AvroTurf::SchemaStore
2
+
2
3
  def initialize(path: nil)
3
4
  @path = path or raise "Please specify a schema path"
4
5
  @schemas = Hash.new
6
+ @mutex = Mutex.new
5
7
  end
6
8
 
7
9
  # Resolves and returns a schema.
@@ -11,48 +13,82 @@ class AvroTurf::SchemaStore
11
13
  # Returns an Avro::Schema.
12
14
  def find(name, namespace = nil)
13
15
  fullname = Avro::Name.make_fullname(name, namespace)
14
-
16
+ # Optimistic non-blocking read from @schemas
17
+ # No sense to lock the resource when all the schemas already loaded
15
18
  return @schemas[fullname] if @schemas.key?(fullname)
16
19
 
20
+ # Pessimistic blocking write to @schemas
21
+ @mutex.synchronize do
22
+ # Still need to check is the schema already loaded
23
+ return @schemas[fullname] if @schemas.key?(fullname)
24
+
25
+ load_schema!(fullname, namespace)
26
+ end
27
+ end
28
+
29
+ # Loads all schema definition files in the `schemas_dir`.
30
+ def load_schemas!
31
+ pattern = [@path, "**", "*.avsc"].join("/")
32
+
33
+ Dir.glob(pattern) do |schema_path|
34
+ # Remove the path prefix.
35
+ schema_path.sub!(/^\/?#{@path}\//, "")
36
+
37
+ # Replace `/` with `.` and chop off the file extension.
38
+ schema_name = File.basename(schema_path.tr("/", "."), ".avsc")
39
+
40
+ # Load and cache the schema.
41
+ find(schema_name)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ # Loads single schema
48
+ # Such method is not thread-safe, do not call it of from mutex synchronization routine
49
+ def load_schema!(fullname, namespace = nil, local_schemas_cache = {})
17
50
  *namespace, schema_name = fullname.split(".")
18
51
  schema_path = File.join(@path, *namespace, schema_name + ".avsc")
19
52
  schema_json = JSON.parse(File.read(schema_path))
20
- schema = Avro::Schema.real_parse(schema_json, @schemas)
21
53
 
54
+ schema = Avro::Schema.real_parse(schema_json, local_schemas_cache)
55
+
56
+ # Don't cache the parsed schema until after its fullname is validated
22
57
  if schema.respond_to?(:fullname) && schema.fullname != fullname
23
58
  raise AvroTurf::SchemaError, "expected schema `#{schema_path}' to define type `#{fullname}'"
24
59
  end
25
60
 
61
+ # Cache only this new top-level schema by its fullname. It's critical
62
+ # not to make every sub-schema resolvable at the top level here because
63
+ # multiple different avsc files may define the same sub-schema, and
64
+ # if we share the @schemas cache across all parsing contexts, the Avro
65
+ # gem will raise an Avro::SchemaParseError when parsing another avsc
66
+ # file that contains a subschema with the same fullname as one
67
+ # encountered previously in a different file:
68
+ # <Avro::SchemaParseError: The name "foo.bar" is already in use.>
69
+ # Essentially, the only schemas that should be resolvable in @schemas
70
+ # are those that have their own .avsc files on disk.
71
+ @schemas[fullname] = schema
72
+
26
73
  schema
27
74
  rescue ::Avro::SchemaParseError => e
28
75
  # This is a hack in order to figure out exactly which type was missing. The
29
76
  # Avro gem ought to provide this data directly.
30
77
  if e.to_s =~ /"([\w\.]+)" is not a schema we know about/
31
- find($1)
78
+ # Try to first resolve a referenced schema from disk.
79
+ # If this is successful, the Avro gem will have mutated the
80
+ # local_schemas_cache, adding all the new schemas it found.
81
+ load_schema!($1, nil, local_schemas_cache)
32
82
 
33
- # Re-resolve the original schema now that the dependency has been resolved.
34
- @schemas.delete(fullname)
35
- find(fullname)
83
+ # Attempt to re-parse the original schema now that the dependency
84
+ # has been resolved and use the now-updated local_schemas_cache to
85
+ # pick up where we left off.
86
+ local_schemas_cache.delete(fullname)
87
+ load_schema!(fullname, nil, local_schemas_cache)
36
88
  else
37
89
  raise
38
90
  end
39
91
  rescue Errno::ENOENT, Errno::ENAMETOOLONG
40
92
  raise AvroTurf::SchemaNotFoundError, "could not find Avro schema at `#{schema_path}'"
41
93
  end
42
-
43
- # Loads all schema definition files in the `schemas_dir`.
44
- def load_schemas!
45
- pattern = [@path, "**", "*.avsc"].join("/")
46
-
47
- Dir.glob(pattern) do |schema_path|
48
- # Remove the path prefix.
49
- schema_path.sub!(/^\/?#{@path}\//, "")
50
-
51
- # Replace `/` with `.` and chop off the file extension.
52
- schema_name = File.basename(schema_path.tr("/", "."), ".avsc")
53
-
54
- # Load and cache the schema.
55
- find(schema_name)
56
- end
57
- end
58
94
  end
@@ -34,10 +34,21 @@ class FakeConfluentSchemaRegistryServer < Sinatra::Base
34
34
  end
35
35
 
36
36
  post "/subjects/:subject/versions" do
37
- SCHEMAS << parse_schema
37
+ schema = parse_schema
38
+ ids_for_subject = SUBJECTS[params[:subject]]
39
+
40
+ schemas_for_subject =
41
+ SCHEMAS.select
42
+ .with_index { |_, i| ids_for_subject.include?(i) }
43
+
44
+ if schemas_for_subject.include?(schema)
45
+ schema_id = SCHEMAS.index(schema)
46
+ else
47
+ SCHEMAS << schema
48
+ schema_id = SCHEMAS.size - 1
49
+ SUBJECTS[params[:subject]] = SUBJECTS[params[:subject]] << schema_id
50
+ end
38
51
 
39
- schema_id = SCHEMAS.size - 1
40
- SUBJECTS[params[:subject]] = SUBJECTS[params[:subject]] << schema_id
41
52
  { id: schema_id }.to_json
42
53
  end
43
54
 
@@ -73,6 +84,7 @@ class FakeConfluentSchemaRegistryServer < Sinatra::Base
73
84
  {
74
85
  name: params[:subject],
75
86
  version: schema_ids.index(schema_id) + 1,
87
+ id: schema_id,
76
88
  schema: schema
77
89
  }.to_json
78
90
  end
@@ -1,3 +1,3 @@
1
1
  class AvroTurf
2
- VERSION = "0.8.0"
2
+ VERSION = "1.0.0"
3
3
  end
@@ -16,8 +16,9 @@ describe AvroTurf::CachedConfluentSchemaRegistry do
16
16
 
17
17
  describe "#fetch" do
18
18
  it "caches the result of fetch" do
19
+ # multiple calls return same result, with only one upstream call
19
20
  allow(upstream).to receive(:fetch).with(id).and_return(schema)
20
- registry.fetch(id)
21
+ expect(registry.fetch(id)).to eq(schema)
21
22
  expect(registry.fetch(id)).to eq(schema)
22
23
  expect(upstream).to have_received(:fetch).exactly(1).times
23
24
  end
@@ -27,13 +28,34 @@ describe AvroTurf::CachedConfluentSchemaRegistry do
27
28
  let(:subject_name) { "a_subject" }
28
29
 
29
30
  it "caches the result of register" do
31
+ # multiple calls return same result, with only one upstream call
30
32
  allow(upstream).to receive(:register).with(subject_name, schema).and_return(id)
31
- registry.register(subject_name, schema)
33
+ expect(registry.register(subject_name, schema)).to eq(id)
32
34
  expect(registry.register(subject_name, schema)).to eq(id)
33
35
  expect(upstream).to have_received(:register).exactly(1).times
34
36
  end
35
37
  end
36
38
 
39
+ describe '#subject_version' do
40
+ let(:subject_name) { 'a_subject' }
41
+ let(:version) { 1 }
42
+ let(:schema_with_meta) do
43
+ {
44
+ subject: subject_name,
45
+ id: 1,
46
+ version: 1,
47
+ schema: schema
48
+ }
49
+ end
50
+
51
+ it 'caches the result of subject_version' do
52
+ allow(upstream).to receive(:subject_version).with(subject_name, version).and_return(schema_with_meta)
53
+ registry.subject_version(subject_name, version)
54
+ registry.subject_version(subject_name, version)
55
+ expect(upstream).to have_received(:subject_version).exactly(1).times
56
+ end
57
+ end
58
+
37
59
  it_behaves_like "a confluent schema registry client" do
38
60
  let(:upstream) { AvroTurf::ConfluentSchemaRegistry.new(registry_url, logger: logger) }
39
61
  let(:registry) { described_class.new(upstream) }
@@ -3,7 +3,19 @@ require 'avro_turf/confluent_schema_registry'
3
3
  require 'avro_turf/test/fake_confluent_schema_registry_server'
4
4
 
5
5
  describe AvroTurf::ConfluentSchemaRegistry do
6
+ let(:client_cert) { "test client cert" }
7
+ let(:client_key) { "test client key" }
8
+ let(:client_key_pass) { "test client key password" }
9
+
6
10
  it_behaves_like "a confluent schema registry client" do
7
- let(:registry) { described_class.new(registry_url, logger: logger) }
11
+ let(:registry) {
12
+ described_class.new(
13
+ registry_url,
14
+ logger: logger,
15
+ client_cert: client_cert,
16
+ client_key: client_key,
17
+ client_key_pass: client_key_pass
18
+ )
19
+ }
8
20
  end
9
21
  end
@@ -0,0 +1,159 @@
1
+ require 'webmock/rspec'
2
+ require 'avro_turf/cached_confluent_schema_registry'
3
+ require 'avro_turf/test/fake_confluent_schema_registry_server'
4
+
5
+ describe AvroTurf::CachedConfluentSchemaRegistry do
6
+ let(:upstream) { instance_double(AvroTurf::ConfluentSchemaRegistry) }
7
+ let(:cache) { AvroTurf::DiskCache.new("spec/cache")}
8
+ let(:registry) { described_class.new(upstream, cache: cache) }
9
+ let(:id) { rand(999) }
10
+ let(:schema) do
11
+ {
12
+ type: "record",
13
+ name: "person",
14
+ fields: [{ name: "name", type: "string" }]
15
+ }.to_json
16
+ end
17
+
18
+ let(:city_id) { rand(999) }
19
+ let(:city_schema) do
20
+ {
21
+ type: "record",
22
+ name: "city",
23
+ fields: [{ name: "name", type: "string" }]
24
+ }.to_json
25
+ end
26
+
27
+ let(:subject) { 'subject' }
28
+ let(:version) { rand(999) }
29
+ let(:subject_version_schema) do
30
+ {
31
+ subject: subject,
32
+ version: version,
33
+ id: id,
34
+ schema: {
35
+ type: "record",
36
+ name: "city",
37
+ fields: { name: "name", type: "string" }
38
+ }
39
+ }.to_json
40
+ end
41
+
42
+ before do
43
+ FileUtils.mkdir_p("spec/cache")
44
+ end
45
+
46
+ describe "#fetch" do
47
+ let(:cache_before) do
48
+ {
49
+ "#{id}" => "#{schema}"
50
+ }
51
+ end
52
+ let(:cache_after) do
53
+ {
54
+ "#{id}" => "#{schema}",
55
+ "#{city_id}" => "#{city_schema}"
56
+ }
57
+ end
58
+
59
+ # setup the disk cache to avoid performing the upstream fetch
60
+ before do
61
+ store_cache("schemas_by_id.json", cache_before)
62
+ end
63
+
64
+ it "uses preloaded disk cache" do
65
+ # multiple calls return same result, with zero upstream calls
66
+ allow(upstream).to receive(:fetch).with(id).and_return(schema)
67
+ expect(registry.fetch(id)).to eq(schema)
68
+ expect(registry.fetch(id)).to eq(schema)
69
+ expect(upstream).to have_received(:fetch).exactly(0).times
70
+ expect(load_cache("schemas_by_id.json")).to eq cache_before
71
+ end
72
+
73
+ it "writes thru to disk cache" do
74
+ # multiple calls return same result, with only one upstream call
75
+ allow(upstream).to receive(:fetch).with(city_id).and_return(city_schema)
76
+ expect(registry.fetch(city_id)).to eq(city_schema)
77
+ expect(registry.fetch(city_id)).to eq(city_schema)
78
+ expect(upstream).to have_received(:fetch).exactly(1).times
79
+ expect(load_cache("schemas_by_id.json")).to eq cache_after
80
+ end
81
+ end
82
+
83
+ describe "#register" do
84
+ let(:subject_name) { "a_subject" }
85
+ let(:cache_before) do
86
+ {
87
+ "#{subject_name}#{schema}" => id
88
+ }
89
+ end
90
+
91
+ let(:city_name) { "a_city" }
92
+ let(:cache_after) do
93
+ {
94
+ "#{subject_name}#{schema}" => id,
95
+ "#{city_name}#{city_schema}" => city_id
96
+ }
97
+ end
98
+
99
+ # setup the disk cache to avoid performing the upstream register
100
+ before do
101
+ store_cache("ids_by_schema.json", cache_before)
102
+ end
103
+
104
+ it "uses preloaded disk cache" do
105
+ # multiple calls return same result, with zero upstream calls
106
+ allow(upstream).to receive(:register).with(subject_name, schema).and_return(id)
107
+ expect(registry.register(subject_name, schema)).to eq(id)
108
+ expect(registry.register(subject_name, schema)).to eq(id)
109
+ expect(upstream).to have_received(:register).exactly(0).times
110
+ expect(load_cache("ids_by_schema.json")).to eq cache_before
111
+ end
112
+
113
+ it "writes thru to disk cache" do
114
+ # multiple calls return same result, with only one upstream call
115
+ allow(upstream).to receive(:register).with(city_name, city_schema).and_return(city_id)
116
+ expect(registry.register(city_name, city_schema)).to eq(city_id)
117
+ expect(registry.register(city_name, city_schema)).to eq(city_id)
118
+ expect(upstream).to have_received(:register).exactly(1).times
119
+ expect(load_cache("ids_by_schema.json")).to eq cache_after
120
+ end
121
+ end
122
+
123
+ describe "#subject_version" do
124
+ it "writes thru to disk cache" do
125
+ # multiple calls return same result, with zero upstream calls
126
+ allow(upstream).to receive(:subject_version).with(subject, version).and_return(subject_version_schema)
127
+ expect(File).not_to exist("./spec/cache/schemas_by_subject_version.json")
128
+
129
+ expect(registry.subject_version(subject, version)).to eq(subject_version_schema)
130
+
131
+ json = JSON.parse(File.read("./spec/cache/schemas_by_subject_version.json"))["#{subject}#{version}"]
132
+ expect(json).to eq(subject_version_schema)
133
+
134
+ expect(registry.subject_version(subject, version)).to eq(subject_version_schema)
135
+ expect(upstream).to have_received(:subject_version).exactly(1).times
136
+ end
137
+
138
+ it "reads from disk cache and populates mem cache" do
139
+ allow(upstream).to receive(:subject_version).with(subject, version).and_return(subject_version_schema)
140
+ key = "#{subject}#{version}"
141
+ hash = {key => subject_version_schema}
142
+ cache.send(:write_to_disk_cache, "./spec/cache/schemas_by_subject_version.json", hash)
143
+
144
+ cached_schema = cache.instance_variable_get(:@schemas_by_subject_version)
145
+ expect(cached_schema).to eq({})
146
+
147
+ expect(registry.subject_version(subject, version)).to eq(subject_version_schema)
148
+ expect(upstream).to have_received(:subject_version).exactly(0).times
149
+
150
+ cached_schema = cache.instance_variable_get(:@schemas_by_subject_version)
151
+ expect(cached_schema).to eq({key => subject_version_schema})
152
+ end
153
+ end
154
+
155
+ it_behaves_like "a confluent schema registry client" do
156
+ let(:upstream) { AvroTurf::ConfluentSchemaRegistry.new(registry_url, logger: logger) }
157
+ let(:registry) { described_class.new(upstream) }
158
+ end
159
+ end