avro_turf 0.7.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +36 -0
  3. data/.github/workflows/ruby.yml +20 -0
  4. data/CHANGELOG.md +29 -0
  5. data/Gemfile +0 -3
  6. data/README.md +54 -16
  7. data/avro_turf.gemspec +13 -2
  8. data/lib/avro_turf.rb +14 -3
  9. data/lib/avro_turf/cached_confluent_schema_registry.rb +39 -0
  10. data/lib/avro_turf/cached_schema_registry.rb +4 -24
  11. data/lib/avro_turf/confluent_schema_registry.rb +106 -0
  12. data/lib/avro_turf/disk_cache.rb +83 -0
  13. data/lib/avro_turf/in_memory_cache.rb +38 -0
  14. data/lib/avro_turf/messaging.rb +77 -9
  15. data/lib/avro_turf/mutable_schema_store.rb +18 -0
  16. data/lib/avro_turf/schema_registry.rb +4 -77
  17. data/lib/avro_turf/schema_store.rb +36 -19
  18. data/lib/avro_turf/schema_to_avro_patch.rb +11 -0
  19. data/lib/avro_turf/test/fake_confluent_schema_registry_server.rb +141 -0
  20. data/lib/avro_turf/test/fake_schema_registry_server.rb +4 -82
  21. data/lib/avro_turf/version.rb +1 -1
  22. data/spec/cached_confluent_schema_registry_spec.rb +63 -0
  23. data/spec/confluent_schema_registry_spec.rb +9 -0
  24. data/spec/disk_cached_confluent_schema_registry_spec.rb +159 -0
  25. data/spec/messaging_spec.rb +208 -19
  26. data/spec/schema_store_spec.rb +36 -0
  27. data/spec/schema_to_avro_patch_spec.rb +42 -0
  28. data/spec/spec_helper.rb +8 -0
  29. data/spec/support/{schema_registry_context.rb → confluent_schema_registry_context.rb} +72 -8
  30. data/spec/test/fake_confluent_schema_registry_server_spec.rb +40 -0
  31. metadata +49 -16
  32. data/circle.yml +0 -4
  33. data/spec/cached_schema_registry_spec.rb +0 -41
  34. data/spec/schema_registry_spec.rb +0 -9
@@ -0,0 +1,83 @@
1
+ # A cache for the CachedConfluentSchemaRegistry.
2
+ # Extends the InMemoryCache to provide a write-thru to disk for persistent cache.
3
+ class AvroTurf::DiskCache < AvroTurf::InMemoryCache
4
+
5
+ def initialize(disk_path)
6
+ super()
7
+
8
+ # load the write-thru cache on startup, if it exists
9
+ @schemas_by_id_path = File.join(disk_path, 'schemas_by_id.json')
10
+ @schemas_by_id = JSON.parse(File.read(@schemas_by_id_path)) if File.exist?(@schemas_by_id_path)
11
+
12
+ @ids_by_schema_path = File.join(disk_path, 'ids_by_schema.json')
13
+ @ids_by_schema = JSON.parse(File.read(@ids_by_schema_path)) if File.exist?(@ids_by_schema_path)
14
+
15
+ @schemas_by_subject_version_path = File.join(disk_path, 'schemas_by_subject_version.json')
16
+ @schemas_by_subject_version = {}
17
+ end
18
+
19
+ # override
20
+ # the write-thru cache (json) does not store keys in numeric format
21
+ # so, convert id to a string for caching purposes
22
+ def lookup_by_id(id)
23
+ super(id.to_s)
24
+ end
25
+
26
+ # override to include write-thru cache after storing result from upstream
27
+ def store_by_id(id, schema)
28
+ # must return the value from storing the result (i.e. do not return result from file write)
29
+ value = super(id.to_s, schema)
30
+ File.write(@schemas_by_id_path, JSON.pretty_generate(@schemas_by_id))
31
+ return value
32
+ end
33
+
34
+ # override to include write-thru cache after storing result from upstream
35
+ def store_by_schema(subject, schema, id)
36
+ # must return the value from storing the result (i.e. do not return result from file write)
37
+ value = super
38
+ File.write(@ids_by_schema_path, JSON.pretty_generate(@ids_by_schema))
39
+ return value
40
+ end
41
+
42
+ # checks instance var (in-memory cache) for schema
43
+ # checks disk cache if in-memory cache doesn't exists
44
+ # if file exists but no in-memory cache, read from file and sync in-memory cache
45
+ # finally, if file doesn't exist return nil
46
+ def lookup_by_version(subject, version)
47
+ key = "#{subject}#{version}"
48
+ schema = @schemas_by_subject_version[key]
49
+
50
+ return schema unless schema.nil?
51
+
52
+ hash = JSON.parse(File.read(@schemas_by_subject_version_path)) if File.exist?(@schemas_by_subject_version_path)
53
+ if hash
54
+ @schemas_by_subject_version = hash
55
+ @schemas_by_subject_version[key]
56
+ end
57
+ end
58
+
59
+ # check if file exists and parse json into a hash
60
+ # if file exists take json and overwite/insert schema at key
61
+ # if file doesn't exist create new hash
62
+ # write the new/updated hash to file
63
+ # update instance var (in memory-cache) to match
64
+ def store_by_version(subject, version, schema)
65
+ key = "#{subject}#{version}"
66
+ hash = JSON.parse(File.read(@schemas_by_subject_version_path)) if File.exist?(@schemas_by_subject_version_path)
67
+ hash = if hash
68
+ hash[key] = schema
69
+ hash
70
+ else
71
+ {key => schema}
72
+ end
73
+
74
+ write_to_disk_cache(@schemas_by_subject_version_path, hash)
75
+
76
+ @schemas_by_subject_version = hash
77
+ @schemas_by_subject_version[key]
78
+ end
79
+
80
+ private def write_to_disk_cache(path, hash)
81
+ File.write(path, JSON.pretty_generate(hash))
82
+ end
83
+ end
@@ -0,0 +1,38 @@
1
+ # A cache for the CachedConfluentSchemaRegistry.
2
+ # Simply stores the schemas and ids in in-memory hashes.
3
+ class AvroTurf::InMemoryCache
4
+
5
+ def initialize
6
+ @schemas_by_id = {}
7
+ @ids_by_schema = {}
8
+ @schema_by_subject_version = {}
9
+ end
10
+
11
+ def lookup_by_id(id)
12
+ @schemas_by_id[id]
13
+ end
14
+
15
+ def store_by_id(id, schema)
16
+ @schemas_by_id[id] = schema
17
+ end
18
+
19
+ def lookup_by_schema(subject, schema)
20
+ key = subject + schema.to_s
21
+ @ids_by_schema[key]
22
+ end
23
+
24
+ def store_by_schema(subject, schema, id)
25
+ key = subject + schema.to_s
26
+ @ids_by_schema[key] = id
27
+ end
28
+
29
+ def lookup_by_version(subject, version)
30
+ key = "#{subject}#{version}"
31
+ @schema_by_subject_version[key]
32
+ end
33
+
34
+ def store_by_version(subject, version, schema)
35
+ key = "#{subject}#{version}"
36
+ @schema_by_subject_version[key] = schema
37
+ end
38
+ end
@@ -1,6 +1,11 @@
1
1
  require 'logger'
2
2
  require 'avro_turf'
3
3
  require 'avro_turf/schema_store'
4
+ require 'avro_turf/confluent_schema_registry'
5
+ require 'avro_turf/cached_confluent_schema_registry'
6
+
7
+ # For back-compatibility require the aliases along with the Messaging API.
8
+ # These names are deprecated and will be removed in a future release.
4
9
  require 'avro_turf/schema_registry'
5
10
  require 'avro_turf/cached_schema_registry'
6
11
 
@@ -16,11 +21,13 @@ class AvroTurf
16
21
  # 1: https://github.com/confluentinc/schema-registry
17
22
  class Messaging
18
23
  MAGIC_BYTE = [0].pack("C").freeze
24
+ DecodedMessage = Struct.new(:schema_id, :writer_schema, :reader_schema, :message)
25
+ private_constant(:DecodedMessage)
19
26
 
20
27
  # Instantiate a new Messaging instance with the given configuration.
21
28
  #
22
29
  # registry - A schema registry object that responds to all methods in the
23
- # AvroTurf::SchemaRegistry interface.
30
+ # AvroTurf::ConfluentSchemaRegistry interface.
24
31
  # registry_url - The String URL of the schema registry that should be used.
25
32
  # schema_store - A schema store object that responds to #find(schema_name, namespace).
26
33
  # schemas_path - The String file system path where local schemas are stored.
@@ -30,7 +37,7 @@ class AvroTurf
30
37
  @logger = logger || Logger.new($stderr)
31
38
  @namespace = namespace
32
39
  @schema_store = schema_store || SchemaStore.new(path: schemas_path || DEFAULT_SCHEMAS_PATH)
33
- @registry = registry || CachedSchemaRegistry.new(SchemaRegistry.new(registry_url, logger: @logger))
40
+ @registry = registry || CachedConfluentSchemaRegistry.new(ConfluentSchemaRegistry.new(registry_url, logger: @logger))
34
41
  @schemas_by_id = {}
35
42
  end
36
43
 
@@ -41,14 +48,24 @@ class AvroTurf
41
48
  # schema_name - The String name of the schema that should be used to encode
42
49
  # the data.
43
50
  # namespace - The namespace of the schema (optional).
51
+ # subject - The subject name the schema should be registered under in
52
+ # the schema registry (optional).
53
+ # version - The integer version of the schema that should be used to decode
54
+ # the data. Must match the schema used when encoding (optional).
55
+ # schema_id - The integer id of the schema that should be used to encode
56
+ # the data.
44
57
  #
45
58
  # Returns the encoded data as a String.
46
- def encode(message, schema_name: nil, namespace: @namespace)
47
- schema = @schema_store.find(schema_name, namespace)
48
-
49
- # Schemas are registered under the full name of the top level Avro record
50
- # type.
51
- schema_id = @registry.register(schema.fullname, schema)
59
+ def encode(message, schema_name: nil, namespace: @namespace, subject: nil, version: nil, schema_id: nil)
60
+ schema_id, schema = if schema_id
61
+ fetch_schema_by_id(schema_id)
62
+ elsif subject && version
63
+ fetch_schema(subject, version)
64
+ elsif schema_name
65
+ register_schema(subject, schema_name, namespace)
66
+ else
67
+ raise ArgumentError.new('Neither schema_name nor schema_id nor subject + version provided to determine the schema.')
68
+ end
52
69
 
53
70
  stream = StringIO.new
54
71
  writer = Avro::IO::DatumWriter.new(schema)
@@ -64,6 +81,12 @@ class AvroTurf
64
81
  writer.write(message, encoder)
65
82
 
66
83
  stream.string
84
+ rescue Excon::Error::NotFound
85
+ if schema_id
86
+ raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
87
+ else
88
+ raise SchemaNotFoundError.new("Schema with subject: `#{subject}` version: `#{version}` is not found on registry")
89
+ end
67
90
  end
68
91
 
69
92
  # Decodes data into the original message.
@@ -75,6 +98,20 @@ class AvroTurf
75
98
  #
76
99
  # Returns the decoded message.
77
100
  def decode(data, schema_name: nil, namespace: @namespace)
101
+ decode_message(data, schema_name: schema_name, namespace: namespace).message
102
+ end
103
+
104
+ # Decodes data into the original message.
105
+ #
106
+ # data - A String containing encoded data.
107
+ # schema_name - The String name of the schema that should be used to decode
108
+ # the data. Must match the schema used when encoding (optional).
109
+ # namespace - The namespace of the schema (optional).
110
+ #
111
+ # Returns Struct with the next attributes:
112
+ # schema_id - The integer id of schema used to encode the message
113
+ # message - The decoded message
114
+ def decode_message(data, schema_name: nil, namespace: @namespace)
78
115
  readers_schema = schema_name && @schema_store.find(schema_name, namespace)
79
116
  stream = StringIO.new(data)
80
117
  decoder = Avro::IO::BinaryDecoder.new(stream)
@@ -95,7 +132,38 @@ class AvroTurf
95
132
  end
96
133
 
97
134
  reader = Avro::IO::DatumReader.new(writers_schema, readers_schema)
98
- reader.read(decoder)
135
+ message = reader.read(decoder)
136
+
137
+ DecodedMessage.new(schema_id, writers_schema, readers_schema, message)
138
+ rescue Excon::Error::NotFound
139
+ raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
140
+ end
141
+
142
+ private
143
+
144
+ # Providing subject and version to determine the schema,
145
+ # which skips the auto registeration of schema on the schema registry.
146
+ # Fetch the schema from registry with the provided subject name and version.
147
+ def fetch_schema(subject, version)
148
+ schema_data = @registry.subject_version(subject, version)
149
+ schema_id = schema_data.fetch('id')
150
+ schema = Avro::Schema.parse(schema_data.fetch('schema'))
151
+ [schema_id, schema]
152
+ end
153
+
154
+ # Fetch the schema from registry with the provided schema_id.
155
+ def fetch_schema_by_id(schema_id)
156
+ schema_json = @registry.fetch(schema_id)
157
+ schema = Avro::Schema.parse(schema_json)
158
+ [schema_id, schema]
159
+ end
160
+
161
+ # Schemas are registered under the full name of the top level Avro record
162
+ # type, or `subject` if it's provided.
163
+ def register_schema(subject, schema_name, namespace)
164
+ schema = @schema_store.find(schema_name, namespace)
165
+ schema_id = @registry.register(subject || schema.fullname, schema)
166
+ [schema_id, schema]
99
167
  end
100
168
  end
101
169
  end
@@ -0,0 +1,18 @@
1
+ require 'avro_turf/schema_store'
2
+
3
+ class AvroTurf
4
+ # A schema store that allows you to add or remove schemas, and to access
5
+ # them externally.
6
+ class MutableSchemaStore < SchemaStore
7
+ attr_accessor :schemas
8
+
9
+ # @param schema_hash [Hash]
10
+ def add_schema(schema_hash)
11
+ name = schema_hash['name']
12
+ namespace = schema_hash['namespace']
13
+ full_name = Avro::Name.make_fullname(name, namespace)
14
+ return if @schemas.key?(full_name)
15
+ Avro::Schema.real_parse(schema_hash, @schemas)
16
+ end
17
+ end
18
+ end
@@ -1,79 +1,6 @@
1
- require 'excon'
1
+ require 'avro_turf/confluent_schema_registry'
2
2
 
3
- class AvroTurf::SchemaRegistry
4
- CONTENT_TYPE = "application/vnd.schemaregistry.v1+json".freeze
3
+ # AvroTurf::SchemaRegistry is deprecated and will be removed in a future release.
4
+ # Use AvroTurf::ConfluentSchemaRegistry instead.
5
5
 
6
- def initialize(url, logger: Logger.new($stdout))
7
- @logger = logger
8
- @connection = Excon.new(url, headers: {
9
- "Content-Type" => CONTENT_TYPE,
10
- })
11
- end
12
-
13
- def fetch(id)
14
- @logger.info "Fetching schema with id #{id}"
15
- data = get("/schemas/ids/#{id}")
16
- data.fetch("schema")
17
- end
18
-
19
- def register(subject, schema)
20
- data = post("/subjects/#{subject}/versions", body: {
21
- schema: schema.to_s
22
- }.to_json)
23
-
24
- id = data.fetch("id")
25
-
26
- @logger.info "Registered schema for subject `#{subject}`; id = #{id}"
27
-
28
- id
29
- end
30
-
31
- # List all subjects
32
- def subjects
33
- get('/subjects')
34
- end
35
-
36
- # List all versions for a subject
37
- def subject_versions(subject)
38
- get("/subjects/#{subject}/versions")
39
- end
40
-
41
- # Get a specific version for a subject
42
- def subject_version(subject, version = 'latest')
43
- get("/subjects/#{subject}/versions/#{version}")
44
- end
45
-
46
- # Check if a schema exists. Returns nil if not found.
47
- def check(subject, schema)
48
- data = post("/subjects/#{subject}",
49
- expects: [200, 404],
50
- body: { schema: schema.to_s }.to_json)
51
- data unless data.has_key?("error_code")
52
- end
53
-
54
- # Check if a schema is compatible with the stored version.
55
- # Returns true if compatible, false otherwise
56
- # http://docs.confluent.io/2.0.0/schema-registry/docs/api.html#compatibility
57
- def compatible?(subject, schema, version = 'latest')
58
- data = post("/compatibility/subjects/#{subject}/versions/#{version}",
59
- expects: [200, 404],
60
- body: { schema: schema.to_s }.to_json)
61
- data.fetch('is_compatible', false) unless data.has_key?('error_code')
62
- end
63
-
64
- private
65
-
66
- def get(path, **options)
67
- request(path, method: :get, **options)
68
- end
69
-
70
- def post(path, **options)
71
- request(path, method: :post, **options)
72
- end
73
-
74
- def request(path, **options)
75
- options = { expects: 200 }.merge!(options)
76
- response = @connection.request(path: path, **options)
77
- JSON.parse(response.body)
78
- end
79
- end
6
+ AvroTurf::SchemaRegistry = AvroTurf::ConfluentSchemaRegistry
@@ -1,7 +1,9 @@
1
1
  class AvroTurf::SchemaStore
2
+
2
3
  def initialize(path: nil)
3
4
  @path = path or raise "Please specify a schema path"
4
5
  @schemas = Hash.new
6
+ @mutex = Mutex.new
5
7
  end
6
8
 
7
9
  # Resolves and returns a schema.
@@ -11,9 +13,40 @@ class AvroTurf::SchemaStore
11
13
  # Returns an Avro::Schema.
12
14
  def find(name, namespace = nil)
13
15
  fullname = Avro::Name.make_fullname(name, namespace)
14
-
16
+ # Optimistic non-blocking read from @schemas
17
+ # No sense to lock the resource when all the schemas already loaded
15
18
  return @schemas[fullname] if @schemas.key?(fullname)
16
19
 
20
+ # Pessimistic blocking write to @schemas
21
+ @mutex.synchronize do
22
+ # Still need to check is the schema already loaded
23
+ return @schemas[fullname] if @schemas.key?(fullname)
24
+
25
+ load_schema!(fullname, namespace)
26
+ end
27
+ end
28
+
29
+ # Loads all schema definition files in the `schemas_dir`.
30
+ def load_schemas!
31
+ pattern = [@path, "**", "*.avsc"].join("/")
32
+
33
+ Dir.glob(pattern) do |schema_path|
34
+ # Remove the path prefix.
35
+ schema_path.sub!(/^\/?#{@path}\//, "")
36
+
37
+ # Replace `/` with `.` and chop off the file extension.
38
+ schema_name = File.basename(schema_path.tr("/", "."), ".avsc")
39
+
40
+ # Load and cache the schema.
41
+ find(schema_name)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ # Loads single schema
48
+ # Such method is not thread-safe, do not call it of from mutex synchronization routine
49
+ def load_schema!(fullname, namespace = nil)
17
50
  *namespace, schema_name = fullname.split(".")
18
51
  schema_path = File.join(@path, *namespace, schema_name + ".avsc")
19
52
  schema_json = JSON.parse(File.read(schema_path))
@@ -28,31 +61,15 @@ class AvroTurf::SchemaStore
28
61
  # This is a hack in order to figure out exactly which type was missing. The
29
62
  # Avro gem ought to provide this data directly.
30
63
  if e.to_s =~ /"([\w\.]+)" is not a schema we know about/
31
- find($1)
64
+ load_schema!($1)
32
65
 
33
66
  # Re-resolve the original schema now that the dependency has been resolved.
34
67
  @schemas.delete(fullname)
35
- find(fullname)
68
+ load_schema!(fullname)
36
69
  else
37
70
  raise
38
71
  end
39
72
  rescue Errno::ENOENT, Errno::ENAMETOOLONG
40
73
  raise AvroTurf::SchemaNotFoundError, "could not find Avro schema at `#{schema_path}'"
41
74
  end
42
-
43
- # Loads all schema definition files in the `schemas_dir`.
44
- def load_schemas!
45
- pattern = [@path, "**", "*.avsc"].join("/")
46
-
47
- Dir.glob(pattern) do |schema_path|
48
- # Remove the path prefix.
49
- schema_path.sub!(/^\/?#{@path}\//, "")
50
-
51
- # Replace `/` with `.` and chop off the file extension.
52
- schema_name = File.basename(schema_path.tr("/", "."), ".avsc")
53
-
54
- # Load and cache the schema.
55
- find(schema_name)
56
- end
57
- end
58
75
  end