RubyGems - avro_turf - Versions diffs - 0.7.1 → 0.10.0 - Mend

avro_turf 0.7.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +36 -0
data/.github/workflows/ruby.yml +20 -0
data/CHANGELOG.md +29 -0
data/Gemfile +0 -3
data/README.md +54 -16
data/avro_turf.gemspec +13 -2
data/lib/avro_turf.rb +14 -3
data/lib/avro_turf/cached_confluent_schema_registry.rb +39 -0
data/lib/avro_turf/cached_schema_registry.rb +4 -24
data/lib/avro_turf/confluent_schema_registry.rb +106 -0
data/lib/avro_turf/disk_cache.rb +83 -0
data/lib/avro_turf/in_memory_cache.rb +38 -0
data/lib/avro_turf/messaging.rb +77 -9
data/lib/avro_turf/mutable_schema_store.rb +18 -0
data/lib/avro_turf/schema_registry.rb +4 -77
data/lib/avro_turf/schema_store.rb +36 -19
data/lib/avro_turf/schema_to_avro_patch.rb +11 -0
data/lib/avro_turf/test/fake_confluent_schema_registry_server.rb +141 -0
data/lib/avro_turf/test/fake_schema_registry_server.rb +4 -82
data/lib/avro_turf/version.rb +1 -1
data/spec/cached_confluent_schema_registry_spec.rb +63 -0
data/spec/confluent_schema_registry_spec.rb +9 -0
data/spec/disk_cached_confluent_schema_registry_spec.rb +159 -0
data/spec/messaging_spec.rb +208 -19
data/spec/schema_store_spec.rb +36 -0
data/spec/schema_to_avro_patch_spec.rb +42 -0
data/spec/spec_helper.rb +8 -0
data/spec/support/{schema_registry_context.rb → confluent_schema_registry_context.rb} +72 -8
data/spec/test/fake_confluent_schema_registry_server_spec.rb +40 -0
metadata +49 -16
data/circle.yml +0 -4
data/spec/cached_schema_registry_spec.rb +0 -41
data/spec/schema_registry_spec.rb +0 -9

data/lib/avro_turf/disk_cache.rb ADDED

@@ -0,0 +1,83 @@
+# A cache for the CachedConfluentSchemaRegistry.
+# Extends the InMemoryCache to provide a write-thru to disk for persistent cache.
+class AvroTurf::DiskCache < AvroTurf::InMemoryCache
+  def initialize(disk_path)
+    super()
+    # load the write-thru cache on startup, if it exists
+    @schemas_by_id_path = File.join(disk_path, 'schemas_by_id.json')
+    @schemas_by_id = JSON.parse(File.read(@schemas_by_id_path)) if File.exist?(@schemas_by_id_path)
+    @ids_by_schema_path = File.join(disk_path, 'ids_by_schema.json')
+    @ids_by_schema = JSON.parse(File.read(@ids_by_schema_path)) if File.exist?(@ids_by_schema_path)
+    @schemas_by_subject_version_path = File.join(disk_path, 'schemas_by_subject_version.json')
+    @schemas_by_subject_version = {}
+  end
+  # override
+  # the write-thru cache (json) does not store keys in numeric format
+  # so, convert id to a string for caching purposes
+  def lookup_by_id(id)
+    super(id.to_s)
+  end
+  # override to include write-thru cache after storing result from upstream
+  def store_by_id(id, schema)
+    # must return the value from storing the result (i.e. do not return result from file write)
+    value = super(id.to_s, schema)
+    File.write(@schemas_by_id_path, JSON.pretty_generate(@schemas_by_id))
+    return value
+  end
+  # override to include write-thru cache after storing result from upstream
+  def store_by_schema(subject, schema, id)
+    # must return the value from storing the result (i.e. do not return result from file write)
+    value = super
+    File.write(@ids_by_schema_path, JSON.pretty_generate(@ids_by_schema))
+    return value
+  end
+  # checks instance var (in-memory cache) for schema
+  # checks disk cache if in-memory cache doesn't exists
+  # if file exists but no in-memory cache, read from file and sync in-memory cache
+  # finally, if file doesn't exist return nil
+  def lookup_by_version(subject, version)
+    key = "#{subject}#{version}"
+    schema = @schemas_by_subject_version[key]
+    return schema unless schema.nil?
+    hash = JSON.parse(File.read(@schemas_by_subject_version_path)) if File.exist?(@schemas_by_subject_version_path)
+    if hash
+      @schemas_by_subject_version = hash
+      @schemas_by_subject_version[key]
+    end
+  end
+  # check if file exists and parse json into a hash
+  # if file exists take json and overwite/insert schema at key
+  # if file doesn't exist create new hash
+  # write the new/updated hash to file
+  # update instance var (in memory-cache) to match
+  def store_by_version(subject, version, schema)
+    key = "#{subject}#{version}"
+    hash = JSON.parse(File.read(@schemas_by_subject_version_path)) if File.exist?(@schemas_by_subject_version_path)
+    hash = if hash
+             hash[key] = schema
+             hash
+           else
+             {key => schema}
+           end
+    write_to_disk_cache(@schemas_by_subject_version_path, hash)
+    @schemas_by_subject_version = hash
+    @schemas_by_subject_version[key]
+  end
+  private def write_to_disk_cache(path, hash)
+    File.write(path, JSON.pretty_generate(hash))
+  end
+end

data/lib/avro_turf/in_memory_cache.rb ADDED

@@ -0,0 +1,38 @@
+# A cache for the CachedConfluentSchemaRegistry.
+# Simply stores the schemas and ids in in-memory hashes.
+class AvroTurf::InMemoryCache
+  def initialize
+    @schemas_by_id = {}
+    @ids_by_schema = {}
+    @schema_by_subject_version = {}
+  end
+  def lookup_by_id(id)
+    @schemas_by_id[id]
+  end
+  def store_by_id(id, schema)
+    @schemas_by_id[id] = schema
+  end
+  def lookup_by_schema(subject, schema)
+    key = subject + schema.to_s
+    @ids_by_schema[key]
+  end
+  def store_by_schema(subject, schema, id)
+    key = subject + schema.to_s
+    @ids_by_schema[key] = id
+  end
+  def lookup_by_version(subject, version)
+    key = "#{subject}#{version}"
+    @schema_by_subject_version[key]
+  end
+  def store_by_version(subject, version, schema)
+    key = "#{subject}#{version}"
+    @schema_by_subject_version[key] = schema
+  end
+end

data/lib/avro_turf/messaging.rb CHANGED

@@ -1,6 +1,11 @@
 require 'logger'
 require 'avro_turf'
 require 'avro_turf/schema_store'
+require 'avro_turf/confluent_schema_registry'
+require 'avro_turf/cached_confluent_schema_registry'
+# For back-compatibility require the aliases along with the Messaging API.
+# These names are deprecated and will be removed in a future release.
 require 'avro_turf/schema_registry'
 require 'avro_turf/cached_schema_registry'
@@ -16,11 +21,13 @@ class AvroTurf
   # 1: https://github.com/confluentinc/schema-registry
   class Messaging
     MAGIC_BYTE = [0].pack("C").freeze
+    DecodedMessage = Struct.new(:schema_id, :writer_schema, :reader_schema, :message)
+    private_constant(:DecodedMessage)
     # Instantiate a new Messaging instance with the given configuration.
     #
     # registry     - A schema registry object that responds to all methods in the
-    #                AvroTurf::SchemaRegistry interface.
+    #                AvroTurf::ConfluentSchemaRegistry interface.
     # registry_url - The String URL of the schema registry that should be used.
     # schema_store - A schema store object that responds to #find(schema_name, namespace).
     # schemas_path - The String file system path where local schemas are stored.
@@ -30,7 +37,7 @@ class AvroTurf
       @logger = logger || Logger.new($stderr)
       @namespace = namespace
       @schema_store = schema_store || SchemaStore.new(path: schemas_path || DEFAULT_SCHEMAS_PATH)
-      @registry = registry || CachedSchemaRegistry.new(SchemaRegistry.new(registry_url, logger: @logger))
+      @registry = registry || CachedConfluentSchemaRegistry.new(ConfluentSchemaRegistry.new(registry_url, logger: @logger))
       @schemas_by_id = {}
     end
@@ -41,14 +48,24 @@ class AvroTurf
     # schema_name - The String name of the schema that should be used to encode
     #               the data.
     # namespace   - The namespace of the schema (optional).
+    # subject     - The subject name the schema should be registered under in
+    #               the schema registry (optional).
+    # version     - The integer version of the schema that should be used to decode
+    #               the data. Must match the schema used when encoding (optional).
+    # schema_id   - The integer id of the schema that should be used to encode
+    #               the data.
     #
     # Returns the encoded data as a String.
-    def encode(message, schema_name: nil, namespace: @namespace)
-      schema = @schema_store.find(schema_name, namespace)
-      # Schemas are registered under the full name of the top level Avro record
-      # type.
-      schema_id = @registry.register(schema.fullname, schema)
+    def encode(message, schema_name: nil, namespace: @namespace, subject: nil, version: nil, schema_id: nil)
+      schema_id, schema = if schema_id
+        fetch_schema_by_id(schema_id)
+      elsif subject && version
+        fetch_schema(subject, version)
+      elsif schema_name
+        register_schema(subject, schema_name, namespace)
+      else
+        raise ArgumentError.new('Neither schema_name nor schema_id nor subject + version provided to determine the schema.')
+      end
       stream = StringIO.new
       writer = Avro::IO::DatumWriter.new(schema)
@@ -64,6 +81,12 @@ class AvroTurf
       writer.write(message, encoder)
       stream.string
+    rescue Excon::Error::NotFound
+      if schema_id
+        raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
+      else
+        raise SchemaNotFoundError.new("Schema with subject: `#{subject}` version: `#{version}` is not found on registry")
+      end
     end
     # Decodes data into the original message.
@@ -75,6 +98,20 @@ class AvroTurf
     #
     # Returns the decoded message.
     def decode(data, schema_name: nil, namespace: @namespace)
+      decode_message(data, schema_name: schema_name, namespace: namespace).message
+    end
+    # Decodes data into the original message.
+    #
+    # data        - A String containing encoded data.
+    # schema_name - The String name of the schema that should be used to decode
+    #               the data. Must match the schema used when encoding (optional).
+    # namespace   - The namespace of the schema (optional).
+    #
+    # Returns Struct with the next attributes:
+    #   schema_id  - The integer id of schema used to encode the message
+    #   message    - The decoded message
+    def decode_message(data, schema_name: nil, namespace: @namespace)
       readers_schema = schema_name && @schema_store.find(schema_name, namespace)
       stream = StringIO.new(data)
       decoder = Avro::IO::BinaryDecoder.new(stream)
@@ -95,7 +132,38 @@ class AvroTurf
       end
       reader = Avro::IO::DatumReader.new(writers_schema, readers_schema)
-      reader.read(decoder)
+      message = reader.read(decoder)
+      DecodedMessage.new(schema_id, writers_schema, readers_schema, message)
+    rescue Excon::Error::NotFound
+      raise SchemaNotFoundError.new("Schema with id: #{schema_id} is not found on registry")
+    end
+    private
+    # Providing subject and version to determine the schema,
+    # which skips the auto registeration of schema on the schema registry.
+    # Fetch the schema from registry with the provided subject name and version.
+    def fetch_schema(subject, version)
+      schema_data = @registry.subject_version(subject, version)
+      schema_id = schema_data.fetch('id')
+      schema = Avro::Schema.parse(schema_data.fetch('schema'))
+      [schema_id, schema]
+    end
+    # Fetch the schema from registry with the provided schema_id.
+    def fetch_schema_by_id(schema_id)
+      schema_json = @registry.fetch(schema_id)
+      schema = Avro::Schema.parse(schema_json)
+      [schema_id, schema]
+    end
+    # Schemas are registered under the full name of the top level Avro record
+    # type, or `subject` if it's provided.
+    def register_schema(subject, schema_name, namespace)
+      schema = @schema_store.find(schema_name, namespace)
+      schema_id = @registry.register(subject || schema.fullname, schema)
+      [schema_id, schema]
     end
   end
 end

data/lib/avro_turf/mutable_schema_store.rb ADDED

@@ -0,0 +1,18 @@
+require 'avro_turf/schema_store'
+class AvroTurf
+  # A schema store that allows you to add or remove schemas, and to access
+  # them externally.
+  class MutableSchemaStore < SchemaStore
+    attr_accessor :schemas
+    # @param schema_hash [Hash]
+    def add_schema(schema_hash)
+      name = schema_hash['name']
+      namespace = schema_hash['namespace']
+      full_name = Avro::Name.make_fullname(name, namespace)
+      return if @schemas.key?(full_name)
+      Avro::Schema.real_parse(schema_hash, @schemas)
+    end
+  end
+end

data/lib/avro_turf/schema_registry.rb CHANGED

@@ -1,79 +1,6 @@
-require 'excon'
+require 'avro_turf/confluent_schema_registry'
-class AvroTurf::SchemaRegistry
-  CONTENT_TYPE = "application/vnd.schemaregistry.v1+json".freeze
+# AvroTurf::SchemaRegistry is deprecated and will be removed in a future release.
+# Use AvroTurf::ConfluentSchemaRegistry instead.
-  def initialize(url, logger: Logger.new($stdout))
-    @logger = logger
-    @connection = Excon.new(url, headers: {
-      "Content-Type" => CONTENT_TYPE,
-    })
-  end
-  def fetch(id)
-    @logger.info "Fetching schema with id #{id}"
-    data = get("/schemas/ids/#{id}")
-    data.fetch("schema")
-  end
-  def register(subject, schema)
-    data = post("/subjects/#{subject}/versions", body: {
-      schema: schema.to_s
-    }.to_json)
-    id = data.fetch("id")
-    @logger.info "Registered schema for subject `#{subject}`; id = #{id}"
-    id
-  end
-  # List all subjects
-  def subjects
-    get('/subjects')
-  end
-  # List all versions for a subject
-  def subject_versions(subject)
-    get("/subjects/#{subject}/versions")
-  end
-  # Get a specific version for a subject
-  def subject_version(subject, version = 'latest')
-    get("/subjects/#{subject}/versions/#{version}")
-  end
-  # Check if a schema exists. Returns nil if not found.
-  def check(subject, schema)
-    data = post("/subjects/#{subject}",
-                expects: [200, 404],
-                body: { schema: schema.to_s }.to_json)
-    data unless data.has_key?("error_code")
-  end
-  # Check if a schema is compatible with the stored version.
-  # Returns true if compatible, false otherwise
-  # http://docs.confluent.io/2.0.0/schema-registry/docs/api.html#compatibility
-  def compatible?(subject, schema, version = 'latest')
-    data = post("/compatibility/subjects/#{subject}/versions/#{version}",
-                expects: [200, 404],
-                body: { schema: schema.to_s }.to_json)
-    data.fetch('is_compatible', false) unless data.has_key?('error_code')
-  end
-  private
-  def get(path, **options)
-    request(path, method: :get, **options)
-  end
-  def post(path, **options)
-    request(path, method: :post, **options)
-  end
-  def request(path, **options)
-    options = { expects: 200 }.merge!(options)
-    response = @connection.request(path: path, **options)
-    JSON.parse(response.body)
-  end
-end
+AvroTurf::SchemaRegistry = AvroTurf::ConfluentSchemaRegistry

data/lib/avro_turf/schema_store.rb CHANGED

@@ -1,7 +1,9 @@
 class AvroTurf::SchemaStore
   def initialize(path: nil)
     @path = path or raise "Please specify a schema path"
     @schemas = Hash.new
+    @mutex = Mutex.new
   end
   # Resolves and returns a schema.
@@ -11,9 +13,40 @@ class AvroTurf::SchemaStore
   # Returns an Avro::Schema.
   def find(name, namespace = nil)
     fullname = Avro::Name.make_fullname(name, namespace)
+    # Optimistic non-blocking read from @schemas
+    # No sense to lock the resource when all the schemas already loaded
     return @schemas[fullname] if @schemas.key?(fullname)
+    # Pessimistic blocking write to @schemas
+    @mutex.synchronize do
+      # Still need to check is the schema already loaded
+      return @schemas[fullname] if @schemas.key?(fullname)
+      load_schema!(fullname, namespace)
+    end
+  end
+  # Loads all schema definition files in the `schemas_dir`.
+  def load_schemas!
+    pattern = [@path, "**", "*.avsc"].join("/")
+    Dir.glob(pattern) do |schema_path|
+      # Remove the path prefix.
+      schema_path.sub!(/^\/?#{@path}\//, "")
+      # Replace `/` with `.` and chop off the file extension.
+      schema_name = File.basename(schema_path.tr("/", "."), ".avsc")
+      # Load and cache the schema.
+      find(schema_name)
+    end
+  end
+  private
+  # Loads single schema
+  # Such method is not thread-safe, do not call it of from mutex synchronization routine
+  def load_schema!(fullname, namespace = nil)
     *namespace, schema_name = fullname.split(".")
     schema_path = File.join(@path, *namespace, schema_name + ".avsc")
     schema_json = JSON.parse(File.read(schema_path))
@@ -28,31 +61,15 @@ class AvroTurf::SchemaStore
     # This is a hack in order to figure out exactly which type was missing. The
     # Avro gem ought to provide this data directly.
     if e.to_s =~ /"([\w\.]+)" is not a schema we know about/
-      find($1)
+      load_schema!($1)
       # Re-resolve the original schema now that the dependency has been resolved.
       @schemas.delete(fullname)
-      find(fullname)
+      load_schema!(fullname)
     else
       raise
     end
   rescue Errno::ENOENT, Errno::ENAMETOOLONG
     raise AvroTurf::SchemaNotFoundError, "could not find Avro schema at `#{schema_path}'"
   end
-  # Loads all schema definition files in the `schemas_dir`.
-  def load_schemas!
-    pattern = [@path, "**", "*.avsc"].join("/")
-    Dir.glob(pattern) do |schema_path|
-      # Remove the path prefix.
-      schema_path.sub!(/^\/?#{@path}\//, "")
-      # Replace `/` with `.` and chop off the file extension.
-      schema_name = File.basename(schema_path.tr("/", "."), ".avsc")
-      # Load and cache the schema.
-      find(schema_name)
-    end
-  end
 end