PyPI - avrokit - Versions diffs - 0.0.1__py3-none-any.whl - Mend

avrokit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

avrokit/__init__.py +111 -0
avrokit/asyncio/__init__.py +11 -0
avrokit/asyncio/reader.py +48 -0
avrokit/asyncio/writer.py +48 -0
avrokit/io/__init__.py +35 -0
avrokit/io/compact.py +36 -0
avrokit/io/reader.py +84 -0
avrokit/io/schema.py +166 -0
avrokit/io/writer.py +199 -0
avrokit/py.typed +0 -0
avrokit/tools/__init__.py +31 -0
avrokit/tools/__main__.py +81 -0
avrokit/tools/base.py +30 -0
avrokit/tools/cat.py +103 -0
avrokit/tools/concat.py +174 -0
avrokit/tools/count.py +74 -0
avrokit/tools/filesort.py +141 -0
avrokit/tools/fromparquet.py +207 -0
avrokit/tools/getmeta.py +32 -0
avrokit/tools/getschema.py +32 -0
avrokit/tools/httpserver.py +269 -0
avrokit/tools/partition.py +107 -0
avrokit/tools/repair.py +237 -0
avrokit/tools/stats.py +66 -0
avrokit/tools/tojson.py +33 -0
avrokit/tools/toparquet.py +153 -0
avrokit/url/__init__.py +10 -0
avrokit/url/base.py +98 -0
avrokit/url/factory.py +34 -0
avrokit/url/file.py +108 -0
avrokit/url/google.py +168 -0
avrokit/url/http.py +122 -0
avrokit/url/s3.py +141 -0
avrokit/url/utils.py +60 -0
avrokit-0.0.1.dist-info/METADATA +485 -0
avrokit-0.0.1.dist-info/RECORD +39 -0
avrokit-0.0.1.dist-info/WHEEL +4 -0
avrokit-0.0.1.dist-info/entry_points.txt +3 -0
avrokit-0.0.1.dist-info/licenses/LICENSE +201 -0

avrokit/__init__.py ADDED Viewed

@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Python utilities for working with Avro data files.
+Basic Usage Examples
+--------------------
+Reading Avro Files:
+    >>> from avrokit import avro_reader, avro_records, parse_url
+    >>>
+    >>> # Read all records from an Avro file
+    >>> url = parse_url('file:///path/to/data.avro', mode='rb')
+    >>> for record in avro_records(url):
+    ...     print(record)
+    >>>
+    >>> # Use the reader context manager directly
+    >>> with avro_reader(url) as reader:
+    ...     schema = reader.datum_reader.writers_schema
+    ...     for record in reader:
+    ...         print(record)
+Writing Avro Files:
+    >>> from avrokit import avro_writer, avro_schema, parse_url
+    >>>
+    >>> # Define your schema
+    >>> schema = avro_schema({
+    ...     'type': 'record',
+    ...     'name': 'User',
+    ...     'fields': [
+    ...         {'name': 'name', 'type': 'string'},
+    ...         {'name': 'age', 'type': 'int'}
+    ...     ]
+    ... })
+    >>>
+    >>> # Write records to a new file
+    >>> url = parse_url('file:///path/to/output.avro', mode='wb')
+    >>> with avro_writer(url, schema) as writer:
+    ...     writer.append({'name': 'Alice', 'age': 30})
+    ...     writer.append({'name': 'Bob', 'age': 25})
+    >>>
+    >>> # Append to an existing file
+    >>> url = parse_url('file:///path/to/output.avro', mode='ab')
+    >>> with avro_writer(url) as writer:
+    ...     writer.append({'name': 'Charlie', 'age': 35})
+Working with Remote Files (GCS, S3):
+    >>> # Google Cloud Storage
+    >>> url = parse_url('gs://bucket/path/to/data.avro', mode='rb')
+    >>> for record in avro_records(url):
+    ...     print(record)
+    >>>
+    >>> # Amazon S3
+    >>> url = parse_url('s3://bucket/path/to/data.avro', mode='rb')
+    >>> for record in avro_records(url):
+    ...     print(record)
+Reading Multiple Files:
+    >>> from avrokit import PartitionedAvroReader
+    >>>
+    >>> # Read from multiple files or glob patterns
+    >>> url = parse_url('file:///path/to/data-*.avro', mode='rb')
+    >>> with PartitionedAvroReader(url) as reader:
+    ...     for record in reader:
+    ...         print(record)
+Writing Partitioned Files:
+    >>> from avrokit import PartitionedAvroWriter
+    >>>
+    >>> url = parse_url('file:///path/to/output/', mode='wb')
+    >>> with PartitionedAvroWriter(url, schema) as writer:
+    ...     for i in range(100):
+    ...         writer.append({'name': f'User{i}', 'age': i})
+    ...         if i % 10 == 0:
+    ...             writer.roll()  # Create a new partition file
+"""
+from .url import URL, parse_url, create_url_mapping
+from .io import (
+    PartitionedAvroReader,
+    PartitionedAvroWriter,
+    TimePartitionedAvroWriter,
+    add_avro_schema_fields,
+    avro_reader,
+    avro_schema,
+    avro_writer,
+    avro_records,
+    compact_avro_data,
+    validate_avro_schema_evolution,
+)
+from .asyncio import DeferredAvroWriter, BlockingQueueAvroReader
+__all__ = [
+    "BlockingQueueAvroReader",
+    "DeferredAvroWriter",
+    "PartitionedAvroReader",
+    "PartitionedAvroWriter",
+    "TimePartitionedAvroWriter",
+    "URL",
+    "add_avro_schema_fields",
+    "avro_reader",
+    "avro_schema",
+    "avro_writer",
+    "avro_records",
+    "compact_avro_data",
+    "create_url_mapping",
+    "parse_url",
+    "validate_avro_schema_evolution",
+]

avrokit/asyncio/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .writer import DeferredAvroWriter
+from .reader import BlockingQueueAvroReader
+__all__ = [
+    "DeferredAvroWriter",
+    "BlockingQueueAvroReader",
+]

avrokit/asyncio/reader.py ADDED Viewed

@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+import threading
+import logging
+import queue
+from typing import Iterable
+logger = logging.getLogger(__name__)
+class BlockingQueueAvroReader:
+    """
+    Reads
+    """
+    def __init__(self, data: Iterable[object], daemon: bool = True) -> None:
+        self.data = data
+        self._reader_queue: queue.Queue = queue.Queue()
+        self._reader_thread_done = threading.Event()
+        self._reader_thread = threading.Thread(
+            target=self._reader_worker, daemon=daemon
+        )
+    def _reader_worker(self) -> None:
+        try:
+            for datum in self.data:
+                self._reader_queue.put(datum, block=True)
+        except Exception as e:
+            logger.error("Error in reader thread")
+            logger.exception(e)
+        finally:
+            self._reader_thread_done.set()
+    @property
+    def queue(self) -> queue.Queue:
+        return self._reader_queue
+    def empty(self) -> bool:
+        return self._reader_queue.empty() and self._reader_thread_done.is_set()
+    def start(self) -> None:
+        self._reader_thread.start()
+    def stop(self) -> None:
+        self._reader_thread_done.set()
+        self._reader_thread.join()

avrokit/asyncio/writer.py ADDED Viewed

@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+import threading
+import logging
+import queue
+from avrokit.io.writer import Appendable
+logger = logging.getLogger(__name__)
+class DeferredAvroWriter:
+    """
+    Accepts writes and appends asynchronously in a separate thread.
+    """
+    def __init__(self, writer: Appendable, daemon: bool = True) -> None:
+        self.writer = writer
+        self._writer_queue: queue.Queue = queue.Queue()
+        self._writer_thread_done = threading.Event()
+        self._writer_thread = threading.Thread(
+            target=self._writer_worker, daemon=daemon
+        )
+    def _writer_worker(self):
+        while not self._writer_thread_done.is_set() or not self._writer_queue.empty():
+            try:
+                datum = self._writer_queue.get(timeout=1)
+                if datum:
+                    self.writer.append(datum)
+            except queue.Empty:
+                continue
+            except Exception as e:
+                logger.error("Error in writer thread")
+                logger.exception(e)
+    def start(self) -> None:
+        self._writer_thread.start()
+    def stop(self) -> None:
+        self._writer_thread_done.set()
+        self._writer_thread.join()
+    def append(
+        self, datum: object, block: bool = True, timeout: int | None = None
+    ) -> None:
+        self._writer_queue.put(datum, block=block, timeout=timeout)

avrokit/io/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .schema import (
+    add_avro_schema_fields,
+    avro_schema,
+    read_avro_schema,
+    read_avro_schema_from_first_nonempty_file,
+    validate_avro_schema_evolution,
+)
+from .reader import avro_reader, PartitionedAvroReader, avro_records
+from .writer import (
+    avro_writer,
+    Appendable,
+    PartitionedAvroWriter,
+    TimePartitionedAvroWriter,
+)
+from .compact import compact_avro_data
+__all__ = [
+    "Appendable",
+    "PartitionedAvroReader",
+    "PartitionedAvroWriter",
+    "TimePartitionedAvroWriter",
+    "add_avro_schema_fields",
+    "avro_reader",
+    "avro_schema",
+    "avro_writer",
+    "avro_records",
+    "compact_avro_data",
+    "read_avro_schema",
+    "read_avro_schema_from_first_nonempty_file",
+    "validate_avro_schema_evolution",
+]

avrokit/io/compact.py ADDED Viewed

@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+from avrokit.io.reader import avro_reader
+from avrokit.url.utils import flatten_urls
+from ..url import URL
+from .writer import avro_writer
+from .schema import read_avro_schema_from_first_nonempty_file
+from typing import Union, Sequence
+def compact_avro_data(
+    src: Union[URL, Sequence[URL]],
+    dst: URL,
+    expand_src: bool = True,
+) -> None:
+    """
+    Compact Avro data from source URLs into a single destination URL.
+    :param src: Source URL(s) to read Avro data from.
+    :param dst: Destination URL to write the compacted Avro data to.
+    :param expand_src: Whether to expand source URLs if they are partitioned.
+    """
+    src_urls = flatten_urls(src, expand=expand_src)
+    if not src_urls:
+        raise ValueError("No source URLs found to compact.")
+    schema = read_avro_schema_from_first_nonempty_file(src_urls)
+    if schema is None:
+        raise ValueError("No Avro schema found in source URLs.")
+    # TODO Codec for avro_writer compression
+    with avro_writer(dst.with_mode("wb"), schema) as writer:
+        for url in src_urls:
+            with avro_reader(url.with_mode("rb")) as reader:
+                for record in reader:
+                    writer.append(record)

avrokit/io/reader.py ADDED Viewed

@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from ..url import URL
+from avro.datafile import DataFileReader
+from avro.io import DatumReader
+from contextlib import contextmanager
+from typing import Generator, Iterator, Sequence, IO, Any, Self, Union, cast
+@contextmanager
+def avro_reader(url: URL) -> Generator[DataFileReader, None, None]:
+    """
+    Opens an Avro DataFileReader for the given URL.
+    :param url: The URL of the Avro file to read.
+    :return: A DataFileReader object.
+    """
+    with url as f, DataFileReader(f, DatumReader()) as reader:
+        yield reader
+def avro_records(url: URL) -> Generator[dict[str, Any], None, None]:
+    with avro_reader(url) as reader:
+        for record in reader:
+            yield cast(dict[str, Any], record)
+class PartitionedAvroReader:
+    def __init__(self, urls: Union[URL, Sequence[URL]]):
+        self.urls = [urls] if isinstance(urls, URL) else urls
+        self.expanded_urls: list[URL] = []
+        self.current_index = 0
+        self.current_url: URL | None = None
+        self.current_url_stream: IO[Any] | None = None
+        self.current_reader: DataFileReader | None = None
+    def _open_reader(self) -> DataFileReader:
+        if self.current_reader:
+            self.current_reader.close()
+        if self.current_index >= len(self.expanded_urls):
+            raise StopIteration
+        self.current_url = self.expanded_urls[self.current_index]
+        if not self.current_url:
+            raise StopIteration
+        self.current_url_stream = self.current_url.open()
+        self.current_reader = DataFileReader(self.current_url_stream, DatumReader())
+        return self.current_reader
+    def open(self) -> Self:
+        self.expanded_urls = [
+            expanded_url for url in self.urls for expanded_url in url.expand()
+        ]
+        if len(self.expanded_urls) != 0:
+            self._open_reader()
+        return self
+    def close(self) -> None:
+        if self.current_reader:
+            self.current_reader.close()
+            self.current_reader = None
+        self.current_url_stream = None
+        self.current_index = 0
+    def __enter__(self) -> Self:
+        return self.open()
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+    def __iter__(self) -> Iterator[object]:
+        return self
+    def __next__(self) -> object:
+        while True:
+            try:
+                if not self.current_reader:
+                    raise StopIteration
+                return next(self.current_reader)
+            except StopIteration:
+                self.current_index += 1
+                self._open_reader()

avrokit/io/schema.py ADDED Viewed

@@ -0,0 +1,166 @@
+# SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+import json
+from typing import Sequence, Union, Any
+from avro.schema import Field, RecordSchema, Schema, UnionSchema, parse, EnumSchema
+from avro.io import DatumReader
+from avro.datafile import DataFileReader
+from ..url import URL
+def read_avro_schema(url: URL) -> Schema:
+    """
+    Reads the Avro schema from a file at the given URL.
+    """
+    with url as f, DataFileReader(f, DatumReader()) as reader:
+        return parse(reader.schema)
+def read_avro_schema_from_first_nonempty_file(urls: Sequence[URL]) -> Schema | None:
+    """
+    Reads the Avro schema from the first non-empty file in a sequence of URLs.
+    """
+    for url in urls:
+        if url.exists() and url.size() > 0:
+            return read_avro_schema(url)
+    return None
+def avro_schema(schema: Union[str, dict]) -> Schema:
+    """
+    Converts a dictionary schema to an Avro Schema object.
+    :param schema: The dictionary schema to convert.
+    :return: An Avro Schema object.
+    """
+    if isinstance(schema, str):
+        return parse(schema)
+    else:
+        return parse(json.dumps(schema))
+def add_avro_schema_fields(schema: Schema, fields: Sequence[dict[str, Any]]) -> Schema:
+    """
+    Adds fields to an Avro schema.
+    """
+    schema_dict = schema.to_json()
+    if not isinstance(schema_dict, dict):
+        raise ValueError("Schema is not a valid Avro record schema.")
+    schema_dict["fields"].extend(fields)
+    return avro_schema(schema_dict)
+def flatten_avro_schema_fields(
+    schema: Schema, path: list[str] | None = None
+) -> dict[str, Field]:
+    """
+    Flattens the Avro schema fields into a dictionary with dot-notation keys.
+    :param schema: The Avro schema to flatten.
+    :param path: The current path in the schema (used for recursion).
+    :return: A dictionary with dot-notation keys and Field objects as values.
+    """
+    acc = {}
+    if isinstance(schema, RecordSchema) and isinstance(schema.fields, list):
+        for field in schema.fields:
+            # Resolve the field name
+            name = []
+            if path is not None:
+                name.extend(path)
+            name.append(field.name)
+            if isinstance(field.type, RecordSchema):
+                # Recursively flatten the schema if it's a record
+                acc.update(flatten_avro_schema_fields(field.type, name))
+            elif isinstance(field.type, UnionSchema):
+                # Add the union schema to the accumulator itself
+                acc[".".join(name)] = field
+                # Look for any record schemas in the union
+                for i, union_schema in enumerate(field.type.schemas):
+                    if isinstance(union_schema, RecordSchema):
+                        # If the union schema is a record, consider its evolution
+                        union_name = [*name, "__union__", str(i)]
+                        acc.update(flatten_avro_schema_fields(union_schema, union_name))
+            else:
+                # Otherwise, just add the field to the accumulator
+                acc[".".join(name)] = field
+    return acc
+def validate_avro_schema_evolution(schema_a: Schema, schema_b: Schema):
+    """
+    Validates the evolution of two Avro schemas.
+    Allowed operations:
+    - Adding a new field with a default value.
+    - Removing a field with a default value.
+    - Adding or changing a default value on an existing field.
+    - Making a field optional (i.e. union with null, default null).
+    - Adding symbols to an enum type.
+    - Adding a new type to a union type.
+    Note: This is focused on *forward compatibility* i.e. that all data can be read using the most
+    recent schema. New data may not be readable with the old schema, but old data can be safely
+    up-converted to the new schema.
+    See: https://docs.oracle.com/cd/E26161_02/html/GettingStartedGuide/schemaevolution.html
+    :param a: The original schema.
+    :param b: The evolved schema.
+    :raises ValueError: If the schemas are not compatible.
+    :return: True if the evolution is valid, False otherwise.
+    """
+    # Flatten the schemas with names in dot-notation
+    schema_a_fields = flatten_avro_schema_fields(schema_a)
+    schema_b_fields = flatten_avro_schema_fields(schema_b)
+    # Check for new fields in schema_b
+    for name, field in schema_b_fields.items():
+        # If the field is not in schema_a, it must be a new field with a default value
+        if name not in schema_a_fields:
+            if "default" not in field.props:
+                raise ValueError(f"Field {name} is missing a default value.")
+            continue
+        # If the field is in both schemas, check for changes
+        old_field = schema_a_fields[name]
+        # Default value cannot be removed, just changed
+        if "default" not in field.props and "default" in old_field.props:
+            raise ValueError(f"Field {name} default value cannot be removed.")
+        if field.type != old_field.type:
+            # If it is an enum, the new enum must be a superset of the old enum
+            if isinstance(field.type, EnumSchema) and isinstance(
+                old_field.type, EnumSchema
+            ):
+                old_enum = set(old_field.type.symbols)
+                new_enum = set(field.type.symbols)
+                if not old_enum.issubset(new_enum):
+                    raise ValueError(
+                        f"Field {name} enum has changed from {old_enum} to {new_enum}."
+                    )
+            # If the type is a union, it must be a superset of the old type
+            elif isinstance(field.type, UnionSchema) and isinstance(
+                old_field.type, UnionSchema
+            ):
+                old_union = set(old_field.type.schemas)
+                new_union = set(field.type.schemas)
+                if not old_union.issubset(new_union):
+                    raise ValueError(
+                        f"Field {name} union has changed from {old_union} to {new_union}."
+                    )
+            # If the type has changed, it must be a union with null
+            elif not isinstance(field.type, UnionSchema):
+                raise ValueError(
+                    f"Field {name} type has changed from {old_field.type} to {field.type}."
+                )
+            elif len(field.type.schemas) != 2 or not any(
+                isinstance(s, RecordSchema) and s.name == "null"
+                for s in field.type.schemas
+            ):
+                raise ValueError(
+                    f"Field {name} type has changed from {old_field.type} to {field.type}."
+                )
+    # Check for removed fields in schema_b
+    for name, field in schema_a_fields.items():
+        # If the field is not in schema_b, it must be a removed field with a default value
+        if name not in schema_b_fields and "default" not in field.props:
+            raise ValueError(f"Field {name} is missing a default value.")