avrokit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrokit/__init__.py ADDED
@@ -0,0 +1,111 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Python utilities for working with Avro data files.
7
+
8
+ Basic Usage Examples
9
+ --------------------
10
+
11
+ Reading Avro Files:
12
+ >>> from avrokit import avro_reader, avro_records, parse_url
13
+ >>>
14
+ >>> # Read all records from an Avro file
15
+ >>> url = parse_url('file:///path/to/data.avro', mode='rb')
16
+ >>> for record in avro_records(url):
17
+ ... print(record)
18
+ >>>
19
+ >>> # Use the reader context manager directly
20
+ >>> with avro_reader(url) as reader:
21
+ ... schema = reader.datum_reader.writers_schema
22
+ ... for record in reader:
23
+ ... print(record)
24
+
25
+ Writing Avro Files:
26
+ >>> from avrokit import avro_writer, avro_schema, parse_url
27
+ >>>
28
+ >>> # Define your schema
29
+ >>> schema = avro_schema({
30
+ ... 'type': 'record',
31
+ ... 'name': 'User',
32
+ ... 'fields': [
33
+ ... {'name': 'name', 'type': 'string'},
34
+ ... {'name': 'age', 'type': 'int'}
35
+ ... ]
36
+ ... })
37
+ >>>
38
+ >>> # Write records to a new file
39
+ >>> url = parse_url('file:///path/to/output.avro', mode='wb')
40
+ >>> with avro_writer(url, schema) as writer:
41
+ ... writer.append({'name': 'Alice', 'age': 30})
42
+ ... writer.append({'name': 'Bob', 'age': 25})
43
+ >>>
44
+ >>> # Append to an existing file
45
+ >>> url = parse_url('file:///path/to/output.avro', mode='ab')
46
+ >>> with avro_writer(url) as writer:
47
+ ... writer.append({'name': 'Charlie', 'age': 35})
48
+
49
+ Working with Remote Files (GCS, S3):
50
+ >>> # Google Cloud Storage
51
+ >>> url = parse_url('gs://bucket/path/to/data.avro', mode='rb')
52
+ >>> for record in avro_records(url):
53
+ ... print(record)
54
+ >>>
55
+ >>> # Amazon S3
56
+ >>> url = parse_url('s3://bucket/path/to/data.avro', mode='rb')
57
+ >>> for record in avro_records(url):
58
+ ... print(record)
59
+
60
+ Reading Multiple Files:
61
+ >>> from avrokit import PartitionedAvroReader
62
+ >>>
63
+ >>> # Read from multiple files or glob patterns
64
+ >>> url = parse_url('file:///path/to/data-*.avro', mode='rb')
65
+ >>> with PartitionedAvroReader(url) as reader:
66
+ ... for record in reader:
67
+ ... print(record)
68
+
69
+ Writing Partitioned Files:
70
+ >>> from avrokit import PartitionedAvroWriter
71
+ >>>
72
+ >>> url = parse_url('file:///path/to/output/', mode='wb')
73
+ >>> with PartitionedAvroWriter(url, schema) as writer:
74
+ ... for i in range(100):
75
+ ... writer.append({'name': f'User{i}', 'age': i})
76
+ ... if i % 10 == 0:
77
+ ... writer.roll() # Create a new partition file
78
+ """
79
+
80
+ from .url import URL, parse_url, create_url_mapping
81
+ from .io import (
82
+ PartitionedAvroReader,
83
+ PartitionedAvroWriter,
84
+ TimePartitionedAvroWriter,
85
+ add_avro_schema_fields,
86
+ avro_reader,
87
+ avro_schema,
88
+ avro_writer,
89
+ avro_records,
90
+ compact_avro_data,
91
+ validate_avro_schema_evolution,
92
+ )
93
+ from .asyncio import DeferredAvroWriter, BlockingQueueAvroReader
94
+
95
+ __all__ = [
96
+ "BlockingQueueAvroReader",
97
+ "DeferredAvroWriter",
98
+ "PartitionedAvroReader",
99
+ "PartitionedAvroWriter",
100
+ "TimePartitionedAvroWriter",
101
+ "URL",
102
+ "add_avro_schema_fields",
103
+ "avro_reader",
104
+ "avro_schema",
105
+ "avro_writer",
106
+ "avro_records",
107
+ "compact_avro_data",
108
+ "create_url_mapping",
109
+ "parse_url",
110
+ "validate_avro_schema_evolution",
111
+ ]
@@ -0,0 +1,11 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from .writer import DeferredAvroWriter
6
+ from .reader import BlockingQueueAvroReader
7
+
8
+ __all__ = [
9
+ "DeferredAvroWriter",
10
+ "BlockingQueueAvroReader",
11
+ ]
@@ -0,0 +1,48 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import threading
6
+ import logging
7
+ import queue
8
+ from typing import Iterable
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class BlockingQueueAvroReader:
14
+ """
15
+ Reads
16
+ """
17
+
18
+ def __init__(self, data: Iterable[object], daemon: bool = True) -> None:
19
+ self.data = data
20
+ self._reader_queue: queue.Queue = queue.Queue()
21
+ self._reader_thread_done = threading.Event()
22
+ self._reader_thread = threading.Thread(
23
+ target=self._reader_worker, daemon=daemon
24
+ )
25
+
26
+ def _reader_worker(self) -> None:
27
+ try:
28
+ for datum in self.data:
29
+ self._reader_queue.put(datum, block=True)
30
+ except Exception as e:
31
+ logger.error("Error in reader thread")
32
+ logger.exception(e)
33
+ finally:
34
+ self._reader_thread_done.set()
35
+
36
+ @property
37
+ def queue(self) -> queue.Queue:
38
+ return self._reader_queue
39
+
40
+ def empty(self) -> bool:
41
+ return self._reader_queue.empty() and self._reader_thread_done.is_set()
42
+
43
+ def start(self) -> None:
44
+ self._reader_thread.start()
45
+
46
+ def stop(self) -> None:
47
+ self._reader_thread_done.set()
48
+ self._reader_thread.join()
@@ -0,0 +1,48 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import threading
6
+ import logging
7
+ import queue
8
+ from avrokit.io.writer import Appendable
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class DeferredAvroWriter:
14
+ """
15
+ Accepts writes and appends asynchronously in a separate thread.
16
+ """
17
+
18
+ def __init__(self, writer: Appendable, daemon: bool = True) -> None:
19
+ self.writer = writer
20
+ self._writer_queue: queue.Queue = queue.Queue()
21
+ self._writer_thread_done = threading.Event()
22
+ self._writer_thread = threading.Thread(
23
+ target=self._writer_worker, daemon=daemon
24
+ )
25
+
26
+ def _writer_worker(self):
27
+ while not self._writer_thread_done.is_set() or not self._writer_queue.empty():
28
+ try:
29
+ datum = self._writer_queue.get(timeout=1)
30
+ if datum:
31
+ self.writer.append(datum)
32
+ except queue.Empty:
33
+ continue
34
+ except Exception as e:
35
+ logger.error("Error in writer thread")
36
+ logger.exception(e)
37
+
38
+ def start(self) -> None:
39
+ self._writer_thread.start()
40
+
41
+ def stop(self) -> None:
42
+ self._writer_thread_done.set()
43
+ self._writer_thread.join()
44
+
45
+ def append(
46
+ self, datum: object, block: bool = True, timeout: int | None = None
47
+ ) -> None:
48
+ self._writer_queue.put(datum, block=block, timeout=timeout)
avrokit/io/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from .schema import (
6
+ add_avro_schema_fields,
7
+ avro_schema,
8
+ read_avro_schema,
9
+ read_avro_schema_from_first_nonempty_file,
10
+ validate_avro_schema_evolution,
11
+ )
12
+ from .reader import avro_reader, PartitionedAvroReader, avro_records
13
+ from .writer import (
14
+ avro_writer,
15
+ Appendable,
16
+ PartitionedAvroWriter,
17
+ TimePartitionedAvroWriter,
18
+ )
19
+ from .compact import compact_avro_data
20
+
21
+ __all__ = [
22
+ "Appendable",
23
+ "PartitionedAvroReader",
24
+ "PartitionedAvroWriter",
25
+ "TimePartitionedAvroWriter",
26
+ "add_avro_schema_fields",
27
+ "avro_reader",
28
+ "avro_schema",
29
+ "avro_writer",
30
+ "avro_records",
31
+ "compact_avro_data",
32
+ "read_avro_schema",
33
+ "read_avro_schema_from_first_nonempty_file",
34
+ "validate_avro_schema_evolution",
35
+ ]
avrokit/io/compact.py ADDED
@@ -0,0 +1,36 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from avrokit.io.reader import avro_reader
6
+ from avrokit.url.utils import flatten_urls
7
+ from ..url import URL
8
+ from .writer import avro_writer
9
+ from .schema import read_avro_schema_from_first_nonempty_file
10
+ from typing import Union, Sequence
11
+
12
+
13
+ def compact_avro_data(
14
+ src: Union[URL, Sequence[URL]],
15
+ dst: URL,
16
+ expand_src: bool = True,
17
+ ) -> None:
18
+ """
19
+ Compact Avro data from source URLs into a single destination URL.
20
+
21
+ :param src: Source URL(s) to read Avro data from.
22
+ :param dst: Destination URL to write the compacted Avro data to.
23
+ :param expand_src: Whether to expand source URLs if they are partitioned.
24
+ """
25
+ src_urls = flatten_urls(src, expand=expand_src)
26
+ if not src_urls:
27
+ raise ValueError("No source URLs found to compact.")
28
+ schema = read_avro_schema_from_first_nonempty_file(src_urls)
29
+ if schema is None:
30
+ raise ValueError("No Avro schema found in source URLs.")
31
+ # TODO Codec for avro_writer compression
32
+ with avro_writer(dst.with_mode("wb"), schema) as writer:
33
+ for url in src_urls:
34
+ with avro_reader(url.with_mode("rb")) as reader:
35
+ for record in reader:
36
+ writer.append(record)
avrokit/io/reader.py ADDED
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from __future__ import annotations
6
+ from ..url import URL
7
+ from avro.datafile import DataFileReader
8
+ from avro.io import DatumReader
9
+ from contextlib import contextmanager
10
+ from typing import Generator, Iterator, Sequence, IO, Any, Self, Union, cast
11
+
12
+
13
+ @contextmanager
14
+ def avro_reader(url: URL) -> Generator[DataFileReader, None, None]:
15
+ """
16
+ Opens an Avro DataFileReader for the given URL.
17
+
18
+ :param url: The URL of the Avro file to read.
19
+ :return: A DataFileReader object.
20
+ """
21
+ with url as f, DataFileReader(f, DatumReader()) as reader:
22
+ yield reader
23
+
24
+
25
+ def avro_records(url: URL) -> Generator[dict[str, Any], None, None]:
26
+ with avro_reader(url) as reader:
27
+ for record in reader:
28
+ yield cast(dict[str, Any], record)
29
+
30
+
31
+ class PartitionedAvroReader:
32
+ def __init__(self, urls: Union[URL, Sequence[URL]]):
33
+ self.urls = [urls] if isinstance(urls, URL) else urls
34
+ self.expanded_urls: list[URL] = []
35
+ self.current_index = 0
36
+ self.current_url: URL | None = None
37
+ self.current_url_stream: IO[Any] | None = None
38
+ self.current_reader: DataFileReader | None = None
39
+
40
+ def _open_reader(self) -> DataFileReader:
41
+ if self.current_reader:
42
+ self.current_reader.close()
43
+ if self.current_index >= len(self.expanded_urls):
44
+ raise StopIteration
45
+ self.current_url = self.expanded_urls[self.current_index]
46
+ if not self.current_url:
47
+ raise StopIteration
48
+ self.current_url_stream = self.current_url.open()
49
+ self.current_reader = DataFileReader(self.current_url_stream, DatumReader())
50
+ return self.current_reader
51
+
52
+ def open(self) -> Self:
53
+ self.expanded_urls = [
54
+ expanded_url for url in self.urls for expanded_url in url.expand()
55
+ ]
56
+ if len(self.expanded_urls) != 0:
57
+ self._open_reader()
58
+ return self
59
+
60
+ def close(self) -> None:
61
+ if self.current_reader:
62
+ self.current_reader.close()
63
+ self.current_reader = None
64
+ self.current_url_stream = None
65
+ self.current_index = 0
66
+
67
+ def __enter__(self) -> Self:
68
+ return self.open()
69
+
70
+ def __exit__(self, exc_type, exc_value, traceback):
71
+ self.close()
72
+
73
+ def __iter__(self) -> Iterator[object]:
74
+ return self
75
+
76
+ def __next__(self) -> object:
77
+ while True:
78
+ try:
79
+ if not self.current_reader:
80
+ raise StopIteration
81
+ return next(self.current_reader)
82
+ except StopIteration:
83
+ self.current_index += 1
84
+ self._open_reader()
avrokit/io/schema.py ADDED
@@ -0,0 +1,166 @@
1
+ # SPDX-FileCopyrightText: 2026 Greg Brandt <brandt.greg@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ from typing import Sequence, Union, Any
7
+ from avro.schema import Field, RecordSchema, Schema, UnionSchema, parse, EnumSchema
8
+ from avro.io import DatumReader
9
+ from avro.datafile import DataFileReader
10
+ from ..url import URL
11
+
12
+
13
+ def read_avro_schema(url: URL) -> Schema:
14
+ """
15
+ Reads the Avro schema from a file at the given URL.
16
+ """
17
+ with url as f, DataFileReader(f, DatumReader()) as reader:
18
+ return parse(reader.schema)
19
+
20
+
21
+ def read_avro_schema_from_first_nonempty_file(urls: Sequence[URL]) -> Schema | None:
22
+ """
23
+ Reads the Avro schema from the first non-empty file in a sequence of URLs.
24
+ """
25
+ for url in urls:
26
+ if url.exists() and url.size() > 0:
27
+ return read_avro_schema(url)
28
+ return None
29
+
30
+
31
+ def avro_schema(schema: Union[str, dict]) -> Schema:
32
+ """
33
+ Converts a dictionary schema to an Avro Schema object.
34
+
35
+ :param schema: The dictionary schema to convert.
36
+ :return: An Avro Schema object.
37
+ """
38
+ if isinstance(schema, str):
39
+ return parse(schema)
40
+ else:
41
+ return parse(json.dumps(schema))
42
+
43
+
44
+ def add_avro_schema_fields(schema: Schema, fields: Sequence[dict[str, Any]]) -> Schema:
45
+ """
46
+ Adds fields to an Avro schema.
47
+ """
48
+ schema_dict = schema.to_json()
49
+ if not isinstance(schema_dict, dict):
50
+ raise ValueError("Schema is not a valid Avro record schema.")
51
+ schema_dict["fields"].extend(fields)
52
+ return avro_schema(schema_dict)
53
+
54
+
55
+ def flatten_avro_schema_fields(
56
+ schema: Schema, path: list[str] | None = None
57
+ ) -> dict[str, Field]:
58
+ """
59
+ Flattens the Avro schema fields into a dictionary with dot-notation keys.
60
+
61
+ :param schema: The Avro schema to flatten.
62
+ :param path: The current path in the schema (used for recursion).
63
+ :return: A dictionary with dot-notation keys and Field objects as values.
64
+ """
65
+ acc = {}
66
+ if isinstance(schema, RecordSchema) and isinstance(schema.fields, list):
67
+ for field in schema.fields:
68
+ # Resolve the field name
69
+ name = []
70
+ if path is not None:
71
+ name.extend(path)
72
+ name.append(field.name)
73
+ if isinstance(field.type, RecordSchema):
74
+ # Recursively flatten the schema if it's a record
75
+ acc.update(flatten_avro_schema_fields(field.type, name))
76
+ elif isinstance(field.type, UnionSchema):
77
+ # Add the union schema to the accumulator itself
78
+ acc[".".join(name)] = field
79
+ # Look for any record schemas in the union
80
+ for i, union_schema in enumerate(field.type.schemas):
81
+ if isinstance(union_schema, RecordSchema):
82
+ # If the union schema is a record, consider its evolution
83
+ union_name = [*name, "__union__", str(i)]
84
+ acc.update(flatten_avro_schema_fields(union_schema, union_name))
85
+ else:
86
+ # Otherwise, just add the field to the accumulator
87
+ acc[".".join(name)] = field
88
+ return acc
89
+
90
+
91
+ def validate_avro_schema_evolution(schema_a: Schema, schema_b: Schema):
92
+ """
93
+ Validates the evolution of two Avro schemas.
94
+
95
+ Allowed operations:
96
+ - Adding a new field with a default value.
97
+ - Removing a field with a default value.
98
+ - Adding or changing a default value on an existing field.
99
+ - Making a field optional (i.e. union with null, default null).
100
+ - Adding symbols to an enum type.
101
+ - Adding a new type to a union type.
102
+
103
+ Note: This is focused on *forward compatibility* i.e. that all data can be read using the most
104
+ recent schema. New data may not be readable with the old schema, but old data can be safely
105
+ up-converted to the new schema.
106
+
107
+ See: https://docs.oracle.com/cd/E26161_02/html/GettingStartedGuide/schemaevolution.html
108
+
109
+ :param a: The original schema.
110
+ :param b: The evolved schema.
111
+ :raises ValueError: If the schemas are not compatible.
112
+ :return: True if the evolution is valid, False otherwise.
113
+ """
114
+ # Flatten the schemas with names in dot-notation
115
+ schema_a_fields = flatten_avro_schema_fields(schema_a)
116
+ schema_b_fields = flatten_avro_schema_fields(schema_b)
117
+ # Check for new fields in schema_b
118
+ for name, field in schema_b_fields.items():
119
+ # If the field is not in schema_a, it must be a new field with a default value
120
+ if name not in schema_a_fields:
121
+ if "default" not in field.props:
122
+ raise ValueError(f"Field {name} is missing a default value.")
123
+ continue
124
+ # If the field is in both schemas, check for changes
125
+ old_field = schema_a_fields[name]
126
+ # Default value cannot be removed, just changed
127
+ if "default" not in field.props and "default" in old_field.props:
128
+ raise ValueError(f"Field {name} default value cannot be removed.")
129
+ if field.type != old_field.type:
130
+ # If it is an enum, the new enum must be a superset of the old enum
131
+ if isinstance(field.type, EnumSchema) and isinstance(
132
+ old_field.type, EnumSchema
133
+ ):
134
+ old_enum = set(old_field.type.symbols)
135
+ new_enum = set(field.type.symbols)
136
+ if not old_enum.issubset(new_enum):
137
+ raise ValueError(
138
+ f"Field {name} enum has changed from {old_enum} to {new_enum}."
139
+ )
140
+ # If the type is a union, it must be a superset of the old type
141
+ elif isinstance(field.type, UnionSchema) and isinstance(
142
+ old_field.type, UnionSchema
143
+ ):
144
+ old_union = set(old_field.type.schemas)
145
+ new_union = set(field.type.schemas)
146
+ if not old_union.issubset(new_union):
147
+ raise ValueError(
148
+ f"Field {name} union has changed from {old_union} to {new_union}."
149
+ )
150
+ # If the type has changed, it must be a union with null
151
+ elif not isinstance(field.type, UnionSchema):
152
+ raise ValueError(
153
+ f"Field {name} type has changed from {old_field.type} to {field.type}."
154
+ )
155
+ elif len(field.type.schemas) != 2 or not any(
156
+ isinstance(s, RecordSchema) and s.name == "null"
157
+ for s in field.type.schemas
158
+ ):
159
+ raise ValueError(
160
+ f"Field {name} type has changed from {old_field.type} to {field.type}."
161
+ )
162
+ # Check for removed fields in schema_b
163
+ for name, field in schema_a_fields.items():
164
+ # If the field is not in schema_b, it must be a removed field with a default value
165
+ if name not in schema_b_fields and "default" not in field.props:
166
+ raise ValueError(f"Field {name} is missing a default value.")