PyPI - mad-prefect - Versions diffs - 1.0.0__tar.gz - Mend

mad-prefect 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

mad_prefect-1.0.0/PKG-INFO +38 -0
mad_prefect-1.0.0/README.md +20 -0
mad_prefect-1.0.0/mad_prefect/data_assets/__init__.py +35 -0
mad_prefect-1.0.0/mad_prefect/data_assets/data_artifact.py +222 -0
mad_prefect-1.0.0/mad_prefect/data_assets/data_artifact_collector.py +69 -0
mad_prefect-1.0.0/mad_prefect/data_assets/data_artifact_query.py +143 -0
mad_prefect-1.0.0/mad_prefect/data_assets/data_asset.py +285 -0
mad_prefect-1.0.0/mad_prefect/data_assets/data_asset_run.py +24 -0
mad_prefect-1.0.0/mad_prefect/data_assets/options.py +16 -0
mad_prefect-1.0.0/mad_prefect/data_assets/readme.md +31 -0
mad_prefect-1.0.0/mad_prefect/data_assets/utils.py +16 -0
mad_prefect-1.0.0/mad_prefect/duckdb.py +72 -0
mad_prefect-1.0.0/mad_prefect/filesystems.py +229 -0
mad_prefect-1.0.0/mad_prefect/json/mad_json_encoder.py +25 -0
mad_prefect-1.0.0/mad_prefect/requests.py +62 -0
mad_prefect-1.0.0/mad_prefect/runner/storage.py +30 -0
mad_prefect-1.0.0/mad_prefect/utils/sql.py +10 -0
mad_prefect-1.0.0/pyproject.toml +34 -0

mad_prefect-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,38 @@
+Metadata-Version: 2.1
+Name: mad-prefect
+Version: 1.0.0
+Summary:
+Author: MAIT DEV Pty Ltd
+Author-email: maitland@mait.dev
+Requires-Python: >=3.11,<3.12
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Requires-Dist: duckdb (>=0.9,<1.2)
+Requires-Dist: fsspec (>=2024.9.0,<2025.0.0)
+Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
+Requires-Dist: pandas (>=2.1.1,<3.0.0)
+Requires-Dist: prefect (>=2.18,<3)
+Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
+Requires-Dist: sshfs (>=2024.6.0,<2025.0.0)
+Description-Content-Type: text/markdown
+# Introduction
+TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project.
+# Getting Started
+TODO: Guide users through getting your code up and running on their own system. In this section you can talk about:
+1.	Installation process
+2.	Software dependencies
+3.	Latest releases
+4.	API references
+# Build and Test
+TODO: Describe and show how to build your code and run the tests.
+# Contribute
+TODO: Explain how other users and developers can contribute to make your code better.
+If you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:
+- [ASP.NET Core](https://github.com/aspnet/Home)
+- [Visual Studio Code](https://github.com/Microsoft/vscode)
+- [Chakra Core](https://github.com/Microsoft/ChakraCore)

mad_prefect-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Introduction
+TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project.
+# Getting Started
+TODO: Guide users through getting your code up and running on their own system. In this section you can talk about:
+1.	Installation process
+2.	Software dependencies
+3.	Latest releases
+4.	API references
+# Build and Test
+TODO: Describe and show how to build your code and run the tests.
+# Contribute
+TODO: Explain how other users and developers can contribute to make your code better.
+If you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:
+- [ASP.NET Core](https://github.com/aspnet/Home)
+- [Visual Studio Code](https://github.com/Microsoft/vscode)
+- [Chakra Core](https://github.com/Microsoft/ChakraCore)

mad_prefect-1.0.0/mad_prefect/data_assets/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+import datetime
+import os
+from typing import Callable, Literal
+from mad_prefect.data_assets.options import ReadJsonOptions
+ASSET_METADATA_LOCATION = os.getenv("ASSET_METADATA_LOCATION", ".asset_metadata")
+ARTIFACT_FILE_TYPES = Literal["parquet", "json"]
+def asset(
+    path: str,
+    artifacts_dir: str = "",
+    name: str | None = None,
+    snapshot_artifacts: bool = False,
+    artifact_filetype: ARTIFACT_FILE_TYPES = "json",
+    read_json_options: ReadJsonOptions | None = None,
+    cache_expiration: datetime.timedelta | None = None,
+):
+    # Prevent a circular reference as it references the env variable
+    from mad_prefect.data_assets.data_asset import DataAsset
+    def decorator(fn: Callable):
+        return DataAsset(
+            fn,
+            path,
+            artifacts_dir,
+            name,
+            snapshot_artifacts,
+            artifact_filetype,
+            read_json_options,
+            cache_expiration,
+        )
+    return decorator

mad_prefect-1.0.0/mad_prefect/data_assets/data_artifact.py ADDED Viewed

@@ -0,0 +1,222 @@
+import json
+import os
+from typing import BinaryIO, Sequence, cast
+import duckdb
+import httpx
+import jsonlines
+import pandas as pd
+from mad_prefect.data_assets import ARTIFACT_FILE_TYPES
+from mad_prefect.data_assets.options import ReadJsonOptions
+from mad_prefect.data_assets.utils import yield_data_batches
+from mad_prefect.duckdb import register_mad_protocol
+from mad_prefect.filesystems import get_fs
+import pyarrow as pa
+import pyarrow.parquet as pq
+from mad_prefect.json.mad_json_encoder import MADJSONEncoder
+class DataArtifact:
+    def __init__(
+        self,
+        path: str,
+        data: object | None = None,
+        read_json_options: ReadJsonOptions | None = None,
+    ):
+        self.path = path
+        filetype = os.path.splitext(self.path)[1].lstrip(".")
+        if filetype not in ["json", "parquet"]:
+            raise ValueError(f"Unsupported file type: {filetype}")
+        self.filetype: ARTIFACT_FILE_TYPES = cast(ARTIFACT_FILE_TYPES, filetype)
+        self.data = data
+        self.read_json_options = read_json_options or ReadJsonOptions()
+        self.persisted = False
+    async def persist(self):
+        # If we've already persisted this artifact this session, don't do anything
+        if self.persisted:
+            return True
+        if not self._truthy(self.data):
+            return False
+        await register_mad_protocol()
+        duckdb.query("SET temp_directory = './.tmp/duckdb/'")
+        if self.filetype == "json":
+            await self._persist_json()
+        elif self.filetype == "parquet":
+            await self._persist_parquet()
+        else:
+            raise ValueError(f"Unsupported file format {self.filetype}")
+        self.persisted = await self.exists()
+        return self.persisted
+    async def _open(self):
+        fs = await get_fs()
+        return cast(BinaryIO, await fs.open(self.path, "wb", True))
+    async def _persist_json(self):
+        entities = self._yield_entities_to_persist()
+        file: BinaryIO | None = None
+        writer: jsonlines.Writer | None = None
+        try:
+            next_entity = await anext(entities)
+            while self._truthy(next_entity):
+                # Use the first entity to determine the file's schema
+                if not file or not writer:
+                    file = await self._open()
+                    writer = jsonlines.Writer(
+                        file,
+                        dumps=lambda obj: json.dumps(obj, cls=MADJSONEncoder),  # type: ignore
+                    )
+                table_or_batch: pa.RecordBatch | pa.Table = (
+                    next_entity
+                    if isinstance(next_entity, (pa.Table, pa.RecordBatch))
+                    else None
+                )
+                if table_or_batch:
+                    next_entity = table_or_batch.to_pylist()
+                if not isinstance(next_entity, Sequence):
+                    next_entity = [next_entity]
+                writer.write_all(next_entity)
+                next_entity = await anext(entities)
+        except StopAsyncIteration as e:
+            pass
+        except Exception as e:
+            raise
+        finally:
+            if writer:
+                writer.close()
+            if file:
+                file.close()
+    async def _persist_parquet(self):
+        def __sanitize_data(data):
+            """
+            Recursively go through the data and replace any empty dictionaries
+            with None or a dummy value to avoid Parquet serialization errors.
+            """
+            if isinstance(data, dict):
+                if not data:  # it's an empty dict
+                    return None  # or return {'dummy_field': None} to keep the key with a dummy field
+                else:
+                    return {key: __sanitize_data(value) for key, value in data.items()}
+            elif isinstance(data, list):
+                return [__sanitize_data(item) for item in data]
+            else:
+                return data
+        entities = self._yield_entities_to_persist()
+        file: BinaryIO | None = None
+        writer: pq.ParquetWriter | None = None
+        try:
+            next_entity = await anext(entities)
+            while self._truthy(next_entity):
+                b = __sanitize_data(next_entity)
+                table_or_batch: pa.RecordBatch | pa.Table = (
+                    b
+                    if isinstance(b, (pa.Table, pa.RecordBatch))
+                    else pa.RecordBatch.from_pylist(b)
+                )
+                # Use the first entity to determine the file's schema
+                if not file or not writer:
+                    file = await self._open()
+                    writer = pq.ParquetWriter(file, table_or_batch.schema)
+                else:
+                    # If schema has evolved, adjust the current RecordBatch
+                    if not table_or_batch.schema.equals(writer.schema):
+                        unified_schema = pa.unify_schemas(
+                            [writer.schema, table_or_batch.schema],
+                            promote_options="permissive",
+                        )
+                        # Align the RecordBatch with the unified schema
+                        table_or_batch = table_or_batch.cast(unified_schema)
+                        # Manually adjust the schema of the writer if needed
+                        writer.schema = unified_schema
+                writer.write(table_or_batch)
+                next_entity = await anext(entities)
+        except StopAsyncIteration as e:
+            pass
+        except Exception as e:
+            raise
+        finally:
+            if writer:
+                writer.close()
+            if file:
+                file.close()
+    async def _yield_entities_to_persist(self):
+        from mad_prefect.data_assets.data_asset import DataAsset
+        async for batch_data in yield_data_batches(self.data):
+            # If the data is an asset, execute it to get the result artifact
+            if isinstance(batch_data, DataAsset):
+                batch_data = await batch_data()
+            # If the entity is a DataAsset, turn it into a DuckDbPyRelation, so it can be handled
+            if isinstance(batch_data, DataArtifact):
+                # An artifact may not exist for example when there were no results
+                if not await batch_data.exists():
+                    continue
+                batch_data = await batch_data.query()
+            if isinstance(batch_data, pd.DataFrame):
+                batch_data = duckdb.from_df(batch_data)
+            if isinstance(batch_data, (duckdb.DuckDBPyRelation)):
+                # Convert duckdb into batches of arrow tables
+                reader = batch_data.fetch_arrow_reader(1000)
+                while True:
+                    try:
+                        # this will yield a pyarrow RecordBatch
+                        batch = reader.read_next_batch()
+                        yield batch
+                    except StopIteration as stop:
+                        break
+            elif isinstance(batch_data, httpx.Response):
+                yield batch_data.json()
+            else:
+                yield batch_data
+    async def query(self, query_str: str | None = None):
+        from mad_prefect.data_assets.data_artifact_query import DataArtifactQuery
+        artifact_query = DataArtifactQuery([self], self.read_json_options)
+        return await artifact_query.query(query_str)
+    async def exists(self):
+        fs = await get_fs()
+        return fs.exists(self.path)
+    def _truthy(self, data):
+        if isinstance(data, pd.DataFrame):
+            if data.empty:
+                return False
+        # duckdb hangs with the not self.data check, so make sure self.data isn't
+        # a duckdb pyrelation before checking self.data
+        elif isinstance(data, duckdb.DuckDBPyRelation):
+            pass
+        elif not data:
+            return False
+        return True

mad_prefect-1.0.0/mad_prefect/data_assets/data_artifact_collector.py ADDED Viewed

@@ -0,0 +1,69 @@
+import httpx
+from mad_prefect.data_assets import ARTIFACT_FILE_TYPES
+from mad_prefect.data_assets.options import ReadJsonOptions
+from mad_prefect.data_assets.utils import yield_data_batches
+from mad_prefect.data_assets.data_artifact import DataArtifact
+from mad_prefect.data_assets.data_artifact_query import DataArtifactQuery
+class DataArtifactCollector:
+    def __init__(
+        self,
+        collector: object,
+        dir: str,
+        filetype: ARTIFACT_FILE_TYPES = "json",
+        artifacts: list[DataArtifact] | None = None,
+        read_json_options: ReadJsonOptions | None = None,
+    ):
+        self.collector = collector
+        self.dir = dir
+        self.filetype = filetype
+        self.artifacts = artifacts or []
+        self.read_json_options = read_json_options or ReadJsonOptions()
+    async def collect(self):
+        fragment_num = 0
+        async for fragment in yield_data_batches(self.collector):
+            # If the output isn't a DataArtifact manually set the params & base_path
+            # and initialize the output as a DataArtifact
+            if isinstance(fragment, DataArtifact):
+                fragment_artifact = fragment
+            else:
+                params = (
+                    dict(fragment.request.url.params)
+                    if isinstance(fragment, httpx.Response)
+                    and fragment.request.url.params
+                    else None
+                )
+                path = self._build_artifact_path(self.dir, params, fragment_num)
+                fragment_artifact = DataArtifact(path, fragment, self.read_json_options)
+            if await fragment_artifact.persist():
+                self.artifacts.append(fragment_artifact)
+                fragment_num += 1
+        artifact_query = DataArtifactQuery(
+            artifacts=self.artifacts,
+            read_json_options=self.read_json_options,
+        )
+        return await artifact_query.query()
+    def _build_artifact_path(
+        self,
+        base_path: str,
+        params: dict | None = None,
+        fragment_number: int | None = None,
+    ):
+        filetype = self.filetype
+        base_path = base_path.rstrip("/")
+        if params is None:
+            return f"{base_path}/fragment={fragment_number}.{filetype}"
+        params_path = "/".join(f"{key}={value}" for key, value in params.items())
+        return f"{base_path}/{params_path}.{filetype}"

mad_prefect-1.0.0/mad_prefect/data_assets/data_artifact_query.py ADDED Viewed

@@ -0,0 +1,143 @@
+from typing import cast
+import duckdb
+from mad_prefect.data_assets import ARTIFACT_FILE_TYPES
+from mad_prefect.data_assets.options import ReadJsonOptions
+from mad_prefect.duckdb import register_mad_protocol
+from mad_prefect.data_assets.data_artifact import DataArtifact
+class DataArtifactQuery:
+    def __init__(
+        self,
+        artifacts: list[DataArtifact] | None = None,
+        read_json_options: ReadJsonOptions | None = None,
+    ):
+        self.artifacts = artifacts or []
+        self.read_json_options = read_json_options or ReadJsonOptions()
+    async def query(self, query_str: str | None = None):
+        await register_mad_protocol()
+        # Get the globs for any artifacts which exist
+        existing_artifacts = [a for a in self.artifacts if await a.exists()]
+        globs = [f"mad://{a.path.strip('/')}" for a in existing_artifacts]
+        if not globs:
+            return
+        # Ensure each artifact is of the same filetype
+        filetypes = set([a.filetype for a in existing_artifacts])
+        if not filetypes or len(filetypes) > 1:
+            raise ValueError("Cannot query artifacts of different filetypes")
+        # Get the base query
+        filetype: ARTIFACT_FILE_TYPES = cast(ARTIFACT_FILE_TYPES, filetypes.pop())
+        if filetype == "json":
+            artifact_query = self._create_query_json(globs)
+        elif filetype == "parquet":
+            artifact_query = self._create_query_parquet(globs)
+        else:
+            raise ValueError(f"Unsupported file format {filetype}")
+        # Apply any additional query on top
+        if query_str:
+            return duckdb.query(f"FROM artifact_query {query_str}")
+        return artifact_query
+    def _create_query_json(self, globs: list[str]):
+        # Prepare the globs string
+        globs_str = ", ".join(f"'{g}'" for g in globs)
+        globs_formatted = f"[{globs_str}]"
+        # Build the base options dict without 'columns'
+        base_options = self.read_json_options.model_dump(
+            exclude={"columns"},
+            exclude_none=True,
+        )
+        options_str = self._format_options_dict(base_options)
+        # Build the base query string without 'columns'
+        base_query = (
+            f"SELECT * FROM read_json({globs_formatted}, {options_str})"
+            if options_str
+            else f"SELECT * FROM read_json({globs_formatted})"
+        )
+        # Process columns after building the base query
+        if self.read_json_options.columns:
+            updated_columns = self._process_columns(
+                base_query, self.read_json_options.columns
+            )
+            # Include 'columns' in options
+            options_with_columns = base_options.copy()
+            options_with_columns["columns"] = updated_columns
+            options_str_with_columns = self._format_options_dict(options_with_columns)
+            # Rebuild the query with 'columns'
+            final_query = f"SELECT * FROM read_json({globs_formatted}, {options_str_with_columns})"
+        else:
+            final_query = base_query
+        # Execute the query
+        artifact_query = duckdb.query(final_query)
+        return artifact_query
+    def _process_columns(
+        self,
+        base_query: str,
+        columns: dict[str, str],
+    ) -> dict[str, str]:
+        # Describe the base query to get the schema
+        schema_info = duckdb.query(f"DESCRIBE {base_query}").fetchall()
+        schema_columns = {row[0]: row[1] for row in schema_info}
+        # Update column types based on provided columns
+        updated_columns = {}
+        for col_name, col_type in schema_columns.items():
+            if col_name in columns:
+                # Use the provided type
+                updated_columns[col_name] = columns[col_name]
+            else:
+                # Use the existing type from the schema
+                updated_columns[col_name] = col_type
+        return updated_columns
+    def _create_query_parquet(self, globs: list[str]):
+        # Prepare the globs string
+        globs_str = ", ".join(f"'{g}'" for g in globs)
+        globs_formatted = f"[{globs_str}]"
+        # Include only relevant options
+        options_dict = {"hive_partitioning": True, "union_by_name": True}
+        options_str = self._format_options_dict(options_dict)
+        # Build the query string
+        artifact_base_query = (
+            f"SELECT * FROM read_parquet({globs_formatted}, {options_str})"
+        )
+        # Execute the query
+        artifact_query = duckdb.query(artifact_base_query)
+        return artifact_query
+    def _format_options_dict(self, options_dict: dict) -> str:
+        def format_value(key, value):
+            if isinstance(value, bool):
+                return "TRUE" if value else "FALSE"
+            elif isinstance(value, str):
+                return f"'{value}'"
+            elif isinstance(value, dict):
+                return f"{value}"
+            else:
+                return str(value)
+        options_str = ", ".join(
+            f"{key} = {format_value(key, value)}" for key, value in options_dict.items()
+        )
+        return options_str