mad-prefect 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ Metadata-Version: 2.1
2
+ Name: mad-prefect
3
+ Version: 1.0.0
4
+ Summary:
5
+ Author: MAIT DEV Pty Ltd
6
+ Author-email: maitland@mait.dev
7
+ Requires-Python: >=3.11,<3.12
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Requires-Dist: duckdb (>=0.9,<1.2)
11
+ Requires-Dist: fsspec (>=2024.9.0,<2025.0.0)
12
+ Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
13
+ Requires-Dist: pandas (>=2.1.1,<3.0.0)
14
+ Requires-Dist: prefect (>=2.18,<3)
15
+ Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
16
+ Requires-Dist: sshfs (>=2024.6.0,<2025.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Introduction
20
+ TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project.
21
+
22
+ # Getting Started
23
+ TODO: Guide users through getting your code up and running on their own system. In this section you can talk about:
24
+ 1. Installation process
25
+ 2. Software dependencies
26
+ 3. Latest releases
27
+ 4. API references
28
+
29
+ # Build and Test
30
+ TODO: Describe and show how to build your code and run the tests.
31
+
32
+ # Contribute
33
+ TODO: Explain how other users and developers can contribute to make your code better.
34
+
35
+ If you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:
36
+ - [ASP.NET Core](https://github.com/aspnet/Home)
37
+ - [Visual Studio Code](https://github.com/Microsoft/vscode)
38
+ - [Chakra Core](https://github.com/Microsoft/ChakraCore)
@@ -0,0 +1,20 @@
1
+ # Introduction
2
+ TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project.
3
+
4
+ # Getting Started
5
+ TODO: Guide users through getting your code up and running on their own system. In this section you can talk about:
6
+ 1. Installation process
7
+ 2. Software dependencies
8
+ 3. Latest releases
9
+ 4. API references
10
+
11
+ # Build and Test
12
+ TODO: Describe and show how to build your code and run the tests.
13
+
14
+ # Contribute
15
+ TODO: Explain how other users and developers can contribute to make your code better.
16
+
17
+ If you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:
18
+ - [ASP.NET Core](https://github.com/aspnet/Home)
19
+ - [Visual Studio Code](https://github.com/Microsoft/vscode)
20
+ - [Chakra Core](https://github.com/Microsoft/ChakraCore)
@@ -0,0 +1,35 @@
1
+ import datetime
2
+ import os
3
+ from typing import Callable, Literal
4
+
5
+ from mad_prefect.data_assets.options import ReadJsonOptions
6
+
7
+ ASSET_METADATA_LOCATION = os.getenv("ASSET_METADATA_LOCATION", ".asset_metadata")
8
+ ARTIFACT_FILE_TYPES = Literal["parquet", "json"]
9
+
10
+
11
+ def asset(
12
+ path: str,
13
+ artifacts_dir: str = "",
14
+ name: str | None = None,
15
+ snapshot_artifacts: bool = False,
16
+ artifact_filetype: ARTIFACT_FILE_TYPES = "json",
17
+ read_json_options: ReadJsonOptions | None = None,
18
+ cache_expiration: datetime.timedelta | None = None,
19
+ ):
20
+ # Prevent a circular reference as it references the env variable
21
+ from mad_prefect.data_assets.data_asset import DataAsset
22
+
23
+ def decorator(fn: Callable):
24
+ return DataAsset(
25
+ fn,
26
+ path,
27
+ artifacts_dir,
28
+ name,
29
+ snapshot_artifacts,
30
+ artifact_filetype,
31
+ read_json_options,
32
+ cache_expiration,
33
+ )
34
+
35
+ return decorator
@@ -0,0 +1,222 @@
1
+ import json
2
+ import os
3
+ from typing import BinaryIO, Sequence, cast
4
+ import duckdb
5
+ import httpx
6
+ import jsonlines
7
+ import pandas as pd
8
+ from mad_prefect.data_assets import ARTIFACT_FILE_TYPES
9
+ from mad_prefect.data_assets.options import ReadJsonOptions
10
+ from mad_prefect.data_assets.utils import yield_data_batches
11
+ from mad_prefect.duckdb import register_mad_protocol
12
+ from mad_prefect.filesystems import get_fs
13
+ import pyarrow as pa
14
+ import pyarrow.parquet as pq
15
+ from mad_prefect.json.mad_json_encoder import MADJSONEncoder
16
+
17
+
18
+ class DataArtifact:
19
+ def __init__(
20
+ self,
21
+ path: str,
22
+ data: object | None = None,
23
+ read_json_options: ReadJsonOptions | None = None,
24
+ ):
25
+ self.path = path
26
+ filetype = os.path.splitext(self.path)[1].lstrip(".")
27
+
28
+ if filetype not in ["json", "parquet"]:
29
+ raise ValueError(f"Unsupported file type: {filetype}")
30
+
31
+ self.filetype: ARTIFACT_FILE_TYPES = cast(ARTIFACT_FILE_TYPES, filetype)
32
+ self.data = data
33
+ self.read_json_options = read_json_options or ReadJsonOptions()
34
+ self.persisted = False
35
+
36
+ async def persist(self):
37
+ # If we've already persisted this artifact this session, don't do anything
38
+ if self.persisted:
39
+ return True
40
+
41
+ if not self._truthy(self.data):
42
+ return False
43
+
44
+ await register_mad_protocol()
45
+ duckdb.query("SET temp_directory = './.tmp/duckdb/'")
46
+
47
+ if self.filetype == "json":
48
+ await self._persist_json()
49
+ elif self.filetype == "parquet":
50
+ await self._persist_parquet()
51
+ else:
52
+ raise ValueError(f"Unsupported file format {self.filetype}")
53
+
54
+ self.persisted = await self.exists()
55
+ return self.persisted
56
+
57
+ async def _open(self):
58
+ fs = await get_fs()
59
+ return cast(BinaryIO, await fs.open(self.path, "wb", True))
60
+
61
+ async def _persist_json(self):
62
+ entities = self._yield_entities_to_persist()
63
+ file: BinaryIO | None = None
64
+ writer: jsonlines.Writer | None = None
65
+
66
+ try:
67
+ next_entity = await anext(entities)
68
+
69
+ while self._truthy(next_entity):
70
+ # Use the first entity to determine the file's schema
71
+ if not file or not writer:
72
+ file = await self._open()
73
+ writer = jsonlines.Writer(
74
+ file,
75
+ dumps=lambda obj: json.dumps(obj, cls=MADJSONEncoder), # type: ignore
76
+ )
77
+
78
+ table_or_batch: pa.RecordBatch | pa.Table = (
79
+ next_entity
80
+ if isinstance(next_entity, (pa.Table, pa.RecordBatch))
81
+ else None
82
+ )
83
+
84
+ if table_or_batch:
85
+ next_entity = table_or_batch.to_pylist()
86
+
87
+ if not isinstance(next_entity, Sequence):
88
+ next_entity = [next_entity]
89
+
90
+ writer.write_all(next_entity)
91
+ next_entity = await anext(entities)
92
+ except StopAsyncIteration as e:
93
+ pass
94
+ except Exception as e:
95
+ raise
96
+ finally:
97
+ if writer:
98
+ writer.close()
99
+
100
+ if file:
101
+ file.close()
102
+
103
+ async def _persist_parquet(self):
104
+ def __sanitize_data(data):
105
+ """
106
+ Recursively go through the data and replace any empty dictionaries
107
+ with None or a dummy value to avoid Parquet serialization errors.
108
+ """
109
+ if isinstance(data, dict):
110
+ if not data: # it's an empty dict
111
+ return None # or return {'dummy_field': None} to keep the key with a dummy field
112
+ else:
113
+ return {key: __sanitize_data(value) for key, value in data.items()}
114
+ elif isinstance(data, list):
115
+ return [__sanitize_data(item) for item in data]
116
+ else:
117
+ return data
118
+
119
+ entities = self._yield_entities_to_persist()
120
+ file: BinaryIO | None = None
121
+ writer: pq.ParquetWriter | None = None
122
+
123
+ try:
124
+ next_entity = await anext(entities)
125
+
126
+ while self._truthy(next_entity):
127
+ b = __sanitize_data(next_entity)
128
+ table_or_batch: pa.RecordBatch | pa.Table = (
129
+ b
130
+ if isinstance(b, (pa.Table, pa.RecordBatch))
131
+ else pa.RecordBatch.from_pylist(b)
132
+ )
133
+
134
+ # Use the first entity to determine the file's schema
135
+ if not file or not writer:
136
+ file = await self._open()
137
+ writer = pq.ParquetWriter(file, table_or_batch.schema)
138
+ else:
139
+ # If schema has evolved, adjust the current RecordBatch
140
+ if not table_or_batch.schema.equals(writer.schema):
141
+ unified_schema = pa.unify_schemas(
142
+ [writer.schema, table_or_batch.schema],
143
+ promote_options="permissive",
144
+ )
145
+
146
+ # Align the RecordBatch with the unified schema
147
+ table_or_batch = table_or_batch.cast(unified_schema)
148
+
149
+ # Manually adjust the schema of the writer if needed
150
+ writer.schema = unified_schema
151
+
152
+ writer.write(table_or_batch)
153
+ next_entity = await anext(entities)
154
+ except StopAsyncIteration as e:
155
+ pass
156
+ except Exception as e:
157
+ raise
158
+ finally:
159
+ if writer:
160
+ writer.close()
161
+
162
+ if file:
163
+ file.close()
164
+
165
+ async def _yield_entities_to_persist(self):
166
+ from mad_prefect.data_assets.data_asset import DataAsset
167
+
168
+ async for batch_data in yield_data_batches(self.data):
169
+ # If the data is an asset, execute it to get the result artifact
170
+ if isinstance(batch_data, DataAsset):
171
+ batch_data = await batch_data()
172
+
173
+ # If the entity is a DataAsset, turn it into a DuckDbPyRelation, so it can be handled
174
+ if isinstance(batch_data, DataArtifact):
175
+ # An artifact may not exist for example when there were no results
176
+ if not await batch_data.exists():
177
+ continue
178
+
179
+ batch_data = await batch_data.query()
180
+
181
+ if isinstance(batch_data, pd.DataFrame):
182
+ batch_data = duckdb.from_df(batch_data)
183
+
184
+ if isinstance(batch_data, (duckdb.DuckDBPyRelation)):
185
+ # Convert duckdb into batches of arrow tables
186
+ reader = batch_data.fetch_arrow_reader(1000)
187
+
188
+ while True:
189
+ try:
190
+ # this will yield a pyarrow RecordBatch
191
+ batch = reader.read_next_batch()
192
+ yield batch
193
+ except StopIteration as stop:
194
+ break
195
+
196
+ elif isinstance(batch_data, httpx.Response):
197
+ yield batch_data.json()
198
+ else:
199
+ yield batch_data
200
+
201
+ async def query(self, query_str: str | None = None):
202
+ from mad_prefect.data_assets.data_artifact_query import DataArtifactQuery
203
+
204
+ artifact_query = DataArtifactQuery([self], self.read_json_options)
205
+ return await artifact_query.query(query_str)
206
+
207
+ async def exists(self):
208
+ fs = await get_fs()
209
+ return fs.exists(self.path)
210
+
211
+ def _truthy(self, data):
212
+ if isinstance(data, pd.DataFrame):
213
+ if data.empty:
214
+ return False
215
+ # duckdb hangs with the not self.data check, so make sure self.data isn't
216
+ # a duckdb pyrelation before checking self.data
217
+ elif isinstance(data, duckdb.DuckDBPyRelation):
218
+ pass
219
+ elif not data:
220
+ return False
221
+
222
+ return True
@@ -0,0 +1,69 @@
1
+ import httpx
2
+ from mad_prefect.data_assets import ARTIFACT_FILE_TYPES
3
+ from mad_prefect.data_assets.options import ReadJsonOptions
4
+ from mad_prefect.data_assets.utils import yield_data_batches
5
+ from mad_prefect.data_assets.data_artifact import DataArtifact
6
+ from mad_prefect.data_assets.data_artifact_query import DataArtifactQuery
7
+
8
+
9
+ class DataArtifactCollector:
10
+
11
+ def __init__(
12
+ self,
13
+ collector: object,
14
+ dir: str,
15
+ filetype: ARTIFACT_FILE_TYPES = "json",
16
+ artifacts: list[DataArtifact] | None = None,
17
+ read_json_options: ReadJsonOptions | None = None,
18
+ ):
19
+ self.collector = collector
20
+ self.dir = dir
21
+ self.filetype = filetype
22
+ self.artifacts = artifacts or []
23
+ self.read_json_options = read_json_options or ReadJsonOptions()
24
+
25
+ async def collect(self):
26
+ fragment_num = 0
27
+
28
+ async for fragment in yield_data_batches(self.collector):
29
+ # If the output isn't a DataArtifact manually set the params & base_path
30
+ # and initialize the output as a DataArtifact
31
+ if isinstance(fragment, DataArtifact):
32
+ fragment_artifact = fragment
33
+ else:
34
+ params = (
35
+ dict(fragment.request.url.params)
36
+ if isinstance(fragment, httpx.Response)
37
+ and fragment.request.url.params
38
+ else None
39
+ )
40
+
41
+ path = self._build_artifact_path(self.dir, params, fragment_num)
42
+ fragment_artifact = DataArtifact(path, fragment, self.read_json_options)
43
+
44
+ if await fragment_artifact.persist():
45
+ self.artifacts.append(fragment_artifact)
46
+ fragment_num += 1
47
+
48
+ artifact_query = DataArtifactQuery(
49
+ artifacts=self.artifacts,
50
+ read_json_options=self.read_json_options,
51
+ )
52
+
53
+ return await artifact_query.query()
54
+
55
+ def _build_artifact_path(
56
+ self,
57
+ base_path: str,
58
+ params: dict | None = None,
59
+ fragment_number: int | None = None,
60
+ ):
61
+ filetype = self.filetype
62
+ base_path = base_path.rstrip("/")
63
+
64
+ if params is None:
65
+ return f"{base_path}/fragment={fragment_number}.{filetype}"
66
+
67
+ params_path = "/".join(f"{key}={value}" for key, value in params.items())
68
+
69
+ return f"{base_path}/{params_path}.{filetype}"
@@ -0,0 +1,143 @@
1
+ from typing import cast
2
+ import duckdb
3
+ from mad_prefect.data_assets import ARTIFACT_FILE_TYPES
4
+ from mad_prefect.data_assets.options import ReadJsonOptions
5
+ from mad_prefect.duckdb import register_mad_protocol
6
+ from mad_prefect.data_assets.data_artifact import DataArtifact
7
+
8
+
9
+ class DataArtifactQuery:
10
+
11
+ def __init__(
12
+ self,
13
+ artifacts: list[DataArtifact] | None = None,
14
+ read_json_options: ReadJsonOptions | None = None,
15
+ ):
16
+ self.artifacts = artifacts or []
17
+ self.read_json_options = read_json_options or ReadJsonOptions()
18
+
19
+ async def query(self, query_str: str | None = None):
20
+ await register_mad_protocol()
21
+
22
+ # Get the globs for any artifacts which exist
23
+ existing_artifacts = [a for a in self.artifacts if await a.exists()]
24
+ globs = [f"mad://{a.path.strip('/')}" for a in existing_artifacts]
25
+
26
+ if not globs:
27
+ return
28
+
29
+ # Ensure each artifact is of the same filetype
30
+ filetypes = set([a.filetype for a in existing_artifacts])
31
+
32
+ if not filetypes or len(filetypes) > 1:
33
+ raise ValueError("Cannot query artifacts of different filetypes")
34
+
35
+ # Get the base query
36
+ filetype: ARTIFACT_FILE_TYPES = cast(ARTIFACT_FILE_TYPES, filetypes.pop())
37
+
38
+ if filetype == "json":
39
+ artifact_query = self._create_query_json(globs)
40
+ elif filetype == "parquet":
41
+ artifact_query = self._create_query_parquet(globs)
42
+ else:
43
+ raise ValueError(f"Unsupported file format {filetype}")
44
+
45
+ # Apply any additional query on top
46
+ if query_str:
47
+ return duckdb.query(f"FROM artifact_query {query_str}")
48
+
49
+ return artifact_query
50
+
51
+ def _create_query_json(self, globs: list[str]):
52
+ # Prepare the globs string
53
+ globs_str = ", ".join(f"'{g}'" for g in globs)
54
+ globs_formatted = f"[{globs_str}]"
55
+
56
+ # Build the base options dict without 'columns'
57
+ base_options = self.read_json_options.model_dump(
58
+ exclude={"columns"},
59
+ exclude_none=True,
60
+ )
61
+ options_str = self._format_options_dict(base_options)
62
+
63
+ # Build the base query string without 'columns'
64
+ base_query = (
65
+ f"SELECT * FROM read_json({globs_formatted}, {options_str})"
66
+ if options_str
67
+ else f"SELECT * FROM read_json({globs_formatted})"
68
+ )
69
+
70
+ # Process columns after building the base query
71
+ if self.read_json_options.columns:
72
+ updated_columns = self._process_columns(
73
+ base_query, self.read_json_options.columns
74
+ )
75
+
76
+ # Include 'columns' in options
77
+ options_with_columns = base_options.copy()
78
+ options_with_columns["columns"] = updated_columns
79
+ options_str_with_columns = self._format_options_dict(options_with_columns)
80
+
81
+ # Rebuild the query with 'columns'
82
+ final_query = f"SELECT * FROM read_json({globs_formatted}, {options_str_with_columns})"
83
+ else:
84
+ final_query = base_query
85
+
86
+ # Execute the query
87
+ artifact_query = duckdb.query(final_query)
88
+ return artifact_query
89
+
90
+ def _process_columns(
91
+ self,
92
+ base_query: str,
93
+ columns: dict[str, str],
94
+ ) -> dict[str, str]:
95
+ # Describe the base query to get the schema
96
+ schema_info = duckdb.query(f"DESCRIBE {base_query}").fetchall()
97
+ schema_columns = {row[0]: row[1] for row in schema_info}
98
+
99
+ # Update column types based on provided columns
100
+ updated_columns = {}
101
+ for col_name, col_type in schema_columns.items():
102
+ if col_name in columns:
103
+ # Use the provided type
104
+ updated_columns[col_name] = columns[col_name]
105
+ else:
106
+ # Use the existing type from the schema
107
+ updated_columns[col_name] = col_type
108
+
109
+ return updated_columns
110
+
111
+ def _create_query_parquet(self, globs: list[str]):
112
+ # Prepare the globs string
113
+ globs_str = ", ".join(f"'{g}'" for g in globs)
114
+ globs_formatted = f"[{globs_str}]"
115
+
116
+ # Include only relevant options
117
+ options_dict = {"hive_partitioning": True, "union_by_name": True}
118
+ options_str = self._format_options_dict(options_dict)
119
+
120
+ # Build the query string
121
+ artifact_base_query = (
122
+ f"SELECT * FROM read_parquet({globs_formatted}, {options_str})"
123
+ )
124
+
125
+ # Execute the query
126
+ artifact_query = duckdb.query(artifact_base_query)
127
+ return artifact_query
128
+
129
+ def _format_options_dict(self, options_dict: dict) -> str:
130
+ def format_value(key, value):
131
+ if isinstance(value, bool):
132
+ return "TRUE" if value else "FALSE"
133
+ elif isinstance(value, str):
134
+ return f"'{value}'"
135
+ elif isinstance(value, dict):
136
+ return f"{value}"
137
+ else:
138
+ return str(value)
139
+
140
+ options_str = ", ".join(
141
+ f"{key} = {format_value(key, value)}" for key, value in options_dict.items()
142
+ )
143
+ return options_str