misata 0.2.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +77 -2
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/profiles.py +332 -0
- misata/smart_values.py +171 -2
- misata/streaming.py +228 -0
- {misata-0.2.0b0.dist-info → misata-0.3.0b0.dist-info}/METADATA +1 -1
- {misata-0.2.0b0.dist-info → misata-0.3.0b0.dist-info}/RECORD +16 -8
- {misata-0.2.0b0.dist-info → misata-0.3.0b0.dist-info}/WHEEL +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.0b0.dist-info}/entry_points.txt +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.0b0.dist-info}/licenses/LICENSE +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.0b0.dist-info}/top_level.txt +0 -0
misata/streaming.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Streaming export utilities for Misata.
|
|
3
|
+
|
|
4
|
+
Provides streaming CSV/Parquet export to handle large datasets
|
|
5
|
+
without loading everything into memory.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from misata.exceptions import ExportError, FileWriteError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StreamingExporter:
|
|
20
|
+
"""Export data in streaming fashion to handle large datasets.
|
|
21
|
+
|
|
22
|
+
Instead of building a full DataFrame and then exporting, this writes
|
|
23
|
+
batches directly to files as they are generated.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
exporter = StreamingExporter(output_dir="./data")
|
|
27
|
+
|
|
28
|
+
for table_name, batch_df in simulator.generate_all():
|
|
29
|
+
exporter.write_batch(table_name, batch_df)
|
|
30
|
+
|
|
31
|
+
exporter.finalize()
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
output_dir: str,
|
|
37
|
+
format: str = "csv",
|
|
38
|
+
progress_callback: Optional[Callable[[str, int], None]] = None,
|
|
39
|
+
):
|
|
40
|
+
"""Initialize the exporter.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
output_dir: Directory to write files to
|
|
44
|
+
format: Export format ('csv' or 'parquet')
|
|
45
|
+
progress_callback: Optional callback(table_name, rows_written)
|
|
46
|
+
"""
|
|
47
|
+
self.output_dir = Path(output_dir)
|
|
48
|
+
self.format = format.lower()
|
|
49
|
+
self.progress_callback = progress_callback
|
|
50
|
+
|
|
51
|
+
self._file_handles: Dict[str, Any] = {}
|
|
52
|
+
self._csv_writers: Dict[str, csv.writer] = {}
|
|
53
|
+
self._rows_written: Dict[str, int] = {}
|
|
54
|
+
self._headers_written: Dict[str, bool] = {}
|
|
55
|
+
|
|
56
|
+
# Create output directory
|
|
57
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
def write_batch(self, table_name: str, df: pd.DataFrame) -> int:
|
|
60
|
+
"""Write a batch of data to the appropriate file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
table_name: Name of the table
|
|
64
|
+
df: Batch DataFrame to write
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Number of rows written
|
|
68
|
+
"""
|
|
69
|
+
if self.format == "csv":
|
|
70
|
+
return self._write_csv_batch(table_name, df)
|
|
71
|
+
elif self.format == "parquet":
|
|
72
|
+
return self._write_parquet_batch(table_name, df)
|
|
73
|
+
else:
|
|
74
|
+
raise ExportError(f"Unsupported format: {self.format}")
|
|
75
|
+
|
|
76
|
+
def _write_csv_batch(self, table_name: str, df: pd.DataFrame) -> int:
|
|
77
|
+
"""Write a batch to CSV file."""
|
|
78
|
+
file_path = self.output_dir / f"{table_name}.csv"
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# First batch: write header
|
|
82
|
+
if table_name not in self._headers_written:
|
|
83
|
+
with open(file_path, 'w', newline='', encoding='utf-8') as f:
|
|
84
|
+
writer = csv.writer(f)
|
|
85
|
+
writer.writerow(df.columns.tolist())
|
|
86
|
+
self._headers_written[table_name] = True
|
|
87
|
+
self._rows_written[table_name] = 0
|
|
88
|
+
|
|
89
|
+
# Append data
|
|
90
|
+
with open(file_path, 'a', newline='', encoding='utf-8') as f:
|
|
91
|
+
writer = csv.writer(f)
|
|
92
|
+
for _, row in df.iterrows():
|
|
93
|
+
writer.writerow(row.tolist())
|
|
94
|
+
|
|
95
|
+
rows = len(df)
|
|
96
|
+
self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
|
|
97
|
+
|
|
98
|
+
if self.progress_callback:
|
|
99
|
+
self.progress_callback(table_name, self._rows_written[table_name])
|
|
100
|
+
|
|
101
|
+
return rows
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise FileWriteError(f"Failed to write CSV: {e}", path=str(file_path))
|
|
105
|
+
|
|
106
|
+
def _write_parquet_batch(self, table_name: str, df: pd.DataFrame) -> int:
|
|
107
|
+
"""Write a batch to Parquet file using append mode."""
|
|
108
|
+
try:
|
|
109
|
+
import pyarrow as pa
|
|
110
|
+
import pyarrow.parquet as pq
|
|
111
|
+
except ImportError:
|
|
112
|
+
raise ExportError(
|
|
113
|
+
"PyArrow required for Parquet export",
|
|
114
|
+
details={"suggestion": "pip install pyarrow"}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
file_path = self.output_dir / f"{table_name}.parquet"
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
table = pa.Table.from_pandas(df)
|
|
121
|
+
|
|
122
|
+
if file_path.exists():
|
|
123
|
+
# Append to existing file
|
|
124
|
+
existing = pq.read_table(file_path)
|
|
125
|
+
combined = pa.concat_tables([existing, table])
|
|
126
|
+
pq.write_table(combined, file_path)
|
|
127
|
+
else:
|
|
128
|
+
pq.write_table(table, file_path)
|
|
129
|
+
|
|
130
|
+
rows = len(df)
|
|
131
|
+
self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
|
|
132
|
+
|
|
133
|
+
if self.progress_callback:
|
|
134
|
+
self.progress_callback(table_name, self._rows_written[table_name])
|
|
135
|
+
|
|
136
|
+
return rows
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
raise FileWriteError(f"Failed to write Parquet: {e}", path=str(file_path))
|
|
140
|
+
|
|
141
|
+
def finalize(self) -> Dict[str, int]:
|
|
142
|
+
"""Finalize all exports and return summary.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dict mapping table names to row counts
|
|
146
|
+
"""
|
|
147
|
+
# Close any open file handles
|
|
148
|
+
for handle in self._file_handles.values():
|
|
149
|
+
try:
|
|
150
|
+
handle.close()
|
|
151
|
+
except Exception:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
self._file_handles.clear()
|
|
155
|
+
self._csv_writers.clear()
|
|
156
|
+
|
|
157
|
+
return self._rows_written.copy()
|
|
158
|
+
|
|
159
|
+
def get_file_paths(self) -> Dict[str, Path]:
|
|
160
|
+
"""Get paths to all exported files."""
|
|
161
|
+
ext = ".csv" if self.format == "csv" else ".parquet"
|
|
162
|
+
return {
|
|
163
|
+
table: self.output_dir / f"{table}{ext}"
|
|
164
|
+
for table in self._rows_written.keys()
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
168
|
+
"""Get export statistics."""
|
|
169
|
+
return {
|
|
170
|
+
"output_dir": str(self.output_dir),
|
|
171
|
+
"format": self.format,
|
|
172
|
+
"tables": self._rows_written.copy(),
|
|
173
|
+
"total_rows": sum(self._rows_written.values()),
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def stream_to_csv(
|
|
178
|
+
generator: Generator[tuple, None, None],
|
|
179
|
+
output_dir: str,
|
|
180
|
+
progress_callback: Optional[Callable[[str, int], None]] = None,
|
|
181
|
+
) -> Dict[str, int]:
|
|
182
|
+
"""Stream data from a generator directly to CSV files.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
generator: Iterator yielding (table_name, batch_df) tuples
|
|
186
|
+
output_dir: Directory to write files to
|
|
187
|
+
progress_callback: Optional callback(table_name, rows_written)
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dict mapping table names to final row counts
|
|
191
|
+
"""
|
|
192
|
+
exporter = StreamingExporter(
|
|
193
|
+
output_dir=output_dir,
|
|
194
|
+
format="csv",
|
|
195
|
+
progress_callback=progress_callback,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
for table_name, batch_df in generator:
|
|
199
|
+
exporter.write_batch(table_name, batch_df)
|
|
200
|
+
|
|
201
|
+
return exporter.finalize()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def stream_to_parquet(
|
|
205
|
+
generator: Generator[tuple, None, None],
|
|
206
|
+
output_dir: str,
|
|
207
|
+
progress_callback: Optional[Callable[[str, int], None]] = None,
|
|
208
|
+
) -> Dict[str, int]:
|
|
209
|
+
"""Stream data from a generator directly to Parquet files.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
generator: Iterator yielding (table_name, batch_df) tuples
|
|
213
|
+
output_dir: Directory to write files to
|
|
214
|
+
progress_callback: Optional callback(table_name, rows_written)
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Dict mapping table names to final row counts
|
|
218
|
+
"""
|
|
219
|
+
exporter = StreamingExporter(
|
|
220
|
+
output_dir=output_dir,
|
|
221
|
+
format="parquet",
|
|
222
|
+
progress_callback=progress_callback,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
for table_name, batch_df in generator:
|
|
226
|
+
exporter.write_batch(table_name, batch_df)
|
|
227
|
+
|
|
228
|
+
return exporter.finalize()
|
|
@@ -1,29 +1,37 @@
|
|
|
1
|
-
misata/__init__.py,sha256=
|
|
1
|
+
misata/__init__.py,sha256=Vra5zMkd5Y6HTzhGRc76jTv10Z0yuhw33MDUoLpACrE,3144
|
|
2
2
|
misata/api.py,sha256=Wq2H3iJzocNTsCzb9vhYJxDyag3Yiucvb-GVF0tdKhI,14999
|
|
3
3
|
misata/audit.py,sha256=4eUCHT2STptemfakWeNODbVuBRhyD8Q32LlB2eufvuw,12291
|
|
4
4
|
misata/benchmark.py,sha256=Y1-tuKegJyAlTneROQpPo276qnfmMmupGDbVDs9k5J8,12358
|
|
5
|
+
misata/cache.py,sha256=fuLk7cQ7hOEmlqEWmm-O516L26btZ6zFO8FdrqFCRLg,7087
|
|
5
6
|
misata/cli.py,sha256=a7YijZCUYrkCYGVYJ2nZSL9J3JfFqbXQQOad6bhy7zM,22642
|
|
6
7
|
misata/codegen.py,sha256=m7ykTtLgITvaqzVB1cVhs1b9Puo2X4uyzngZ85wi6J0,5791
|
|
8
|
+
misata/constraints.py,sha256=8jUKlA2VyVomDnl2zz0RDkUqxEkxlwUnBbHTXr_SA5g,9937
|
|
9
|
+
misata/context.py,sha256=tjYrU67wjII07Pl3MKV_uCMl_s55DIOQZCouxAryyzE,8509
|
|
7
10
|
misata/curve_fitting.py,sha256=gLj4BkIxNWKkfo3QKZFI_aq60bsXlI53K5yZX4hc9EU,4126
|
|
8
11
|
misata/customization.py,sha256=pw-BEsPKN091hyOrQWWQoRhTrlmQ9_PXXopm2FZSEvs,8551
|
|
12
|
+
misata/exceptions.py,sha256=C3IGMk8xAy9AmRVWeSAnLHHui7drv6rzgzvOmr6gh50,8335
|
|
9
13
|
misata/feedback.py,sha256=HBEsoKi_vdRqwRzMoVFVj_cjfzQ5SUAaGz40s1HMD50,13313
|
|
10
14
|
misata/formulas.py,sha256=KOTq5YN_19vv1ERd92bdzKot9yo9rrrwjOuWO13nFCg,11210
|
|
11
15
|
misata/generators.py,sha256=NrMF12i6CB7K6fUsqcqurmZBBQ382ZhVnYB9oMBIZCE,8844
|
|
12
16
|
misata/hybrid.py,sha256=5oopAdfOLWUYzdRWlc0plVeVEVg7Nu1CVGNNCDSjQt8,13104
|
|
13
17
|
misata/llm_parser.py,sha256=2SVozbKtb0kaPaR4ERz9FtIIxK5jQVaYJ8L_xC6gU10,20662
|
|
14
18
|
misata/noise.py,sha256=UO7MokzQ5Y5Vj7JaayDUG0JwCLnpHtnpQTcJ4UHWibo,10460
|
|
19
|
+
misata/profiles.py,sha256=0djys8wWvH8VP74KmGn6cGLuOb64h9Hk0g0bkXOfxP4,9578
|
|
15
20
|
misata/quality.py,sha256=VSntJfMnF1tVWJ05fvbVJOMcAPEB7QtuEg18k6aEwhA,11685
|
|
16
21
|
misata/schema.py,sha256=zMYDPCgPfcy_STgANiS-Ow3dUETpW3Ayo02G88jmBe0,8954
|
|
17
22
|
misata/semantic.py,sha256=0fauGWJ75wlbHVqT0hohYTN4m_nscdaMaVAIfkhTZXk,7087
|
|
18
23
|
misata/simulator.py,sha256=nq9KxOS-4oUMNu7a2Ten0TQyhT2u_rTo2ImmvdkMRbU,34037
|
|
19
|
-
misata/smart_values.py,sha256=
|
|
24
|
+
misata/smart_values.py,sha256=8-TYBK5cVBst9tfGuQXXetOLSqgns_NKnIl14rpVrbk,35870
|
|
20
25
|
misata/story_parser.py,sha256=7N7so3KWisl2UxkOtENQwP-4hN2cs9vTKsPHVRZB2Mc,15964
|
|
26
|
+
misata/streaming.py,sha256=qbEnoFRfn9a7H_gWlq5C3TwbNUnP5U98OPo1EdU_cQ0,7578
|
|
21
27
|
misata/validation.py,sha256=5yJSN7jecVNLJ8ss6y7l2U4hF1Ljn27Q6Xs9N1iDPcw,10791
|
|
28
|
+
misata/generators/__init__.py,sha256=V4I_1IucuywRJZH3cLxKvBd2Ib7kE0WIJ7tq8y4lkx8,568
|
|
29
|
+
misata/generators/base.py,sha256=iON9iAONMEQdbq2Fdric3V3bWn3caD1ITC16DTCK0Og,21329
|
|
22
30
|
misata/templates/__init__.py,sha256=0RcZz9d4bmCqLAr77h0gpMfHncqAPeZCguqsuGCz7rE,25245
|
|
23
31
|
misata/templates/library.py,sha256=eMex18ZKlzQqIkGFgs1uy9QGs7PmUN_VVL4txKvxynM,20930
|
|
24
|
-
misata-0.
|
|
25
|
-
misata-0.
|
|
26
|
-
misata-0.
|
|
27
|
-
misata-0.
|
|
28
|
-
misata-0.
|
|
29
|
-
misata-0.
|
|
32
|
+
misata-0.3.0b0.dist-info/licenses/LICENSE,sha256=oagkechmfr9iT214N871zCm7TnB0KTfPjAUWxHsYJ4I,1071
|
|
33
|
+
misata-0.3.0b0.dist-info/METADATA,sha256=Wxpa2V0Sum-CFOpNnmRd27eEDfyT9CKIy-4nGZnrCys,8114
|
|
34
|
+
misata-0.3.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
misata-0.3.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
|
|
36
|
+
misata-0.3.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
|
|
37
|
+
misata-0.3.0b0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|