misata 0.2.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/streaming.py ADDED
@@ -0,0 +1,228 @@
1
+ """
2
+ Streaming export utilities for Misata.
3
+
4
+ Provides streaming CSV/Parquet export to handle large datasets
5
+ without loading everything into memory.
6
+ """
7
+
8
+ import csv
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Union
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from misata.exceptions import ExportError, FileWriteError
17
+
18
+
19
+ class StreamingExporter:
20
+ """Export data in streaming fashion to handle large datasets.
21
+
22
+ Instead of building a full DataFrame and then exporting, this writes
23
+ batches directly to files as they are generated.
24
+
25
+ Example:
26
+ exporter = StreamingExporter(output_dir="./data")
27
+
28
+ for table_name, batch_df in simulator.generate_all():
29
+ exporter.write_batch(table_name, batch_df)
30
+
31
+ exporter.finalize()
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ output_dir: str,
37
+ format: str = "csv",
38
+ progress_callback: Optional[Callable[[str, int], None]] = None,
39
+ ):
40
+ """Initialize the exporter.
41
+
42
+ Args:
43
+ output_dir: Directory to write files to
44
+ format: Export format ('csv' or 'parquet')
45
+ progress_callback: Optional callback(table_name, rows_written)
46
+ """
47
+ self.output_dir = Path(output_dir)
48
+ self.format = format.lower()
49
+ self.progress_callback = progress_callback
50
+
51
+ self._file_handles: Dict[str, Any] = {}
52
+ self._csv_writers: Dict[str, csv.writer] = {}
53
+ self._rows_written: Dict[str, int] = {}
54
+ self._headers_written: Dict[str, bool] = {}
55
+
56
+ # Create output directory
57
+ self.output_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ def write_batch(self, table_name: str, df: pd.DataFrame) -> int:
60
+ """Write a batch of data to the appropriate file.
61
+
62
+ Args:
63
+ table_name: Name of the table
64
+ df: Batch DataFrame to write
65
+
66
+ Returns:
67
+ Number of rows written
68
+ """
69
+ if self.format == "csv":
70
+ return self._write_csv_batch(table_name, df)
71
+ elif self.format == "parquet":
72
+ return self._write_parquet_batch(table_name, df)
73
+ else:
74
+ raise ExportError(f"Unsupported format: {self.format}")
75
+
76
+ def _write_csv_batch(self, table_name: str, df: pd.DataFrame) -> int:
77
+ """Write a batch to CSV file."""
78
+ file_path = self.output_dir / f"{table_name}.csv"
79
+
80
+ try:
81
+ # First batch: write header
82
+ if table_name not in self._headers_written:
83
+ with open(file_path, 'w', newline='', encoding='utf-8') as f:
84
+ writer = csv.writer(f)
85
+ writer.writerow(df.columns.tolist())
86
+ self._headers_written[table_name] = True
87
+ self._rows_written[table_name] = 0
88
+
89
+ # Append data
90
+ with open(file_path, 'a', newline='', encoding='utf-8') as f:
91
+ writer = csv.writer(f)
92
+ for _, row in df.iterrows():
93
+ writer.writerow(row.tolist())
94
+
95
+ rows = len(df)
96
+ self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
97
+
98
+ if self.progress_callback:
99
+ self.progress_callback(table_name, self._rows_written[table_name])
100
+
101
+ return rows
102
+
103
+ except Exception as e:
104
+ raise FileWriteError(f"Failed to write CSV: {e}", path=str(file_path))
105
+
106
+ def _write_parquet_batch(self, table_name: str, df: pd.DataFrame) -> int:
107
+ """Write a batch to Parquet file using append mode."""
108
+ try:
109
+ import pyarrow as pa
110
+ import pyarrow.parquet as pq
111
+ except ImportError:
112
+ raise ExportError(
113
+ "PyArrow required for Parquet export",
114
+ details={"suggestion": "pip install pyarrow"}
115
+ )
116
+
117
+ file_path = self.output_dir / f"{table_name}.parquet"
118
+
119
+ try:
120
+ table = pa.Table.from_pandas(df)
121
+
122
+ if file_path.exists():
123
+ # Append to existing file
124
+ existing = pq.read_table(file_path)
125
+ combined = pa.concat_tables([existing, table])
126
+ pq.write_table(combined, file_path)
127
+ else:
128
+ pq.write_table(table, file_path)
129
+
130
+ rows = len(df)
131
+ self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
132
+
133
+ if self.progress_callback:
134
+ self.progress_callback(table_name, self._rows_written[table_name])
135
+
136
+ return rows
137
+
138
+ except Exception as e:
139
+ raise FileWriteError(f"Failed to write Parquet: {e}", path=str(file_path))
140
+
141
+ def finalize(self) -> Dict[str, int]:
142
+ """Finalize all exports and return summary.
143
+
144
+ Returns:
145
+ Dict mapping table names to row counts
146
+ """
147
+ # Close any open file handles
148
+ for handle in self._file_handles.values():
149
+ try:
150
+ handle.close()
151
+ except Exception:
152
+ pass
153
+
154
+ self._file_handles.clear()
155
+ self._csv_writers.clear()
156
+
157
+ return self._rows_written.copy()
158
+
159
+ def get_file_paths(self) -> Dict[str, Path]:
160
+ """Get paths to all exported files."""
161
+ ext = ".csv" if self.format == "csv" else ".parquet"
162
+ return {
163
+ table: self.output_dir / f"{table}{ext}"
164
+ for table in self._rows_written.keys()
165
+ }
166
+
167
+ def get_stats(self) -> Dict[str, Any]:
168
+ """Get export statistics."""
169
+ return {
170
+ "output_dir": str(self.output_dir),
171
+ "format": self.format,
172
+ "tables": self._rows_written.copy(),
173
+ "total_rows": sum(self._rows_written.values()),
174
+ }
175
+
176
+
177
+ def stream_to_csv(
178
+ generator: Generator[tuple, None, None],
179
+ output_dir: str,
180
+ progress_callback: Optional[Callable[[str, int], None]] = None,
181
+ ) -> Dict[str, int]:
182
+ """Stream data from a generator directly to CSV files.
183
+
184
+ Args:
185
+ generator: Iterator yielding (table_name, batch_df) tuples
186
+ output_dir: Directory to write files to
187
+ progress_callback: Optional callback(table_name, rows_written)
188
+
189
+ Returns:
190
+ Dict mapping table names to final row counts
191
+ """
192
+ exporter = StreamingExporter(
193
+ output_dir=output_dir,
194
+ format="csv",
195
+ progress_callback=progress_callback,
196
+ )
197
+
198
+ for table_name, batch_df in generator:
199
+ exporter.write_batch(table_name, batch_df)
200
+
201
+ return exporter.finalize()
202
+
203
+
204
+ def stream_to_parquet(
205
+ generator: Generator[tuple, None, None],
206
+ output_dir: str,
207
+ progress_callback: Optional[Callable[[str, int], None]] = None,
208
+ ) -> Dict[str, int]:
209
+ """Stream data from a generator directly to Parquet files.
210
+
211
+ Args:
212
+ generator: Iterator yielding (table_name, batch_df) tuples
213
+ output_dir: Directory to write files to
214
+ progress_callback: Optional callback(table_name, rows_written)
215
+
216
+ Returns:
217
+ Dict mapping table names to final row counts
218
+ """
219
+ exporter = StreamingExporter(
220
+ output_dir=output_dir,
221
+ format="parquet",
222
+ progress_callback=progress_callback,
223
+ )
224
+
225
+ for table_name, batch_df in generator:
226
+ exporter.write_batch(table_name, batch_df)
227
+
228
+ return exporter.finalize()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: misata
3
- Version: 0.2.0b0
3
+ Version: 0.3.0b0
4
4
  Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
5
5
  Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
6
6
  License: MIT
@@ -1,29 +1,37 @@
1
- misata/__init__.py,sha256=rTXPG74KAXwnJD9TchtkbOr3kVPaC6W6CrhTKEeSSyk,1404
1
+ misata/__init__.py,sha256=Vra5zMkd5Y6HTzhGRc76jTv10Z0yuhw33MDUoLpACrE,3144
2
2
  misata/api.py,sha256=Wq2H3iJzocNTsCzb9vhYJxDyag3Yiucvb-GVF0tdKhI,14999
3
3
  misata/audit.py,sha256=4eUCHT2STptemfakWeNODbVuBRhyD8Q32LlB2eufvuw,12291
4
4
  misata/benchmark.py,sha256=Y1-tuKegJyAlTneROQpPo276qnfmMmupGDbVDs9k5J8,12358
5
+ misata/cache.py,sha256=fuLk7cQ7hOEmlqEWmm-O516L26btZ6zFO8FdrqFCRLg,7087
5
6
  misata/cli.py,sha256=a7YijZCUYrkCYGVYJ2nZSL9J3JfFqbXQQOad6bhy7zM,22642
6
7
  misata/codegen.py,sha256=m7ykTtLgITvaqzVB1cVhs1b9Puo2X4uyzngZ85wi6J0,5791
8
+ misata/constraints.py,sha256=8jUKlA2VyVomDnl2zz0RDkUqxEkxlwUnBbHTXr_SA5g,9937
9
+ misata/context.py,sha256=tjYrU67wjII07Pl3MKV_uCMl_s55DIOQZCouxAryyzE,8509
7
10
  misata/curve_fitting.py,sha256=gLj4BkIxNWKkfo3QKZFI_aq60bsXlI53K5yZX4hc9EU,4126
8
11
  misata/customization.py,sha256=pw-BEsPKN091hyOrQWWQoRhTrlmQ9_PXXopm2FZSEvs,8551
12
+ misata/exceptions.py,sha256=C3IGMk8xAy9AmRVWeSAnLHHui7drv6rzgzvOmr6gh50,8335
9
13
  misata/feedback.py,sha256=HBEsoKi_vdRqwRzMoVFVj_cjfzQ5SUAaGz40s1HMD50,13313
10
14
  misata/formulas.py,sha256=KOTq5YN_19vv1ERd92bdzKot9yo9rrrwjOuWO13nFCg,11210
11
15
  misata/generators.py,sha256=NrMF12i6CB7K6fUsqcqurmZBBQ382ZhVnYB9oMBIZCE,8844
12
16
  misata/hybrid.py,sha256=5oopAdfOLWUYzdRWlc0plVeVEVg7Nu1CVGNNCDSjQt8,13104
13
17
  misata/llm_parser.py,sha256=2SVozbKtb0kaPaR4ERz9FtIIxK5jQVaYJ8L_xC6gU10,20662
14
18
  misata/noise.py,sha256=UO7MokzQ5Y5Vj7JaayDUG0JwCLnpHtnpQTcJ4UHWibo,10460
19
+ misata/profiles.py,sha256=0djys8wWvH8VP74KmGn6cGLuOb64h9Hk0g0bkXOfxP4,9578
15
20
  misata/quality.py,sha256=VSntJfMnF1tVWJ05fvbVJOMcAPEB7QtuEg18k6aEwhA,11685
16
21
  misata/schema.py,sha256=zMYDPCgPfcy_STgANiS-Ow3dUETpW3Ayo02G88jmBe0,8954
17
22
  misata/semantic.py,sha256=0fauGWJ75wlbHVqT0hohYTN4m_nscdaMaVAIfkhTZXk,7087
18
23
  misata/simulator.py,sha256=nq9KxOS-4oUMNu7a2Ten0TQyhT2u_rTo2ImmvdkMRbU,34037
19
- misata/smart_values.py,sha256=_jVE3kqqSnFqfKchFDwlsuzFPmyJhtgmpJr4O-wpXrA,28274
24
+ misata/smart_values.py,sha256=8-TYBK5cVBst9tfGuQXXetOLSqgns_NKnIl14rpVrbk,35870
20
25
  misata/story_parser.py,sha256=7N7so3KWisl2UxkOtENQwP-4hN2cs9vTKsPHVRZB2Mc,15964
26
+ misata/streaming.py,sha256=qbEnoFRfn9a7H_gWlq5C3TwbNUnP5U98OPo1EdU_cQ0,7578
21
27
  misata/validation.py,sha256=5yJSN7jecVNLJ8ss6y7l2U4hF1Ljn27Q6Xs9N1iDPcw,10791
28
+ misata/generators/__init__.py,sha256=V4I_1IucuywRJZH3cLxKvBd2Ib7kE0WIJ7tq8y4lkx8,568
29
+ misata/generators/base.py,sha256=iON9iAONMEQdbq2Fdric3V3bWn3caD1ITC16DTCK0Og,21329
22
30
  misata/templates/__init__.py,sha256=0RcZz9d4bmCqLAr77h0gpMfHncqAPeZCguqsuGCz7rE,25245
23
31
  misata/templates/library.py,sha256=eMex18ZKlzQqIkGFgs1uy9QGs7PmUN_VVL4txKvxynM,20930
24
- misata-0.2.0b0.dist-info/licenses/LICENSE,sha256=oagkechmfr9iT214N871zCm7TnB0KTfPjAUWxHsYJ4I,1071
25
- misata-0.2.0b0.dist-info/METADATA,sha256=t5yL_ZD7DNiH7TuCfaF4ZNT-5wYTiIaUSj8puQ4TROw,8114
26
- misata-0.2.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- misata-0.2.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
28
- misata-0.2.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
29
- misata-0.2.0b0.dist-info/RECORD,,
32
+ misata-0.3.0b0.dist-info/licenses/LICENSE,sha256=oagkechmfr9iT214N871zCm7TnB0KTfPjAUWxHsYJ4I,1071
33
+ misata-0.3.0b0.dist-info/METADATA,sha256=Wxpa2V0Sum-CFOpNnmRd27eEDfyT9CKIy-4nGZnrCys,8114
34
+ misata-0.3.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ misata-0.3.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
36
+ misata-0.3.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
37
+ misata-0.3.0b0.dist-info/RECORD,,