mostlyai-mock 0.1.18__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/PKG-INFO +2 -2
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/README.md +1 -1
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/mostlyai/mock/core.py +57 -9
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/pyproject.toml +1 -1
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/.gitignore +0 -0
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/LICENSE +0 -0
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.0}/mostlyai/mock/mcp_server.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -47,7 +47,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
|
|
47
47
|
* A light-weight python client for prompting LLMs for mixed-type tabular data.
|
48
48
|
* Select from a wide range of LLM endpoints and LLM models.
|
49
49
|
* Supports single-table as well as multi-table scenarios.
|
50
|
-
* Supports variety of data types: `string`, `
|
50
|
+
* Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
|
51
51
|
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
52
52
|
* Create from scratch or enrich existing datasets with new columns and/or rows.
|
53
53
|
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
@@ -9,7 +9,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
|
|
9
9
|
* A light-weight python client for prompting LLMs for mixed-type tabular data.
|
10
10
|
* Select from a wide range of LLM endpoints and LLM models.
|
11
11
|
* Supports single-table as well as multi-table scenarios.
|
12
|
-
* Supports variety of data types: `string`, `
|
12
|
+
* Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
|
13
13
|
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
14
14
|
* Create from scratch or enrich existing datasets with new columns and/or rows.
|
15
15
|
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
@@ -18,8 +18,9 @@ import asyncio
|
|
18
18
|
import concurrent.futures
|
19
19
|
import json
|
20
20
|
import math
|
21
|
+
import time
|
21
22
|
from collections import deque
|
22
|
-
from collections.abc import AsyncGenerator, Callable
|
23
|
+
from collections.abc import AsyncGenerator, Awaitable, Callable
|
23
24
|
from enum import Enum
|
24
25
|
from io import StringIO
|
25
26
|
from typing import Any, Literal
|
@@ -29,7 +30,6 @@ import litellm
|
|
29
30
|
import pandas as pd
|
30
31
|
import tenacity
|
31
32
|
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
32
|
-
from tqdm.asyncio import tqdm
|
33
33
|
|
34
34
|
litellm.suppress_debug_info = True
|
35
35
|
|
@@ -249,8 +249,26 @@ async def _sample_table(
|
|
249
249
|
n_workers: int,
|
250
250
|
llm_config: LLMConfig,
|
251
251
|
config: MockConfig,
|
252
|
-
progress_callback: Callable | None = None,
|
252
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
253
253
|
) -> pd.DataFrame:
|
254
|
+
# provide a default progress callback if none is provided
|
255
|
+
if progress_callback is None:
|
256
|
+
|
257
|
+
async def default_progress_callback(**kwargs):
|
258
|
+
percentage = (kwargs["progress"] / kwargs["total"]) * 100 if kwargs["total"] > 0 else 0
|
259
|
+
rows_per_second = kwargs["rows"] / kwargs["elapsed_time"] if kwargs["elapsed_time"] > 0 else 0
|
260
|
+
message = (
|
261
|
+
f"Generating table `{kwargs['table']}`".ljust(40)
|
262
|
+
+ f": {percentage:3.0f}%, {kwargs['rows']} rows, {kwargs['elapsed_time']:.0f}s, {rows_per_second:.1f} rows/s"
|
263
|
+
)
|
264
|
+
is_final = kwargs["progress"] >= kwargs["total"]
|
265
|
+
if is_final:
|
266
|
+
print(f"\r{message}") # final update with newline
|
267
|
+
else:
|
268
|
+
print(f"\r{message}", end="", flush=True) # in-progress update
|
269
|
+
|
270
|
+
progress_callback = default_progress_callback
|
271
|
+
|
254
272
|
table_rows_generator = _create_table_rows_generator(
|
255
273
|
name=name,
|
256
274
|
prompt=prompt,
|
@@ -265,7 +283,6 @@ async def _sample_table(
|
|
265
283
|
llm_config=llm_config,
|
266
284
|
progress_callback=progress_callback,
|
267
285
|
)
|
268
|
-
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
269
286
|
table_df = await _convert_table_rows_generator_to_df(
|
270
287
|
table_rows_generator=table_rows_generator,
|
271
288
|
columns=columns,
|
@@ -805,7 +822,7 @@ async def _create_table_rows_generator(
|
|
805
822
|
non_context_size: int | None,
|
806
823
|
n_workers: int,
|
807
824
|
llm_config: LLMConfig,
|
808
|
-
progress_callback: Callable | None = None,
|
825
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
809
826
|
) -> AsyncGenerator[dict]:
|
810
827
|
batch_size = 20 # generate 20 root table rows at a time
|
811
828
|
|
@@ -927,6 +944,7 @@ async def _create_table_rows_generator(
|
|
927
944
|
|
928
945
|
n_completed_batches = 0
|
929
946
|
n_yielded_sequences = 0
|
947
|
+
table_start_time = time.time()
|
930
948
|
while n_yielded_sequences < sample_size:
|
931
949
|
if n_completed_batches >= n_total_batches:
|
932
950
|
assert context_data is None, "n_total_batches is fixed for linked tables"
|
@@ -959,10 +977,13 @@ async def _create_table_rows_generator(
|
|
959
977
|
break
|
960
978
|
n_completed_batches += 1
|
961
979
|
if progress_callback:
|
980
|
+
elapsed_time = time.time() - table_start_time
|
962
981
|
await progress_callback(
|
982
|
+
table=name,
|
963
983
|
progress=n_completed_batches,
|
964
984
|
total=n_total_batches,
|
965
|
-
|
985
|
+
rows=n_yielded_sequences,
|
986
|
+
elapsed_time=round(elapsed_time, 2),
|
966
987
|
)
|
967
988
|
result_queue.task_done()
|
968
989
|
|
@@ -1232,7 +1253,7 @@ async def _sample_common(
|
|
1232
1253
|
top_p: float = 0.95,
|
1233
1254
|
n_workers: int = 10,
|
1234
1255
|
return_type: Literal["auto", "dict"] = "auto",
|
1235
|
-
progress_callback: Callable | None = None,
|
1256
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
1236
1257
|
):
|
1237
1258
|
tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
|
1238
1259
|
config = MockConfig(tables)
|
@@ -1290,6 +1311,7 @@ def sample(
|
|
1290
1311
|
top_p: float = 0.95,
|
1291
1312
|
n_workers: int = 10,
|
1292
1313
|
return_type: Literal["auto", "dict"] = "auto",
|
1314
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
1293
1315
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
1294
1316
|
"""
|
1295
1317
|
Generate synthetic data from scratch or enrich existing data with new columns.
|
@@ -1305,6 +1327,7 @@ def sample(
|
|
1305
1327
|
Args:
|
1306
1328
|
tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
|
1307
1329
|
Note: Avoid using double quotes (`"`) and other special characters in column names.
|
1330
|
+
Available dtypes: `string`, `integer`, `float`, `category`, `boolean`, `date`, `datetime`.
|
1308
1331
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
1309
1332
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
1310
1333
|
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
@@ -1329,6 +1352,11 @@ def sample(
|
|
1329
1352
|
n_workers (int): The number of concurrent workers making the LLM calls. Default is 10. The value is clamped to the range [1, 10].
|
1330
1353
|
If n_workers is 1, the generation of batches becomes sequential and certain features for better data consistency are enabled.
|
1331
1354
|
return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
|
1355
|
+
progress_callback (Callable | None): Optional callback function to track progress during data generation.
|
1356
|
+
If not provided, a default progress callback will display progress messages in the format:
|
1357
|
+
"Generating table `table_name`: X%, Y rows, Zs, W.X rows/s"
|
1358
|
+
The callback receives keyword arguments including: table, progress, total,
|
1359
|
+
rows, and elapsed_time. Default is None.
|
1332
1360
|
|
1333
1361
|
Returns:
|
1334
1362
|
- pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
|
@@ -1506,6 +1534,26 @@ def sample(
|
|
1506
1534
|
df_customers = data["customers"]
|
1507
1535
|
df_orders = data["orders"]
|
1508
1536
|
```
|
1537
|
+
|
1538
|
+
Example of using a custom progress callback to provide progress in JSON format:
|
1539
|
+
```python
|
1540
|
+
from mostlyai import mock
|
1541
|
+
import asyncio
|
1542
|
+
import json
|
1543
|
+
|
1544
|
+
async def custom_progress_callback(**kwargs):
|
1545
|
+
msg = f"\r{json.dumps(kwargs)}"
|
1546
|
+
if kwargs["progress"] < kwargs["total"]:
|
1547
|
+
print(msg, end="", flush=True)
|
1548
|
+
else:
|
1549
|
+
print(msg)
|
1550
|
+
|
1551
|
+
df = mock.sample(
|
1552
|
+
tables=tables,
|
1553
|
+
sample_size=10,
|
1554
|
+
progress_callback=custom_progress_callback
|
1555
|
+
)
|
1556
|
+
```
|
1509
1557
|
"""
|
1510
1558
|
|
1511
1559
|
def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
@@ -1523,7 +1571,7 @@ def sample(
|
|
1523
1571
|
top_p=top_p,
|
1524
1572
|
n_workers=n_workers,
|
1525
1573
|
return_type=return_type,
|
1526
|
-
progress_callback=
|
1574
|
+
progress_callback=progress_callback,
|
1527
1575
|
)
|
1528
1576
|
return future.result()
|
1529
1577
|
|
@@ -1539,7 +1587,7 @@ async def _asample(
|
|
1539
1587
|
top_p: float = 0.95,
|
1540
1588
|
n_workers: int = 10,
|
1541
1589
|
return_type: Literal["auto", "dict"] = "auto",
|
1542
|
-
progress_callback: Callable | None = None,
|
1590
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
1543
1591
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
1544
1592
|
return await _sample_common(
|
1545
1593
|
tables=tables,
|
File without changes
|
File without changes
|
File without changes
|