mostlyai-mock 0.1.18__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/PKG-INFO +2 -2
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/README.md +1 -1
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/mostlyai/mock/core.py +63 -9
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/pyproject.toml +1 -1
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/.gitignore +0 -0
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/LICENSE +0 -0
- {mostlyai_mock-0.1.18 → mostlyai_mock-0.2.1}/mostlyai/mock/mcp_server.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1
|
3
|
+
Version: 0.2.1
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -47,7 +47,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
|
|
47
47
|
* A light-weight python client for prompting LLMs for mixed-type tabular data.
|
48
48
|
* Select from a wide range of LLM endpoints and LLM models.
|
49
49
|
* Supports single-table as well as multi-table scenarios.
|
50
|
-
* Supports variety of data types: `string`, `
|
50
|
+
* Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
|
51
51
|
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
52
52
|
* Create from scratch or enrich existing datasets with new columns and/or rows.
|
53
53
|
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
@@ -9,7 +9,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
|
|
9
9
|
* A light-weight python client for prompting LLMs for mixed-type tabular data.
|
10
10
|
* Select from a wide range of LLM endpoints and LLM models.
|
11
11
|
* Supports single-table as well as multi-table scenarios.
|
12
|
-
* Supports variety of data types: `string`, `
|
12
|
+
* Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
|
13
13
|
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
14
14
|
* Create from scratch or enrich existing datasets with new columns and/or rows.
|
15
15
|
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
@@ -18,8 +18,9 @@ import asyncio
|
|
18
18
|
import concurrent.futures
|
19
19
|
import json
|
20
20
|
import math
|
21
|
+
import time
|
21
22
|
from collections import deque
|
22
|
-
from collections.abc import AsyncGenerator, Callable
|
23
|
+
from collections.abc import AsyncGenerator, Awaitable, Callable
|
23
24
|
from enum import Enum
|
24
25
|
from io import StringIO
|
25
26
|
from typing import Any, Literal
|
@@ -29,7 +30,6 @@ import litellm
|
|
29
30
|
import pandas as pd
|
30
31
|
import tenacity
|
31
32
|
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
32
|
-
from tqdm.asyncio import tqdm
|
33
33
|
|
34
34
|
litellm.suppress_debug_info = True
|
35
35
|
|
@@ -249,8 +249,26 @@ async def _sample_table(
|
|
249
249
|
n_workers: int,
|
250
250
|
llm_config: LLMConfig,
|
251
251
|
config: MockConfig,
|
252
|
-
progress_callback: Callable | None = None,
|
252
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
253
253
|
) -> pd.DataFrame:
|
254
|
+
# provide a default progress callback if none is provided
|
255
|
+
if progress_callback is None:
|
256
|
+
|
257
|
+
async def default_progress_callback(**kwargs):
|
258
|
+
percentage = (kwargs["progress"] / kwargs["total"]) * 100 if kwargs["total"] > 0 else 0
|
259
|
+
rows_per_second = kwargs["rows"] / kwargs["elapsed_time"] if kwargs["elapsed_time"] > 0 else 0
|
260
|
+
message = (
|
261
|
+
f"Generating table `{kwargs['table']}`".ljust(40)
|
262
|
+
+ f": {percentage:3.0f}%, {kwargs['rows']} rows, {kwargs['elapsed_time']:.0f}s, {rows_per_second:.1f} rows/s"
|
263
|
+
)
|
264
|
+
is_final = kwargs["progress"] >= kwargs["total"]
|
265
|
+
if is_final:
|
266
|
+
print(f"\r{message}") # final update with newline
|
267
|
+
else:
|
268
|
+
print(f"\r{message}", end="", flush=True) # in-progress update
|
269
|
+
|
270
|
+
progress_callback = default_progress_callback
|
271
|
+
|
254
272
|
table_rows_generator = _create_table_rows_generator(
|
255
273
|
name=name,
|
256
274
|
prompt=prompt,
|
@@ -265,7 +283,6 @@ async def _sample_table(
|
|
265
283
|
llm_config=llm_config,
|
266
284
|
progress_callback=progress_callback,
|
267
285
|
)
|
268
|
-
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
269
286
|
table_df = await _convert_table_rows_generator_to_df(
|
270
287
|
table_rows_generator=table_rows_generator,
|
271
288
|
columns=columns,
|
@@ -805,7 +822,7 @@ async def _create_table_rows_generator(
|
|
805
822
|
non_context_size: int | None,
|
806
823
|
n_workers: int,
|
807
824
|
llm_config: LLMConfig,
|
808
|
-
progress_callback: Callable | None = None,
|
825
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
809
826
|
) -> AsyncGenerator[dict]:
|
810
827
|
batch_size = 20 # generate 20 root table rows at a time
|
811
828
|
|
@@ -863,6 +880,10 @@ async def _create_table_rows_generator(
|
|
863
880
|
# +2 because LLM may not always count the rows correctly
|
864
881
|
batch_sizes[-1] = sample_size - sum(batch_sizes[:-1]) + 2
|
865
882
|
|
883
|
+
# emit initial progress message right away
|
884
|
+
if progress_callback:
|
885
|
+
await progress_callback(table=name, progress=0, total=n_total_batches, rows=0, elapsed_time=0)
|
886
|
+
|
866
887
|
# initialize queues for async communication
|
867
888
|
batch_queue = asyncio.PriorityQueue()
|
868
889
|
result_queue = asyncio.Queue()
|
@@ -927,6 +948,8 @@ async def _create_table_rows_generator(
|
|
927
948
|
|
928
949
|
n_completed_batches = 0
|
929
950
|
n_yielded_sequences = 0
|
951
|
+
n_generated_rows = 0
|
952
|
+
table_start_time = time.time()
|
930
953
|
while n_yielded_sequences < sample_size:
|
931
954
|
if n_completed_batches >= n_total_batches:
|
932
955
|
assert context_data is None, "n_total_batches is fixed for linked tables"
|
@@ -950,6 +973,7 @@ async def _create_table_rows_generator(
|
|
950
973
|
rows = result
|
951
974
|
for row_idx, row in enumerate(rows):
|
952
975
|
yield (batch_idx, row)
|
976
|
+
n_generated_rows += 1
|
953
977
|
if context_batches is None or row_idx == len(rows) - 1:
|
954
978
|
# in case of flat table, each row is considered a single sequence
|
955
979
|
# in case of linked table, all rows are considered a single sequence
|
@@ -959,10 +983,13 @@ async def _create_table_rows_generator(
|
|
959
983
|
break
|
960
984
|
n_completed_batches += 1
|
961
985
|
if progress_callback:
|
986
|
+
elapsed_time = time.time() - table_start_time
|
962
987
|
await progress_callback(
|
988
|
+
table=name,
|
963
989
|
progress=n_completed_batches,
|
964
990
|
total=n_total_batches,
|
965
|
-
|
991
|
+
rows=n_generated_rows,
|
992
|
+
elapsed_time=round(elapsed_time, 2),
|
966
993
|
)
|
967
994
|
result_queue.task_done()
|
968
995
|
|
@@ -1232,7 +1259,7 @@ async def _sample_common(
|
|
1232
1259
|
top_p: float = 0.95,
|
1233
1260
|
n_workers: int = 10,
|
1234
1261
|
return_type: Literal["auto", "dict"] = "auto",
|
1235
|
-
progress_callback: Callable | None = None,
|
1262
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
1236
1263
|
):
|
1237
1264
|
tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
|
1238
1265
|
config = MockConfig(tables)
|
@@ -1290,6 +1317,7 @@ def sample(
|
|
1290
1317
|
top_p: float = 0.95,
|
1291
1318
|
n_workers: int = 10,
|
1292
1319
|
return_type: Literal["auto", "dict"] = "auto",
|
1320
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
1293
1321
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
1294
1322
|
"""
|
1295
1323
|
Generate synthetic data from scratch or enrich existing data with new columns.
|
@@ -1305,6 +1333,7 @@ def sample(
|
|
1305
1333
|
Args:
|
1306
1334
|
tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
|
1307
1335
|
Note: Avoid using double quotes (`"`) and other special characters in column names.
|
1336
|
+
Available dtypes: `string`, `integer`, `float`, `category`, `boolean`, `date`, `datetime`.
|
1308
1337
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
1309
1338
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
1310
1339
|
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
@@ -1329,6 +1358,11 @@ def sample(
|
|
1329
1358
|
n_workers (int): The number of concurrent workers making the LLM calls. Default is 10. The value is clamped to the range [1, 10].
|
1330
1359
|
If n_workers is 1, the generation of batches becomes sequential and certain features for better data consistency are enabled.
|
1331
1360
|
return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
|
1361
|
+
progress_callback (Callable | None): Optional callback function to track progress during data generation.
|
1362
|
+
If not provided, a default progress callback will display progress messages in the format:
|
1363
|
+
"Generating table `table_name`: X%, Y rows, Zs, W.X rows/s"
|
1364
|
+
The callback receives keyword arguments including: table, progress, total,
|
1365
|
+
rows, and elapsed_time. Default is None.
|
1332
1366
|
|
1333
1367
|
Returns:
|
1334
1368
|
- pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
|
@@ -1506,6 +1540,26 @@ def sample(
|
|
1506
1540
|
df_customers = data["customers"]
|
1507
1541
|
df_orders = data["orders"]
|
1508
1542
|
```
|
1543
|
+
|
1544
|
+
Example of using a custom progress callback to provide progress in JSON format:
|
1545
|
+
```python
|
1546
|
+
from mostlyai import mock
|
1547
|
+
import asyncio
|
1548
|
+
import json
|
1549
|
+
|
1550
|
+
async def custom_progress_callback(**kwargs):
|
1551
|
+
msg = f"\r{json.dumps(kwargs)}"
|
1552
|
+
if kwargs["progress"] < kwargs["total"]:
|
1553
|
+
print(msg, end="", flush=True)
|
1554
|
+
else:
|
1555
|
+
print(msg)
|
1556
|
+
|
1557
|
+
df = mock.sample(
|
1558
|
+
tables=tables,
|
1559
|
+
sample_size=10,
|
1560
|
+
progress_callback=custom_progress_callback
|
1561
|
+
)
|
1562
|
+
```
|
1509
1563
|
"""
|
1510
1564
|
|
1511
1565
|
def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
@@ -1523,7 +1577,7 @@ def sample(
|
|
1523
1577
|
top_p=top_p,
|
1524
1578
|
n_workers=n_workers,
|
1525
1579
|
return_type=return_type,
|
1526
|
-
progress_callback=
|
1580
|
+
progress_callback=progress_callback,
|
1527
1581
|
)
|
1528
1582
|
return future.result()
|
1529
1583
|
|
@@ -1539,7 +1593,7 @@ async def _asample(
|
|
1539
1593
|
top_p: float = 0.95,
|
1540
1594
|
n_workers: int = 10,
|
1541
1595
|
return_type: Literal["auto", "dict"] = "auto",
|
1542
|
-
progress_callback: Callable | None = None,
|
1596
|
+
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
1543
1597
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
1544
1598
|
return await _sample_common(
|
1545
1599
|
tables=tables,
|
File without changes
|
File without changes
|
File without changes
|