mostlyai-mock 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +108 -39
- mostlyai/mock/mcp_server.py +9 -7
- {mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.14.dist-info}/METADATA +2 -2
- mostlyai_mock-0.1.14.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.13.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.14.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.14.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.14.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -19,7 +19,7 @@ import concurrent.futures
|
|
19
19
|
import json
|
20
20
|
import math
|
21
21
|
from collections import deque
|
22
|
-
from collections.abc import AsyncGenerator
|
22
|
+
from collections.abc import AsyncGenerator, Callable
|
23
23
|
from enum import Enum
|
24
24
|
from io import StringIO
|
25
25
|
from typing import Any, Literal
|
@@ -248,6 +248,7 @@ async def _sample_table(
|
|
248
248
|
non_context_size: int | None,
|
249
249
|
n_workers: int,
|
250
250
|
llm_config: LLMConfig,
|
251
|
+
progress_callback: Callable | None = None,
|
251
252
|
) -> pd.DataFrame:
|
252
253
|
table_rows_generator = _create_table_rows_generator(
|
253
254
|
name=name,
|
@@ -261,16 +262,13 @@ async def _sample_table(
|
|
261
262
|
non_context_size=non_context_size,
|
262
263
|
n_workers=n_workers,
|
263
264
|
llm_config=llm_config,
|
265
|
+
progress_callback=progress_callback,
|
264
266
|
)
|
265
267
|
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
266
268
|
table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
|
267
269
|
return table_df
|
268
270
|
|
269
271
|
|
270
|
-
def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
|
271
|
-
return asyncio.run(_sample_table(*args, **kwargs))
|
272
|
-
|
273
|
-
|
274
272
|
def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
|
275
273
|
return f"""
|
276
274
|
You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
@@ -766,6 +764,7 @@ async def _create_table_rows_generator(
|
|
766
764
|
non_context_size: int | None,
|
767
765
|
n_workers: int,
|
768
766
|
llm_config: LLMConfig,
|
767
|
+
progress_callback: Callable | None = None,
|
769
768
|
) -> AsyncGenerator[dict]:
|
770
769
|
batch_size = 20 # generate 20 root table rows at a time
|
771
770
|
|
@@ -807,6 +806,13 @@ async def _create_table_rows_generator(
|
|
807
806
|
assert non_context_table_name in data
|
808
807
|
non_context_data[non_context_table_name] = data[non_context_table_name]
|
809
808
|
|
809
|
+
# calculate ideal batch size that spreads the workload evenly across workers
|
810
|
+
ideal_batch_size = max(math.ceil(sample_size / n_workers), 5)
|
811
|
+
if ideal_batch_size < batch_size:
|
812
|
+
# never increase batch_size beyond initial value
|
813
|
+
# this is especially important for sequential tables, where batch_size is currently assumed to be 1 everywhere
|
814
|
+
batch_size = ideal_batch_size
|
815
|
+
|
810
816
|
# calculate batch_sizes
|
811
817
|
assert sample_size is not None, "sample_size should have been filled by this point"
|
812
818
|
n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
|
@@ -911,6 +917,12 @@ async def _create_table_rows_generator(
|
|
911
917
|
if n_yielded_sequences >= sample_size:
|
912
918
|
break
|
913
919
|
n_completed_batches += 1
|
920
|
+
if progress_callback:
|
921
|
+
await progress_callback(
|
922
|
+
progress=n_completed_batches,
|
923
|
+
total=n_total_batches,
|
924
|
+
message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
|
925
|
+
)
|
914
926
|
result_queue.task_done()
|
915
927
|
|
916
928
|
# gracefully shutdown workers
|
@@ -1101,6 +1113,54 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
|
|
1101
1113
|
return execution_plan
|
1102
1114
|
|
1103
1115
|
|
1116
|
+
async def _sample_common(
|
1117
|
+
*,
|
1118
|
+
tables: dict[str, dict],
|
1119
|
+
sample_size: int | dict[str, int] = 4,
|
1120
|
+
existing_data: dict[str, pd.DataFrame] | None = None,
|
1121
|
+
model: str = "openai/gpt-4.1-nano",
|
1122
|
+
api_key: str | None = None,
|
1123
|
+
temperature: float = 1.0,
|
1124
|
+
top_p: float = 0.95,
|
1125
|
+
n_workers: int = 10,
|
1126
|
+
return_type: Literal["auto", "dict"] = "auto",
|
1127
|
+
progress_callback: Callable | None = None,
|
1128
|
+
):
|
1129
|
+
tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
|
1130
|
+
config = MockConfig(tables)
|
1131
|
+
|
1132
|
+
llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
|
1133
|
+
|
1134
|
+
sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
|
1135
|
+
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
1136
|
+
|
1137
|
+
n_workers = max(min(n_workers, 10), 1)
|
1138
|
+
|
1139
|
+
execution_plan: list[str] = _build_execution_plan(config)
|
1140
|
+
|
1141
|
+
data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
|
1142
|
+
|
1143
|
+
for table_name in execution_plan:
|
1144
|
+
table_config = config.root[table_name]
|
1145
|
+
df = await _sample_table(
|
1146
|
+
name=table_name,
|
1147
|
+
prompt=table_config.prompt,
|
1148
|
+
columns=table_config.columns,
|
1149
|
+
foreign_keys=table_config.foreign_keys,
|
1150
|
+
primary_keys=primary_keys,
|
1151
|
+
data=data,
|
1152
|
+
sample_size=sample_size.get(table_name),
|
1153
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
1154
|
+
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
1155
|
+
n_workers=n_workers,
|
1156
|
+
llm_config=llm_config,
|
1157
|
+
progress_callback=progress_callback,
|
1158
|
+
)
|
1159
|
+
data[table_name] = df
|
1160
|
+
|
1161
|
+
return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
|
1162
|
+
|
1163
|
+
|
1104
1164
|
def sample(
|
1105
1165
|
*,
|
1106
1166
|
tables: dict[str, dict],
|
@@ -1329,42 +1389,51 @@ def sample(
|
|
1329
1389
|
```
|
1330
1390
|
"""
|
1331
1391
|
|
1332
|
-
|
1333
|
-
|
1392
|
+
def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
1393
|
+
return asyncio.run(_sample_common(*args, **kwargs))
|
1334
1394
|
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1395
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
1396
|
+
future = executor.submit(
|
1397
|
+
sample_common_sync,
|
1398
|
+
tables=tables,
|
1399
|
+
sample_size=sample_size,
|
1400
|
+
existing_data=existing_data,
|
1401
|
+
model=model,
|
1402
|
+
api_key=api_key,
|
1403
|
+
temperature=temperature,
|
1404
|
+
top_p=top_p,
|
1405
|
+
n_workers=n_workers,
|
1406
|
+
return_type=return_type,
|
1407
|
+
progress_callback=None,
|
1408
|
+
)
|
1409
|
+
return future.result()
|
1341
1410
|
|
1342
|
-
execution_plan: list[str] = _build_execution_plan(config)
|
1343
1411
|
|
1344
|
-
|
1412
|
+
async def _asample(
|
1413
|
+
*,
|
1414
|
+
tables: dict[str, dict],
|
1415
|
+
sample_size: int | dict[str, int] = 4,
|
1416
|
+
existing_data: dict[str, pd.DataFrame] | None = None,
|
1417
|
+
model: str = "openai/gpt-4.1-nano",
|
1418
|
+
api_key: str | None = None,
|
1419
|
+
temperature: float = 1.0,
|
1420
|
+
top_p: float = 0.95,
|
1421
|
+
n_workers: int = 10,
|
1422
|
+
return_type: Literal["auto", "dict"] = "auto",
|
1423
|
+
progress_callback: Callable | None = None,
|
1424
|
+
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
1425
|
+
return await _sample_common(
|
1426
|
+
tables=tables,
|
1427
|
+
sample_size=sample_size,
|
1428
|
+
existing_data=existing_data,
|
1429
|
+
model=model,
|
1430
|
+
api_key=api_key,
|
1431
|
+
temperature=temperature,
|
1432
|
+
top_p=top_p,
|
1433
|
+
n_workers=n_workers,
|
1434
|
+
return_type=return_type,
|
1435
|
+
progress_callback=progress_callback,
|
1436
|
+
)
|
1345
1437
|
|
1346
|
-
# synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
|
1347
|
-
# in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
|
1348
|
-
# a new thread is spawned for each call to `_sample_table`
|
1349
|
-
# NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
|
1350
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
1351
|
-
for table_name in execution_plan:
|
1352
|
-
table_config = config.root[table_name]
|
1353
|
-
future = executor.submit(
|
1354
|
-
_sample_table_sync,
|
1355
|
-
name=table_name,
|
1356
|
-
prompt=table_config.prompt,
|
1357
|
-
columns=table_config.columns,
|
1358
|
-
foreign_keys=table_config.foreign_keys,
|
1359
|
-
primary_keys=primary_keys,
|
1360
|
-
data=data,
|
1361
|
-
sample_size=sample_size.get(table_name),
|
1362
|
-
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
1363
|
-
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
1364
|
-
n_workers=n_workers,
|
1365
|
-
llm_config=llm_config,
|
1366
|
-
)
|
1367
|
-
df = future.result()
|
1368
|
-
data[table_name] = df
|
1369
1438
|
|
1370
|
-
|
1439
|
+
_asample.__doc__ = sample.__doc__
|
mostlyai/mock/mcp_server.py
CHANGED
@@ -16,16 +16,16 @@ import os
|
|
16
16
|
import tempfile
|
17
17
|
|
18
18
|
import pandas as pd
|
19
|
-
from fastmcp import FastMCP
|
19
|
+
from fastmcp import Context, FastMCP
|
20
20
|
|
21
|
-
from mostlyai import
|
21
|
+
from mostlyai.mock.core import _asample
|
22
22
|
|
23
23
|
SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
|
24
24
|
Synthetic Mock Data.
|
25
25
|
|
26
26
|
Use LLMs to generate any Tabular Data towards your needs. Create from scratch, expand existing datasets, or enrich tables with new columns.
|
27
27
|
|
28
|
-
This tool is a proxy to the `mostlyai.mock.
|
28
|
+
This tool is a proxy to the `mostlyai.mock.core._asample` function, but returns a dictionary of paths to the generated CSV files.
|
29
29
|
|
30
30
|
Present the result nicely to the user, in Markdown format. Example:
|
31
31
|
|
@@ -33,8 +33,8 @@ Mock data can be found under the following paths:
|
|
33
33
|
- `/tmp/tmpl41bwa6n/players.csv`
|
34
34
|
- `/tmp/tmpl41bwa6n/seasons.csv`
|
35
35
|
|
36
|
-
== mostlyai.mock.
|
37
|
-
{
|
36
|
+
== mostlyai.mock.core._asample docstring ==
|
37
|
+
{_asample.__doc__}
|
38
38
|
"""
|
39
39
|
|
40
40
|
mcp = FastMCP(name="MostlyAI Mock MCP Server")
|
@@ -51,7 +51,8 @@ def _store_locally(data: dict[str, pd.DataFrame]) -> dict[str, str]:
|
|
51
51
|
|
52
52
|
|
53
53
|
@mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
|
54
|
-
def mock_data(
|
54
|
+
async def mock_data(
|
55
|
+
ctx: Context,
|
55
56
|
*,
|
56
57
|
tables: dict[str, dict],
|
57
58
|
sample_size: int,
|
@@ -60,7 +61,7 @@ def mock_data(
|
|
60
61
|
temperature: float = 1.0,
|
61
62
|
top_p: float = 0.95,
|
62
63
|
) -> dict[str, str]:
|
63
|
-
data =
|
64
|
+
data = await _asample(
|
64
65
|
tables=tables,
|
65
66
|
sample_size=sample_size,
|
66
67
|
model=model,
|
@@ -68,6 +69,7 @@ def mock_data(
|
|
68
69
|
temperature=temperature,
|
69
70
|
top_p=top_p,
|
70
71
|
return_type="dict",
|
72
|
+
progress_callback=ctx.report_progress,
|
71
73
|
)
|
72
74
|
locations = _store_locally(data)
|
73
75
|
return locations
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -229,7 +229,7 @@ tables = {
|
|
229
229
|
],
|
230
230
|
}
|
231
231
|
}
|
232
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
|
232
|
+
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
|
233
233
|
print(df)
|
234
234
|
# employee_id name boss_id role
|
235
235
|
# 0 B0-1 Patricia Lee <NA> President
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=MLHwi5g6_lAEd8cDEISbVdRWmorOVAQ6IoMm8BsRpqg,715
|
2
|
+
mostlyai/mock/core.py,sha256=JdWHix-Pp0s---b_Z3f2ui7J7LSl4_r_gPP0z8UHKY8,61663
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=0Vn1jmrdNAvUZSviaaU7Lhn7L7iHFyd8kGFigM0-4s0,2367
|
4
|
+
mostlyai_mock-0.1.14.dist-info/METADATA,sha256=PHiUTSEvevYTPVvsMGT-kilTDwaEgIEL0T8Vr56PSiY,14123
|
5
|
+
mostlyai_mock-0.1.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.14.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.14.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=r4GBSmJmB1SGtviYtZwY5b3GBzhK_mt8czzk-py4flo,715
|
2
|
-
mostlyai/mock/core.py,sha256=nu0PSX3Xt8l6_95cIrJ7Wt0SbJvfrLD3t0CFIidOLcM,59573
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=MrVUrIsAZsFzjK1suwNl1fxS1ES-wpc-YSM8cS8Fqcw,2259
|
4
|
-
mostlyai_mock-0.1.13.dist-info/METADATA,sha256=un3lLINiMi8HkVcmsIr64U-OQQiqT5LsgiGam1aNTj4,14110
|
5
|
-
mostlyai_mock-0.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.13.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|