mostlyai-mock 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -229,7 +229,7 @@ tables = {
229
229
  ],
230
230
  }
231
231
  }
232
- df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
232
+ df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
233
233
  print(df)
234
234
  # employee_id name boss_id role
235
235
  # 0 B0-1 Patricia Lee <NA> President
@@ -194,7 +194,7 @@ tables = {
194
194
  ],
195
195
  }
196
196
  }
197
- df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
197
+ df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
198
198
  print(df)
199
199
  # employee_id name boss_id role
200
200
  # 0 B0-1 Patricia Lee <NA> President
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.12" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.14" # Do not set this manually. Use poetry version [params].
@@ -19,7 +19,7 @@ import concurrent.futures
19
19
  import json
20
20
  import math
21
21
  from collections import deque
22
- from collections.abc import AsyncGenerator
22
+ from collections.abc import AsyncGenerator, Callable
23
23
  from enum import Enum
24
24
  from io import StringIO
25
25
  from typing import Any, Literal
@@ -248,6 +248,7 @@ async def _sample_table(
248
248
  non_context_size: int | None,
249
249
  n_workers: int,
250
250
  llm_config: LLMConfig,
251
+ progress_callback: Callable | None = None,
251
252
  ) -> pd.DataFrame:
252
253
  table_rows_generator = _create_table_rows_generator(
253
254
  name=name,
@@ -261,16 +262,13 @@ async def _sample_table(
261
262
  non_context_size=non_context_size,
262
263
  n_workers=n_workers,
263
264
  llm_config=llm_config,
265
+ progress_callback=progress_callback,
264
266
  )
265
267
  table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
266
268
  table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
267
269
  return table_df
268
270
 
269
271
 
270
- def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
271
- return asyncio.run(_sample_table(*args, **kwargs))
272
-
273
-
274
272
  def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
275
273
  return f"""
276
274
  You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
@@ -766,6 +764,7 @@ async def _create_table_rows_generator(
766
764
  non_context_size: int | None,
767
765
  n_workers: int,
768
766
  llm_config: LLMConfig,
767
+ progress_callback: Callable | None = None,
769
768
  ) -> AsyncGenerator[dict]:
770
769
  batch_size = 20 # generate 20 root table rows at a time
771
770
 
@@ -807,6 +806,13 @@ async def _create_table_rows_generator(
807
806
  assert non_context_table_name in data
808
807
  non_context_data[non_context_table_name] = data[non_context_table_name]
809
808
 
809
+ # calculate ideal batch size that spreads the workload evenly across workers
810
+ ideal_batch_size = max(math.ceil(sample_size / n_workers), 5)
811
+ if ideal_batch_size < batch_size:
812
+ # never increase batch_size beyond initial value
813
+ # this is especially important for sequential tables, where batch_size is currently assumed to be 1 everywhere
814
+ batch_size = ideal_batch_size
815
+
810
816
  # calculate batch_sizes
811
817
  assert sample_size is not None, "sample_size should have been filled by this point"
812
818
  n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
@@ -911,6 +917,12 @@ async def _create_table_rows_generator(
911
917
  if n_yielded_sequences >= sample_size:
912
918
  break
913
919
  n_completed_batches += 1
920
+ if progress_callback:
921
+ await progress_callback(
922
+ progress=n_completed_batches,
923
+ total=n_total_batches,
924
+ message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
925
+ )
914
926
  result_queue.task_done()
915
927
 
916
928
  # gracefully shutdown workers
@@ -924,9 +936,9 @@ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: Co
924
936
  series = series.copy()
925
937
  if column_config.dtype in [DType.DATE, DType.DATETIME]:
926
938
 
927
- def harmonize_datetime(x):
939
+ def harmonize_datetime(x: Any):
928
940
  try:
929
- return dateutil.parser.parse(x)
941
+ return dateutil.parser.parse(str(x))
930
942
  except Exception:
931
943
  return pd.NaT
932
944
 
@@ -1101,6 +1113,54 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
1101
1113
  return execution_plan
1102
1114
 
1103
1115
 
1116
+ async def _sample_common(
1117
+ *,
1118
+ tables: dict[str, dict],
1119
+ sample_size: int | dict[str, int] = 4,
1120
+ existing_data: dict[str, pd.DataFrame] | None = None,
1121
+ model: str = "openai/gpt-4.1-nano",
1122
+ api_key: str | None = None,
1123
+ temperature: float = 1.0,
1124
+ top_p: float = 0.95,
1125
+ n_workers: int = 10,
1126
+ return_type: Literal["auto", "dict"] = "auto",
1127
+ progress_callback: Callable | None = None,
1128
+ ):
1129
+ tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
1130
+ config = MockConfig(tables)
1131
+
1132
+ llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
1133
+
1134
+ sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
1135
+ primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
1136
+
1137
+ n_workers = max(min(n_workers, 10), 1)
1138
+
1139
+ execution_plan: list[str] = _build_execution_plan(config)
1140
+
1141
+ data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
1142
+
1143
+ for table_name in execution_plan:
1144
+ table_config = config.root[table_name]
1145
+ df = await _sample_table(
1146
+ name=table_name,
1147
+ prompt=table_config.prompt,
1148
+ columns=table_config.columns,
1149
+ foreign_keys=table_config.foreign_keys,
1150
+ primary_keys=primary_keys,
1151
+ data=data,
1152
+ sample_size=sample_size.get(table_name),
1153
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
1154
+ non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
1155
+ n_workers=n_workers,
1156
+ llm_config=llm_config,
1157
+ progress_callback=progress_callback,
1158
+ )
1159
+ data[table_name] = df
1160
+
1161
+ return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
1162
+
1163
+
1104
1164
  def sample(
1105
1165
  *,
1106
1166
  tables: dict[str, dict],
@@ -1329,42 +1389,51 @@ def sample(
1329
1389
  ```
1330
1390
  """
1331
1391
 
1332
- tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
1333
- config = MockConfig(tables)
1392
+ def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
1393
+ return asyncio.run(_sample_common(*args, **kwargs))
1334
1394
 
1335
- llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
1336
-
1337
- sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
1338
- primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
1339
-
1340
- n_workers = max(min(n_workers, 10), 1)
1395
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1396
+ future = executor.submit(
1397
+ sample_common_sync,
1398
+ tables=tables,
1399
+ sample_size=sample_size,
1400
+ existing_data=existing_data,
1401
+ model=model,
1402
+ api_key=api_key,
1403
+ temperature=temperature,
1404
+ top_p=top_p,
1405
+ n_workers=n_workers,
1406
+ return_type=return_type,
1407
+ progress_callback=None,
1408
+ )
1409
+ return future.result()
1341
1410
 
1342
- execution_plan: list[str] = _build_execution_plan(config)
1343
1411
 
1344
- data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
1412
+ async def _asample(
1413
+ *,
1414
+ tables: dict[str, dict],
1415
+ sample_size: int | dict[str, int] = 4,
1416
+ existing_data: dict[str, pd.DataFrame] | None = None,
1417
+ model: str = "openai/gpt-4.1-nano",
1418
+ api_key: str | None = None,
1419
+ temperature: float = 1.0,
1420
+ top_p: float = 0.95,
1421
+ n_workers: int = 10,
1422
+ return_type: Literal["auto", "dict"] = "auto",
1423
+ progress_callback: Callable | None = None,
1424
+ ) -> pd.DataFrame | dict[str, pd.DataFrame]:
1425
+ return await _sample_common(
1426
+ tables=tables,
1427
+ sample_size=sample_size,
1428
+ existing_data=existing_data,
1429
+ model=model,
1430
+ api_key=api_key,
1431
+ temperature=temperature,
1432
+ top_p=top_p,
1433
+ n_workers=n_workers,
1434
+ return_type=return_type,
1435
+ progress_callback=progress_callback,
1436
+ )
1345
1437
 
1346
- # synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
1347
- # in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
1348
- # a new thread is spawned for each call to `_sample_table`
1349
- # NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
1350
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1351
- for table_name in execution_plan:
1352
- table_config = config.root[table_name]
1353
- future = executor.submit(
1354
- _sample_table_sync,
1355
- name=table_name,
1356
- prompt=table_config.prompt,
1357
- columns=table_config.columns,
1358
- foreign_keys=table_config.foreign_keys,
1359
- primary_keys=primary_keys,
1360
- data=data,
1361
- sample_size=sample_size.get(table_name),
1362
- previous_rows_size=10, # present 10 previously generated rows to the LLM
1363
- non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
1364
- n_workers=n_workers,
1365
- llm_config=llm_config,
1366
- )
1367
- df = future.result()
1368
- data[table_name] = df
1369
1438
 
1370
- return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
1439
+ _asample.__doc__ = sample.__doc__
@@ -16,16 +16,16 @@ import os
16
16
  import tempfile
17
17
 
18
18
  import pandas as pd
19
- from fastmcp import FastMCP
19
+ from fastmcp import Context, FastMCP
20
20
 
21
- from mostlyai import mock
21
+ from mostlyai.mock.core import _asample
22
22
 
23
23
  SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
24
24
  Synthetic Mock Data.
25
25
 
26
26
  Use LLMs to generate any Tabular Data towards your needs. Create from scratch, expand existing datasets, or enrich tables with new columns.
27
27
 
28
- This tool is a proxy to the `mostlyai.mock.sample` function, but returns a dictionary of paths to the generated CSV files.
28
+ This tool is a proxy to the `mostlyai.mock.core._asample` function, but returns a dictionary of paths to the generated CSV files.
29
29
 
30
30
  Present the result nicely to the user, in Markdown format. Example:
31
31
 
@@ -33,8 +33,8 @@ Mock data can be found under the following paths:
33
33
  - `/tmp/tmpl41bwa6n/players.csv`
34
34
  - `/tmp/tmpl41bwa6n/seasons.csv`
35
35
 
36
- == mostlyai.mock.sample DocString ==
37
- {mock.sample.__doc__}
36
+ == mostlyai.mock.core._asample docstring ==
37
+ {_asample.__doc__}
38
38
  """
39
39
 
40
40
  mcp = FastMCP(name="MostlyAI Mock MCP Server")
@@ -51,7 +51,8 @@ def _store_locally(data: dict[str, pd.DataFrame]) -> dict[str, str]:
51
51
 
52
52
 
53
53
  @mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
54
- def mock_data(
54
+ async def mock_data(
55
+ ctx: Context,
55
56
  *,
56
57
  tables: dict[str, dict],
57
58
  sample_size: int,
@@ -60,7 +61,7 @@ def mock_data(
60
61
  temperature: float = 1.0,
61
62
  top_p: float = 0.95,
62
63
  ) -> dict[str, str]:
63
- data = mock.sample(
64
+ data = await _asample(
64
65
  tables=tables,
65
66
  sample_size=sample_size,
66
67
  model=model,
@@ -68,6 +69,7 @@ def mock_data(
68
69
  temperature=temperature,
69
70
  top_p=top_p,
70
71
  return_type="dict",
72
+ progress_callback=ctx.report_progress,
71
73
  )
72
74
  locations = _store_locally(data)
73
75
  return locations
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.1.12"
3
+ version = "0.1.14"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes