mostlyai-mock 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.13" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.15" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -19,7 +19,7 @@ import concurrent.futures
19
19
  import json
20
20
  import math
21
21
  from collections import deque
22
- from collections.abc import AsyncGenerator
22
+ from collections.abc import AsyncGenerator, Callable
23
23
  from enum import Enum
24
24
  from io import StringIO
25
25
  from typing import Any, Literal
@@ -40,10 +40,10 @@ class LLMOutputFormat(str, Enum):
40
40
 
41
41
 
42
42
  class LLMConfig(BaseModel):
43
- model: str = "openai/gpt-4.1-nano"
44
- api_key: str | None = None
45
- temperature: float = 1.0
46
- top_p: float = 0.95
43
+ model: str
44
+ api_key: str | None
45
+ temperature: float
46
+ top_p: float
47
47
 
48
48
 
49
49
  class MockConfig(RootModel[dict[str, "TableConfig"]]):
@@ -248,6 +248,7 @@ async def _sample_table(
248
248
  non_context_size: int | None,
249
249
  n_workers: int,
250
250
  llm_config: LLMConfig,
251
+ progress_callback: Callable | None = None,
251
252
  ) -> pd.DataFrame:
252
253
  table_rows_generator = _create_table_rows_generator(
253
254
  name=name,
@@ -261,19 +262,15 @@ async def _sample_table(
261
262
  non_context_size=non_context_size,
262
263
  n_workers=n_workers,
263
264
  llm_config=llm_config,
265
+ progress_callback=progress_callback,
264
266
  )
265
267
  table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
266
268
  table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
267
269
  return table_df
268
270
 
269
271
 
270
- def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
271
- return asyncio.run(_sample_table(*args, **kwargs))
272
-
273
-
274
272
  def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
275
- return f"""
276
- You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
273
+ return f"""You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
277
274
 
278
275
  Your task is to:
279
276
 
@@ -291,8 +288,7 @@ appropriate content. For dates and timestamps, ensure logical chronology. Always
291
288
  across tables.
292
289
 
293
290
  When enriching existing data, carefully analyze the patterns and relationships in the existing columns \
294
- to generate compatible and realistic values for the missing columns.
295
- """
291
+ to generate compatible and realistic values for the missing columns."""
296
292
 
297
293
 
298
294
  def _create_table_prompt(
@@ -717,7 +713,7 @@ async def _worker(
717
713
  if do_repeat_task:
718
714
  # allow 10 retries across all workers before propagating the exception to the orchestrator
719
715
  await retry_queue.put(1)
720
- if retry_queue.qsize() < 10:
716
+ if retry_queue.qsize() <= 10:
721
717
  # put task back to the front of the batch queue
722
718
  await batch_queue.put((batch_idx, task))
723
719
  else:
@@ -766,6 +762,7 @@ async def _create_table_rows_generator(
766
762
  non_context_size: int | None,
767
763
  n_workers: int,
768
764
  llm_config: LLMConfig,
765
+ progress_callback: Callable | None = None,
769
766
  ) -> AsyncGenerator[dict]:
770
767
  batch_size = 20 # generate 20 root table rows at a time
771
768
 
@@ -807,6 +804,13 @@ async def _create_table_rows_generator(
807
804
  assert non_context_table_name in data
808
805
  non_context_data[non_context_table_name] = data[non_context_table_name]
809
806
 
807
+ # calculate ideal batch size that spreads the workload evenly across workers
808
+ ideal_batch_size = max(math.ceil(sample_size / n_workers), 5)
809
+ if ideal_batch_size < batch_size:
810
+ # never increase batch_size beyond initial value
811
+ # this is especially important for sequential tables, where batch_size is currently assumed to be 1 everywhere
812
+ batch_size = ideal_batch_size
813
+
810
814
  # calculate batch_sizes
811
815
  assert sample_size is not None, "sample_size should have been filled by this point"
812
816
  n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
@@ -911,6 +915,12 @@ async def _create_table_rows_generator(
911
915
  if n_yielded_sequences >= sample_size:
912
916
  break
913
917
  n_completed_batches += 1
918
+ if progress_callback:
919
+ await progress_callback(
920
+ progress=n_completed_batches,
921
+ total=n_total_batches,
922
+ message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
923
+ )
914
924
  result_queue.task_done()
915
925
 
916
926
  # gracefully shutdown workers
@@ -1101,6 +1111,54 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
1101
1111
  return execution_plan
1102
1112
 
1103
1113
 
1114
+ async def _sample_common(
1115
+ *,
1116
+ tables: dict[str, dict],
1117
+ sample_size: int | dict[str, int] = 4,
1118
+ existing_data: dict[str, pd.DataFrame] | None = None,
1119
+ model: str = "openai/gpt-4.1-nano",
1120
+ api_key: str | None = None,
1121
+ temperature: float = 1.0,
1122
+ top_p: float = 0.95,
1123
+ n_workers: int = 10,
1124
+ return_type: Literal["auto", "dict"] = "auto",
1125
+ progress_callback: Callable | None = None,
1126
+ ):
1127
+ tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
1128
+ config = MockConfig(tables)
1129
+
1130
+ llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
1131
+
1132
+ sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
1133
+ primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
1134
+
1135
+ n_workers = max(min(n_workers, 10), 1)
1136
+
1137
+ execution_plan: list[str] = _build_execution_plan(config)
1138
+
1139
+ data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
1140
+
1141
+ for table_name in execution_plan:
1142
+ table_config = config.root[table_name]
1143
+ df = await _sample_table(
1144
+ name=table_name,
1145
+ prompt=table_config.prompt,
1146
+ columns=table_config.columns,
1147
+ foreign_keys=table_config.foreign_keys,
1148
+ primary_keys=primary_keys,
1149
+ data=data,
1150
+ sample_size=sample_size.get(table_name),
1151
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
1152
+ non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
1153
+ n_workers=n_workers,
1154
+ llm_config=llm_config,
1155
+ progress_callback=progress_callback,
1156
+ )
1157
+ data[table_name] = df
1158
+
1159
+ return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
1160
+
1161
+
1104
1162
  def sample(
1105
1163
  *,
1106
1164
  tables: dict[str, dict],
@@ -1121,11 +1179,12 @@ def sample(
1121
1179
  or the enrichment of existing datasets with new, context-aware columns.
1122
1180
 
1123
1181
  It is particularly useful for quickly simulating production-like datasets for testing or prototyping purposes.
1124
- It is advised to limit mocking to small datasets for performance reasons (rows * cols < 100).
1182
+ It is advised to limit mocking to small datasets for performance reasons (rows * cols < 1000).
1125
1183
  It might take a couple of minutes for bigger datasets.
1126
1184
 
1127
1185
  Args:
1128
1186
  tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
1187
+ Note: Avoid using double quotes (`"`) and other special characters in column names.
1129
1188
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
1130
1189
  If a single integer is provided, the same number of rows will be generated for each subject table.
1131
1190
  If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
@@ -1329,42 +1388,51 @@ def sample(
1329
1388
  ```
1330
1389
  """
1331
1390
 
1332
- tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
1333
- config = MockConfig(tables)
1391
+ def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
1392
+ return asyncio.run(_sample_common(*args, **kwargs))
1334
1393
 
1335
- llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
1336
-
1337
- sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
1338
- primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
1339
-
1340
- n_workers = max(min(n_workers, 10), 1)
1394
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1395
+ future = executor.submit(
1396
+ sample_common_sync,
1397
+ tables=tables,
1398
+ sample_size=sample_size,
1399
+ existing_data=existing_data,
1400
+ model=model,
1401
+ api_key=api_key,
1402
+ temperature=temperature,
1403
+ top_p=top_p,
1404
+ n_workers=n_workers,
1405
+ return_type=return_type,
1406
+ progress_callback=None,
1407
+ )
1408
+ return future.result()
1341
1409
 
1342
- execution_plan: list[str] = _build_execution_plan(config)
1343
1410
 
1344
- data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
1411
+ async def _asample(
1412
+ *,
1413
+ tables: dict[str, dict],
1414
+ sample_size: int | dict[str, int] = 4,
1415
+ existing_data: dict[str, pd.DataFrame] | None = None,
1416
+ model: str = "openai/gpt-4.1-nano",
1417
+ api_key: str | None = None,
1418
+ temperature: float = 1.0,
1419
+ top_p: float = 0.95,
1420
+ n_workers: int = 10,
1421
+ return_type: Literal["auto", "dict"] = "auto",
1422
+ progress_callback: Callable | None = None,
1423
+ ) -> pd.DataFrame | dict[str, pd.DataFrame]:
1424
+ return await _sample_common(
1425
+ tables=tables,
1426
+ sample_size=sample_size,
1427
+ existing_data=existing_data,
1428
+ model=model,
1429
+ api_key=api_key,
1430
+ temperature=temperature,
1431
+ top_p=top_p,
1432
+ n_workers=n_workers,
1433
+ return_type=return_type,
1434
+ progress_callback=progress_callback,
1435
+ )
1345
1436
 
1346
- # synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
1347
- # in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
1348
- # a new thread is spawned for each call to `_sample_table`
1349
- # NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
1350
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1351
- for table_name in execution_plan:
1352
- table_config = config.root[table_name]
1353
- future = executor.submit(
1354
- _sample_table_sync,
1355
- name=table_name,
1356
- prompt=table_config.prompt,
1357
- columns=table_config.columns,
1358
- foreign_keys=table_config.foreign_keys,
1359
- primary_keys=primary_keys,
1360
- data=data,
1361
- sample_size=sample_size.get(table_name),
1362
- previous_rows_size=10, # present 10 previously generated rows to the LLM
1363
- non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
1364
- n_workers=n_workers,
1365
- llm_config=llm_config,
1366
- )
1367
- df = future.result()
1368
- data[table_name] = df
1369
1437
 
1370
- return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
1438
+ _asample.__doc__ = sample.__doc__
@@ -16,16 +16,16 @@ import os
16
16
  import tempfile
17
17
 
18
18
  import pandas as pd
19
- from fastmcp import FastMCP
19
+ from fastmcp import Context, FastMCP
20
20
 
21
- from mostlyai import mock
21
+ from mostlyai.mock.core import _asample
22
22
 
23
23
  SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
24
24
  Synthetic Mock Data.
25
25
 
26
26
  Use LLMs to generate any Tabular Data towards your needs. Create from scratch, expand existing datasets, or enrich tables with new columns.
27
27
 
28
- This tool is a proxy to the `mostlyai.mock.sample` function, but returns a dictionary of paths to the generated CSV files.
28
+ This tool is a proxy to the `mostlyai.mock.core._asample` function, but returns a dictionary of paths to the generated CSV files.
29
29
 
30
30
  Present the result nicely to the user, in Markdown format. Example:
31
31
 
@@ -33,8 +33,8 @@ Mock data can be found under the following paths:
33
33
  - `/tmp/tmpl41bwa6n/players.csv`
34
34
  - `/tmp/tmpl41bwa6n/seasons.csv`
35
35
 
36
- == mostlyai.mock.sample DocString ==
37
- {mock.sample.__doc__}
36
+ == mostlyai.mock.core._asample docstring ==
37
+ {_asample.__doc__}
38
38
  """
39
39
 
40
40
  mcp = FastMCP(name="MostlyAI Mock MCP Server")
@@ -51,7 +51,8 @@ def _store_locally(data: dict[str, pd.DataFrame]) -> dict[str, str]:
51
51
 
52
52
 
53
53
  @mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
54
- def mock_data(
54
+ async def mock_data(
55
+ ctx: Context,
55
56
  *,
56
57
  tables: dict[str, dict],
57
58
  sample_size: int,
@@ -60,7 +61,7 @@ def mock_data(
60
61
  temperature: float = 1.0,
61
62
  top_p: float = 0.95,
62
63
  ) -> dict[str, str]:
63
- data = mock.sample(
64
+ data = await _asample(
64
65
  tables=tables,
65
66
  sample_size=sample_size,
66
67
  model=model,
@@ -68,6 +69,7 @@ def mock_data(
68
69
  temperature=temperature,
69
70
  top_p=top_p,
70
71
  return_type="dict",
72
+ progress_callback=ctx.report_progress,
71
73
  )
72
74
  locations = _store_locally(data)
73
75
  return locations
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -24,13 +24,16 @@ Classifier: Programming Language :: Python :: 3.13
24
24
  Classifier: Topic :: Software Development :: Libraries
25
25
  Classifier: Typing :: Typed
26
26
  Requires-Python: >=3.10
27
- Requires-Dist: fastmcp<3.0.0,>=2.0.0
28
27
  Requires-Dist: litellm>=1.67.0
29
28
  Requires-Dist: numpy>=1.26.3
30
29
  Requires-Dist: pandas>=2.0.0
31
30
  Requires-Dist: pyarrow>=14.0.0
32
31
  Requires-Dist: pydantic<3.0.0,>=2.0.0
33
32
  Requires-Dist: tenacity>=9.1.2
33
+ Provides-Extra: litellm-proxy
34
+ Requires-Dist: litellm[proxy]>=1.67.0; extra == 'litellm-proxy'
35
+ Provides-Extra: mcp
36
+ Requires-Dist: fastmcp<3.0.0,>=2.0.0; extra == 'mcp'
34
37
  Description-Content-Type: text/markdown
35
38
 
36
39
  # Synthetic Mock Data 🔮
@@ -229,7 +232,7 @@ tables = {
229
232
  ],
230
233
  }
231
234
  }
232
- df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
235
+ df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
233
236
  print(df)
234
237
  # employee_id name boss_id role
235
238
  # 0 B0-1 Patricia Lee <NA> President
@@ -285,18 +288,18 @@ This repo comes with MCP Server. It can be easily consumed by any MCP Client by
285
288
 
286
289
  ```json
287
290
  {
288
- "mcpServers": {
289
- "mostlyai-mock-mcp": {
290
- "command": "uvx",
291
- "args": ["--from", "mostlyai-mock", "mcp-server"],
292
- "env": {
293
- "OPENAI_API_KEY": "PROVIDE YOUR KEY",
294
- "GEMINI_API_KEY": "PROVIDE YOUR KEY",
295
- "GROQ_API_KEY": "PROVIDE YOUR KEY",
296
- "ANTHROPIC_API_KEY": "PROVIDE YOUR KEY"
297
- }
298
- }
299
- }
291
+ "mcpServers": {
292
+ "mostlyai-mock-mcp": {
293
+ "command": "uvx",
294
+ "args": ["--from", "mostlyai-mock[mcp]", "mcp-server"],
295
+ "env": {
296
+ "OPENAI_API_KEY": "PROVIDE YOUR KEY",
297
+ "GEMINI_API_KEY": "PROVIDE YOUR KEY",
298
+ "GROQ_API_KEY": "PROVIDE YOUR KEY",
299
+ "ANTHROPIC_API_KEY": "PROVIDE YOUR KEY"
300
+ }
301
+ }
302
+ }
300
303
  }
301
304
  ```
302
305
 
@@ -306,5 +309,5 @@ For example:
306
309
 
307
310
  Troubleshooting:
308
311
  1. If the MCP Client fails to detect the MCP Server, provide the absolute path in the `command` field, for example: `/Users/johnsmith/.local/bin/uvx`
309
- 2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock mcp-server`
310
- 3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "mcp-server"]`
312
+ 2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock[mcp] mcp-server`
313
+ 3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "--extra", "mcp", "mcp-server"]`
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=uv2DLnOleN6BNMfAMleXJCPcOZvM_tMTRy5njUIKDag,715
2
+ mostlyai/mock/core.py,sha256=JDJ9nVpRR2WochxumSdQS96zak0OV1frkJOwlQsPVBw,61715
3
+ mostlyai/mock/mcp_server.py,sha256=0Vn1jmrdNAvUZSviaaU7Lhn7L7iHFyd8kGFigM0-4s0,2367
4
+ mostlyai_mock-0.1.15.dist-info/METADATA,sha256=OG3NRdCcH2qycRQ5HrzyLtJwtl74lRw5Py1JtqfB2YI,14305
5
+ mostlyai_mock-0.1.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.1.15.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
+ mostlyai_mock-0.1.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.1.15.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=r4GBSmJmB1SGtviYtZwY5b3GBzhK_mt8czzk-py4flo,715
2
- mostlyai/mock/core.py,sha256=nu0PSX3Xt8l6_95cIrJ7Wt0SbJvfrLD3t0CFIidOLcM,59573
3
- mostlyai/mock/mcp_server.py,sha256=MrVUrIsAZsFzjK1suwNl1fxS1ES-wpc-YSM8cS8Fqcw,2259
4
- mostlyai_mock-0.1.13.dist-info/METADATA,sha256=un3lLINiMi8HkVcmsIr64U-OQQiqT5LsgiGam1aNTj4,14110
5
- mostlyai_mock-0.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- mostlyai_mock-0.1.13.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
- mostlyai_mock-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
- mostlyai_mock-0.1.13.dist-info/RECORD,,