mostlyai-mock 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +37 -20
- mostlyai/mock/mcp_server.py +1 -1
- {mostlyai_mock-0.1.14.dist-info → mostlyai_mock-0.1.16.dist-info}/METADATA +23 -20
- mostlyai_mock-0.1.16.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.14.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.14.dist-info → mostlyai_mock-0.1.16.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.14.dist-info → mostlyai_mock-0.1.16.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.14.dist-info → mostlyai_mock-0.1.16.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -40,10 +40,10 @@ class LLMOutputFormat(str, Enum):
|
|
40
40
|
|
41
41
|
|
42
42
|
class LLMConfig(BaseModel):
|
43
|
-
model: str
|
44
|
-
api_key: str | None
|
45
|
-
temperature: float
|
46
|
-
top_p: float
|
43
|
+
model: str
|
44
|
+
api_key: str | None
|
45
|
+
temperature: float
|
46
|
+
top_p: float
|
47
47
|
|
48
48
|
|
49
49
|
class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
@@ -270,8 +270,7 @@ async def _sample_table(
|
|
270
270
|
|
271
271
|
|
272
272
|
def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
|
273
|
-
return f"""
|
274
|
-
You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
273
|
+
return f"""You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
275
274
|
|
276
275
|
Your task is to:
|
277
276
|
|
@@ -289,8 +288,7 @@ appropriate content. For dates and timestamps, ensure logical chronology. Always
|
|
289
288
|
across tables.
|
290
289
|
|
291
290
|
When enriching existing data, carefully analyze the patterns and relationships in the existing columns \
|
292
|
-
to generate compatible and realistic values for the missing columns.
|
293
|
-
"""
|
291
|
+
to generate compatible and realistic values for the missing columns."""
|
294
292
|
|
295
293
|
|
296
294
|
def _create_table_prompt(
|
@@ -647,6 +645,24 @@ async def _worker(
|
|
647
645
|
"stream": True,
|
648
646
|
}
|
649
647
|
|
648
|
+
# support for openai reasoning models
|
649
|
+
model_only = llm_config.model.split("/")[-1] if "/" in llm_config.model else llm_config.model
|
650
|
+
reasoning_effort = (
|
651
|
+
"low"
|
652
|
+
if (model_only.startswith("o") and (model_only[1:].isdigit() or model_only[1:].split("-")[0].isdigit()))
|
653
|
+
else "minimal"
|
654
|
+
if (
|
655
|
+
model_only.startswith("gpt-")
|
656
|
+
and model_only.split("-")[1].isdigit()
|
657
|
+
and int(model_only.split("-")[1]) >= 5
|
658
|
+
)
|
659
|
+
else None
|
660
|
+
)
|
661
|
+
|
662
|
+
if reasoning_effort:
|
663
|
+
litellm_kwargs.pop("top_p")
|
664
|
+
litellm_kwargs["reasoning_effort"] = reasoning_effort
|
665
|
+
|
650
666
|
# construct messages
|
651
667
|
system_prompt = _create_system_prompt(llm_output_format)
|
652
668
|
user_prompt = _create_table_prompt(
|
@@ -715,7 +731,7 @@ async def _worker(
|
|
715
731
|
if do_repeat_task:
|
716
732
|
# allow 10 retries across all workers before propagating the exception to the orchestrator
|
717
733
|
await retry_queue.put(1)
|
718
|
-
if retry_queue.qsize()
|
734
|
+
if retry_queue.qsize() <= 10:
|
719
735
|
# put task back to the front of the batch queue
|
720
736
|
await batch_queue.put((batch_idx, task))
|
721
737
|
else:
|
@@ -1118,7 +1134,7 @@ async def _sample_common(
|
|
1118
1134
|
tables: dict[str, dict],
|
1119
1135
|
sample_size: int | dict[str, int] = 4,
|
1120
1136
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
1121
|
-
model: str = "openai/gpt-
|
1137
|
+
model: str = "openai/gpt-5-nano",
|
1122
1138
|
api_key: str | None = None,
|
1123
1139
|
temperature: float = 1.0,
|
1124
1140
|
top_p: float = 0.95,
|
@@ -1166,7 +1182,7 @@ def sample(
|
|
1166
1182
|
tables: dict[str, dict],
|
1167
1183
|
sample_size: int | dict[str, int] = 4,
|
1168
1184
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
1169
|
-
model: str = "openai/gpt-
|
1185
|
+
model: str = "openai/gpt-5-nano",
|
1170
1186
|
api_key: str | None = None,
|
1171
1187
|
temperature: float = 1.0,
|
1172
1188
|
top_p: float = 0.95,
|
@@ -1181,11 +1197,12 @@ def sample(
|
|
1181
1197
|
or the enrichment of existing datasets with new, context-aware columns.
|
1182
1198
|
|
1183
1199
|
It is particularly useful for quickly simulating production-like datasets for testing or prototyping purposes.
|
1184
|
-
It is advised to limit mocking to small datasets for performance reasons (rows * cols <
|
1200
|
+
It is advised to limit mocking to small datasets for performance reasons (rows * cols < 1000).
|
1185
1201
|
It might take a couple of minutes for bigger datasets.
|
1186
1202
|
|
1187
1203
|
Args:
|
1188
1204
|
tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
|
1205
|
+
Note: Avoid using double quotes (`"`) and other special characters in column names.
|
1189
1206
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
1190
1207
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
1191
1208
|
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
@@ -1195,9 +1212,9 @@ def sample(
|
|
1195
1212
|
Default is None.
|
1196
1213
|
model (str): The LiteLLM chat completion model to be used.
|
1197
1214
|
Examples include:
|
1198
|
-
- `openai/gpt-
|
1199
|
-
- `openai/gpt-
|
1200
|
-
- `openai/gpt-
|
1215
|
+
- `openai/gpt-5-nano` (default; fast, and smart)
|
1216
|
+
- `openai/gpt-5-mini` (slower, but smarter)
|
1217
|
+
- `openai/gpt-5` (slowest, but smartest)
|
1201
1218
|
- `gemini/gemini-2.0-flash`
|
1202
1219
|
- `gemini/gemini-2.5-flash-preview-04-17`
|
1203
1220
|
- 'groq/gemma2-9b-it`
|
@@ -1235,7 +1252,7 @@ def sample(
|
|
1235
1252
|
},
|
1236
1253
|
}
|
1237
1254
|
}
|
1238
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-
|
1255
|
+
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5-nano")
|
1239
1256
|
```
|
1240
1257
|
|
1241
1258
|
Example of generating mock data for multiple tables (with PK/FK relationships):
|
@@ -1298,7 +1315,7 @@ def sample(
|
|
1298
1315
|
],
|
1299
1316
|
},
|
1300
1317
|
}
|
1301
|
-
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-
|
1318
|
+
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-5")
|
1302
1319
|
df_customers = data["customers"]
|
1303
1320
|
df_warehouses = data["warehouses"]
|
1304
1321
|
df_orders = data["orders"]
|
@@ -1327,7 +1344,7 @@ def sample(
|
|
1327
1344
|
enriched_df = mock.sample(
|
1328
1345
|
tables=tables,
|
1329
1346
|
existing_data={"patients": existing_df},
|
1330
|
-
model="openai/gpt-
|
1347
|
+
model="openai/gpt-5-nano"
|
1331
1348
|
)
|
1332
1349
|
enriched_df
|
1333
1350
|
```
|
@@ -1382,7 +1399,7 @@ def sample(
|
|
1382
1399
|
"customers": existing_customers,
|
1383
1400
|
"orders": existing_orders,
|
1384
1401
|
},
|
1385
|
-
model="openai/gpt-
|
1402
|
+
model="openai/gpt-5-nano"
|
1386
1403
|
)
|
1387
1404
|
df_customers = data["customers"]
|
1388
1405
|
df_orders = data["orders"]
|
@@ -1414,7 +1431,7 @@ async def _asample(
|
|
1414
1431
|
tables: dict[str, dict],
|
1415
1432
|
sample_size: int | dict[str, int] = 4,
|
1416
1433
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
1417
|
-
model: str = "openai/gpt-
|
1434
|
+
model: str = "openai/gpt-5-nano",
|
1418
1435
|
api_key: str | None = None,
|
1419
1436
|
temperature: float = 1.0,
|
1420
1437
|
top_p: float = 0.95,
|
mostlyai/mock/mcp_server.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.16
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -24,13 +24,16 @@ Classifier: Programming Language :: Python :: 3.13
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries
|
25
25
|
Classifier: Typing :: Typed
|
26
26
|
Requires-Python: >=3.10
|
27
|
-
Requires-Dist: fastmcp<3.0.0,>=2.0.0
|
28
27
|
Requires-Dist: litellm>=1.67.0
|
29
28
|
Requires-Dist: numpy>=1.26.3
|
30
29
|
Requires-Dist: pandas>=2.0.0
|
31
30
|
Requires-Dist: pyarrow>=14.0.0
|
32
31
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
33
32
|
Requires-Dist: tenacity>=9.1.2
|
33
|
+
Provides-Extra: litellm-proxy
|
34
|
+
Requires-Dist: litellm[proxy]>=1.67.0; extra == 'litellm-proxy'
|
35
|
+
Provides-Extra: mcp
|
36
|
+
Requires-Dist: fastmcp<3.0.0,>=2.0.0; extra == 'mcp'
|
34
37
|
Description-Content-Type: text/markdown
|
35
38
|
|
36
39
|
# Synthetic Mock Data 🔮
|
@@ -92,7 +95,7 @@ tables = {
|
|
92
95
|
df = mock.sample(
|
93
96
|
tables=tables, # provide table and column definitions
|
94
97
|
sample_size=10, # generate 10 records
|
95
|
-
model="openai/gpt-
|
98
|
+
model="openai/gpt-5-nano", # select the LLM model (optional)
|
96
99
|
)
|
97
100
|
print(df)
|
98
101
|
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
@@ -173,7 +176,7 @@ tables = {
|
|
173
176
|
data = mock.sample(
|
174
177
|
tables=tables,
|
175
178
|
sample_size=2,
|
176
|
-
model="openai/gpt-
|
179
|
+
model="openai/gpt-5",
|
177
180
|
n_workers=1,
|
178
181
|
)
|
179
182
|
print(data["customers"])
|
@@ -229,7 +232,7 @@ tables = {
|
|
229
232
|
],
|
230
233
|
}
|
231
234
|
}
|
232
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-
|
235
|
+
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
|
233
236
|
print(df)
|
234
237
|
# employee_id name boss_id role
|
235
238
|
# 0 B0-1 Patricia Lee <NA> President
|
@@ -270,7 +273,7 @@ existing_guests = pd.DataFrame({
|
|
270
273
|
df = mock.sample(
|
271
274
|
tables=tables,
|
272
275
|
existing_data={"guests": existing_guests},
|
273
|
-
model="openai/gpt-
|
276
|
+
model="openai/gpt-5-nano"
|
274
277
|
)
|
275
278
|
print(df)
|
276
279
|
# guest_id name nationality gender age room_number is_vip
|
@@ -285,18 +288,18 @@ This repo comes with MCP Server. It can be easily consumed by any MCP Client by
|
|
285
288
|
|
286
289
|
```json
|
287
290
|
{
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
291
|
+
"mcpServers": {
|
292
|
+
"mostlyai-mock-mcp": {
|
293
|
+
"command": "uvx",
|
294
|
+
"args": ["--from", "mostlyai-mock[mcp]", "mcp-server"],
|
295
|
+
"env": {
|
296
|
+
"OPENAI_API_KEY": "PROVIDE YOUR KEY",
|
297
|
+
"GEMINI_API_KEY": "PROVIDE YOUR KEY",
|
298
|
+
"GROQ_API_KEY": "PROVIDE YOUR KEY",
|
299
|
+
"ANTHROPIC_API_KEY": "PROVIDE YOUR KEY"
|
300
|
+
}
|
301
|
+
}
|
302
|
+
}
|
300
303
|
}
|
301
304
|
```
|
302
305
|
|
@@ -306,5 +309,5 @@ For example:
|
|
306
309
|
|
307
310
|
Troubleshooting:
|
308
311
|
1. If the MCP Client fails to detect the MCP Server, provide the absolute path in the `command` field, for example: `/Users/johnsmith/.local/bin/uvx`
|
309
|
-
2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock mcp-server`
|
310
|
-
3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "mcp-server"]`
|
312
|
+
2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock[mcp] mcp-server`
|
313
|
+
3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "--extra", "mcp", "mcp-server"]`
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=XEezyGjkXQBReW_ORi83H2WEVhLolDDLbGjxA2g2yEs,715
|
2
|
+
mostlyai/mock/core.py,sha256=FTF0BfJowxNHm_L0RpTk6BhS1mXzvjELP-3Z96aFVMQ,62454
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
|
4
|
+
mostlyai_mock-0.1.16.dist-info/METADATA,sha256=CT6lcz2cAq5W-u3VjQLr_Dg8VbuEtU-JlvsXg5OsKTk,14297
|
5
|
+
mostlyai_mock-0.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.16.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.16.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=MLHwi5g6_lAEd8cDEISbVdRWmorOVAQ6IoMm8BsRpqg,715
|
2
|
-
mostlyai/mock/core.py,sha256=JdWHix-Pp0s---b_Z3f2ui7J7LSl4_r_gPP0z8UHKY8,61663
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=0Vn1jmrdNAvUZSviaaU7Lhn7L7iHFyd8kGFigM0-4s0,2367
|
4
|
-
mostlyai_mock-0.1.14.dist-info/METADATA,sha256=PHiUTSEvevYTPVvsMGT-kilTDwaEgIEL0T8Vr56PSiY,14123
|
5
|
-
mostlyai_mock-0.1.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.14.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|