mostlyai-mock 0.0.11__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +16 -13
- mostlyai/mock/mcp_server.py +6 -16
- {mostlyai_mock-0.0.11.dist-info → mostlyai_mock-0.1.0.dist-info}/METADATA +2 -2
- mostlyai_mock-0.1.0.dist-info/RECORD +8 -0
- mostlyai_mock-0.0.11.dist-info/RECORD +0 -8
- {mostlyai_mock-0.0.11.dist-info → mostlyai_mock-0.1.0.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.0.11.dist-info → mostlyai_mock-0.1.0.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.0.11.dist-info → mostlyai_mock-0.1.0.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -25,7 +25,7 @@ import pandas as pd
|
|
25
25
|
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
26
26
|
from tqdm import tqdm
|
27
27
|
|
28
|
-
SYSTEM_PROMPT =
|
28
|
+
SYSTEM_PROMPT = """
|
29
29
|
You are a specialized synthetic data generator designed to create
|
30
30
|
highly realistic, contextually appropriate data based on schema definitions. Your task is to:
|
31
31
|
|
@@ -264,8 +264,8 @@ def _create_table_prompt(
|
|
264
264
|
|
265
265
|
prompt += f"## Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
266
266
|
|
267
|
-
prompt +=
|
268
|
-
prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
|
267
|
+
prompt += "## Context Table Data:\n\n"
|
268
|
+
prompt += f"{context_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
269
269
|
|
270
270
|
# add non-context table names, primary keys and data
|
271
271
|
if foreign_keys and len(foreign_keys) > 1:
|
@@ -278,8 +278,10 @@ def _create_table_prompt(
|
|
278
278
|
|
279
279
|
prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
280
280
|
|
281
|
-
prompt +=
|
282
|
-
prompt +=
|
281
|
+
prompt += "## Non-Context Table Data:\n\n"
|
282
|
+
prompt += (
|
283
|
+
f"{non_context_data[fk.referenced_table].to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
284
|
+
)
|
283
285
|
|
284
286
|
# add instructions
|
285
287
|
prompt += "\n## Instructions:\n\n"
|
@@ -302,8 +304,8 @@ def _create_table_prompt(
|
|
302
304
|
"Don't copy previous rows in the output. "
|
303
305
|
"Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
|
304
306
|
)
|
305
|
-
prompt +=
|
306
|
-
prompt +=
|
307
|
+
prompt += "Do not use code to generate the data.\n\n"
|
308
|
+
prompt += "Return the full data as a JSON string.\n"
|
307
309
|
|
308
310
|
return prompt
|
309
311
|
|
@@ -384,12 +386,12 @@ def _create_table_rows_generator(
|
|
384
386
|
for i in range(0, len(data), batch_size):
|
385
387
|
yield data.iloc[i : i + batch_size]
|
386
388
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
389
|
+
if not llm_config.model.startswith("litellm_proxy/"):
|
390
|
+
# ensure model supports response_format and json schema (this check does not work with litellm_proxy)
|
391
|
+
supported_params = litellm.get_supported_openai_params(model=llm_config.model) or []
|
392
|
+
assert "response_format" in supported_params and litellm.supports_response_schema(llm_config.model), (
|
393
|
+
"The model does not support structured output / JSON mode."
|
394
|
+
)
|
393
395
|
|
394
396
|
# derive context data (if first foreign key is present) and harmonize sample size accordingly
|
395
397
|
context_data: pd.DataFrame | None = None
|
@@ -398,6 +400,7 @@ def _create_table_rows_generator(
|
|
398
400
|
assert generated_data is not None
|
399
401
|
assert context_table_name in generated_data
|
400
402
|
context_data = generated_data[context_table_name]
|
403
|
+
batch_size = 1 # generate one sequence at a time
|
401
404
|
sample_size = len(context_data)
|
402
405
|
|
403
406
|
# derive non-context data (if more than one foreign key is present)
|
mostlyai/mock/mcp_server.py
CHANGED
@@ -2,13 +2,14 @@ import os
|
|
2
2
|
import tempfile
|
3
3
|
|
4
4
|
import pandas as pd
|
5
|
-
from fastmcp import
|
5
|
+
from fastmcp import FastMCP
|
6
6
|
|
7
7
|
from mostlyai import mock
|
8
8
|
|
9
9
|
SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
|
10
|
-
|
11
|
-
|
10
|
+
Generate mock data by prompting an LLM.
|
11
|
+
|
12
|
+
This tool is a proxy to the `mostlyai.mock.sample` function, but returns a dictionary of paths to the generated CSV files.
|
12
13
|
|
13
14
|
Present the result nicely to the user, in Markdown format. Example:
|
14
15
|
|
@@ -16,10 +17,7 @@ Mock data can be found under the following paths:
|
|
16
17
|
- `/tmp/tmpl41bwa6n/players.csv`
|
17
18
|
- `/tmp/tmpl41bwa6n/seasons.csv`
|
18
19
|
|
19
|
-
|
20
|
-
What comes after the `=============================` is the documentation of the `mostlyai.mock.sample` function.
|
21
|
-
|
22
|
-
=============================
|
20
|
+
== mostlyai.mock.sample DocString ==
|
23
21
|
{mock.sample.__doc__}
|
24
22
|
"""
|
25
23
|
|
@@ -37,7 +35,7 @@ def _store_locally(data: dict[str, pd.DataFrame]) -> dict[str, str]:
|
|
37
35
|
|
38
36
|
|
39
37
|
@mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
|
40
|
-
def
|
38
|
+
def mock_data(
|
41
39
|
*,
|
42
40
|
tables: dict[str, dict],
|
43
41
|
sample_size: int,
|
@@ -45,14 +43,7 @@ def sample_mock_data(
|
|
45
43
|
api_key: str | None = None,
|
46
44
|
temperature: float = 1.0,
|
47
45
|
top_p: float = 0.95,
|
48
|
-
ctx: Context,
|
49
46
|
) -> dict[str, str]:
|
50
|
-
# Notes:
|
51
|
-
# 1. Returning DataFrames directly results in converting them into truncated string.
|
52
|
-
# 2. The logs / progress bars are not propagated to the MCP Client. There is a dedicated API to do that (e.g. `ctx.info(...)`)
|
53
|
-
# 3. MCP Server inherits only selected environment variables (PATH, USER...); one way to pass LLM keys is through client configuration (`mcpServers->env`)
|
54
|
-
# 4. Some MCP Clients, e.g. Cursor, do not like Unions or Optionals in type hints
|
55
|
-
ctx.info(f"Generating mock data for `{len(tables)}` tables")
|
56
47
|
data = mock.sample(
|
57
48
|
tables=tables,
|
58
49
|
sample_size=sample_size,
|
@@ -62,7 +53,6 @@ def sample_mock_data(
|
|
62
53
|
top_p=top_p,
|
63
54
|
return_type="dict",
|
64
55
|
)
|
65
|
-
ctx.info(f"Generated mock data for `{len(tables)}` tables")
|
66
56
|
locations = _store_locally(data)
|
67
57
|
return locations
|
68
58
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -268,4 +268,4 @@ For example:
|
|
268
268
|
Troubleshooting:
|
269
269
|
1. If the MCP Client fails to detect the MCP Server, provide the absolute path in the `command` field, for example: `/Users/johnsmith/.local/bin/uvx`
|
270
270
|
2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock mcp-server`
|
271
|
-
3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--
|
271
|
+
3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "mcp-server"]`
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=ozv-XYJlW9RYYfDXVWDKJxMxqEif6MOC0HXCY3nX0eA,714
|
2
|
+
mostlyai/mock/core.py,sha256=g00md01vP-xyI6FKDUB-PCCzOmtpeN8Bm8jXQEwclJA,30130
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
|
4
|
+
mostlyai_mock-0.1.0.dist-info/METADATA,sha256=rbHAHWTMQDrcJjio0DT8_sstVaVa7B0NGPMqACFoXUY,12708
|
5
|
+
mostlyai_mock-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.0.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.0.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=fDLeoKXZggBnsgC_lhC7-O_DltAbc-Wea0QL0xgsjtY,715
|
2
|
-
mostlyai/mock/core.py,sha256=p5VAsRppzAc4P8FqKEunfQ3cPjImUU2cEc6yqHJVhMg,29884
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=F4O49tK2NJV-N53gxbJRNk24-lx5b2YgaUaojQhNAqQ,2318
|
4
|
-
mostlyai_mock-0.0.11.dist-info/METADATA,sha256=tApgQujSIyaH_jutE-_m3-3L8sqm1nXrSrxmzOW-d6M,12680
|
5
|
-
mostlyai_mock-0.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.0.11.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.0.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.0.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|