mostlyai-mock 0.0.7__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/PKG-INFO +48 -8
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/README.md +46 -7
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/mostlyai/mock/core.py +136 -144
- mostlyai_mock-0.0.9/mostlyai/mock/mcp_server.py +85 -0
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/pyproject.toml +5 -1
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/.gitignore +0 -0
- {mostlyai_mock-0.0.7 → mostlyai_mock-0.0.9}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -24,6 +24,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries
|
25
25
|
Classifier: Typing :: Typed
|
26
26
|
Requires-Python: >=3.10
|
27
|
+
Requires-Dist: fastmcp<3.0.0,>=2.0.0
|
27
28
|
Requires-Dist: litellm>=1.67.0
|
28
29
|
Requires-Dist: numpy>=1.26.3
|
29
30
|
Requires-Dist: pandas>=2.0.0
|
@@ -72,7 +73,7 @@ from mostlyai import mock
|
|
72
73
|
|
73
74
|
tables = {
|
74
75
|
"guests": {
|
75
|
-
"
|
76
|
+
"prompt": "Guests of an Alpine ski hotel in Austria",
|
76
77
|
"columns": {
|
77
78
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
78
79
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
@@ -112,7 +113,7 @@ from mostlyai import mock
|
|
112
113
|
|
113
114
|
tables = {
|
114
115
|
"customers": {
|
115
|
-
"
|
116
|
+
"prompt": "Customers of a hardware store",
|
116
117
|
"columns": {
|
117
118
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
118
119
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
@@ -120,7 +121,7 @@ tables = {
|
|
120
121
|
"primary_key": "customer_id",
|
121
122
|
},
|
122
123
|
"warehouses": {
|
123
|
-
"
|
124
|
+
"prompt": "Warehouses of a hardware store",
|
124
125
|
"columns": {
|
125
126
|
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
126
127
|
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
@@ -128,7 +129,7 @@ tables = {
|
|
128
129
|
"primary_key": "warehouse_id",
|
129
130
|
},
|
130
131
|
"orders": {
|
131
|
-
"
|
132
|
+
"prompt": "Orders of a Customer",
|
132
133
|
"columns": {
|
133
134
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
134
135
|
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
@@ -141,7 +142,7 @@ tables = {
|
|
141
142
|
{
|
142
143
|
"column": "customer_id",
|
143
144
|
"referenced_table": "customers",
|
144
|
-
"
|
145
|
+
"prompt": "each customer has anywhere between 2 and 3 orders",
|
145
146
|
},
|
146
147
|
{
|
147
148
|
"column": "warehouse_id",
|
@@ -150,7 +151,7 @@ tables = {
|
|
150
151
|
],
|
151
152
|
},
|
152
153
|
"items": {
|
153
|
-
"
|
154
|
+
"prompt": "Items in an Order",
|
154
155
|
"columns": {
|
155
156
|
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
156
157
|
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
@@ -161,7 +162,7 @@ tables = {
|
|
161
162
|
{
|
162
163
|
"column": "order_id",
|
163
164
|
"referenced_table": "orders",
|
164
|
-
"
|
165
|
+
"prompt": "each order has between 1 and 2 items",
|
165
166
|
}
|
166
167
|
],
|
167
168
|
},
|
@@ -199,3 +200,42 @@ print(data["items"])
|
|
199
200
|
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
200
201
|
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
201
202
|
```
|
203
|
+
|
204
|
+
6. Create your first self-referencing synthetic table
|
205
|
+
|
206
|
+
```python
|
207
|
+
from mostlyai import mock
|
208
|
+
|
209
|
+
tables = {
|
210
|
+
"employees": {
|
211
|
+
"prompt": "Employees of a company",
|
212
|
+
"columns": {
|
213
|
+
"employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
|
214
|
+
"name": {"prompt": "first name and last name of the president", "dtype": "string"},
|
215
|
+
"boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
|
216
|
+
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
217
|
+
},
|
218
|
+
"primary_key": "employee_id",
|
219
|
+
"foreign_keys": [
|
220
|
+
{
|
221
|
+
"column": "boss_id",
|
222
|
+
"referenced_table": "employees",
|
223
|
+
"prompt": "each boss has at most 3 employees",
|
224
|
+
},
|
225
|
+
],
|
226
|
+
}
|
227
|
+
}
|
228
|
+
df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
|
229
|
+
print(df)
|
230
|
+
# employee_id name boss_id role
|
231
|
+
# 0 1 Sandra Phillips <NA> President
|
232
|
+
# 1 2 Marcus Tran 1 Chief Financial Officer
|
233
|
+
# 2 3 Ava Whittaker 1 Chief Technology Officer
|
234
|
+
# 3 4 Sophie Martin 1 Chief Operations Officer
|
235
|
+
# 4 5 Chad Nelson 2 Finance Manager
|
236
|
+
# 5 6 Ethan Glover 2 Senior Accountant
|
237
|
+
# 6 7 Kimberly Ortiz 2 Junior Accountant
|
238
|
+
# 7 8 Lucas Romero 3 IT Manager
|
239
|
+
# 8 9 Priya Desai 3 Lead Software Engineer
|
240
|
+
# 9 10 Felix Bennett 3 Senior Systems Analyst
|
241
|
+
```
|
@@ -39,7 +39,7 @@ from mostlyai import mock
|
|
39
39
|
|
40
40
|
tables = {
|
41
41
|
"guests": {
|
42
|
-
"
|
42
|
+
"prompt": "Guests of an Alpine ski hotel in Austria",
|
43
43
|
"columns": {
|
44
44
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
45
45
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
@@ -79,7 +79,7 @@ from mostlyai import mock
|
|
79
79
|
|
80
80
|
tables = {
|
81
81
|
"customers": {
|
82
|
-
"
|
82
|
+
"prompt": "Customers of a hardware store",
|
83
83
|
"columns": {
|
84
84
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
85
85
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
@@ -87,7 +87,7 @@ tables = {
|
|
87
87
|
"primary_key": "customer_id",
|
88
88
|
},
|
89
89
|
"warehouses": {
|
90
|
-
"
|
90
|
+
"prompt": "Warehouses of a hardware store",
|
91
91
|
"columns": {
|
92
92
|
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
93
93
|
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
@@ -95,7 +95,7 @@ tables = {
|
|
95
95
|
"primary_key": "warehouse_id",
|
96
96
|
},
|
97
97
|
"orders": {
|
98
|
-
"
|
98
|
+
"prompt": "Orders of a Customer",
|
99
99
|
"columns": {
|
100
100
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
101
101
|
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
@@ -108,7 +108,7 @@ tables = {
|
|
108
108
|
{
|
109
109
|
"column": "customer_id",
|
110
110
|
"referenced_table": "customers",
|
111
|
-
"
|
111
|
+
"prompt": "each customer has anywhere between 2 and 3 orders",
|
112
112
|
},
|
113
113
|
{
|
114
114
|
"column": "warehouse_id",
|
@@ -117,7 +117,7 @@ tables = {
|
|
117
117
|
],
|
118
118
|
},
|
119
119
|
"items": {
|
120
|
-
"
|
120
|
+
"prompt": "Items in an Order",
|
121
121
|
"columns": {
|
122
122
|
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
123
123
|
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
@@ -128,7 +128,7 @@ tables = {
|
|
128
128
|
{
|
129
129
|
"column": "order_id",
|
130
130
|
"referenced_table": "orders",
|
131
|
-
"
|
131
|
+
"prompt": "each order has between 1 and 2 items",
|
132
132
|
}
|
133
133
|
],
|
134
134
|
},
|
@@ -166,3 +166,42 @@ print(data["items"])
|
|
166
166
|
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
167
167
|
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
168
168
|
```
|
169
|
+
|
170
|
+
6. Create your first self-referencing synthetic table
|
171
|
+
|
172
|
+
```python
|
173
|
+
from mostlyai import mock
|
174
|
+
|
175
|
+
tables = {
|
176
|
+
"employees": {
|
177
|
+
"prompt": "Employees of a company",
|
178
|
+
"columns": {
|
179
|
+
"employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
|
180
|
+
"name": {"prompt": "first name and last name of the president", "dtype": "string"},
|
181
|
+
"boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
|
182
|
+
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
183
|
+
},
|
184
|
+
"primary_key": "employee_id",
|
185
|
+
"foreign_keys": [
|
186
|
+
{
|
187
|
+
"column": "boss_id",
|
188
|
+
"referenced_table": "employees",
|
189
|
+
"prompt": "each boss has at most 3 employees",
|
190
|
+
},
|
191
|
+
],
|
192
|
+
}
|
193
|
+
}
|
194
|
+
df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
|
195
|
+
print(df)
|
196
|
+
# employee_id name boss_id role
|
197
|
+
# 0 1 Sandra Phillips <NA> President
|
198
|
+
# 1 2 Marcus Tran 1 Chief Financial Officer
|
199
|
+
# 2 3 Ava Whittaker 1 Chief Technology Officer
|
200
|
+
# 3 4 Sophie Martin 1 Chief Operations Officer
|
201
|
+
# 4 5 Chad Nelson 2 Finance Manager
|
202
|
+
# 5 6 Ethan Glover 2 Senior Accountant
|
203
|
+
# 6 7 Kimberly Ortiz 2 Junior Accountant
|
204
|
+
# 7 8 Lucas Romero 3 IT Manager
|
205
|
+
# 8 9 Priya Desai 3 Lead Software Engineer
|
206
|
+
# 9 10 Felix Bennett 3 Senior Systems Analyst
|
207
|
+
```
|
@@ -44,8 +44,10 @@ across tables.
|
|
44
44
|
|
45
45
|
|
46
46
|
class LLMConfig(BaseModel):
|
47
|
-
model: str
|
47
|
+
model: str = "openai/gpt-4.1-nano"
|
48
48
|
api_key: str | None = None
|
49
|
+
temperature: float = 1.0
|
50
|
+
top_p: float = 0.95
|
49
51
|
|
50
52
|
|
51
53
|
class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
@@ -100,10 +102,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
100
102
|
if table_name in path:
|
101
103
|
cycle_start = path.index(table_name)
|
102
104
|
cycle = path[cycle_start:] + [table_name]
|
103
|
-
|
104
|
-
|
105
|
-
msg += " Self-referencing tables are not yet supported."
|
106
|
-
raise ValueError(msg)
|
105
|
+
if len(cycle) > 2: # len(cycle) == 2 means self-referencing table, which is allowed
|
106
|
+
raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}.")
|
107
107
|
if table_name in visited:
|
108
108
|
return
|
109
109
|
visited.add(table_name)
|
@@ -119,7 +119,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
119
119
|
|
120
120
|
|
121
121
|
class TableConfig(BaseModel):
|
122
|
-
|
122
|
+
prompt: str = ""
|
123
123
|
columns: dict[str, ColumnConfig] = Field(..., min_items=1)
|
124
124
|
primary_key: str | None = None
|
125
125
|
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
|
@@ -187,83 +187,78 @@ class DType(str, Enum):
|
|
187
187
|
class ForeignKeyConfig(BaseModel):
|
188
188
|
column: str
|
189
189
|
referenced_table: str
|
190
|
-
|
190
|
+
prompt: str | None = None
|
191
191
|
|
192
192
|
|
193
193
|
def _sample_table(
|
194
194
|
*,
|
195
|
-
|
196
|
-
|
195
|
+
name: str,
|
196
|
+
prompt: str,
|
197
|
+
columns: dict[str, ColumnConfig],
|
198
|
+
foreign_keys: list[ForeignKeyConfig] | None,
|
197
199
|
primary_keys: dict[str, str] | None,
|
198
|
-
sample_size: int | None,
|
199
200
|
generated_data: dict[str, pd.DataFrame] | None,
|
200
|
-
|
201
|
-
top_p: float,
|
201
|
+
sample_size: int,
|
202
202
|
batch_size: int,
|
203
203
|
previous_rows_size: int,
|
204
204
|
non_context_size: int | None,
|
205
205
|
llm_config: LLMConfig,
|
206
206
|
) -> pd.DataFrame:
|
207
207
|
table_rows_generator = _create_table_rows_generator(
|
208
|
-
|
209
|
-
|
208
|
+
name=name,
|
209
|
+
prompt=prompt,
|
210
|
+
columns=columns,
|
210
211
|
primary_keys=primary_keys,
|
211
|
-
|
212
|
+
foreign_keys=foreign_keys,
|
212
213
|
generated_data=generated_data,
|
213
|
-
|
214
|
-
top_p=top_p,
|
214
|
+
sample_size=sample_size,
|
215
215
|
batch_size=batch_size,
|
216
216
|
previous_rows_size=previous_rows_size,
|
217
217
|
non_context_size=non_context_size,
|
218
218
|
llm_config=llm_config,
|
219
219
|
)
|
220
|
-
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{
|
221
|
-
table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator,
|
220
|
+
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
221
|
+
table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
|
222
222
|
return table_df
|
223
223
|
|
224
224
|
|
225
225
|
def _create_table_prompt(
|
226
226
|
*,
|
227
|
-
|
228
|
-
|
227
|
+
name: str,
|
228
|
+
prompt: str,
|
229
229
|
columns: dict[str, ColumnConfig],
|
230
230
|
primary_keys: dict[str, str] | None,
|
231
231
|
batch_size: int | None,
|
232
232
|
foreign_keys: list[ForeignKeyConfig] | None,
|
233
233
|
context_data: pd.DataFrame | None,
|
234
|
-
non_context_data: dict[str, pd.DataFrame],
|
235
|
-
previous_rows: list[dict],
|
234
|
+
non_context_data: dict[str, pd.DataFrame] | None,
|
235
|
+
previous_rows: list[dict] | None,
|
236
236
|
) -> str:
|
237
|
-
|
238
|
-
|
239
|
-
assert context_data is None
|
240
|
-
else:
|
241
|
-
assert foreign_keys is not None
|
242
|
-
assert context_data is not None
|
243
|
-
assert primary_keys is not None
|
244
|
-
|
245
|
-
# add description
|
246
|
-
prompt = f"# {table_description}\n\n"
|
237
|
+
# add table prompt
|
238
|
+
prompt = f"# {prompt}\n\n"
|
247
239
|
|
248
240
|
# define table
|
249
|
-
prompt += f"## Table: {
|
241
|
+
prompt += f"## Table: {name}\n\n"
|
242
|
+
|
243
|
+
prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
|
250
244
|
|
251
245
|
# add columns specifications
|
252
246
|
prompt += "## Columns Specifications:\n\n"
|
253
247
|
prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
|
254
248
|
|
255
|
-
# define foreign keys
|
256
|
-
if foreign_keys is not None:
|
257
|
-
prompt += "## Foreign Keys:\n\n"
|
258
|
-
prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
|
259
|
-
|
260
249
|
# add previous rows as context to help the LLM generate consistent data
|
261
250
|
if previous_rows:
|
262
251
|
prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
|
263
252
|
prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
|
264
253
|
|
254
|
+
# define foreign keys
|
255
|
+
if foreign_keys:
|
256
|
+
prompt += "## Foreign Keys:\n\n"
|
257
|
+
prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
|
258
|
+
|
265
259
|
# add context table name, primary key and data
|
266
|
-
if
|
260
|
+
if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
|
261
|
+
assert context_data is not None
|
267
262
|
fk = foreign_keys[0]
|
268
263
|
prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
|
269
264
|
|
@@ -273,8 +268,12 @@ def _create_table_prompt(
|
|
273
268
|
prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
|
274
269
|
|
275
270
|
# add non-context table names, primary keys and data
|
276
|
-
if
|
271
|
+
if foreign_keys and len(foreign_keys) > 1:
|
277
272
|
for fk in foreign_keys[1:]:
|
273
|
+
if fk.referenced_table == name: # self-dependency is not considered as non-context
|
274
|
+
continue
|
275
|
+
assert non_context_data is not None
|
276
|
+
assert fk.referenced_table in non_context_data
|
278
277
|
prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
|
279
278
|
|
280
279
|
prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
@@ -284,15 +283,17 @@ def _create_table_prompt(
|
|
284
283
|
|
285
284
|
# add instructions
|
286
285
|
prompt += "\n## Instructions:\n\n"
|
287
|
-
if
|
288
|
-
|
289
|
-
|
290
|
-
|
286
|
+
if not foreign_keys:
|
287
|
+
assert batch_size is not None
|
288
|
+
prompt += f"Generate {batch_size} rows for the `{name}` table.\n\n"
|
289
|
+
else:
|
291
290
|
prompt += (
|
292
|
-
f"Generate data for the `{
|
291
|
+
f"Generate data for the `{name}` table. "
|
293
292
|
f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
|
294
|
-
f"The
|
295
|
-
f"
|
293
|
+
f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
|
294
|
+
f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
|
295
|
+
f"In this case, ensure that the generated foreign keys are consistent with generated primary keys of the table. "
|
296
|
+
f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
|
296
297
|
)
|
297
298
|
|
298
299
|
if previous_rows:
|
@@ -309,13 +310,13 @@ def _create_table_prompt(
|
|
309
310
|
|
310
311
|
def _create_table_rows_generator(
|
311
312
|
*,
|
312
|
-
|
313
|
-
|
313
|
+
name: str,
|
314
|
+
prompt: str,
|
315
|
+
columns: dict[str, ColumnConfig],
|
316
|
+
foreign_keys: list[ForeignKeyConfig] | None,
|
314
317
|
primary_keys: dict[str, str] | None,
|
315
|
-
sample_size: int | None,
|
316
318
|
generated_data: dict[str, pd.DataFrame] | None,
|
317
|
-
|
318
|
-
top_p: float,
|
319
|
+
sample_size: int,
|
319
320
|
batch_size: int,
|
320
321
|
previous_rows_size: int,
|
321
322
|
non_context_size: int | None,
|
@@ -383,37 +384,38 @@ def _create_table_rows_generator(
|
|
383
384
|
for i in range(0, len(data), batch_size):
|
384
385
|
yield data.iloc[i : i + batch_size]
|
385
386
|
|
387
|
+
# ensure model supports response_format and json schema
|
388
|
+
supported_params = litellm.get_supported_openai_params(model=llm_config.model)
|
389
|
+
assert "response_format" in supported_params
|
390
|
+
assert litellm.supports_response_schema(llm_config.model), (
|
391
|
+
"The model does not support structured output / JSON mode."
|
392
|
+
)
|
393
|
+
|
386
394
|
# derive context data (if first foreign key is present) and harmonize sample size accordingly
|
387
395
|
context_data: pd.DataFrame | None = None
|
388
|
-
if
|
389
|
-
context_table_name =
|
396
|
+
if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
|
397
|
+
context_table_name = foreign_keys[0].referenced_table
|
390
398
|
assert generated_data is not None
|
391
399
|
assert context_table_name in generated_data
|
392
400
|
context_data = generated_data[context_table_name]
|
393
401
|
sample_size = len(context_data)
|
394
|
-
assert sample_size is not None
|
395
402
|
|
396
403
|
# derive non-context data (if more than one foreign key is present)
|
397
404
|
non_context_data: dict[str, pd.DataFrame] = {}
|
398
|
-
if
|
405
|
+
if foreign_keys and len(foreign_keys) > 1:
|
399
406
|
assert generated_data is not None
|
400
407
|
assert non_context_size is not None
|
401
|
-
for fk in
|
408
|
+
for fk in foreign_keys[1:]:
|
409
|
+
if fk.referenced_table == name: # self-dependency is not considered as non-context
|
410
|
+
continue
|
402
411
|
non_context_table_name = fk.referenced_table
|
403
412
|
assert non_context_table_name in generated_data
|
404
413
|
non_context_data[non_context_table_name] = generated_data[non_context_table_name]
|
405
414
|
|
406
|
-
# ensure model supports response_format and json schema
|
407
|
-
supported_params = litellm.get_supported_openai_params(model=llm_config.model)
|
408
|
-
assert "response_format" in supported_params
|
409
|
-
assert litellm.supports_response_schema(llm_config.model), (
|
410
|
-
"The model does not support structured output / JSON mode."
|
411
|
-
)
|
412
|
-
|
413
415
|
litellm_kwargs = {
|
414
|
-
"response_format": create_table_response_format(columns=
|
415
|
-
"temperature": temperature,
|
416
|
-
"top_p": top_p,
|
416
|
+
"response_format": create_table_response_format(columns=columns),
|
417
|
+
"temperature": llm_config.temperature,
|
418
|
+
"top_p": llm_config.top_p,
|
417
419
|
"model": llm_config.model,
|
418
420
|
"api_key": llm_config.api_key,
|
419
421
|
"stream": True,
|
@@ -427,18 +429,17 @@ def _create_table_rows_generator(
|
|
427
429
|
if non_context_data
|
428
430
|
else None
|
429
431
|
)
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
prompt = _create_table_prompt(**prompt_kwargs)
|
432
|
+
prompt = _create_table_prompt(
|
433
|
+
name=name,
|
434
|
+
prompt=prompt,
|
435
|
+
columns=columns,
|
436
|
+
primary_keys=primary_keys,
|
437
|
+
batch_size=batch_size,
|
438
|
+
foreign_keys=foreign_keys,
|
439
|
+
context_data=context_batch,
|
440
|
+
non_context_data=non_context_batch,
|
441
|
+
previous_rows=list(previous_rows),
|
442
|
+
)
|
442
443
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}]
|
443
444
|
|
444
445
|
response = litellm.completion(messages=messages, **litellm_kwargs)
|
@@ -464,7 +465,8 @@ def _create_table_rows_generator(
|
|
464
465
|
|
465
466
|
|
466
467
|
def _convert_table_rows_generator_to_df(
|
467
|
-
table_rows_generator: Generator[dict],
|
468
|
+
table_rows_generator: Generator[dict],
|
469
|
+
columns: dict[str, ColumnConfig],
|
468
470
|
) -> pd.DataFrame:
|
469
471
|
def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
|
470
472
|
for column_name, column_config in columns.items():
|
@@ -485,7 +487,7 @@ def _convert_table_rows_generator_to_df(
|
|
485
487
|
return df
|
486
488
|
|
487
489
|
df = pd.DataFrame(list(table_rows_generator))
|
488
|
-
df = align_df_dtypes_with_mock_dtypes(df,
|
490
|
+
df = align_df_dtypes_with_mock_dtypes(df, columns)
|
489
491
|
return df
|
490
492
|
|
491
493
|
|
@@ -498,30 +500,32 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
|
|
498
500
|
return sample_size
|
499
501
|
|
500
502
|
|
501
|
-
def
|
502
|
-
|
503
|
-
|
503
|
+
def _build_execution_plan(config: MockConfig) -> list[str]:
|
504
|
+
def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
|
505
|
+
child_to_parents = {}
|
506
|
+
parent_to_children = {}
|
504
507
|
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
+
for table_name in config.root:
|
509
|
+
child_to_parents[table_name] = set()
|
510
|
+
parent_to_children[table_name] = set()
|
508
511
|
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
512
|
+
for table_name, table_config in config.root.items():
|
513
|
+
if table_config.foreign_keys:
|
514
|
+
for fk in table_config.foreign_keys:
|
515
|
+
referenced_table = fk.referenced_table
|
516
|
+
child_to_parents[table_name].add(referenced_table)
|
517
|
+
parent_to_children[referenced_table].add(table_name)
|
515
518
|
|
516
|
-
|
517
|
-
|
519
|
+
root_tables = []
|
520
|
+
for table_name, parents in child_to_parents.items():
|
521
|
+
if not parents or parents == {table_name}: # no dependencies or only self-dependency
|
522
|
+
root_tables.append(table_name)
|
523
|
+
return child_to_parents, parent_to_children, root_tables
|
518
524
|
|
525
|
+
child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
|
519
526
|
|
520
|
-
def _build_execution_plan(
|
521
|
-
parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
|
522
|
-
) -> list[str]:
|
523
527
|
execution_plan = []
|
524
|
-
bfs_queue = list(
|
528
|
+
bfs_queue = list(root_tables)
|
525
529
|
processed = set()
|
526
530
|
|
527
531
|
while bfs_queue:
|
@@ -530,7 +534,10 @@ def _build_execution_plan(
|
|
530
534
|
continue
|
531
535
|
|
532
536
|
# ensure all parents are processed before processing this table
|
533
|
-
unprocessed_parents = [
|
537
|
+
unprocessed_parents = []
|
538
|
+
for parent in child_to_parents[table_name]:
|
539
|
+
if parent not in processed and parent != table_name: # exclude self-dependency
|
540
|
+
unprocessed_parents.append(parent)
|
534
541
|
if unprocessed_parents:
|
535
542
|
bfs_queue.extend(unprocessed_parents)
|
536
543
|
bfs_queue.append(table_name)
|
@@ -553,6 +560,7 @@ def sample(
|
|
553
560
|
api_key: str | None = None,
|
554
561
|
temperature: float = 1.0,
|
555
562
|
top_p: float = 0.95,
|
563
|
+
return_type: Literal["auto", "dict"] = "auto",
|
556
564
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
557
565
|
"""
|
558
566
|
Generate mock data by prompting an LLM.
|
@@ -577,6 +585,7 @@ def sample(
|
|
577
585
|
api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
|
578
586
|
temperature (float): The temperature to use for the LLM. Default is 1.0.
|
579
587
|
top_p (float): The top-p value to use for the LLM. Default is 0.95.
|
588
|
+
return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
|
580
589
|
|
581
590
|
Returns:
|
582
591
|
- pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
|
@@ -588,7 +597,7 @@ def sample(
|
|
588
597
|
|
589
598
|
tables = {
|
590
599
|
"guests": {
|
591
|
-
"
|
600
|
+
"prompt": "Guests of an Alpine ski hotel in Austria",
|
592
601
|
"columns": {
|
593
602
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
594
603
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
@@ -611,7 +620,7 @@ def sample(
|
|
611
620
|
|
612
621
|
tables = {
|
613
622
|
"customers": {
|
614
|
-
"
|
623
|
+
"prompt": "Customers of a hardware store",
|
615
624
|
"columns": {
|
616
625
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
617
626
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
@@ -619,7 +628,7 @@ def sample(
|
|
619
628
|
"primary_key": "customer_id",
|
620
629
|
},
|
621
630
|
"warehouses": {
|
622
|
-
"
|
631
|
+
"prompt": "Warehouses of a hardware store",
|
623
632
|
"columns": {
|
624
633
|
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
625
634
|
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
@@ -627,7 +636,7 @@ def sample(
|
|
627
636
|
"primary_key": "warehouse_id",
|
628
637
|
},
|
629
638
|
"orders": {
|
630
|
-
"
|
639
|
+
"prompt": "Orders of a Customer",
|
631
640
|
"columns": {
|
632
641
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
633
642
|
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
@@ -640,7 +649,7 @@ def sample(
|
|
640
649
|
{
|
641
650
|
"column": "customer_id",
|
642
651
|
"referenced_table": "customers",
|
643
|
-
"
|
652
|
+
"prompt": "each customer has anywhere between 2 and 3 orders",
|
644
653
|
},
|
645
654
|
{
|
646
655
|
"column": "warehouse_id",
|
@@ -649,7 +658,7 @@ def sample(
|
|
649
658
|
],
|
650
659
|
},
|
651
660
|
"items": {
|
652
|
-
"
|
661
|
+
"prompt": "Items in an Order",
|
653
662
|
"columns": {
|
654
663
|
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
655
664
|
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
@@ -660,7 +669,7 @@ def sample(
|
|
660
669
|
{
|
661
670
|
"column": "order_id",
|
662
671
|
"referenced_table": "orders",
|
663
|
-
"
|
672
|
+
"prompt": "each order has between 1 and 2 items",
|
664
673
|
}
|
665
674
|
],
|
666
675
|
},
|
@@ -674,47 +683,30 @@ def sample(
|
|
674
683
|
"""
|
675
684
|
|
676
685
|
config = MockConfig(tables)
|
686
|
+
llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
|
677
687
|
|
678
688
|
sample_size = _harmonize_sample_size(sample_size, config)
|
679
689
|
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
680
690
|
|
681
|
-
|
682
|
-
execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
|
691
|
+
execution_plan: list[str] = _build_execution_plan(config)
|
683
692
|
|
684
|
-
|
693
|
+
data: dict[str, pd.DataFrame] = {}
|
685
694
|
|
686
695
|
for table_name in execution_plan:
|
687
696
|
table_config = config.root[table_name]
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
# sequencial table
|
705
|
-
df = _sample_table(
|
706
|
-
table_name=table_name,
|
707
|
-
table_config=table_config,
|
708
|
-
primary_keys=primary_keys,
|
709
|
-
sample_size=None,
|
710
|
-
generated_data=results,
|
711
|
-
temperature=temperature,
|
712
|
-
top_p=top_p,
|
713
|
-
batch_size=1, # generate one sequence at a time
|
714
|
-
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
715
|
-
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
716
|
-
llm_config=LLMConfig(model=model, api_key=api_key),
|
717
|
-
)
|
718
|
-
results[table_name] = df
|
719
|
-
|
720
|
-
return results if len(results) > 1 else next(iter(results.values()))
|
697
|
+
df = _sample_table(
|
698
|
+
name=table_name,
|
699
|
+
prompt=table_config.prompt,
|
700
|
+
columns=table_config.columns,
|
701
|
+
foreign_keys=table_config.foreign_keys,
|
702
|
+
primary_keys=primary_keys,
|
703
|
+
generated_data=data,
|
704
|
+
sample_size=sample_size[table_name],
|
705
|
+
batch_size=30, # generate 30 root table rows at a time
|
706
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
707
|
+
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
708
|
+
llm_config=llm_config,
|
709
|
+
)
|
710
|
+
data[table_name] = df
|
711
|
+
|
712
|
+
return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
|
@@ -0,0 +1,85 @@
|
|
1
|
+
import os
|
2
|
+
import tempfile
|
3
|
+
import zipfile
|
4
|
+
|
5
|
+
import requests
|
6
|
+
from fastmcp import Context, FastMCP
|
7
|
+
|
8
|
+
from mostlyai import mock
|
9
|
+
|
10
|
+
SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
|
11
|
+
It is proxy to the `mostlyai.mock.sample` function.
|
12
|
+
|
13
|
+
This function returns an URL to the generated CSV bundle (as ZIP file).
|
14
|
+
Print this URL in Markdown format, so user can easily download the data.
|
15
|
+
|
16
|
+
What comes after the `=============================` is the documentation of the `mostlyai.mock.sample` function.
|
17
|
+
|
18
|
+
=============================
|
19
|
+
{mock.sample.__doc__}
|
20
|
+
"""
|
21
|
+
|
22
|
+
mcp = FastMCP(name="MostlyAI Mock MCP Server")
|
23
|
+
|
24
|
+
|
25
|
+
def _upload_to_0x0st(data: dict) -> str:
|
26
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
27
|
+
zip_path = os.path.join(temp_dir, "mock_data.zip")
|
28
|
+
with zipfile.ZipFile(zip_path, "w") as zip_file:
|
29
|
+
for table_name, df in data.items():
|
30
|
+
csv_path = os.path.join(temp_dir, f"{table_name}.csv")
|
31
|
+
df.to_csv(csv_path, index=False)
|
32
|
+
zip_file.write(csv_path, arcname=f"{table_name}.csv")
|
33
|
+
|
34
|
+
with open(zip_path, "rb") as f:
|
35
|
+
response = requests.post(
|
36
|
+
"https://0x0.st",
|
37
|
+
files={"file": f},
|
38
|
+
data={"expires": "24", "secret": ""},
|
39
|
+
headers={"User-Agent": "MockData/1.0"},
|
40
|
+
)
|
41
|
+
|
42
|
+
if response.status_code == 200:
|
43
|
+
url = response.text.strip()
|
44
|
+
return url
|
45
|
+
else:
|
46
|
+
raise Exception(f"Failed to upload ZIP: HTTP {response.status_code}")
|
47
|
+
|
48
|
+
|
49
|
+
@mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
|
50
|
+
def sample_mock_data(
|
51
|
+
*,
|
52
|
+
tables: dict[str, dict],
|
53
|
+
sample_size: int,
|
54
|
+
model: str = "openai/gpt-4.1-nano",
|
55
|
+
api_key: str | None = None,
|
56
|
+
temperature: float = 1.0,
|
57
|
+
top_p: float = 0.95,
|
58
|
+
ctx: Context,
|
59
|
+
) -> str:
|
60
|
+
# Notes:
|
61
|
+
# 1. Returning DataFrames directly results in converting them into truncated string.
|
62
|
+
# 2. The logs / progress bars are not propagated to the MCP Client. There is a dedicated API to do that (e.g. `ctx.info(...)`)
|
63
|
+
# 3. MCP Server inherits only selected environment variables (PATH, USER...); one way to pass LLM keys is through client configuration (`mcpServers->env`)
|
64
|
+
# 4. Some MCP Clients, e.g. Cursor, do not like Unions or Optionals in type hints
|
65
|
+
ctx.info(f"Generating mock data for `{len(tables)}` tables")
|
66
|
+
data = mock.sample(
|
67
|
+
tables=tables,
|
68
|
+
sample_size=sample_size,
|
69
|
+
model=model,
|
70
|
+
api_key=api_key,
|
71
|
+
temperature=temperature,
|
72
|
+
top_p=top_p,
|
73
|
+
return_type="dict",
|
74
|
+
)
|
75
|
+
ctx.info(f"Generated mock data for `{len(tables)}` tables")
|
76
|
+
url = _upload_to_0x0st(data)
|
77
|
+
return url
|
78
|
+
|
79
|
+
|
80
|
+
def main():
|
81
|
+
mcp.run(transport="stdio")
|
82
|
+
|
83
|
+
|
84
|
+
if __name__ == "__main__":
|
85
|
+
main()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "mostlyai-mock"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.9"
|
4
4
|
description = "Synthetic Mock Data"
|
5
5
|
authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
|
6
6
|
requires-python = ">=3.10"
|
@@ -29,8 +29,12 @@ dependencies = [
|
|
29
29
|
"pandas>=2.0.0",
|
30
30
|
"pyarrow>=14.0.0",
|
31
31
|
"litellm>=1.67.0",
|
32
|
+
"fastmcp>=2.0.0,<3.0.0",
|
32
33
|
]
|
33
34
|
|
35
|
+
[project.scripts]
|
36
|
+
mcp-server = "mostlyai.mock.mcp_server:main"
|
37
|
+
|
34
38
|
[project.urls]
|
35
39
|
homepage = "https://github.com/mostly-ai/mostlyai-mock"
|
36
40
|
repository = "https://github.com/mostly-ai/mostlyai-mock"
|
File without changes
|
File without changes
|