mostlyai-mock 0.0.6__tar.gz → 0.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/PKG-INFO +87 -31
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/README.md +84 -30
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/mostlyai/mock/core.py +198 -137
- mostlyai_mock-0.0.8/mostlyai/mock/mcp.py +46 -0
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/pyproject.toml +6 -1
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/.gitignore +0 -0
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.8
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -24,16 +24,18 @@ Classifier: Programming Language :: Python :: 3.13
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries
|
25
25
|
Classifier: Typing :: Typed
|
26
26
|
Requires-Python: >=3.10
|
27
|
+
Requires-Dist: fastmcp<3.0.0,>=2.0.0
|
27
28
|
Requires-Dist: litellm>=1.67.0
|
28
29
|
Requires-Dist: numpy>=1.26.3
|
29
30
|
Requires-Dist: pandas>=2.0.0
|
30
31
|
Requires-Dist: pyarrow>=14.0.0
|
31
32
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
33
|
+
Requires-Dist: typer<1.0.0,>=0.9.0
|
32
34
|
Description-Content-Type: text/markdown
|
33
35
|
|
34
36
|
# Synthetic Mock Data 🔮
|
35
37
|
|
36
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
38
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
37
39
|
|
38
40
|
Create data out of nothing. Prompt LLMs for Tabular Data.
|
39
41
|
|
@@ -72,7 +74,7 @@ from mostlyai import mock
|
|
72
74
|
|
73
75
|
tables = {
|
74
76
|
"guests": {
|
75
|
-
"
|
77
|
+
"prompt": "Guests of an Alpine ski hotel in Austria",
|
76
78
|
"columns": {
|
77
79
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
78
80
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
@@ -112,17 +114,26 @@ from mostlyai import mock
|
|
112
114
|
|
113
115
|
tables = {
|
114
116
|
"customers": {
|
115
|
-
"
|
117
|
+
"prompt": "Customers of a hardware store",
|
116
118
|
"columns": {
|
117
119
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
118
120
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
119
121
|
},
|
120
122
|
"primary_key": "customer_id",
|
121
123
|
},
|
124
|
+
"warehouses": {
|
125
|
+
"prompt": "Warehouses of a hardware store",
|
126
|
+
"columns": {
|
127
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
128
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
129
|
+
},
|
130
|
+
"primary_key": "warehouse_id",
|
131
|
+
},
|
122
132
|
"orders": {
|
123
|
-
"
|
133
|
+
"prompt": "Orders of a Customer",
|
124
134
|
"columns": {
|
125
135
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
136
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
126
137
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
127
138
|
"text": {"prompt": "order text description", "dtype": "string"},
|
128
139
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -132,12 +143,16 @@ tables = {
|
|
132
143
|
{
|
133
144
|
"column": "customer_id",
|
134
145
|
"referenced_table": "customers",
|
135
|
-
"
|
136
|
-
}
|
146
|
+
"prompt": "each customer has anywhere between 2 and 3 orders",
|
147
|
+
},
|
148
|
+
{
|
149
|
+
"column": "warehouse_id",
|
150
|
+
"referenced_table": "warehouses",
|
151
|
+
},
|
137
152
|
],
|
138
153
|
},
|
139
154
|
"items": {
|
140
|
-
"
|
155
|
+
"prompt": "Items in an Order",
|
141
156
|
"columns": {
|
142
157
|
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
143
158
|
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
@@ -148,7 +163,7 @@ tables = {
|
|
148
163
|
{
|
149
164
|
"column": "order_id",
|
150
165
|
"referenced_table": "orders",
|
151
|
-
"
|
166
|
+
"prompt": "each order has between 1 and 2 items",
|
152
167
|
}
|
153
168
|
],
|
154
169
|
},
|
@@ -159,28 +174,69 @@ data = mock.sample(
|
|
159
174
|
model="openai/gpt-4.1"
|
160
175
|
)
|
161
176
|
print(data["customers"])
|
162
|
-
# customer_id
|
163
|
-
# 0 1
|
164
|
-
# 1 2
|
177
|
+
# customer_id name
|
178
|
+
# 0 1 Matthew Carlson
|
179
|
+
# 1 2 Priya Shah
|
180
|
+
print(data["warehouses"])
|
181
|
+
# warehouse_id name
|
182
|
+
# 0 1 Central Distribution Hub
|
183
|
+
# 1 2 Northgate Storage Facility
|
165
184
|
print(data["orders"])
|
166
|
-
# customer_id
|
167
|
-
# 0 1
|
168
|
-
# 1 1
|
169
|
-
# 2 1
|
170
|
-
# 3 2
|
171
|
-
# 4 2
|
172
|
-
# 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
|
185
|
+
# customer_id warehouse_id order_id text amount
|
186
|
+
# 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
|
187
|
+
# 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
|
188
|
+
# 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
|
189
|
+
# 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
|
190
|
+
# 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
|
173
191
|
print(data["items"])
|
174
|
-
#
|
175
|
-
# 0
|
176
|
-
# 1
|
177
|
-
# 2
|
178
|
-
# 3
|
179
|
-
# 4
|
180
|
-
# 5
|
181
|
-
# 6
|
182
|
-
# 7
|
183
|
-
# 8
|
184
|
-
# 9
|
185
|
-
# 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
|
192
|
+
# item_id order_id name price
|
193
|
+
# 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
|
194
|
+
# 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
|
195
|
+
# 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
|
196
|
+
# 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
|
197
|
+
# 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
|
198
|
+
# 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
|
199
|
+
# 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
|
200
|
+
# 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
|
201
|
+
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
202
|
+
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
186
203
|
```
|
204
|
+
|
205
|
+
6. Create your first self-referencing synthetic table
|
206
|
+
|
207
|
+
```python
|
208
|
+
from mostlyai import mock
|
209
|
+
|
210
|
+
tables = {
|
211
|
+
"employees": {
|
212
|
+
"prompt": "Employees of a company",
|
213
|
+
"columns": {
|
214
|
+
"employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
|
215
|
+
"name": {"prompt": "first name and last name of the president", "dtype": "string"},
|
216
|
+
"boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
|
217
|
+
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
218
|
+
},
|
219
|
+
"primary_key": "employee_id",
|
220
|
+
"foreign_keys": [
|
221
|
+
{
|
222
|
+
"column": "boss_id",
|
223
|
+
"referenced_table": "employees",
|
224
|
+
"prompt": "each boss has at most 3 employees",
|
225
|
+
},
|
226
|
+
],
|
227
|
+
}
|
228
|
+
}
|
229
|
+
df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
|
230
|
+
print(df)
|
231
|
+
# employee_id name boss_id role
|
232
|
+
# 0 1 Sandra Phillips <NA> President
|
233
|
+
# 1 2 Marcus Tran 1 Chief Financial Officer
|
234
|
+
# 2 3 Ava Whittaker 1 Chief Technology Officer
|
235
|
+
# 3 4 Sophie Martin 1 Chief Operations Officer
|
236
|
+
# 4 5 Chad Nelson 2 Finance Manager
|
237
|
+
# 5 6 Ethan Glover 2 Senior Accountant
|
238
|
+
# 6 7 Kimberly Ortiz 2 Junior Accountant
|
239
|
+
# 7 8 Lucas Romero 3 IT Manager
|
240
|
+
# 8 9 Priya Desai 3 Lead Software Engineer
|
241
|
+
# 9 10 Felix Bennett 3 Senior Systems Analyst
|
242
|
+
```
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Synthetic Mock Data 🔮
|
2
2
|
|
3
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
3
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
4
4
|
|
5
5
|
Create data out of nothing. Prompt LLMs for Tabular Data.
|
6
6
|
|
@@ -39,7 +39,7 @@ from mostlyai import mock
|
|
39
39
|
|
40
40
|
tables = {
|
41
41
|
"guests": {
|
42
|
-
"
|
42
|
+
"prompt": "Guests of an Alpine ski hotel in Austria",
|
43
43
|
"columns": {
|
44
44
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
45
45
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
@@ -79,17 +79,26 @@ from mostlyai import mock
|
|
79
79
|
|
80
80
|
tables = {
|
81
81
|
"customers": {
|
82
|
-
"
|
82
|
+
"prompt": "Customers of a hardware store",
|
83
83
|
"columns": {
|
84
84
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
85
85
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
86
86
|
},
|
87
87
|
"primary_key": "customer_id",
|
88
88
|
},
|
89
|
+
"warehouses": {
|
90
|
+
"prompt": "Warehouses of a hardware store",
|
91
|
+
"columns": {
|
92
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
93
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
94
|
+
},
|
95
|
+
"primary_key": "warehouse_id",
|
96
|
+
},
|
89
97
|
"orders": {
|
90
|
-
"
|
98
|
+
"prompt": "Orders of a Customer",
|
91
99
|
"columns": {
|
92
100
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
101
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
93
102
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
94
103
|
"text": {"prompt": "order text description", "dtype": "string"},
|
95
104
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -99,12 +108,16 @@ tables = {
|
|
99
108
|
{
|
100
109
|
"column": "customer_id",
|
101
110
|
"referenced_table": "customers",
|
102
|
-
"
|
103
|
-
}
|
111
|
+
"prompt": "each customer has anywhere between 2 and 3 orders",
|
112
|
+
},
|
113
|
+
{
|
114
|
+
"column": "warehouse_id",
|
115
|
+
"referenced_table": "warehouses",
|
116
|
+
},
|
104
117
|
],
|
105
118
|
},
|
106
119
|
"items": {
|
107
|
-
"
|
120
|
+
"prompt": "Items in an Order",
|
108
121
|
"columns": {
|
109
122
|
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
110
123
|
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
@@ -115,7 +128,7 @@ tables = {
|
|
115
128
|
{
|
116
129
|
"column": "order_id",
|
117
130
|
"referenced_table": "orders",
|
118
|
-
"
|
131
|
+
"prompt": "each order has between 1 and 2 items",
|
119
132
|
}
|
120
133
|
],
|
121
134
|
},
|
@@ -126,28 +139,69 @@ data = mock.sample(
|
|
126
139
|
model="openai/gpt-4.1"
|
127
140
|
)
|
128
141
|
print(data["customers"])
|
129
|
-
# customer_id
|
130
|
-
# 0 1
|
131
|
-
# 1 2
|
142
|
+
# customer_id name
|
143
|
+
# 0 1 Matthew Carlson
|
144
|
+
# 1 2 Priya Shah
|
145
|
+
print(data["warehouses"])
|
146
|
+
# warehouse_id name
|
147
|
+
# 0 1 Central Distribution Hub
|
148
|
+
# 1 2 Northgate Storage Facility
|
132
149
|
print(data["orders"])
|
133
|
-
# customer_id
|
134
|
-
# 0 1
|
135
|
-
# 1 1
|
136
|
-
# 2 1
|
137
|
-
# 3 2
|
138
|
-
# 4 2
|
139
|
-
# 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
|
150
|
+
# customer_id warehouse_id order_id text amount
|
151
|
+
# 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
|
152
|
+
# 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
|
153
|
+
# 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
|
154
|
+
# 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
|
155
|
+
# 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
|
140
156
|
print(data["items"])
|
141
|
-
#
|
142
|
-
# 0
|
143
|
-
# 1
|
144
|
-
# 2
|
145
|
-
# 3
|
146
|
-
# 4
|
147
|
-
# 5
|
148
|
-
# 6
|
149
|
-
# 7
|
150
|
-
# 8
|
151
|
-
# 9
|
152
|
-
# 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
|
157
|
+
# item_id order_id name price
|
158
|
+
# 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
|
159
|
+
# 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
|
160
|
+
# 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
|
161
|
+
# 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
|
162
|
+
# 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
|
163
|
+
# 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
|
164
|
+
# 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
|
165
|
+
# 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
|
166
|
+
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
167
|
+
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
153
168
|
```
|
169
|
+
|
170
|
+
6. Create your first self-referencing synthetic table
|
171
|
+
|
172
|
+
```python
|
173
|
+
from mostlyai import mock
|
174
|
+
|
175
|
+
tables = {
|
176
|
+
"employees": {
|
177
|
+
"prompt": "Employees of a company",
|
178
|
+
"columns": {
|
179
|
+
"employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
|
180
|
+
"name": {"prompt": "first name and last name of the president", "dtype": "string"},
|
181
|
+
"boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
|
182
|
+
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
183
|
+
},
|
184
|
+
"primary_key": "employee_id",
|
185
|
+
"foreign_keys": [
|
186
|
+
{
|
187
|
+
"column": "boss_id",
|
188
|
+
"referenced_table": "employees",
|
189
|
+
"prompt": "each boss has at most 3 employees",
|
190
|
+
},
|
191
|
+
],
|
192
|
+
}
|
193
|
+
}
|
194
|
+
df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
|
195
|
+
print(df)
|
196
|
+
# employee_id name boss_id role
|
197
|
+
# 0 1 Sandra Phillips <NA> President
|
198
|
+
# 1 2 Marcus Tran 1 Chief Financial Officer
|
199
|
+
# 2 3 Ava Whittaker 1 Chief Technology Officer
|
200
|
+
# 3 4 Sophie Martin 1 Chief Operations Officer
|
201
|
+
# 4 5 Chad Nelson 2 Finance Manager
|
202
|
+
# 5 6 Ethan Glover 2 Senior Accountant
|
203
|
+
# 6 7 Kimberly Ortiz 2 Junior Accountant
|
204
|
+
# 7 8 Lucas Romero 3 IT Manager
|
205
|
+
# 8 9 Priya Desai 3 Lead Software Engineer
|
206
|
+
# 9 10 Felix Bennett 3 Senior Systems Analyst
|
207
|
+
```
|
@@ -44,8 +44,10 @@ across tables.
|
|
44
44
|
|
45
45
|
|
46
46
|
class LLMConfig(BaseModel):
|
47
|
-
model: str
|
47
|
+
model: str = "openai/gpt-4.1-nano"
|
48
48
|
api_key: str | None = None
|
49
|
+
temperature: float = 1.0
|
50
|
+
top_p: float = 0.95
|
49
51
|
|
50
52
|
|
51
53
|
class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
@@ -100,7 +102,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
100
102
|
if table_name in path:
|
101
103
|
cycle_start = path.index(table_name)
|
102
104
|
cycle = path[cycle_start:] + [table_name]
|
103
|
-
|
105
|
+
if len(cycle) > 2: # len(cycle) == 2 means self-referencing table, which is allowed
|
106
|
+
raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}.")
|
104
107
|
if table_name in visited:
|
105
108
|
return
|
106
109
|
visited.add(table_name)
|
@@ -116,10 +119,10 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
116
119
|
|
117
120
|
|
118
121
|
class TableConfig(BaseModel):
|
119
|
-
|
122
|
+
prompt: str = ""
|
120
123
|
columns: dict[str, ColumnConfig] = Field(..., min_items=1)
|
121
124
|
primary_key: str | None = None
|
122
|
-
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list
|
125
|
+
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
|
123
126
|
|
124
127
|
|
125
128
|
class ColumnConfig(BaseModel):
|
@@ -163,7 +166,7 @@ class ColumnConfig(BaseModel):
|
|
163
166
|
DType.DATETIME: (str, "strings"),
|
164
167
|
}[self.dtype]
|
165
168
|
try:
|
166
|
-
self.values = [cast_fn(c) for c in self.values]
|
169
|
+
self.values = [cast_fn(c) if pd.notna(c) else None for c in self.values]
|
167
170
|
except ValueError:
|
168
171
|
raise ValueError(
|
169
172
|
f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
|
@@ -184,85 +187,78 @@ class DType(str, Enum):
|
|
184
187
|
class ForeignKeyConfig(BaseModel):
|
185
188
|
column: str
|
186
189
|
referenced_table: str
|
187
|
-
|
190
|
+
prompt: str | None = None
|
188
191
|
|
189
192
|
|
190
193
|
def _sample_table(
|
191
194
|
*,
|
192
|
-
|
193
|
-
|
195
|
+
name: str,
|
196
|
+
prompt: str,
|
197
|
+
columns: dict[str, ColumnConfig],
|
198
|
+
foreign_keys: list[ForeignKeyConfig] | None,
|
194
199
|
primary_keys: dict[str, str] | None,
|
195
|
-
|
196
|
-
|
197
|
-
temperature: float,
|
198
|
-
top_p: float,
|
200
|
+
generated_data: dict[str, pd.DataFrame] | None,
|
201
|
+
sample_size: int,
|
199
202
|
batch_size: int,
|
200
203
|
previous_rows_size: int,
|
204
|
+
non_context_size: int | None,
|
201
205
|
llm_config: LLMConfig,
|
202
206
|
) -> pd.DataFrame:
|
203
|
-
assert (sample_size is None) != (context_data is None), (
|
204
|
-
"Exactly one of sample_size or context_data must be provided"
|
205
|
-
)
|
206
|
-
if sample_size is None:
|
207
|
-
sample_size = len(context_data)
|
208
207
|
table_rows_generator = _create_table_rows_generator(
|
209
|
-
|
210
|
-
|
208
|
+
name=name,
|
209
|
+
prompt=prompt,
|
210
|
+
columns=columns,
|
211
211
|
primary_keys=primary_keys,
|
212
|
+
foreign_keys=foreign_keys,
|
213
|
+
generated_data=generated_data,
|
212
214
|
sample_size=sample_size,
|
213
|
-
context_data=context_data,
|
214
|
-
temperature=temperature,
|
215
|
-
top_p=top_p,
|
216
215
|
batch_size=batch_size,
|
217
216
|
previous_rows_size=previous_rows_size,
|
217
|
+
non_context_size=non_context_size,
|
218
218
|
llm_config=llm_config,
|
219
219
|
)
|
220
|
-
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{
|
221
|
-
table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator,
|
220
|
+
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
221
|
+
table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
|
222
222
|
return table_df
|
223
223
|
|
224
224
|
|
225
225
|
def _create_table_prompt(
|
226
226
|
*,
|
227
|
-
|
228
|
-
|
227
|
+
name: str,
|
228
|
+
prompt: str,
|
229
229
|
columns: dict[str, ColumnConfig],
|
230
230
|
primary_keys: dict[str, str] | None,
|
231
231
|
batch_size: int | None,
|
232
232
|
foreign_keys: list[ForeignKeyConfig] | None,
|
233
233
|
context_data: pd.DataFrame | None,
|
234
|
-
|
234
|
+
non_context_data: dict[str, pd.DataFrame] | None,
|
235
|
+
previous_rows: list[dict] | None,
|
235
236
|
) -> str:
|
236
|
-
|
237
|
-
|
238
|
-
assert context_data is None
|
239
|
-
else:
|
240
|
-
assert foreign_keys is not None
|
241
|
-
assert context_data is not None
|
242
|
-
assert primary_keys is not None
|
243
|
-
|
244
|
-
# add description
|
245
|
-
prompt = f"# {table_description}\n\n"
|
237
|
+
# add table prompt
|
238
|
+
prompt = f"# {prompt}\n\n"
|
246
239
|
|
247
240
|
# define table
|
248
|
-
prompt += f"## Table: {
|
241
|
+
prompt += f"## Table: {name}\n\n"
|
242
|
+
|
243
|
+
prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
|
249
244
|
|
250
245
|
# add columns specifications
|
251
246
|
prompt += "## Columns Specifications:\n\n"
|
252
247
|
prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
|
253
248
|
|
254
|
-
# define foreign keys
|
255
|
-
if foreign_keys is not None:
|
256
|
-
prompt += "## Foreign Keys:\n\n"
|
257
|
-
prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
|
258
|
-
|
259
249
|
# add previous rows as context to help the LLM generate consistent data
|
260
250
|
if previous_rows:
|
261
251
|
prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
|
262
252
|
prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
|
263
253
|
|
254
|
+
# define foreign keys
|
255
|
+
if foreign_keys:
|
256
|
+
prompt += "## Foreign Keys:\n\n"
|
257
|
+
prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
|
258
|
+
|
264
259
|
# add context table name, primary key and data
|
265
|
-
if
|
260
|
+
if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
|
261
|
+
assert context_data is not None
|
266
262
|
fk = foreign_keys[0]
|
267
263
|
prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
|
268
264
|
|
@@ -271,16 +267,35 @@ def _create_table_prompt(
|
|
271
267
|
prompt += f"## Context Table Data:\n\n"
|
272
268
|
prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
|
273
269
|
|
270
|
+
# add non-context table names, primary keys and data
|
271
|
+
if foreign_keys and len(foreign_keys) > 1:
|
272
|
+
for fk in foreign_keys[1:]:
|
273
|
+
if fk.referenced_table == name: # self-dependency is not considered as non-context
|
274
|
+
continue
|
275
|
+
assert non_context_data is not None
|
276
|
+
assert fk.referenced_table in non_context_data
|
277
|
+
prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
|
278
|
+
|
279
|
+
prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
280
|
+
|
281
|
+
prompt += f"## Non-Context Table Data:\n\n"
|
282
|
+
prompt += f"{non_context_data[fk.referenced_table].to_json(orient='records', indent=2)}\n\n"
|
283
|
+
|
274
284
|
# add instructions
|
275
285
|
prompt += "\n## Instructions:\n\n"
|
276
|
-
if
|
277
|
-
|
286
|
+
if not foreign_keys:
|
287
|
+
assert batch_size is not None
|
288
|
+
prompt += f"Generate {batch_size} rows for the `{name}` table.\n\n"
|
278
289
|
else:
|
279
290
|
prompt += (
|
280
|
-
f"Generate data for the `{
|
281
|
-
f"The Foreign Key column may only contain values from Context Table Data. "
|
282
|
-
f"
|
291
|
+
f"Generate data for the `{name}` table. "
|
292
|
+
f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
|
293
|
+
f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
|
294
|
+
f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
|
295
|
+
f"In this case, ensure that the generated foreign keys are consistent with generated primary keys of the table. "
|
296
|
+
f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
|
283
297
|
)
|
298
|
+
|
284
299
|
if previous_rows:
|
285
300
|
prompt += (
|
286
301
|
"Generate new rows that maintain consistency with the previous rows where appropriate. "
|
@@ -295,15 +310,16 @@ def _create_table_prompt(
|
|
295
310
|
|
296
311
|
def _create_table_rows_generator(
|
297
312
|
*,
|
298
|
-
|
299
|
-
|
313
|
+
name: str,
|
314
|
+
prompt: str,
|
315
|
+
columns: dict[str, ColumnConfig],
|
316
|
+
foreign_keys: list[ForeignKeyConfig] | None,
|
300
317
|
primary_keys: dict[str, str] | None,
|
318
|
+
generated_data: dict[str, pd.DataFrame] | None,
|
301
319
|
sample_size: int,
|
302
|
-
temperature: float,
|
303
|
-
top_p: float,
|
304
|
-
context_data: pd.DataFrame | None,
|
305
320
|
batch_size: int,
|
306
321
|
previous_rows_size: int,
|
322
|
+
non_context_size: int | None,
|
307
323
|
llm_config: LLMConfig,
|
308
324
|
) -> Generator[dict]:
|
309
325
|
def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
|
@@ -311,14 +327,14 @@ def _create_table_rows_generator(
|
|
311
327
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
312
328
|
return Literal[tuple(column_config.values)]
|
313
329
|
return {
|
314
|
-
DType.INTEGER: int,
|
315
|
-
DType.FLOAT: float,
|
316
|
-
DType.STRING: str,
|
317
|
-
DType.BOOLEAN: bool,
|
330
|
+
DType.INTEGER: int | None,
|
331
|
+
DType.FLOAT: float | None,
|
332
|
+
DType.STRING: str | None,
|
333
|
+
DType.BOOLEAN: bool | None,
|
318
334
|
# response_format has limited support for JSON Schema features
|
319
335
|
# thus we represent dates and datetimes as strings
|
320
|
-
DType.DATE: str,
|
321
|
-
DType.DATETIME: str,
|
336
|
+
DType.DATE: str | None,
|
337
|
+
DType.DATETIME: str | None,
|
322
338
|
}[column_config.dtype]
|
323
339
|
|
324
340
|
fields = {}
|
@@ -375,10 +391,31 @@ def _create_table_rows_generator(
|
|
375
391
|
"The model does not support structured output / JSON mode."
|
376
392
|
)
|
377
393
|
|
394
|
+
# derive context data (if first foreign key is present) and harmonize sample size accordingly
|
395
|
+
context_data: pd.DataFrame | None = None
|
396
|
+
if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
|
397
|
+
context_table_name = foreign_keys[0].referenced_table
|
398
|
+
assert generated_data is not None
|
399
|
+
assert context_table_name in generated_data
|
400
|
+
context_data = generated_data[context_table_name]
|
401
|
+
sample_size = len(context_data)
|
402
|
+
|
403
|
+
# derive non-context data (if more than one foreign key is present)
|
404
|
+
non_context_data: dict[str, pd.DataFrame] = {}
|
405
|
+
if foreign_keys and len(foreign_keys) > 1:
|
406
|
+
assert generated_data is not None
|
407
|
+
assert non_context_size is not None
|
408
|
+
for fk in foreign_keys[1:]:
|
409
|
+
if fk.referenced_table == name: # self-dependency is not considered as non-context
|
410
|
+
continue
|
411
|
+
non_context_table_name = fk.referenced_table
|
412
|
+
assert non_context_table_name in generated_data
|
413
|
+
non_context_data[non_context_table_name] = generated_data[non_context_table_name]
|
414
|
+
|
378
415
|
litellm_kwargs = {
|
379
|
-
"response_format": create_table_response_format(columns=
|
380
|
-
"temperature": temperature,
|
381
|
-
"top_p": top_p,
|
416
|
+
"response_format": create_table_response_format(columns=columns),
|
417
|
+
"temperature": llm_config.temperature,
|
418
|
+
"top_p": llm_config.top_p,
|
382
419
|
"model": llm_config.model,
|
383
420
|
"api_key": llm_config.api_key,
|
384
421
|
"stream": True,
|
@@ -387,17 +424,22 @@ def _create_table_rows_generator(
|
|
387
424
|
yielded_sequences = 0
|
388
425
|
previous_rows = deque(maxlen=previous_rows_size)
|
389
426
|
for context_batch in batch_infinitely(context_data):
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
427
|
+
non_context_batch = (
|
428
|
+
{table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
|
429
|
+
if non_context_data
|
430
|
+
else None
|
431
|
+
)
|
432
|
+
prompt = _create_table_prompt(
|
433
|
+
name=name,
|
434
|
+
prompt=prompt,
|
435
|
+
columns=columns,
|
436
|
+
primary_keys=primary_keys,
|
437
|
+
batch_size=batch_size,
|
438
|
+
foreign_keys=foreign_keys,
|
439
|
+
context_data=context_batch,
|
440
|
+
non_context_data=non_context_batch,
|
441
|
+
previous_rows=list(previous_rows),
|
442
|
+
)
|
401
443
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}]
|
402
444
|
|
403
445
|
response = litellm.completion(messages=messages, **litellm_kwargs)
|
@@ -423,16 +465,21 @@ def _create_table_rows_generator(
|
|
423
465
|
|
424
466
|
|
425
467
|
def _convert_table_rows_generator_to_df(
|
426
|
-
table_rows_generator: Generator[dict],
|
468
|
+
table_rows_generator: Generator[dict],
|
469
|
+
columns: dict[str, ColumnConfig],
|
427
470
|
) -> pd.DataFrame:
|
428
471
|
def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
|
429
472
|
for column_name, column_config in columns.items():
|
430
473
|
if column_config.dtype in [DType.DATE, DType.DATETIME]:
|
431
474
|
df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
|
432
|
-
elif column_config.dtype
|
433
|
-
df[column_name] = pd.to_numeric(df[column_name], errors="coerce",
|
475
|
+
elif column_config.dtype is DType.INTEGER:
|
476
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
|
477
|
+
"int64[pyarrow]"
|
478
|
+
)
|
479
|
+
elif column_config.dtype is DType.FLOAT:
|
480
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
|
434
481
|
elif column_config.dtype is DType.BOOLEAN:
|
435
|
-
df[column_name] = df[column_name].astype(
|
482
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
|
436
483
|
elif column_config.dtype is DType.CATEGORY:
|
437
484
|
df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
|
438
485
|
else:
|
@@ -440,7 +487,7 @@ def _convert_table_rows_generator_to_df(
|
|
440
487
|
return df
|
441
488
|
|
442
489
|
df = pd.DataFrame(list(table_rows_generator))
|
443
|
-
df = align_df_dtypes_with_mock_dtypes(df,
|
490
|
+
df = align_df_dtypes_with_mock_dtypes(df, columns)
|
444
491
|
return df
|
445
492
|
|
446
493
|
|
@@ -453,28 +500,32 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
|
|
453
500
|
return sample_size
|
454
501
|
|
455
502
|
|
456
|
-
def
|
457
|
-
|
458
|
-
|
503
|
+
def _build_execution_plan(config: MockConfig) -> list[str]:
|
504
|
+
def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
|
505
|
+
child_to_parents = {}
|
506
|
+
parent_to_children = {}
|
459
507
|
|
460
|
-
|
461
|
-
|
462
|
-
|
508
|
+
for table_name in config.root:
|
509
|
+
child_to_parents[table_name] = set()
|
510
|
+
parent_to_children[table_name] = set()
|
463
511
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
512
|
+
for table_name, table_config in config.root.items():
|
513
|
+
if table_config.foreign_keys:
|
514
|
+
for fk in table_config.foreign_keys:
|
515
|
+
referenced_table = fk.referenced_table
|
516
|
+
child_to_parents[table_name].add(referenced_table)
|
517
|
+
parent_to_children[referenced_table].add(table_name)
|
470
518
|
|
471
|
-
|
472
|
-
|
519
|
+
root_tables = []
|
520
|
+
for table_name, parents in child_to_parents.items():
|
521
|
+
if not parents or parents == {table_name}: # no dependencies or only self-dependency
|
522
|
+
root_tables.append(table_name)
|
523
|
+
return child_to_parents, parent_to_children, root_tables
|
473
524
|
|
525
|
+
child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
|
474
526
|
|
475
|
-
def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
|
476
527
|
execution_plan = []
|
477
|
-
bfs_queue = list(
|
528
|
+
bfs_queue = list(root_tables)
|
478
529
|
processed = set()
|
479
530
|
|
480
531
|
while bfs_queue:
|
@@ -482,6 +533,16 @@ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tabl
|
|
482
533
|
if table_name in processed:
|
483
534
|
continue
|
484
535
|
|
536
|
+
# ensure all parents are processed before processing this table
|
537
|
+
unprocessed_parents = []
|
538
|
+
for parent in child_to_parents[table_name]:
|
539
|
+
if parent not in processed and parent != table_name: # exclude self-dependency
|
540
|
+
unprocessed_parents.append(parent)
|
541
|
+
if unprocessed_parents:
|
542
|
+
bfs_queue.extend(unprocessed_parents)
|
543
|
+
bfs_queue.append(table_name)
|
544
|
+
continue
|
545
|
+
|
485
546
|
execution_plan.append(table_name)
|
486
547
|
processed.add(table_name)
|
487
548
|
|
@@ -499,6 +560,7 @@ def sample(
|
|
499
560
|
api_key: str | None = None,
|
500
561
|
temperature: float = 1.0,
|
501
562
|
top_p: float = 0.95,
|
563
|
+
return_type: Literal["auto", "dict"] = "auto",
|
502
564
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
503
565
|
"""
|
504
566
|
Generate mock data by prompting an LLM.
|
@@ -523,6 +585,7 @@ def sample(
|
|
523
585
|
api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
|
524
586
|
temperature (float): The temperature to use for the LLM. Default is 1.0.
|
525
587
|
top_p (float): The top-p value to use for the LLM. Default is 0.95.
|
588
|
+
return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
|
526
589
|
|
527
590
|
Returns:
|
528
591
|
- pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
|
@@ -534,7 +597,7 @@ def sample(
|
|
534
597
|
|
535
598
|
tables = {
|
536
599
|
"guests": {
|
537
|
-
"
|
600
|
+
"prompt": "Guests of an Alpine ski hotel in Austria",
|
538
601
|
"columns": {
|
539
602
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
540
603
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
@@ -557,17 +620,26 @@ def sample(
|
|
557
620
|
|
558
621
|
tables = {
|
559
622
|
"customers": {
|
560
|
-
"
|
623
|
+
"prompt": "Customers of a hardware store",
|
561
624
|
"columns": {
|
562
625
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
563
626
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
564
627
|
},
|
565
628
|
"primary_key": "customer_id",
|
566
629
|
},
|
630
|
+
"warehouses": {
|
631
|
+
"prompt": "Warehouses of a hardware store",
|
632
|
+
"columns": {
|
633
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
634
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
635
|
+
},
|
636
|
+
"primary_key": "warehouse_id",
|
637
|
+
},
|
567
638
|
"orders": {
|
568
|
-
"
|
639
|
+
"prompt": "Orders of a Customer",
|
569
640
|
"columns": {
|
570
641
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
642
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
571
643
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
572
644
|
"text": {"prompt": "order text description", "dtype": "string"},
|
573
645
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -577,12 +649,16 @@ def sample(
|
|
577
649
|
{
|
578
650
|
"column": "customer_id",
|
579
651
|
"referenced_table": "customers",
|
580
|
-
"
|
581
|
-
}
|
652
|
+
"prompt": "each customer has anywhere between 2 and 3 orders",
|
653
|
+
},
|
654
|
+
{
|
655
|
+
"column": "warehouse_id",
|
656
|
+
"referenced_table": "warehouses",
|
657
|
+
},
|
582
658
|
],
|
583
659
|
},
|
584
660
|
"items": {
|
585
|
-
"
|
661
|
+
"prompt": "Items in an Order",
|
586
662
|
"columns": {
|
587
663
|
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
588
664
|
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
@@ -593,59 +669,44 @@ def sample(
|
|
593
669
|
{
|
594
670
|
"column": "order_id",
|
595
671
|
"referenced_table": "orders",
|
596
|
-
"
|
672
|
+
"prompt": "each order has between 1 and 2 items",
|
597
673
|
}
|
598
674
|
],
|
599
675
|
},
|
600
676
|
}
|
601
677
|
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
602
678
|
df_customers = data["customers"]
|
679
|
+
df_warehouses = data["warehouses"]
|
603
680
|
df_orders = data["orders"]
|
604
681
|
df_items = data["items"]
|
605
682
|
```
|
606
683
|
"""
|
607
684
|
|
608
685
|
config = MockConfig(tables)
|
686
|
+
llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
|
609
687
|
|
610
688
|
sample_size = _harmonize_sample_size(sample_size, config)
|
611
689
|
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
612
690
|
|
613
|
-
|
614
|
-
execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
|
691
|
+
execution_plan: list[str] = _build_execution_plan(config)
|
615
692
|
|
616
|
-
|
693
|
+
data: dict[str, pd.DataFrame] = {}
|
617
694
|
|
618
695
|
for table_name in execution_plan:
|
619
696
|
table_config = config.root[table_name]
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
referenced_table = table_config.foreign_keys[0].referenced_table
|
637
|
-
df = _sample_table(
|
638
|
-
table_name=table_name,
|
639
|
-
table_config=table_config,
|
640
|
-
primary_keys=primary_keys,
|
641
|
-
sample_size=None,
|
642
|
-
context_data=results[referenced_table],
|
643
|
-
temperature=temperature,
|
644
|
-
top_p=top_p,
|
645
|
-
batch_size=1, # generate one sequence at a time
|
646
|
-
previous_rows_size=5,
|
647
|
-
llm_config=LLMConfig(model=model, api_key=api_key),
|
648
|
-
)
|
649
|
-
results[table_name] = df
|
650
|
-
|
651
|
-
return results if len(results) > 1 else next(iter(results.values()))
|
697
|
+
df = _sample_table(
|
698
|
+
name=table_name,
|
699
|
+
prompt=table_config.prompt,
|
700
|
+
columns=table_config.columns,
|
701
|
+
foreign_keys=table_config.foreign_keys,
|
702
|
+
primary_keys=primary_keys,
|
703
|
+
generated_data=data,
|
704
|
+
sample_size=sample_size[table_name],
|
705
|
+
batch_size=30, # generate 30 root table rows at a time
|
706
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
707
|
+
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
708
|
+
llm_config=llm_config,
|
709
|
+
)
|
710
|
+
data[table_name] = df
|
711
|
+
|
712
|
+
return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
from fastmcp import Context, FastMCP
|
5
|
+
|
6
|
+
from mostlyai import mock
|
7
|
+
|
8
|
+
mcp = FastMCP(name="MostlyAI Mock MCP Server")
|
9
|
+
|
10
|
+
|
11
|
+
@mcp.tool(description=mock.sample.__doc__)
|
12
|
+
def sample_mock_data(
|
13
|
+
*,
|
14
|
+
tables: dict[str, dict],
|
15
|
+
sample_size: int,
|
16
|
+
model: str = "openai/gpt-4.1-nano",
|
17
|
+
api_key: str | None = None,
|
18
|
+
temperature: float = 1.0,
|
19
|
+
top_p: float = 0.95,
|
20
|
+
ctx: Context,
|
21
|
+
) -> str:
|
22
|
+
# Notes:
|
23
|
+
# 1. Returning DataFrames directly results in converting them into truncated string.
|
24
|
+
# 2. The logs / progress bars are not propagated to the MCP Client. There is a dedicated API to do that (e.g. `ctx.info(...)`)
|
25
|
+
# 3. MCP Server inherits only selected environment variables (PATH, USER...); one way to pass LLM keys is through client configuration (`mcpServers->env`)
|
26
|
+
# 4. Some MCP Clients, e.g. Cursor, do not like Unions or Optionals in type hints
|
27
|
+
ctx.info(f"Generating mock data for `{len(tables)}` tables")
|
28
|
+
data = mock.sample(
|
29
|
+
tables=tables,
|
30
|
+
sample_size=sample_size,
|
31
|
+
model=model,
|
32
|
+
api_key=api_key,
|
33
|
+
temperature=temperature,
|
34
|
+
top_p=top_p,
|
35
|
+
return_type="dict",
|
36
|
+
)
|
37
|
+
ctx.info(f"Generated mock data for `{len(tables)}` tables")
|
38
|
+
return {k: v.to_dict(orient="records") for k, v in data.items()}
|
39
|
+
|
40
|
+
|
41
|
+
def main():
|
42
|
+
mcp.run(transport="stdio")
|
43
|
+
|
44
|
+
|
45
|
+
if __name__ == "__main__":
|
46
|
+
main()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "mostlyai-mock"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.8"
|
4
4
|
description = "Synthetic Mock Data"
|
5
5
|
authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
|
6
6
|
requires-python = ">=3.10"
|
@@ -29,8 +29,13 @@ dependencies = [
|
|
29
29
|
"pandas>=2.0.0",
|
30
30
|
"pyarrow>=14.0.0",
|
31
31
|
"litellm>=1.67.0",
|
32
|
+
"typer>=0.9.0,<1.0.0",
|
33
|
+
"fastmcp>=2.0.0,<3.0.0",
|
32
34
|
]
|
33
35
|
|
36
|
+
[project.scripts]
|
37
|
+
mcp-server = "mostlyai.mock.mcp:main"
|
38
|
+
|
34
39
|
[project.urls]
|
35
40
|
homepage = "https://github.com/mostly-ai/mostlyai-mock"
|
36
41
|
repository = "https://github.com/mostly-ai/mostlyai-mock"
|
File without changes
|
File without changes
|