mostlyai-mock 0.0.6__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/PKG-INFO +40 -25
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/README.md +39 -24
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/mostlyai/mock/core.py +101 -32
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/pyproject.toml +1 -1
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/.gitignore +0 -0
- {mostlyai_mock-0.0.6 → mostlyai_mock-0.0.7}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
|
|
33
33
|
|
34
34
|
# Synthetic Mock Data 🔮
|
35
35
|
|
36
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
36
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
37
37
|
|
38
38
|
Create data out of nothing. Prompt LLMs for Tabular Data.
|
39
39
|
|
@@ -119,10 +119,19 @@ tables = {
|
|
119
119
|
},
|
120
120
|
"primary_key": "customer_id",
|
121
121
|
},
|
122
|
+
"warehouses": {
|
123
|
+
"description": "Warehouses of a hardware store",
|
124
|
+
"columns": {
|
125
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
126
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
127
|
+
},
|
128
|
+
"primary_key": "warehouse_id",
|
129
|
+
},
|
122
130
|
"orders": {
|
123
131
|
"description": "Orders of a Customer",
|
124
132
|
"columns": {
|
125
133
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
134
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
126
135
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
127
136
|
"text": {"prompt": "order text description", "dtype": "string"},
|
128
137
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -133,7 +142,11 @@ tables = {
|
|
133
142
|
"column": "customer_id",
|
134
143
|
"referenced_table": "customers",
|
135
144
|
"description": "each customer has anywhere between 2 and 3 orders",
|
136
|
-
}
|
145
|
+
},
|
146
|
+
{
|
147
|
+
"column": "warehouse_id",
|
148
|
+
"referenced_table": "warehouses",
|
149
|
+
},
|
137
150
|
],
|
138
151
|
},
|
139
152
|
"items": {
|
@@ -159,28 +172,30 @@ data = mock.sample(
|
|
159
172
|
model="openai/gpt-4.1"
|
160
173
|
)
|
161
174
|
print(data["customers"])
|
162
|
-
# customer_id
|
163
|
-
# 0 1
|
164
|
-
# 1 2
|
175
|
+
# customer_id name
|
176
|
+
# 0 1 Matthew Carlson
|
177
|
+
# 1 2 Priya Shah
|
178
|
+
print(data["warehouses"])
|
179
|
+
# warehouse_id name
|
180
|
+
# 0 1 Central Distribution Hub
|
181
|
+
# 1 2 Northgate Storage Facility
|
165
182
|
print(data["orders"])
|
166
|
-
# customer_id
|
167
|
-
# 0 1
|
168
|
-
# 1 1
|
169
|
-
# 2 1
|
170
|
-
# 3 2
|
171
|
-
# 4 2
|
172
|
-
# 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
|
183
|
+
# customer_id warehouse_id order_id text amount
|
184
|
+
# 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
|
185
|
+
# 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
|
186
|
+
# 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
|
187
|
+
# 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
|
188
|
+
# 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
|
173
189
|
print(data["items"])
|
174
|
-
#
|
175
|
-
# 0
|
176
|
-
# 1
|
177
|
-
# 2
|
178
|
-
# 3
|
179
|
-
# 4
|
180
|
-
# 5
|
181
|
-
# 6
|
182
|
-
# 7
|
183
|
-
# 8
|
184
|
-
# 9
|
185
|
-
# 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
|
190
|
+
# item_id order_id name price
|
191
|
+
# 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
|
192
|
+
# 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
|
193
|
+
# 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
|
194
|
+
# 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
|
195
|
+
# 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
|
196
|
+
# 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
|
197
|
+
# 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
|
198
|
+
# 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
|
199
|
+
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
200
|
+
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
186
201
|
```
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Synthetic Mock Data 🔮
|
2
2
|
|
3
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
3
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
4
4
|
|
5
5
|
Create data out of nothing. Prompt LLMs for Tabular Data.
|
6
6
|
|
@@ -86,10 +86,19 @@ tables = {
|
|
86
86
|
},
|
87
87
|
"primary_key": "customer_id",
|
88
88
|
},
|
89
|
+
"warehouses": {
|
90
|
+
"description": "Warehouses of a hardware store",
|
91
|
+
"columns": {
|
92
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
93
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
94
|
+
},
|
95
|
+
"primary_key": "warehouse_id",
|
96
|
+
},
|
89
97
|
"orders": {
|
90
98
|
"description": "Orders of a Customer",
|
91
99
|
"columns": {
|
92
100
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
101
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
93
102
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
94
103
|
"text": {"prompt": "order text description", "dtype": "string"},
|
95
104
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -100,7 +109,11 @@ tables = {
|
|
100
109
|
"column": "customer_id",
|
101
110
|
"referenced_table": "customers",
|
102
111
|
"description": "each customer has anywhere between 2 and 3 orders",
|
103
|
-
}
|
112
|
+
},
|
113
|
+
{
|
114
|
+
"column": "warehouse_id",
|
115
|
+
"referenced_table": "warehouses",
|
116
|
+
},
|
104
117
|
],
|
105
118
|
},
|
106
119
|
"items": {
|
@@ -126,28 +139,30 @@ data = mock.sample(
|
|
126
139
|
model="openai/gpt-4.1"
|
127
140
|
)
|
128
141
|
print(data["customers"])
|
129
|
-
# customer_id
|
130
|
-
# 0 1
|
131
|
-
# 1 2
|
142
|
+
# customer_id name
|
143
|
+
# 0 1 Matthew Carlson
|
144
|
+
# 1 2 Priya Shah
|
145
|
+
print(data["warehouses"])
|
146
|
+
# warehouse_id name
|
147
|
+
# 0 1 Central Distribution Hub
|
148
|
+
# 1 2 Northgate Storage Facility
|
132
149
|
print(data["orders"])
|
133
|
-
# customer_id
|
134
|
-
# 0 1
|
135
|
-
# 1 1
|
136
|
-
# 2 1
|
137
|
-
# 3 2
|
138
|
-
# 4 2
|
139
|
-
# 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
|
150
|
+
# customer_id warehouse_id order_id text amount
|
151
|
+
# 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
|
152
|
+
# 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
|
153
|
+
# 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
|
154
|
+
# 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
|
155
|
+
# 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
|
140
156
|
print(data["items"])
|
141
|
-
#
|
142
|
-
# 0
|
143
|
-
# 1
|
144
|
-
# 2
|
145
|
-
# 3
|
146
|
-
# 4
|
147
|
-
# 5
|
148
|
-
# 6
|
149
|
-
# 7
|
150
|
-
# 8
|
151
|
-
# 9
|
152
|
-
# 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
|
157
|
+
# item_id order_id name price
|
158
|
+
# 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
|
159
|
+
# 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
|
160
|
+
# 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
|
161
|
+
# 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
|
162
|
+
# 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
|
163
|
+
# 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
|
164
|
+
# 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
|
165
|
+
# 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
|
166
|
+
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
167
|
+
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
153
168
|
```
|
@@ -100,7 +100,10 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
100
100
|
if table_name in path:
|
101
101
|
cycle_start = path.index(table_name)
|
102
102
|
cycle = path[cycle_start:] + [table_name]
|
103
|
-
|
103
|
+
msg = f"Circular dependency detected: {' -> '.join(cycle)}."
|
104
|
+
if len(cycle) == 2:
|
105
|
+
msg += " Self-referencing tables are not yet supported."
|
106
|
+
raise ValueError(msg)
|
104
107
|
if table_name in visited:
|
105
108
|
return
|
106
109
|
visited.add(table_name)
|
@@ -119,7 +122,7 @@ class TableConfig(BaseModel):
|
|
119
122
|
description: str = ""
|
120
123
|
columns: dict[str, ColumnConfig] = Field(..., min_items=1)
|
121
124
|
primary_key: str | None = None
|
122
|
-
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list
|
125
|
+
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
|
123
126
|
|
124
127
|
|
125
128
|
class ColumnConfig(BaseModel):
|
@@ -163,7 +166,7 @@ class ColumnConfig(BaseModel):
|
|
163
166
|
DType.DATETIME: (str, "strings"),
|
164
167
|
}[self.dtype]
|
165
168
|
try:
|
166
|
-
self.values = [cast_fn(c) for c in self.values]
|
169
|
+
self.values = [cast_fn(c) if pd.notna(c) else None for c in self.values]
|
167
170
|
except ValueError:
|
168
171
|
raise ValueError(
|
169
172
|
f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
|
@@ -193,28 +196,25 @@ def _sample_table(
|
|
193
196
|
table_config: TableConfig,
|
194
197
|
primary_keys: dict[str, str] | None,
|
195
198
|
sample_size: int | None,
|
196
|
-
|
199
|
+
generated_data: dict[str, pd.DataFrame] | None,
|
197
200
|
temperature: float,
|
198
201
|
top_p: float,
|
199
202
|
batch_size: int,
|
200
203
|
previous_rows_size: int,
|
204
|
+
non_context_size: int | None,
|
201
205
|
llm_config: LLMConfig,
|
202
206
|
) -> pd.DataFrame:
|
203
|
-
assert (sample_size is None) != (context_data is None), (
|
204
|
-
"Exactly one of sample_size or context_data must be provided"
|
205
|
-
)
|
206
|
-
if sample_size is None:
|
207
|
-
sample_size = len(context_data)
|
208
207
|
table_rows_generator = _create_table_rows_generator(
|
209
208
|
table_name=table_name,
|
210
209
|
table_config=table_config,
|
211
210
|
primary_keys=primary_keys,
|
212
211
|
sample_size=sample_size,
|
213
|
-
|
212
|
+
generated_data=generated_data,
|
214
213
|
temperature=temperature,
|
215
214
|
top_p=top_p,
|
216
215
|
batch_size=batch_size,
|
217
216
|
previous_rows_size=previous_rows_size,
|
217
|
+
non_context_size=non_context_size,
|
218
218
|
llm_config=llm_config,
|
219
219
|
)
|
220
220
|
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
|
@@ -231,6 +231,7 @@ def _create_table_prompt(
|
|
231
231
|
batch_size: int | None,
|
232
232
|
foreign_keys: list[ForeignKeyConfig] | None,
|
233
233
|
context_data: pd.DataFrame | None,
|
234
|
+
non_context_data: dict[str, pd.DataFrame],
|
234
235
|
previous_rows: list[dict],
|
235
236
|
) -> str:
|
236
237
|
if batch_size is not None:
|
@@ -271,16 +272,29 @@ def _create_table_prompt(
|
|
271
272
|
prompt += f"## Context Table Data:\n\n"
|
272
273
|
prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
|
273
274
|
|
275
|
+
# add non-context table names, primary keys and data
|
276
|
+
if non_context_data:
|
277
|
+
for fk in foreign_keys[1:]:
|
278
|
+
prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
|
279
|
+
|
280
|
+
prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
281
|
+
|
282
|
+
prompt += f"## Non-Context Table Data:\n\n"
|
283
|
+
prompt += f"{non_context_data[fk.referenced_table].to_json(orient='records', indent=2)}\n\n"
|
284
|
+
|
274
285
|
# add instructions
|
275
286
|
prompt += "\n## Instructions:\n\n"
|
276
287
|
if batch_size is not None:
|
277
288
|
prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
|
278
|
-
|
289
|
+
|
290
|
+
if context_data is not None:
|
279
291
|
prompt += (
|
280
292
|
f"Generate data for the `{table_name}` table. "
|
281
|
-
f"The Foreign Key column may only contain values from Context Table Data. "
|
293
|
+
f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
|
294
|
+
f"The second Foreign Key column from Foreign Keys section (if exists) may only contain values from Non-Context Table Data. "
|
282
295
|
f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
|
283
296
|
)
|
297
|
+
|
284
298
|
if previous_rows:
|
285
299
|
prompt += (
|
286
300
|
"Generate new rows that maintain consistency with the previous rows where appropriate. "
|
@@ -298,12 +312,13 @@ def _create_table_rows_generator(
|
|
298
312
|
table_name: str,
|
299
313
|
table_config: TableConfig,
|
300
314
|
primary_keys: dict[str, str] | None,
|
301
|
-
sample_size: int,
|
315
|
+
sample_size: int | None,
|
316
|
+
generated_data: dict[str, pd.DataFrame] | None,
|
302
317
|
temperature: float,
|
303
318
|
top_p: float,
|
304
|
-
context_data: pd.DataFrame | None,
|
305
319
|
batch_size: int,
|
306
320
|
previous_rows_size: int,
|
321
|
+
non_context_size: int | None,
|
307
322
|
llm_config: LLMConfig,
|
308
323
|
) -> Generator[dict]:
|
309
324
|
def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
|
@@ -311,14 +326,14 @@ def _create_table_rows_generator(
|
|
311
326
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
312
327
|
return Literal[tuple(column_config.values)]
|
313
328
|
return {
|
314
|
-
DType.INTEGER: int,
|
315
|
-
DType.FLOAT: float,
|
316
|
-
DType.STRING: str,
|
317
|
-
DType.BOOLEAN: bool,
|
329
|
+
DType.INTEGER: int | None,
|
330
|
+
DType.FLOAT: float | None,
|
331
|
+
DType.STRING: str | None,
|
332
|
+
DType.BOOLEAN: bool | None,
|
318
333
|
# response_format has limited support for JSON Schema features
|
319
334
|
# thus we represent dates and datetimes as strings
|
320
|
-
DType.DATE: str,
|
321
|
-
DType.DATETIME: str,
|
335
|
+
DType.DATE: str | None,
|
336
|
+
DType.DATETIME: str | None,
|
322
337
|
}[column_config.dtype]
|
323
338
|
|
324
339
|
fields = {}
|
@@ -368,6 +383,26 @@ def _create_table_rows_generator(
|
|
368
383
|
for i in range(0, len(data), batch_size):
|
369
384
|
yield data.iloc[i : i + batch_size]
|
370
385
|
|
386
|
+
# derive context data (if first foreign key is present) and harmonize sample size accordingly
|
387
|
+
context_data: pd.DataFrame | None = None
|
388
|
+
if table_config.foreign_keys:
|
389
|
+
context_table_name = table_config.foreign_keys[0].referenced_table
|
390
|
+
assert generated_data is not None
|
391
|
+
assert context_table_name in generated_data
|
392
|
+
context_data = generated_data[context_table_name]
|
393
|
+
sample_size = len(context_data)
|
394
|
+
assert sample_size is not None
|
395
|
+
|
396
|
+
# derive non-context data (if more than one foreign key is present)
|
397
|
+
non_context_data: dict[str, pd.DataFrame] = {}
|
398
|
+
if table_config.foreign_keys and len(table_config.foreign_keys) > 1:
|
399
|
+
assert generated_data is not None
|
400
|
+
assert non_context_size is not None
|
401
|
+
for fk in table_config.foreign_keys[1:]:
|
402
|
+
non_context_table_name = fk.referenced_table
|
403
|
+
assert non_context_table_name in generated_data
|
404
|
+
non_context_data[non_context_table_name] = generated_data[non_context_table_name]
|
405
|
+
|
371
406
|
# ensure model supports response_format and json schema
|
372
407
|
supported_params = litellm.get_supported_openai_params(model=llm_config.model)
|
373
408
|
assert "response_format" in supported_params
|
@@ -387,6 +422,11 @@ def _create_table_rows_generator(
|
|
387
422
|
yielded_sequences = 0
|
388
423
|
previous_rows = deque(maxlen=previous_rows_size)
|
389
424
|
for context_batch in batch_infinitely(context_data):
|
425
|
+
non_context_batch = (
|
426
|
+
{table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
|
427
|
+
if non_context_data
|
428
|
+
else None
|
429
|
+
)
|
390
430
|
prompt_kwargs = {
|
391
431
|
"table_name": table_name,
|
392
432
|
"table_description": table_config.description,
|
@@ -395,6 +435,7 @@ def _create_table_rows_generator(
|
|
395
435
|
"batch_size": batch_size if context_batch is None else None,
|
396
436
|
"foreign_keys": table_config.foreign_keys if context_batch is not None else None,
|
397
437
|
"context_data": context_batch if context_batch is not None else None,
|
438
|
+
"non_context_data": non_context_batch if non_context_batch else None,
|
398
439
|
"previous_rows": list(previous_rows),
|
399
440
|
}
|
400
441
|
prompt = _create_table_prompt(**prompt_kwargs)
|
@@ -429,10 +470,14 @@ def _convert_table_rows_generator_to_df(
|
|
429
470
|
for column_name, column_config in columns.items():
|
430
471
|
if column_config.dtype in [DType.DATE, DType.DATETIME]:
|
431
472
|
df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
|
432
|
-
elif column_config.dtype
|
433
|
-
df[column_name] = pd.to_numeric(df[column_name], errors="coerce",
|
473
|
+
elif column_config.dtype is DType.INTEGER:
|
474
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
|
475
|
+
"int64[pyarrow]"
|
476
|
+
)
|
477
|
+
elif column_config.dtype is DType.FLOAT:
|
478
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
|
434
479
|
elif column_config.dtype is DType.BOOLEAN:
|
435
|
-
df[column_name] = df[column_name].astype(
|
480
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
|
436
481
|
elif column_config.dtype is DType.CATEGORY:
|
437
482
|
df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
|
438
483
|
else:
|
@@ -472,7 +517,9 @@ def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], d
|
|
472
517
|
return child_to_parents, parent_to_children, subject_tables
|
473
518
|
|
474
519
|
|
475
|
-
def _build_execution_plan(
|
520
|
+
def _build_execution_plan(
|
521
|
+
parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
|
522
|
+
) -> list[str]:
|
476
523
|
execution_plan = []
|
477
524
|
bfs_queue = list(subject_tables)
|
478
525
|
processed = set()
|
@@ -482,6 +529,13 @@ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tabl
|
|
482
529
|
if table_name in processed:
|
483
530
|
continue
|
484
531
|
|
532
|
+
# ensure all parents are processed before processing this table
|
533
|
+
unprocessed_parents = [p for p in child_to_parents[table_name] if p not in processed]
|
534
|
+
if unprocessed_parents:
|
535
|
+
bfs_queue.extend(unprocessed_parents)
|
536
|
+
bfs_queue.append(table_name)
|
537
|
+
continue
|
538
|
+
|
485
539
|
execution_plan.append(table_name)
|
486
540
|
processed.add(table_name)
|
487
541
|
|
@@ -564,10 +618,19 @@ def sample(
|
|
564
618
|
},
|
565
619
|
"primary_key": "customer_id",
|
566
620
|
},
|
621
|
+
"warehouses": {
|
622
|
+
"description": "Warehouses of a hardware store",
|
623
|
+
"columns": {
|
624
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
625
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
626
|
+
},
|
627
|
+
"primary_key": "warehouse_id",
|
628
|
+
},
|
567
629
|
"orders": {
|
568
630
|
"description": "Orders of a Customer",
|
569
631
|
"columns": {
|
570
632
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
633
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
571
634
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
572
635
|
"text": {"prompt": "order text description", "dtype": "string"},
|
573
636
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -578,7 +641,11 @@ def sample(
|
|
578
641
|
"column": "customer_id",
|
579
642
|
"referenced_table": "customers",
|
580
643
|
"description": "each customer has anywhere between 2 and 3 orders",
|
581
|
-
}
|
644
|
+
},
|
645
|
+
{
|
646
|
+
"column": "warehouse_id",
|
647
|
+
"referenced_table": "warehouses",
|
648
|
+
},
|
582
649
|
],
|
583
650
|
},
|
584
651
|
"items": {
|
@@ -600,6 +667,7 @@ def sample(
|
|
600
667
|
}
|
601
668
|
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
602
669
|
df_customers = data["customers"]
|
670
|
+
df_warehouses = data["warehouses"]
|
603
671
|
df_orders = data["orders"]
|
604
672
|
df_items = data["items"]
|
605
673
|
```
|
@@ -611,7 +679,7 @@ def sample(
|
|
611
679
|
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
612
680
|
|
613
681
|
child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
|
614
|
-
execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
|
682
|
+
execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
|
615
683
|
|
616
684
|
results: dict[str, pd.DataFrame] = {}
|
617
685
|
|
@@ -624,26 +692,27 @@ def sample(
|
|
624
692
|
table_config=table_config,
|
625
693
|
primary_keys=None,
|
626
694
|
sample_size=sample_size[table_name],
|
627
|
-
|
695
|
+
generated_data=None,
|
628
696
|
temperature=temperature,
|
629
697
|
top_p=top_p,
|
630
|
-
batch_size=
|
631
|
-
previous_rows_size=
|
698
|
+
batch_size=30, # generate 30 subjects at a time
|
699
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
700
|
+
non_context_size=None,
|
632
701
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
633
702
|
)
|
634
703
|
else:
|
635
704
|
# sequencial table
|
636
|
-
referenced_table = table_config.foreign_keys[0].referenced_table
|
637
705
|
df = _sample_table(
|
638
706
|
table_name=table_name,
|
639
707
|
table_config=table_config,
|
640
708
|
primary_keys=primary_keys,
|
641
709
|
sample_size=None,
|
642
|
-
|
710
|
+
generated_data=results,
|
643
711
|
temperature=temperature,
|
644
712
|
top_p=top_p,
|
645
713
|
batch_size=1, # generate one sequence at a time
|
646
|
-
previous_rows_size=
|
714
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
715
|
+
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
647
716
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
648
717
|
)
|
649
718
|
results[table_name] = df
|
File without changes
|
File without changes
|