mostlyai-mock 0.0.6__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.0.6
3
+ Version: 0.0.7
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
33
33
 
34
34
  # Synthetic Mock Data 🔮
35
35
 
36
- [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
36
+ [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
37
37
 
38
38
  Create data out of nothing. Prompt LLMs for Tabular Data.
39
39
 
@@ -119,10 +119,19 @@ tables = {
119
119
  },
120
120
  "primary_key": "customer_id",
121
121
  },
122
+ "warehouses": {
123
+ "description": "Warehouses of a hardware store",
124
+ "columns": {
125
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
126
+ "name": {"prompt": "the name of the warehouse", "dtype": "string"},
127
+ },
128
+ "primary_key": "warehouse_id",
129
+ },
122
130
  "orders": {
123
131
  "description": "Orders of a Customer",
124
132
  "columns": {
125
133
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
134
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
126
135
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
127
136
  "text": {"prompt": "order text description", "dtype": "string"},
128
137
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -133,7 +142,11 @@ tables = {
133
142
  "column": "customer_id",
134
143
  "referenced_table": "customers",
135
144
  "description": "each customer has anywhere between 2 and 3 orders",
136
- }
145
+ },
146
+ {
147
+ "column": "warehouse_id",
148
+ "referenced_table": "warehouses",
149
+ },
137
150
  ],
138
151
  },
139
152
  "items": {
@@ -159,28 +172,30 @@ data = mock.sample(
159
172
  model="openai/gpt-4.1"
160
173
  )
161
174
  print(data["customers"])
162
- # customer_id name
163
- # 0 1 Michael Torres
164
- # 1 2 Elaine Kim
175
+ # customer_id name
176
+ # 0 1 Matthew Carlson
177
+ # 1 2 Priya Shah
178
+ print(data["warehouses"])
179
+ # warehouse_id name
180
+ # 0 1 Central Distribution Hub
181
+ # 1 2 Northgate Storage Facility
165
182
  print(data["orders"])
166
- # customer_id order_id text amount
167
- # 0 1 ORD20240612001 Home office desk and ergonomic chair bundle 412.95
168
- # 1 1 ORD20240517322 Wireless noise-cancelling headphones 226.49
169
- # 2 1 ORD20240430307 Smart LED desk lamp with USB charging port 69.99
170
- # 3 2 ORD20240614015 Eco-friendly bamboo kitchen utensil set 39.95
171
- # 4 2 ORD20240528356 Air fryer with digital touch screen, 5-quart c... 129.99
172
- # 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
183
+ # customer_id warehouse_id order_id text amount
184
+ # 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
185
+ # 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
186
+ # 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
187
+ # 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
188
+ # 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
173
189
  print(data["items"])
174
- # item_id order_id name price
175
- # 0 ITEM100001A ORD20240612001 Ergonomic Mesh Office Chair 179.99
176
- # 1 ITEM100001B ORD20240612001 Adjustable Home Office Desk 232.96
177
- # 2 ITEM100002A ORD20240517322 Wireless Noise-Cancelling Headphones 226.49
178
- # 3 ITEM100003A ORD20240430307 Smart LED Desk Lamp 59.99
179
- # 4 ITEM100003B ORD20240430307 USB Charging Cable (Desk Lamp Compatible) 10.0
180
- # 5 ITEM100004A ORD20240614015 Bamboo Cooking Spoon 13.49
181
- # 6 ITEM100004B ORD20240614015 Bamboo Slotted Turner 12.99
182
- # 7 ITEM100005A ORD20240528356 Digital Air Fryer (5-Quart, Black) 115.99
183
- # 8 ITEM100005B ORD20240528356 Silicone Liner for Air Fryer (5-Quart) 13.99
184
- # 9 ITEM100006A ORD20240510078 Double-Walled Glass Coffee Mug (12oz) 13.75
185
- # 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
190
+ # item_id order_id name price
191
+ # 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
192
+ # 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
193
+ # 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
194
+ # 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
195
+ # 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
196
+ # 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
197
+ # 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
198
+ # 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
199
+ # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
200
+ # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
186
201
  ```
@@ -1,6 +1,6 @@
1
1
  # Synthetic Mock Data 🔮
2
2
 
3
- [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
3
+ [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
4
4
 
5
5
  Create data out of nothing. Prompt LLMs for Tabular Data.
6
6
 
@@ -86,10 +86,19 @@ tables = {
86
86
  },
87
87
  "primary_key": "customer_id",
88
88
  },
89
+ "warehouses": {
90
+ "description": "Warehouses of a hardware store",
91
+ "columns": {
92
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
93
+ "name": {"prompt": "the name of the warehouse", "dtype": "string"},
94
+ },
95
+ "primary_key": "warehouse_id",
96
+ },
89
97
  "orders": {
90
98
  "description": "Orders of a Customer",
91
99
  "columns": {
92
100
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
101
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
93
102
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
94
103
  "text": {"prompt": "order text description", "dtype": "string"},
95
104
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -100,7 +109,11 @@ tables = {
100
109
  "column": "customer_id",
101
110
  "referenced_table": "customers",
102
111
  "description": "each customer has anywhere between 2 and 3 orders",
103
- }
112
+ },
113
+ {
114
+ "column": "warehouse_id",
115
+ "referenced_table": "warehouses",
116
+ },
104
117
  ],
105
118
  },
106
119
  "items": {
@@ -126,28 +139,30 @@ data = mock.sample(
126
139
  model="openai/gpt-4.1"
127
140
  )
128
141
  print(data["customers"])
129
- # customer_id name
130
- # 0 1 Michael Torres
131
- # 1 2 Elaine Kim
142
+ # customer_id name
143
+ # 0 1 Matthew Carlson
144
+ # 1 2 Priya Shah
145
+ print(data["warehouses"])
146
+ # warehouse_id name
147
+ # 0 1 Central Distribution Hub
148
+ # 1 2 Northgate Storage Facility
132
149
  print(data["orders"])
133
- # customer_id order_id text amount
134
- # 0 1 ORD20240612001 Home office desk and ergonomic chair bundle 412.95
135
- # 1 1 ORD20240517322 Wireless noise-cancelling headphones 226.49
136
- # 2 1 ORD20240430307 Smart LED desk lamp with USB charging port 69.99
137
- # 3 2 ORD20240614015 Eco-friendly bamboo kitchen utensil set 39.95
138
- # 4 2 ORD20240528356 Air fryer with digital touch screen, 5-quart c... 129.99
139
- # 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
150
+ # customer_id warehouse_id order_id text amount
151
+ # 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
152
+ # 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
153
+ # 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
154
+ # 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
155
+ # 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
140
156
  print(data["items"])
141
- # item_id order_id name price
142
- # 0 ITEM100001A ORD20240612001 Ergonomic Mesh Office Chair 179.99
143
- # 1 ITEM100001B ORD20240612001 Adjustable Home Office Desk 232.96
144
- # 2 ITEM100002A ORD20240517322 Wireless Noise-Cancelling Headphones 226.49
145
- # 3 ITEM100003A ORD20240430307 Smart LED Desk Lamp 59.99
146
- # 4 ITEM100003B ORD20240430307 USB Charging Cable (Desk Lamp Compatible) 10.0
147
- # 5 ITEM100004A ORD20240614015 Bamboo Cooking Spoon 13.49
148
- # 6 ITEM100004B ORD20240614015 Bamboo Slotted Turner 12.99
149
- # 7 ITEM100005A ORD20240528356 Digital Air Fryer (5-Quart, Black) 115.99
150
- # 8 ITEM100005B ORD20240528356 Silicone Liner for Air Fryer (5-Quart) 13.99
151
- # 9 ITEM100006A ORD20240510078 Double-Walled Glass Coffee Mug (12oz) 13.75
152
- # 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
157
+ # item_id order_id name price
158
+ # 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
159
+ # 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
160
+ # 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
161
+ # 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
162
+ # 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
163
+ # 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
164
+ # 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
165
+ # 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
166
+ # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
167
+ # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
153
168
  ```
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.6" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.7" # Do not set this manually. Use poetry version [params].
@@ -100,7 +100,10 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
100
100
  if table_name in path:
101
101
  cycle_start = path.index(table_name)
102
102
  cycle = path[cycle_start:] + [table_name]
103
- raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}")
103
+ msg = f"Circular dependency detected: {' -> '.join(cycle)}."
104
+ if len(cycle) == 2:
105
+ msg += " Self-referencing tables are not yet supported."
106
+ raise ValueError(msg)
104
107
  if table_name in visited:
105
108
  return
106
109
  visited.add(table_name)
@@ -119,7 +122,7 @@ class TableConfig(BaseModel):
119
122
  description: str = ""
120
123
  columns: dict[str, ColumnConfig] = Field(..., min_items=1)
121
124
  primary_key: str | None = None
122
- foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list, min_length=0, max_length=1)
125
+ foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
123
126
 
124
127
 
125
128
  class ColumnConfig(BaseModel):
@@ -163,7 +166,7 @@ class ColumnConfig(BaseModel):
163
166
  DType.DATETIME: (str, "strings"),
164
167
  }[self.dtype]
165
168
  try:
166
- self.values = [cast_fn(c) for c in self.values]
169
+ self.values = [cast_fn(c) if pd.notna(c) else None for c in self.values]
167
170
  except ValueError:
168
171
  raise ValueError(
169
172
  f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
@@ -193,28 +196,25 @@ def _sample_table(
193
196
  table_config: TableConfig,
194
197
  primary_keys: dict[str, str] | None,
195
198
  sample_size: int | None,
196
- context_data: pd.DataFrame | None,
199
+ generated_data: dict[str, pd.DataFrame] | None,
197
200
  temperature: float,
198
201
  top_p: float,
199
202
  batch_size: int,
200
203
  previous_rows_size: int,
204
+ non_context_size: int | None,
201
205
  llm_config: LLMConfig,
202
206
  ) -> pd.DataFrame:
203
- assert (sample_size is None) != (context_data is None), (
204
- "Exactly one of sample_size or context_data must be provided"
205
- )
206
- if sample_size is None:
207
- sample_size = len(context_data)
208
207
  table_rows_generator = _create_table_rows_generator(
209
208
  table_name=table_name,
210
209
  table_config=table_config,
211
210
  primary_keys=primary_keys,
212
211
  sample_size=sample_size,
213
- context_data=context_data,
212
+ generated_data=generated_data,
214
213
  temperature=temperature,
215
214
  top_p=top_p,
216
215
  batch_size=batch_size,
217
216
  previous_rows_size=previous_rows_size,
217
+ non_context_size=non_context_size,
218
218
  llm_config=llm_config,
219
219
  )
220
220
  table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
@@ -231,6 +231,7 @@ def _create_table_prompt(
231
231
  batch_size: int | None,
232
232
  foreign_keys: list[ForeignKeyConfig] | None,
233
233
  context_data: pd.DataFrame | None,
234
+ non_context_data: dict[str, pd.DataFrame],
234
235
  previous_rows: list[dict],
235
236
  ) -> str:
236
237
  if batch_size is not None:
@@ -271,16 +272,29 @@ def _create_table_prompt(
271
272
  prompt += f"## Context Table Data:\n\n"
272
273
  prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
273
274
 
275
+ # add non-context table names, primary keys and data
276
+ if non_context_data:
277
+ for fk in foreign_keys[1:]:
278
+ prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
279
+
280
+ prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
281
+
282
+ prompt += f"## Non-Context Table Data:\n\n"
283
+ prompt += f"{non_context_data[fk.referenced_table].to_json(orient='records', indent=2)}\n\n"
284
+
274
285
  # add instructions
275
286
  prompt += "\n## Instructions:\n\n"
276
287
  if batch_size is not None:
277
288
  prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
278
- else:
289
+
290
+ if context_data is not None:
279
291
  prompt += (
280
292
  f"Generate data for the `{table_name}` table. "
281
- f"The Foreign Key column may only contain values from Context Table Data. "
293
+ f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
294
+ f"The second Foreign Key column from Foreign Keys section (if exists) may only contain values from Non-Context Table Data. "
282
295
  f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
283
296
  )
297
+
284
298
  if previous_rows:
285
299
  prompt += (
286
300
  "Generate new rows that maintain consistency with the previous rows where appropriate. "
@@ -298,12 +312,13 @@ def _create_table_rows_generator(
298
312
  table_name: str,
299
313
  table_config: TableConfig,
300
314
  primary_keys: dict[str, str] | None,
301
- sample_size: int,
315
+ sample_size: int | None,
316
+ generated_data: dict[str, pd.DataFrame] | None,
302
317
  temperature: float,
303
318
  top_p: float,
304
- context_data: pd.DataFrame | None,
305
319
  batch_size: int,
306
320
  previous_rows_size: int,
321
+ non_context_size: int | None,
307
322
  llm_config: LLMConfig,
308
323
  ) -> Generator[dict]:
309
324
  def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
@@ -311,14 +326,14 @@ def _create_table_rows_generator(
311
326
  if column_config.values or column_config.dtype is DType.CATEGORY:
312
327
  return Literal[tuple(column_config.values)]
313
328
  return {
314
- DType.INTEGER: int,
315
- DType.FLOAT: float,
316
- DType.STRING: str,
317
- DType.BOOLEAN: bool,
329
+ DType.INTEGER: int | None,
330
+ DType.FLOAT: float | None,
331
+ DType.STRING: str | None,
332
+ DType.BOOLEAN: bool | None,
318
333
  # response_format has limited support for JSON Schema features
319
334
  # thus we represent dates and datetimes as strings
320
- DType.DATE: str,
321
- DType.DATETIME: str,
335
+ DType.DATE: str | None,
336
+ DType.DATETIME: str | None,
322
337
  }[column_config.dtype]
323
338
 
324
339
  fields = {}
@@ -368,6 +383,26 @@ def _create_table_rows_generator(
368
383
  for i in range(0, len(data), batch_size):
369
384
  yield data.iloc[i : i + batch_size]
370
385
 
386
+ # derive context data (if first foreign key is present) and harmonize sample size accordingly
387
+ context_data: pd.DataFrame | None = None
388
+ if table_config.foreign_keys:
389
+ context_table_name = table_config.foreign_keys[0].referenced_table
390
+ assert generated_data is not None
391
+ assert context_table_name in generated_data
392
+ context_data = generated_data[context_table_name]
393
+ sample_size = len(context_data)
394
+ assert sample_size is not None
395
+
396
+ # derive non-context data (if more than one foreign key is present)
397
+ non_context_data: dict[str, pd.DataFrame] = {}
398
+ if table_config.foreign_keys and len(table_config.foreign_keys) > 1:
399
+ assert generated_data is not None
400
+ assert non_context_size is not None
401
+ for fk in table_config.foreign_keys[1:]:
402
+ non_context_table_name = fk.referenced_table
403
+ assert non_context_table_name in generated_data
404
+ non_context_data[non_context_table_name] = generated_data[non_context_table_name]
405
+
371
406
  # ensure model supports response_format and json schema
372
407
  supported_params = litellm.get_supported_openai_params(model=llm_config.model)
373
408
  assert "response_format" in supported_params
@@ -387,6 +422,11 @@ def _create_table_rows_generator(
387
422
  yielded_sequences = 0
388
423
  previous_rows = deque(maxlen=previous_rows_size)
389
424
  for context_batch in batch_infinitely(context_data):
425
+ non_context_batch = (
426
+ {table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
427
+ if non_context_data
428
+ else None
429
+ )
390
430
  prompt_kwargs = {
391
431
  "table_name": table_name,
392
432
  "table_description": table_config.description,
@@ -395,6 +435,7 @@ def _create_table_rows_generator(
395
435
  "batch_size": batch_size if context_batch is None else None,
396
436
  "foreign_keys": table_config.foreign_keys if context_batch is not None else None,
397
437
  "context_data": context_batch if context_batch is not None else None,
438
+ "non_context_data": non_context_batch if non_context_batch else None,
398
439
  "previous_rows": list(previous_rows),
399
440
  }
400
441
  prompt = _create_table_prompt(**prompt_kwargs)
@@ -429,10 +470,14 @@ def _convert_table_rows_generator_to_df(
429
470
  for column_name, column_config in columns.items():
430
471
  if column_config.dtype in [DType.DATE, DType.DATETIME]:
431
472
  df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
432
- elif column_config.dtype in [DType.INTEGER, DType.FLOAT]:
433
- df[column_name] = pd.to_numeric(df[column_name], errors="coerce", dtype_backend="pyarrow")
473
+ elif column_config.dtype is DType.INTEGER:
474
+ df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
475
+ "int64[pyarrow]"
476
+ )
477
+ elif column_config.dtype is DType.FLOAT:
478
+ df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
434
479
  elif column_config.dtype is DType.BOOLEAN:
435
- df[column_name] = df[column_name].astype(bool)
480
+ df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
436
481
  elif column_config.dtype is DType.CATEGORY:
437
482
  df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
438
483
  else:
@@ -472,7 +517,9 @@ def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], d
472
517
  return child_to_parents, parent_to_children, subject_tables
473
518
 
474
519
 
475
- def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
520
+ def _build_execution_plan(
521
+ parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
522
+ ) -> list[str]:
476
523
  execution_plan = []
477
524
  bfs_queue = list(subject_tables)
478
525
  processed = set()
@@ -482,6 +529,13 @@ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tabl
482
529
  if table_name in processed:
483
530
  continue
484
531
 
532
+ # ensure all parents are processed before processing this table
533
+ unprocessed_parents = [p for p in child_to_parents[table_name] if p not in processed]
534
+ if unprocessed_parents:
535
+ bfs_queue.extend(unprocessed_parents)
536
+ bfs_queue.append(table_name)
537
+ continue
538
+
485
539
  execution_plan.append(table_name)
486
540
  processed.add(table_name)
487
541
 
@@ -564,10 +618,19 @@ def sample(
564
618
  },
565
619
  "primary_key": "customer_id",
566
620
  },
621
+ "warehouses": {
622
+ "description": "Warehouses of a hardware store",
623
+ "columns": {
624
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
625
+ "name": {"prompt": "the name of the warehouse", "dtype": "string"},
626
+ },
627
+ "primary_key": "warehouse_id",
628
+ },
567
629
  "orders": {
568
630
  "description": "Orders of a Customer",
569
631
  "columns": {
570
632
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
633
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
571
634
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
572
635
  "text": {"prompt": "order text description", "dtype": "string"},
573
636
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -578,7 +641,11 @@ def sample(
578
641
  "column": "customer_id",
579
642
  "referenced_table": "customers",
580
643
  "description": "each customer has anywhere between 2 and 3 orders",
581
- }
644
+ },
645
+ {
646
+ "column": "warehouse_id",
647
+ "referenced_table": "warehouses",
648
+ },
582
649
  ],
583
650
  },
584
651
  "items": {
@@ -600,6 +667,7 @@ def sample(
600
667
  }
601
668
  data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
602
669
  df_customers = data["customers"]
670
+ df_warehouses = data["warehouses"]
603
671
  df_orders = data["orders"]
604
672
  df_items = data["items"]
605
673
  ```
@@ -611,7 +679,7 @@ def sample(
611
679
  primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
612
680
 
613
681
  child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
614
- execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
682
+ execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
615
683
 
616
684
  results: dict[str, pd.DataFrame] = {}
617
685
 
@@ -624,26 +692,27 @@ def sample(
624
692
  table_config=table_config,
625
693
  primary_keys=None,
626
694
  sample_size=sample_size[table_name],
627
- context_data=None,
695
+ generated_data=None,
628
696
  temperature=temperature,
629
697
  top_p=top_p,
630
- batch_size=20, # generate 20 subjects at a time
631
- previous_rows_size=5,
698
+ batch_size=30, # generate 30 subjects at a time
699
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
700
+ non_context_size=None,
632
701
  llm_config=LLMConfig(model=model, api_key=api_key),
633
702
  )
634
703
  else:
635
704
  # sequencial table
636
- referenced_table = table_config.foreign_keys[0].referenced_table
637
705
  df = _sample_table(
638
706
  table_name=table_name,
639
707
  table_config=table_config,
640
708
  primary_keys=primary_keys,
641
709
  sample_size=None,
642
- context_data=results[referenced_table],
710
+ generated_data=results,
643
711
  temperature=temperature,
644
712
  top_p=top_p,
645
713
  batch_size=1, # generate one sequence at a time
646
- previous_rows_size=5,
714
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
715
+ non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
647
716
  llm_config=LLMConfig(model=model, api_key=api_key),
648
717
  )
649
718
  results[table_name] = df
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.0.6"
3
+ version = "0.0.7"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes
File without changes