mostlyai-mock 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +134 -42
- {mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/METADATA +2 -1
- mostlyai_mock-0.1.7.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.5.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
from __future__ import annotations
|
16
16
|
|
17
|
+
import itertools
|
17
18
|
import json
|
18
19
|
from collections import deque
|
19
20
|
from collections.abc import Generator
|
@@ -22,6 +23,7 @@ from typing import Any, Literal
|
|
22
23
|
|
23
24
|
import litellm
|
24
25
|
import pandas as pd
|
26
|
+
import tenacity
|
25
27
|
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
26
28
|
from tqdm import tqdm
|
27
29
|
|
@@ -246,52 +248,85 @@ def _create_table_prompt(
|
|
246
248
|
prompt = f"# {prompt}\n\n"
|
247
249
|
|
248
250
|
# define table
|
249
|
-
prompt += f"## Table: {name}
|
251
|
+
prompt += f"## Target Table: `{name}`\n\n"
|
250
252
|
|
251
|
-
prompt += f"
|
253
|
+
prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
|
252
254
|
|
253
255
|
# add columns specifications
|
254
|
-
prompt += "
|
255
|
-
|
256
|
+
prompt += "### Target Table Column Specifications:\n\n"
|
257
|
+
column_specifications = {
|
258
|
+
name: config.model_dump(exclude_defaults=True, exclude_unset=True, exclude_none=True)
|
259
|
+
for name, config in columns.items()
|
260
|
+
}
|
261
|
+
if existing_data is not None:
|
262
|
+
# do not generate values for columns that already exist in existing data
|
263
|
+
column_specifications = {
|
264
|
+
column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
|
265
|
+
}
|
266
|
+
prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
|
256
267
|
|
257
268
|
# add previous rows as context to help the LLM generate consistent data
|
269
|
+
has_previous_rows_section = False
|
258
270
|
if previous_rows:
|
259
|
-
|
271
|
+
has_previous_rows_section = True
|
272
|
+
prompt += f"\n## Previous `{len(previous_rows)}` Rows of Target Table `{name}`:\n\n"
|
260
273
|
prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
|
261
274
|
|
262
275
|
# add existing data to augment
|
276
|
+
has_existing_data_section = False
|
263
277
|
if existing_data is not None:
|
264
|
-
|
278
|
+
has_existing_data_section = True
|
279
|
+
prompt += f"\n## Existing Data of Target Table `{name}` to Augment:\n\n"
|
265
280
|
prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
266
281
|
|
267
|
-
# define foreign keys
|
268
|
-
|
269
|
-
|
270
|
-
|
282
|
+
# define self referencing foreign keys
|
283
|
+
has_self_referencing_foreign_keys_section = False
|
284
|
+
self_referencing_foreign_keys = [fk for fk in foreign_keys if fk.referenced_table == name]
|
285
|
+
if self_referencing_foreign_keys:
|
286
|
+
has_self_referencing_foreign_keys_section = True
|
287
|
+
prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
|
288
|
+
for fk in self_referencing_foreign_keys:
|
289
|
+
prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
|
290
|
+
|
291
|
+
prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
|
292
|
+
|
293
|
+
prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
|
294
|
+
|
295
|
+
foreign_keys = [fk for fk in foreign_keys if fk.referenced_table != name] # exclude self-dependency going forward
|
271
296
|
|
272
297
|
# add context table name, primary key and data
|
273
|
-
|
298
|
+
has_context_table_section = False
|
299
|
+
if foreign_keys:
|
300
|
+
has_context_table_section = True
|
274
301
|
assert context_data is not None
|
275
302
|
fk = foreign_keys[0]
|
276
303
|
prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
|
277
304
|
|
278
|
-
prompt += f"
|
305
|
+
prompt += f"### Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
279
306
|
|
280
|
-
prompt += "
|
307
|
+
prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
|
308
|
+
|
309
|
+
prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
|
310
|
+
|
311
|
+
prompt += "### Context Table Data:\n\n"
|
281
312
|
prompt += f"{context_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
282
313
|
|
283
314
|
# add non-context table names, primary keys and data
|
315
|
+
has_non_context_tables_section = False
|
284
316
|
if foreign_keys and len(foreign_keys) > 1:
|
317
|
+
has_non_context_tables_section = True
|
285
318
|
for fk in foreign_keys[1:]:
|
286
|
-
if fk.referenced_table == name: # self-dependency is not considered as non-context
|
287
|
-
continue
|
288
319
|
assert non_context_data is not None
|
289
320
|
assert fk.referenced_table in non_context_data
|
290
321
|
prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
|
291
322
|
|
292
|
-
prompt += f"
|
323
|
+
prompt += f"### Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
293
324
|
|
294
|
-
prompt += "
|
325
|
+
prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
|
326
|
+
|
327
|
+
prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
|
328
|
+
|
329
|
+
prompt += "### Non-Context Table Data:\n\n"
|
295
330
|
prompt += (
|
296
331
|
f"{non_context_data[fk.referenced_table].to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
297
332
|
)
|
@@ -304,42 +339,62 @@ def _create_table_prompt(
|
|
304
339
|
n_rows = None
|
305
340
|
if existing_data is not None:
|
306
341
|
n_rows = len(existing_data)
|
307
|
-
elif not foreign_keys:
|
342
|
+
elif not foreign_keys and not self_referencing_foreign_keys:
|
308
343
|
assert batch_size is not None
|
309
344
|
n_rows = batch_size
|
310
345
|
|
311
|
-
prompt += f"{verb.capitalize()} data for the `{name}
|
346
|
+
prompt += f"{verb.capitalize()} data for the Target Table `{name}`.\n\n"
|
312
347
|
if n_rows is not None:
|
313
348
|
prompt += f"Number of rows to {verb}: `{n_rows}`.\n\n"
|
314
349
|
|
315
|
-
if
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
350
|
+
if has_context_table_section:
|
351
|
+
assert foreign_keys
|
352
|
+
prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
|
353
|
+
if has_previous_rows_section:
|
354
|
+
prompt += " Never use values from `Previous Rows of Target Table` section."
|
355
|
+
prompt += " Respect the `Description of the Relationship` of `Context Table` section to understand the relationship, in particular the number of rows to generate."
|
356
|
+
prompt += "\n\n"
|
357
|
+
|
358
|
+
if has_self_referencing_foreign_keys_section:
|
359
|
+
prompt += "Target Table Self Referencing Foreign Key columns defined in `Self Referencing Foreign Keys` must be consistent with the `Target Table Primary Key`."
|
360
|
+
prompt += " Respect the `Description of the Relationship` of `Self Referencing Foreign Keys` section to understand the relationship."
|
361
|
+
prompt += "\n\n"
|
362
|
+
|
363
|
+
if has_non_context_tables_section:
|
364
|
+
assert len(foreign_keys) > 1
|
365
|
+
prompt += "All other Target Table Foreign Key columns may only contain values from `Non-Context Table Data` of relevant `Non-Context Table` sections."
|
366
|
+
prompt += " Respect the `Description of the Relationship` of relevant `Non-Context Table` section to understand the relationship."
|
367
|
+
prompt += "\n\n"
|
368
|
+
|
369
|
+
if has_existing_data_section:
|
370
|
+
assert existing_data is not None
|
325
371
|
prompt += (
|
326
372
|
f"You are given existing data for the `{name}` table and asked to generate "
|
327
|
-
f"values for the missing columns. The existing data contains column(s): {
|
328
|
-
f"You need to generate values for column(s): {
|
373
|
+
f"values for the missing columns. The existing data contains column(s): {list(existing_data.columns)}. "
|
374
|
+
f"You need to generate values for column(s): {list(columns.keys() - existing_data.columns)}. "
|
329
375
|
f"Ensure that the generated values are contextually appropriate and consistent with the existing data. "
|
330
376
|
f"Use the existing columns' values to inform the generation of new values. "
|
331
377
|
f"Don't generate new rows, only augment the existing data.\n\n"
|
332
378
|
)
|
333
379
|
|
334
|
-
if
|
380
|
+
if has_previous_rows_section:
|
381
|
+
assert previous_rows is not None
|
335
382
|
prompt += (
|
336
383
|
f"{verb.capitalize()} new rows that maintain consistency with the previous rows where appropriate. "
|
337
384
|
"Don't copy previous rows in the output. "
|
338
385
|
"Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
|
339
386
|
)
|
387
|
+
|
340
388
|
prompt += f"Do not use code to {verb} the data.\n\n"
|
341
|
-
prompt += "Return the full data as a JSON string.\n"
|
342
389
|
|
390
|
+
prompt += "Return data as a JSON string."
|
391
|
+
prompt += " The JSON string should have 'rows' key at the top level. The value of 'rows' key should be a list of JSON objects."
|
392
|
+
prompt += " Each JSON object should have column names as keys and values as column values."
|
393
|
+
if existing_data is not None:
|
394
|
+
prompt += (
|
395
|
+
f" Only include the following columns in the JSON string: {list(columns.keys() - existing_data.columns)}."
|
396
|
+
)
|
397
|
+
prompt += "\n"
|
343
398
|
return prompt
|
344
399
|
|
345
400
|
|
@@ -357,7 +412,9 @@ def _create_table_rows_generator(
|
|
357
412
|
non_context_size: int | None,
|
358
413
|
llm_config: LLMConfig,
|
359
414
|
) -> Generator[dict]:
|
360
|
-
def create_table_response_format(
|
415
|
+
def create_table_response_format(
|
416
|
+
columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
|
417
|
+
) -> tuple[type[BaseModel], int]:
|
361
418
|
def create_annotation(column_config: ColumnConfig) -> type:
|
362
419
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
363
420
|
return Literal[tuple(column_config.values)]
|
@@ -374,11 +431,14 @@ def _create_table_rows_generator(
|
|
374
431
|
|
375
432
|
fields = {}
|
376
433
|
for column_name, column_config in columns.items():
|
434
|
+
if existing_data is not None and column_name in existing_data.columns:
|
435
|
+
continue # skip columns that already exist in existing data
|
377
436
|
annotation = create_annotation(column_config)
|
378
437
|
fields[column_name] = (annotation, Field(...))
|
379
438
|
TableRow = create_model("TableRow", **fields)
|
380
439
|
TableRows = create_model("TableRows", rows=(list[TableRow], ...))
|
381
|
-
|
440
|
+
n_enforced_columns = len(fields)
|
441
|
+
return TableRows, n_enforced_columns
|
382
442
|
|
383
443
|
def yield_rows_from_json_chunks_stream(response: litellm.CustomStreamWrapper) -> Generator[dict]:
|
384
444
|
# starting with dirty buffer is to handle the `{"rows": []}` case
|
@@ -419,6 +479,18 @@ def _create_table_rows_generator(
|
|
419
479
|
for i in range(0, len(data), batch_size):
|
420
480
|
yield data.iloc[i : i + batch_size]
|
421
481
|
|
482
|
+
def completion_with_retries(*args, **kwargs):
|
483
|
+
n_attempts = 3
|
484
|
+
|
485
|
+
def print_on_retry(_):
|
486
|
+
print(" * Trying again... * ", end="", flush=True)
|
487
|
+
|
488
|
+
# try up to 3 times, print a message to the user on each retry
|
489
|
+
retryer = tenacity.Retrying(
|
490
|
+
stop=tenacity.stop_after_attempt(n_attempts), reraise=True, before_sleep=print_on_retry
|
491
|
+
)
|
492
|
+
return retryer(litellm.completion, *args, **kwargs)
|
493
|
+
|
422
494
|
if not llm_config.model.startswith("litellm_proxy/"):
|
423
495
|
# ensure model supports response_format and json schema (this check does not work with litellm_proxy)
|
424
496
|
supported_params = litellm.get_supported_openai_params(model=llm_config.model) or []
|
@@ -453,7 +525,6 @@ def _create_table_rows_generator(
|
|
453
525
|
non_context_data[non_context_table_name] = data[non_context_table_name]
|
454
526
|
|
455
527
|
litellm_kwargs = {
|
456
|
-
"response_format": create_table_response_format(columns=columns),
|
457
528
|
"temperature": llm_config.temperature,
|
458
529
|
"top_p": llm_config.top_p,
|
459
530
|
"model": llm_config.model,
|
@@ -494,6 +565,10 @@ def _create_table_rows_generator(
|
|
494
565
|
if batch_size >= remaining_rows:
|
495
566
|
batch_size = remaining_rows + 2 # +2 because LLM may not always count the rows correctly
|
496
567
|
|
568
|
+
response_format, n_enforced_columns = create_table_response_format(
|
569
|
+
columns=columns, existing_data=existing_batch
|
570
|
+
)
|
571
|
+
|
497
572
|
llm_prompt = _create_table_prompt(
|
498
573
|
name=name,
|
499
574
|
prompt=prompt,
|
@@ -508,12 +583,20 @@ def _create_table_rows_generator(
|
|
508
583
|
)
|
509
584
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": llm_prompt}]
|
510
585
|
|
511
|
-
|
512
|
-
|
586
|
+
if n_enforced_columns != 0:
|
587
|
+
response = completion_with_retries(messages=messages, response_format=response_format, **litellm_kwargs)
|
588
|
+
rows_stream = yield_rows_from_json_chunks_stream(response)
|
589
|
+
else:
|
590
|
+
# skip roundtrip to LLM in case all columns are provided in existing data
|
591
|
+
rows_stream = itertools.repeat({})
|
513
592
|
|
593
|
+
batch_row_idx = 0
|
514
594
|
while True:
|
515
595
|
try:
|
516
|
-
|
596
|
+
row_generated_part = next(rows_stream)
|
597
|
+
row_existing_part = existing_batch.iloc[batch_row_idx].to_dict() if existing_batch is not None else {}
|
598
|
+
row = {**row_existing_part, **row_generated_part}
|
599
|
+
row = {column: row[column] for column in columns.keys()} # keep columns order according to user's spec
|
517
600
|
except StopIteration:
|
518
601
|
break # move to next batch
|
519
602
|
previous_rows.append(row)
|
@@ -523,6 +606,7 @@ def _create_table_rows_generator(
|
|
523
606
|
yielded_sequences += 1
|
524
607
|
if yielded_sequences >= sample_size:
|
525
608
|
return # move to next table
|
609
|
+
batch_row_idx += 1
|
526
610
|
if context_batch is not None:
|
527
611
|
# for each context_batch, full sequences are generated
|
528
612
|
yielded_sequences += len(context_batch)
|
@@ -653,7 +737,7 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
|
|
653
737
|
def sample(
|
654
738
|
*,
|
655
739
|
tables: dict[str, dict],
|
656
|
-
sample_size: int | dict[str, int] =
|
740
|
+
sample_size: int | dict[str, int] = 4,
|
657
741
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
658
742
|
model: str = "openai/gpt-4.1-nano",
|
659
743
|
api_key: str | None = None,
|
@@ -664,12 +748,20 @@ def sample(
|
|
664
748
|
"""
|
665
749
|
Generate mock data from scratch or enrich existing data by prompting an LLM.
|
666
750
|
|
751
|
+
While faker and numpy are useful to create fake data, this utility is unique as it allows
|
752
|
+
the creation of coherent, realistic multi-table tabular mock data
|
753
|
+
or the enrichment of existing datasets with new, context-aware columns.
|
754
|
+
|
755
|
+
It is particularly useful for quickly simulating production-like datasets for testing or prototyping purposes.
|
756
|
+
It is advised to limit mocking to small datasets for performance reasons (rows * cols < 100).
|
757
|
+
It might take a couple of minutes for bigger datasets.
|
758
|
+
|
667
759
|
Args:
|
668
760
|
tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
|
669
761
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
670
762
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
671
763
|
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
672
|
-
Default is
|
764
|
+
Default is 4. Ignored if existing_data is provided.
|
673
765
|
If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
|
674
766
|
existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
|
675
767
|
Default is None.
|
@@ -889,7 +981,7 @@ def sample(
|
|
889
981
|
primary_keys=primary_keys,
|
890
982
|
data=data,
|
891
983
|
sample_size=sample_size[table_name],
|
892
|
-
batch_size=
|
984
|
+
batch_size=20, # generate 20 root table rows at a time
|
893
985
|
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
894
986
|
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
895
987
|
llm_config=llm_config,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.7
|
4
4
|
Summary: LLM-generated Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -30,6 +30,7 @@ Requires-Dist: numpy>=1.26.3
|
|
30
30
|
Requires-Dist: pandas>=2.0.0
|
31
31
|
Requires-Dist: pyarrow>=14.0.0
|
32
32
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
33
|
+
Requires-Dist: tenacity>=9.1.2
|
33
34
|
Description-Content-Type: text/markdown
|
34
35
|
|
35
36
|
# LLM-generated Mock Data 🔮
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=Cmo4Ko8-X41gSewcEpNTTvw7bpRUrtn6B5Cmnwric-Q,714
|
2
|
+
mostlyai/mock/core.py,sha256=L-PbOTSIR1cfBeMZL8-v5k7VhxBfKAoyw230soBwQWc,42754
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
|
4
|
+
mostlyai_mock-0.1.7.dist-info/METADATA,sha256=6tLpoqLx-LOI-Cr_O_xWm4LI5PBfa4nt1FkrqdNIpQA,13918
|
5
|
+
mostlyai_mock-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.7.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.7.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=-bfsVZJQ0OkN5b3IRP3F9aUCiA8Eq1-RmAqBmTg0O0g,714
|
2
|
-
mostlyai/mock/core.py,sha256=V7KG7nOQPU95v6lRoSIfJuYivS0pNZ3rbiNC6SqDZSc,38075
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
|
4
|
-
mostlyai_mock-0.1.5.dist-info/METADATA,sha256=LfugCsu7ANDZk2ozNFHDxgCqY42etJIdkXcfc-S-cUE,13887
|
5
|
-
mostlyai_mock-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.5.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|