mostlyai-mock 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.5" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.7" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import itertools
17
18
  import json
18
19
  from collections import deque
19
20
  from collections.abc import Generator
@@ -22,6 +23,7 @@ from typing import Any, Literal
22
23
 
23
24
  import litellm
24
25
  import pandas as pd
26
+ import tenacity
25
27
  from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
26
28
  from tqdm import tqdm
27
29
 
@@ -246,52 +248,85 @@ def _create_table_prompt(
246
248
  prompt = f"# {prompt}\n\n"
247
249
 
248
250
  # define table
249
- prompt += f"## Table: {name}\n\n"
251
+ prompt += f"## Target Table: `{name}`\n\n"
250
252
 
251
- prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
253
+ prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
252
254
 
253
255
  # add columns specifications
254
- prompt += "## Columns Specifications:\n\n"
255
- prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
256
+ prompt += "### Target Table Column Specifications:\n\n"
257
+ column_specifications = {
258
+ name: config.model_dump(exclude_defaults=True, exclude_unset=True, exclude_none=True)
259
+ for name, config in columns.items()
260
+ }
261
+ if existing_data is not None:
262
+ # do not generate values for columns that already exist in existing data
263
+ column_specifications = {
264
+ column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
265
+ }
266
+ prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
256
267
 
257
268
  # add previous rows as context to help the LLM generate consistent data
269
+ has_previous_rows_section = False
258
270
  if previous_rows:
259
- prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
271
+ has_previous_rows_section = True
272
+ prompt += f"\n## Previous `{len(previous_rows)}` Rows of Target Table `{name}`:\n\n"
260
273
  prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
261
274
 
262
275
  # add existing data to augment
276
+ has_existing_data_section = False
263
277
  if existing_data is not None:
264
- prompt += "\n## Existing Data to Augment:\n\n"
278
+ has_existing_data_section = True
279
+ prompt += f"\n## Existing Data of Target Table `{name}` to Augment:\n\n"
265
280
  prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
266
281
 
267
- # define foreign keys
268
- if foreign_keys:
269
- prompt += "## Foreign Keys:\n\n"
270
- prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
282
+ # define self referencing foreign keys
283
+ has_self_referencing_foreign_keys_section = False
284
+ self_referencing_foreign_keys = [fk for fk in foreign_keys if fk.referenced_table == name]
285
+ if self_referencing_foreign_keys:
286
+ has_self_referencing_foreign_keys_section = True
287
+ prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
288
+ for fk in self_referencing_foreign_keys:
289
+ prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
290
+
291
+ prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
292
+
293
+ prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
294
+
295
+ foreign_keys = [fk for fk in foreign_keys if fk.referenced_table != name] # exclude self-dependency going forward
271
296
 
272
297
  # add context table name, primary key and data
273
- if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
298
+ has_context_table_section = False
299
+ if foreign_keys:
300
+ has_context_table_section = True
274
301
  assert context_data is not None
275
302
  fk = foreign_keys[0]
276
303
  prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
277
304
 
278
- prompt += f"## Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
305
+ prompt += f"### Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
279
306
 
280
- prompt += "## Context Table Data:\n\n"
307
+ prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
308
+
309
+ prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
310
+
311
+ prompt += "### Context Table Data:\n\n"
281
312
  prompt += f"{context_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
282
313
 
283
314
  # add non-context table names, primary keys and data
315
+ has_non_context_tables_section = False
284
316
  if foreign_keys and len(foreign_keys) > 1:
317
+ has_non_context_tables_section = True
285
318
  for fk in foreign_keys[1:]:
286
- if fk.referenced_table == name: # self-dependency is not considered as non-context
287
- continue
288
319
  assert non_context_data is not None
289
320
  assert fk.referenced_table in non_context_data
290
321
  prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
291
322
 
292
- prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
323
+ prompt += f"### Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
293
324
 
294
- prompt += "## Non-Context Table Data:\n\n"
325
+ prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
326
+
327
+ prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
328
+
329
+ prompt += "### Non-Context Table Data:\n\n"
295
330
  prompt += (
296
331
  f"{non_context_data[fk.referenced_table].to_json(orient='records', date_format='iso', indent=2)}\n\n"
297
332
  )
@@ -304,42 +339,62 @@ def _create_table_prompt(
304
339
  n_rows = None
305
340
  if existing_data is not None:
306
341
  n_rows = len(existing_data)
307
- elif not foreign_keys:
342
+ elif not foreign_keys and not self_referencing_foreign_keys:
308
343
  assert batch_size is not None
309
344
  n_rows = batch_size
310
345
 
311
- prompt += f"{verb.capitalize()} data for the `{name}` table.\n\n"
346
+ prompt += f"{verb.capitalize()} data for the Target Table `{name}`.\n\n"
312
347
  if n_rows is not None:
313
348
  prompt += f"Number of rows to {verb}: `{n_rows}`.\n\n"
314
349
 
315
- if foreign_keys:
316
- prompt += (
317
- "The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
318
- "The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
319
- "If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
320
- "In this case, ensure that the foreign keys are consistent with primary keys of the table. "
321
- "Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
322
- )
323
-
324
- if existing_data is not None:
350
+ if has_context_table_section:
351
+ assert foreign_keys
352
+ prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
353
+ if has_previous_rows_section:
354
+ prompt += " Never use values from `Previous Rows of Target Table` section."
355
+ prompt += " Respect the `Description of the Relationship` of `Context Table` section to understand the relationship, in particular the number of rows to generate."
356
+ prompt += "\n\n"
357
+
358
+ if has_self_referencing_foreign_keys_section:
359
+ prompt += "Target Table Self Referencing Foreign Key columns defined in `Self Referencing Foreign Keys` must be consistent with the `Target Table Primary Key`."
360
+ prompt += " Respect the `Description of the Relationship` of `Self Referencing Foreign Keys` section to understand the relationship."
361
+ prompt += "\n\n"
362
+
363
+ if has_non_context_tables_section:
364
+ assert len(foreign_keys) > 1
365
+ prompt += "All other Target Table Foreign Key columns may only contain values from `Non-Context Table Data` of relevant `Non-Context Table` sections."
366
+ prompt += " Respect the `Description of the Relationship` of relevant `Non-Context Table` section to understand the relationship."
367
+ prompt += "\n\n"
368
+
369
+ if has_existing_data_section:
370
+ assert existing_data is not None
325
371
  prompt += (
326
372
  f"You are given existing data for the `{name}` table and asked to generate "
327
- f"values for the missing columns. The existing data contains column(s): {', '.join(existing_data.columns)}. "
328
- f"You need to generate values for column(s): {', '.join(columns.keys() - existing_data.columns)}. "
373
+ f"values for the missing columns. The existing data contains column(s): {list(existing_data.columns)}. "
374
+ f"You need to generate values for column(s): {list(columns.keys() - existing_data.columns)}. "
329
375
  f"Ensure that the generated values are contextually appropriate and consistent with the existing data. "
330
376
  f"Use the existing columns' values to inform the generation of new values. "
331
377
  f"Don't generate new rows, only augment the existing data.\n\n"
332
378
  )
333
379
 
334
- if previous_rows:
380
+ if has_previous_rows_section:
381
+ assert previous_rows is not None
335
382
  prompt += (
336
383
  f"{verb.capitalize()} new rows that maintain consistency with the previous rows where appropriate. "
337
384
  "Don't copy previous rows in the output. "
338
385
  "Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
339
386
  )
387
+
340
388
  prompt += f"Do not use code to {verb} the data.\n\n"
341
- prompt += "Return the full data as a JSON string.\n"
342
389
 
390
+ prompt += "Return data as a JSON string."
391
+ prompt += " The JSON string should have 'rows' key at the top level. The value of 'rows' key should be a list of JSON objects."
392
+ prompt += " Each JSON object should have column names as keys and values as column values."
393
+ if existing_data is not None:
394
+ prompt += (
395
+ f" Only include the following columns in the JSON string: {list(columns.keys() - existing_data.columns)}."
396
+ )
397
+ prompt += "\n"
343
398
  return prompt
344
399
 
345
400
 
@@ -357,7 +412,9 @@ def _create_table_rows_generator(
357
412
  non_context_size: int | None,
358
413
  llm_config: LLMConfig,
359
414
  ) -> Generator[dict]:
360
- def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
415
+ def create_table_response_format(
416
+ columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
417
+ ) -> tuple[type[BaseModel], int]:
361
418
  def create_annotation(column_config: ColumnConfig) -> type:
362
419
  if column_config.values or column_config.dtype is DType.CATEGORY:
363
420
  return Literal[tuple(column_config.values)]
@@ -374,11 +431,14 @@ def _create_table_rows_generator(
374
431
 
375
432
  fields = {}
376
433
  for column_name, column_config in columns.items():
434
+ if existing_data is not None and column_name in existing_data.columns:
435
+ continue # skip columns that already exist in existing data
377
436
  annotation = create_annotation(column_config)
378
437
  fields[column_name] = (annotation, Field(...))
379
438
  TableRow = create_model("TableRow", **fields)
380
439
  TableRows = create_model("TableRows", rows=(list[TableRow], ...))
381
- return TableRows
440
+ n_enforced_columns = len(fields)
441
+ return TableRows, n_enforced_columns
382
442
 
383
443
  def yield_rows_from_json_chunks_stream(response: litellm.CustomStreamWrapper) -> Generator[dict]:
384
444
  # starting with dirty buffer is to handle the `{"rows": []}` case
@@ -419,6 +479,18 @@ def _create_table_rows_generator(
419
479
  for i in range(0, len(data), batch_size):
420
480
  yield data.iloc[i : i + batch_size]
421
481
 
482
+ def completion_with_retries(*args, **kwargs):
483
+ n_attempts = 3
484
+
485
+ def print_on_retry(_):
486
+ print(" * Trying again... * ", end="", flush=True)
487
+
488
+ # try up to 3 times, print a message to the user on each retry
489
+ retryer = tenacity.Retrying(
490
+ stop=tenacity.stop_after_attempt(n_attempts), reraise=True, before_sleep=print_on_retry
491
+ )
492
+ return retryer(litellm.completion, *args, **kwargs)
493
+
422
494
  if not llm_config.model.startswith("litellm_proxy/"):
423
495
  # ensure model supports response_format and json schema (this check does not work with litellm_proxy)
424
496
  supported_params = litellm.get_supported_openai_params(model=llm_config.model) or []
@@ -453,7 +525,6 @@ def _create_table_rows_generator(
453
525
  non_context_data[non_context_table_name] = data[non_context_table_name]
454
526
 
455
527
  litellm_kwargs = {
456
- "response_format": create_table_response_format(columns=columns),
457
528
  "temperature": llm_config.temperature,
458
529
  "top_p": llm_config.top_p,
459
530
  "model": llm_config.model,
@@ -494,6 +565,10 @@ def _create_table_rows_generator(
494
565
  if batch_size >= remaining_rows:
495
566
  batch_size = remaining_rows + 2 # +2 because LLM may not always count the rows correctly
496
567
 
568
+ response_format, n_enforced_columns = create_table_response_format(
569
+ columns=columns, existing_data=existing_batch
570
+ )
571
+
497
572
  llm_prompt = _create_table_prompt(
498
573
  name=name,
499
574
  prompt=prompt,
@@ -508,12 +583,20 @@ def _create_table_rows_generator(
508
583
  )
509
584
  messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": llm_prompt}]
510
585
 
511
- response = litellm.completion(messages=messages, **litellm_kwargs)
512
- rows_stream = yield_rows_from_json_chunks_stream(response)
586
+ if n_enforced_columns != 0:
587
+ response = completion_with_retries(messages=messages, response_format=response_format, **litellm_kwargs)
588
+ rows_stream = yield_rows_from_json_chunks_stream(response)
589
+ else:
590
+ # skip roundtrip to LLM in case all columns are provided in existing data
591
+ rows_stream = itertools.repeat({})
513
592
 
593
+ batch_row_idx = 0
514
594
  while True:
515
595
  try:
516
- row = next(rows_stream)
596
+ row_generated_part = next(rows_stream)
597
+ row_existing_part = existing_batch.iloc[batch_row_idx].to_dict() if existing_batch is not None else {}
598
+ row = {**row_existing_part, **row_generated_part}
599
+ row = {column: row[column] for column in columns.keys()} # keep columns order according to user's spec
517
600
  except StopIteration:
518
601
  break # move to next batch
519
602
  previous_rows.append(row)
@@ -523,6 +606,7 @@ def _create_table_rows_generator(
523
606
  yielded_sequences += 1
524
607
  if yielded_sequences >= sample_size:
525
608
  return # move to next table
609
+ batch_row_idx += 1
526
610
  if context_batch is not None:
527
611
  # for each context_batch, full sequences are generated
528
612
  yielded_sequences += len(context_batch)
@@ -653,7 +737,7 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
653
737
  def sample(
654
738
  *,
655
739
  tables: dict[str, dict],
656
- sample_size: int | dict[str, int] = 10,
740
+ sample_size: int | dict[str, int] = 4,
657
741
  existing_data: dict[str, pd.DataFrame] | None = None,
658
742
  model: str = "openai/gpt-4.1-nano",
659
743
  api_key: str | None = None,
@@ -664,12 +748,20 @@ def sample(
664
748
  """
665
749
  Generate mock data from scratch or enrich existing data by prompting an LLM.
666
750
 
751
+ While faker and numpy are useful to create fake data, this utility is unique as it allows
752
+ the creation of coherent, realistic multi-table tabular mock data
753
+ or the enrichment of existing datasets with new, context-aware columns.
754
+
755
+ It is particularly useful for quickly simulating production-like datasets for testing or prototyping purposes.
756
+ It is advised to limit mocking to small datasets for performance reasons (rows * cols < 100).
757
+ It might take a couple of minutes for bigger datasets.
758
+
667
759
  Args:
668
760
  tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
669
761
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
670
762
  If a single integer is provided, the same number of rows will be generated for each subject table.
671
763
  If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
672
- Default is 10. Ignored if existing_data is provided.
764
+ Default is 4. Ignored if existing_data is provided.
673
765
  If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
674
766
  existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
675
767
  Default is None.
@@ -889,7 +981,7 @@ def sample(
889
981
  primary_keys=primary_keys,
890
982
  data=data,
891
983
  sample_size=sample_size[table_name],
892
- batch_size=30, # generate 30 root table rows at a time
984
+ batch_size=20, # generate 20 root table rows at a time
893
985
  previous_rows_size=10, # present 10 previously generated rows to the LLM
894
986
  non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
895
987
  llm_config=llm_config,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: LLM-generated Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -30,6 +30,7 @@ Requires-Dist: numpy>=1.26.3
30
30
  Requires-Dist: pandas>=2.0.0
31
31
  Requires-Dist: pyarrow>=14.0.0
32
32
  Requires-Dist: pydantic<3.0.0,>=2.0.0
33
+ Requires-Dist: tenacity>=9.1.2
33
34
  Description-Content-Type: text/markdown
34
35
 
35
36
  # LLM-generated Mock Data 🔮
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=Cmo4Ko8-X41gSewcEpNTTvw7bpRUrtn6B5Cmnwric-Q,714
2
+ mostlyai/mock/core.py,sha256=L-PbOTSIR1cfBeMZL8-v5k7VhxBfKAoyw230soBwQWc,42754
3
+ mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
4
+ mostlyai_mock-0.1.7.dist-info/METADATA,sha256=6tLpoqLx-LOI-Cr_O_xWm4LI5PBfa4nt1FkrqdNIpQA,13918
5
+ mostlyai_mock-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.1.7.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
+ mostlyai_mock-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.1.7.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=-bfsVZJQ0OkN5b3IRP3F9aUCiA8Eq1-RmAqBmTg0O0g,714
2
- mostlyai/mock/core.py,sha256=V7KG7nOQPU95v6lRoSIfJuYivS0pNZ3rbiNC6SqDZSc,38075
3
- mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
4
- mostlyai_mock-0.1.5.dist-info/METADATA,sha256=LfugCsu7ANDZk2ozNFHDxgCqY42etJIdkXcfc-S-cUE,13887
5
- mostlyai_mock-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- mostlyai_mock-0.1.5.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
- mostlyai_mock-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
- mostlyai_mock-0.1.5.dist-info/RECORD,,