PyPI - mostlyai-mock - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

mostlyai-mock 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

mostlyai/mock/__init__.py CHANGED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.5"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.7"  # Do not set this manually. Use poetry version [params].

mostlyai/mock/core.py CHANGED Viewed

@@ -14,6 +14,7 @@
 from __future__ import annotations
+import itertools
 import json
 from collections import deque
 from collections.abc import Generator
@@ -22,6 +23,7 @@ from typing import Any, Literal
 import litellm
 import pandas as pd
+import tenacity
 from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
 from tqdm import tqdm
@@ -246,52 +248,85 @@ def _create_table_prompt(
     prompt = f"# {prompt}\n\n"
     # define table
-    prompt += f"## Table: {name}\n\n"
+    prompt += f"## Target Table: `{name}`\n\n"
-    prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
+    prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
     # add columns specifications
-    prompt += "## Columns Specifications:\n\n"
-    prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
+    prompt += "### Target Table Column Specifications:\n\n"
+    column_specifications = {
+        name: config.model_dump(exclude_defaults=True, exclude_unset=True, exclude_none=True)
+        for name, config in columns.items()
+    }
+    if existing_data is not None:
+        # do not generate values for columns that already exist in existing data
+        column_specifications = {
+            column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
+        }
+    prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
     # add previous rows as context to help the LLM generate consistent data
+    has_previous_rows_section = False
     if previous_rows:
-        prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
+        has_previous_rows_section = True
+        prompt += f"\n## Previous `{len(previous_rows)}` Rows of Target Table `{name}`:\n\n"
         prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
     # add existing data to augment
+    has_existing_data_section = False
     if existing_data is not None:
-        prompt += "\n## Existing Data to Augment:\n\n"
+        has_existing_data_section = True
+        prompt += f"\n## Existing Data of Target Table `{name}` to Augment:\n\n"
         prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
-    # define foreign keys
-    if foreign_keys:
-        prompt += "## Foreign Keys:\n\n"
-        prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
+    # define self referencing foreign keys
+    has_self_referencing_foreign_keys_section = False
+    self_referencing_foreign_keys = [fk for fk in foreign_keys if fk.referenced_table == name]
+    if self_referencing_foreign_keys:
+        has_self_referencing_foreign_keys_section = True
+        prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
+        for fk in self_referencing_foreign_keys:
+            prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
+            prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
+            prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
+    foreign_keys = [fk for fk in foreign_keys if fk.referenced_table != name]  # exclude self-dependency going forward
     # add context table name, primary key and data
-    if foreign_keys and foreign_keys[0].referenced_table != name:  # self-dependency is not considered as context
+    has_context_table_section = False
+    if foreign_keys:
+        has_context_table_section = True
         assert context_data is not None
         fk = foreign_keys[0]
         prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
-        prompt += f"## Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+        prompt += f"### Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
-        prompt += "## Context Table Data:\n\n"
+        prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
+        prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
+        prompt += "### Context Table Data:\n\n"
         prompt += f"{context_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
     # add non-context table names, primary keys and data
+    has_non_context_tables_section = False
     if foreign_keys and len(foreign_keys) > 1:
+        has_non_context_tables_section = True
         for fk in foreign_keys[1:]:
-            if fk.referenced_table == name:  # self-dependency is not considered as non-context
-                continue
             assert non_context_data is not None
             assert fk.referenced_table in non_context_data
             prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
-            prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+            prompt += f"### Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
-            prompt += "## Non-Context Table Data:\n\n"
+            prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
+            prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
+            prompt += "### Non-Context Table Data:\n\n"
             prompt += (
                 f"{non_context_data[fk.referenced_table].to_json(orient='records', date_format='iso', indent=2)}\n\n"
             )
@@ -304,42 +339,62 @@ def _create_table_prompt(
     n_rows = None
     if existing_data is not None:
         n_rows = len(existing_data)
-    elif not foreign_keys:
+    elif not foreign_keys and not self_referencing_foreign_keys:
         assert batch_size is not None
         n_rows = batch_size
-    prompt += f"{verb.capitalize()} data for the `{name}` table.\n\n"
+    prompt += f"{verb.capitalize()} data for the Target Table `{name}`.\n\n"
     if n_rows is not None:
         prompt += f"Number of rows to {verb}: `{n_rows}`.\n\n"
-    if foreign_keys:
-        prompt += (
-            "The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
-            "The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
-            "If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
-            "In this case, ensure that the foreign keys are consistent with primary keys of the table. "
-            "Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
-        )
-    if existing_data is not None:
+    if has_context_table_section:
+        assert foreign_keys
+        prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
+        if has_previous_rows_section:
+            prompt += " Never use values from `Previous Rows of Target Table` section."
+        prompt += " Respect the `Description of the Relationship` of `Context Table` section to understand the relationship, in particular the number of rows to generate."
+        prompt += "\n\n"
+    if has_self_referencing_foreign_keys_section:
+        prompt += "Target Table Self Referencing Foreign Key columns defined in `Self Referencing Foreign Keys` must be consistent with the `Target Table Primary Key`."
+        prompt += " Respect the `Description of the Relationship` of `Self Referencing Foreign Keys` section to understand the relationship."
+        prompt += "\n\n"
+    if has_non_context_tables_section:
+        assert len(foreign_keys) > 1
+        prompt += "All other Target Table Foreign Key columns may only contain values from `Non-Context Table Data` of relevant `Non-Context Table` sections."
+        prompt += " Respect the `Description of the Relationship` of relevant `Non-Context Table` section to understand the relationship."
+        prompt += "\n\n"
+    if has_existing_data_section:
+        assert existing_data is not None
         prompt += (
             f"You are given existing data for the `{name}` table and asked to generate "
-            f"values for the missing columns. The existing data contains column(s): {', '.join(existing_data.columns)}. "
-            f"You need to generate values for column(s): {', '.join(columns.keys() - existing_data.columns)}. "
+            f"values for the missing columns. The existing data contains column(s): {list(existing_data.columns)}. "
+            f"You need to generate values for column(s): {list(columns.keys() - existing_data.columns)}. "
             f"Ensure that the generated values are contextually appropriate and consistent with the existing data. "
             f"Use the existing columns' values to inform the generation of new values. "
             f"Don't generate new rows, only augment the existing data.\n\n"
         )
-    if previous_rows:
+    if has_previous_rows_section:
+        assert previous_rows is not None
         prompt += (
             f"{verb.capitalize()} new rows that maintain consistency with the previous rows where appropriate. "
             "Don't copy previous rows in the output. "
             "Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
         )
     prompt += f"Do not use code to {verb} the data.\n\n"
-    prompt += "Return the full data as a JSON string.\n"
+    prompt += "Return data as a JSON string."
+    prompt += " The JSON string should have 'rows' key at the top level. The value of 'rows' key should be a list of JSON objects."
+    prompt += " Each JSON object should have column names as keys and values as column values."
+    if existing_data is not None:
+        prompt += (
+            f" Only include the following columns in the JSON string: {list(columns.keys() - existing_data.columns)}."
+        )
+    prompt += "\n"
     return prompt
@@ -357,7 +412,9 @@ def _create_table_rows_generator(
     non_context_size: int | None,
     llm_config: LLMConfig,
 ) -> Generator[dict]:
-    def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
+    def create_table_response_format(
+        columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
+    ) -> tuple[type[BaseModel], int]:
         def create_annotation(column_config: ColumnConfig) -> type:
             if column_config.values or column_config.dtype is DType.CATEGORY:
                 return Literal[tuple(column_config.values)]
@@ -374,11 +431,14 @@ def _create_table_rows_generator(
         fields = {}
         for column_name, column_config in columns.items():
+            if existing_data is not None and column_name in existing_data.columns:
+                continue  # skip columns that already exist in existing data
             annotation = create_annotation(column_config)
             fields[column_name] = (annotation, Field(...))
         TableRow = create_model("TableRow", **fields)
         TableRows = create_model("TableRows", rows=(list[TableRow], ...))
-        return TableRows
+        n_enforced_columns = len(fields)
+        return TableRows, n_enforced_columns
     def yield_rows_from_json_chunks_stream(response: litellm.CustomStreamWrapper) -> Generator[dict]:
         # starting with dirty buffer is to handle the `{"rows": []}` case
@@ -419,6 +479,18 @@ def _create_table_rows_generator(
                 for i in range(0, len(data), batch_size):
                     yield data.iloc[i : i + batch_size]
+    def completion_with_retries(*args, **kwargs):
+        n_attempts = 3
+        def print_on_retry(_):
+            print(" * Trying again... * ", end="", flush=True)
+        # try up to 3 times, print a message to the user on each retry
+        retryer = tenacity.Retrying(
+            stop=tenacity.stop_after_attempt(n_attempts), reraise=True, before_sleep=print_on_retry
+        )
+        return retryer(litellm.completion, *args, **kwargs)
     if not llm_config.model.startswith("litellm_proxy/"):
         # ensure model supports response_format and json schema (this check does not work with litellm_proxy)
         supported_params = litellm.get_supported_openai_params(model=llm_config.model) or []
@@ -453,7 +525,6 @@ def _create_table_rows_generator(
             non_context_data[non_context_table_name] = data[non_context_table_name]
     litellm_kwargs = {
-        "response_format": create_table_response_format(columns=columns),
         "temperature": llm_config.temperature,
         "top_p": llm_config.top_p,
         "model": llm_config.model,
@@ -494,6 +565,10 @@ def _create_table_rows_generator(
             if batch_size >= remaining_rows:
                 batch_size = remaining_rows + 2  # +2 because LLM may not always count the rows correctly
+        response_format, n_enforced_columns = create_table_response_format(
+            columns=columns, existing_data=existing_batch
+        )
         llm_prompt = _create_table_prompt(
             name=name,
             prompt=prompt,
@@ -508,12 +583,20 @@ def _create_table_rows_generator(
         )
         messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": llm_prompt}]
-        response = litellm.completion(messages=messages, **litellm_kwargs)
-        rows_stream = yield_rows_from_json_chunks_stream(response)
+        if n_enforced_columns != 0:
+            response = completion_with_retries(messages=messages, response_format=response_format, **litellm_kwargs)
+            rows_stream = yield_rows_from_json_chunks_stream(response)
+        else:
+            # skip roundtrip to LLM in case all columns are provided in existing data
+            rows_stream = itertools.repeat({})
+        batch_row_idx = 0
         while True:
             try:
-                row = next(rows_stream)
+                row_generated_part = next(rows_stream)
+                row_existing_part = existing_batch.iloc[batch_row_idx].to_dict() if existing_batch is not None else {}
+                row = {**row_existing_part, **row_generated_part}
+                row = {column: row[column] for column in columns.keys()}  # keep columns order according to user's spec
             except StopIteration:
                 break  # move to next batch
             previous_rows.append(row)
@@ -523,6 +606,7 @@ def _create_table_rows_generator(
                 yielded_sequences += 1
                 if yielded_sequences >= sample_size:
                     return  # move to next table
+            batch_row_idx += 1
         if context_batch is not None:
             # for each context_batch, full sequences are generated
             yielded_sequences += len(context_batch)
@@ -653,7 +737,7 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
 def sample(
     *,
     tables: dict[str, dict],
-    sample_size: int | dict[str, int] = 10,
+    sample_size: int | dict[str, int] = 4,
     existing_data: dict[str, pd.DataFrame] | None = None,
     model: str = "openai/gpt-4.1-nano",
     api_key: str | None = None,
@@ -664,12 +748,20 @@ def sample(
     """
     Generate mock data from scratch or enrich existing data by prompting an LLM.
+    While faker and numpy are useful to create fake data, this utility is unique as it allows
+    the creation of coherent, realistic multi-table tabular mock data
+    or the enrichment of existing datasets with new, context-aware columns.
+    It is particularly useful for quickly simulating production-like datasets for testing or prototyping purposes.
+    It is advised to limit mocking to small datasets for performance reasons (rows * cols < 100).
+    It might take a couple of minutes for bigger datasets.
     Args:
         tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
         sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
             If a single integer is provided, the same number of rows will be generated for each subject table.
             If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
-            Default is 10. Ignored if existing_data is provided.
+            Default is 4. Ignored if existing_data is provided.
             If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
         existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
             Default is None.
@@ -889,7 +981,7 @@ def sample(
             primary_keys=primary_keys,
             data=data,
             sample_size=sample_size[table_name],
-            batch_size=30,  # generate 30 root table rows at a time
+            batch_size=20,  # generate 20 root table rows at a time
             previous_rows_size=10,  # present 10 previously generated rows to the LLM
             non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
             llm_config=llm_config,

{mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.5
+Version: 0.1.7
 Summary: LLM-generated Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -30,6 +30,7 @@ Requires-Dist: numpy>=1.26.3
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyarrow>=14.0.0
 Requires-Dist: pydantic<3.0.0,>=2.0.0
+Requires-Dist: tenacity>=9.1.2
 Description-Content-Type: text/markdown
 # LLM-generated Mock Data 🔮

mostlyai_mock-0.1.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mostlyai/mock/__init__.py,sha256=Cmo4Ko8-X41gSewcEpNTTvw7bpRUrtn6B5Cmnwric-Q,714
+mostlyai/mock/core.py,sha256=L-PbOTSIR1cfBeMZL8-v5k7VhxBfKAoyw230soBwQWc,42754
+mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
+mostlyai_mock-0.1.7.dist-info/METADATA,sha256=6tLpoqLx-LOI-Cr_O_xWm4LI5PBfa4nt1FkrqdNIpQA,13918
+mostlyai_mock-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+mostlyai_mock-0.1.7.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
+mostlyai_mock-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mostlyai_mock-0.1.7.dist-info/RECORD,,

mostlyai_mock-0.1.5.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mostlyai/mock/__init__.py,sha256=-bfsVZJQ0OkN5b3IRP3F9aUCiA8Eq1-RmAqBmTg0O0g,714
-mostlyai/mock/core.py,sha256=V7KG7nOQPU95v6lRoSIfJuYivS0pNZ3rbiNC6SqDZSc,38075
-mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
-mostlyai_mock-0.1.5.dist-info/METADATA,sha256=LfugCsu7ANDZk2ozNFHDxgCqY42etJIdkXcfc-S-cUE,13887
-mostlyai_mock-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-mostlyai_mock-0.1.5.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
-mostlyai_mock-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mostlyai_mock-0.1.5.dist-info/RECORD,,

{mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mostlyai_mock-0.1.5.dist-info → mostlyai_mock-0.1.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

mostlyai-mock 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl