PyPI - mostlyai-mock - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

mostlyai-mock 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mostlyai/mock/__init__.py CHANGED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.4"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.6"  # Do not set this manually. Use poetry version [params].

mostlyai/mock/core.py CHANGED Viewed

@@ -14,21 +14,23 @@
 from __future__ import annotations
+import itertools
 import json
 from collections import deque
 from collections.abc import Generator
 from enum import Enum
-from typing import Any, Literal, Type
+from typing import Any, Literal
 import litellm
 import pandas as pd
+import tenacity
 from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
 from tqdm import tqdm
 litellm.suppress_debug_info = True
 SYSTEM_PROMPT = """
-You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
+You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
 Your task is to:
@@ -58,7 +60,7 @@ class LLMConfig(BaseModel):
 class MockConfig(RootModel[dict[str, "TableConfig"]]):
-    root: dict[str, TableConfig] = Field(..., min_items=1)
+    root: dict[str, TableConfig] = Field(..., min_length=1)
     @field_validator("root")
     @classmethod
@@ -127,7 +129,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
 class TableConfig(BaseModel):
     prompt: str = ""
-    columns: dict[str, ColumnConfig] = Field(..., min_items=1)
+    columns: dict[str, ColumnConfig] = Field(..., min_length=1)
     primary_key: str | None = None
     foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
@@ -246,52 +248,85 @@ def _create_table_prompt(
     prompt = f"# {prompt}\n\n"
     # define table
-    prompt += f"## Table: {name}\n\n"
+    prompt += f"## Target Table: `{name}`\n\n"
-    prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
+    prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
     # add columns specifications
-    prompt += "## Columns Specifications:\n\n"
-    prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
+    prompt += "### Target Table Column Specifications:\n\n"
+    column_specifications = {
+        name: config.model_dump(exclude_defaults=True, exclude_unset=True, exclude_none=True)
+        for name, config in columns.items()
+    }
+    if existing_data is not None:
+        # do not generate values for columns that already exist in existing data
+        column_specifications = {
+            column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
+        }
+    prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
     # add previous rows as context to help the LLM generate consistent data
+    has_previous_rows_section = False
     if previous_rows:
-        prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
+        has_previous_rows_section = True
+        prompt += f"\n## Previous `{len(previous_rows)}` Rows of Target Table `{name}`:\n\n"
         prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
     # add existing data to augment
+    has_existing_data_section = False
     if existing_data is not None:
-        prompt += f"\n## Existing Data to Augment:\n\n"
+        has_existing_data_section = True
+        prompt += f"\n## Existing Data of Target Table `{name}` to Augment:\n\n"
         prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
-    # define foreign keys
-    if foreign_keys:
-        prompt += "## Foreign Keys:\n\n"
-        prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
+    # define self referencing foreign keys
+    has_self_referencing_foreign_keys_section = False
+    self_referencing_foreign_keys = [fk for fk in foreign_keys if fk.referenced_table == name]
+    if self_referencing_foreign_keys:
+        has_self_referencing_foreign_keys_section = True
+        prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
+        for fk in self_referencing_foreign_keys:
+            prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
+            prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
+            prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
+    foreign_keys = [fk for fk in foreign_keys if fk.referenced_table != name]  # exclude self-dependency going forward
     # add context table name, primary key and data
-    if foreign_keys and foreign_keys[0].referenced_table != name:  # self-dependency is not considered as context
+    has_context_table_section = False
+    if foreign_keys:
+        has_context_table_section = True
         assert context_data is not None
         fk = foreign_keys[0]
         prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
-        prompt += f"## Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+        prompt += f"### Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+        prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
-        prompt += "## Context Table Data:\n\n"
+        prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
+        prompt += "### Context Table Data:\n\n"
         prompt += f"{context_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
     # add non-context table names, primary keys and data
+    has_non_context_tables_section = False
     if foreign_keys and len(foreign_keys) > 1:
+        has_non_context_tables_section = True
         for fk in foreign_keys[1:]:
-            if fk.referenced_table == name:  # self-dependency is not considered as non-context
-                continue
             assert non_context_data is not None
             assert fk.referenced_table in non_context_data
             prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
-            prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+            prompt += f"### Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+            prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
+            prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
-            prompt += "## Non-Context Table Data:\n\n"
+            prompt += "### Non-Context Table Data:\n\n"
             prompt += (
                 f"{non_context_data[fk.referenced_table].to_json(orient='records', date_format='iso', indent=2)}\n\n"
             )
@@ -304,42 +339,62 @@ def _create_table_prompt(
     n_rows = None
     if existing_data is not None:
         n_rows = len(existing_data)
-    elif not foreign_keys:
+    elif not foreign_keys and not self_referencing_foreign_keys:
         assert batch_size is not None
         n_rows = batch_size
-    prompt += f"{verb.capitalize()} data for the `{name}` table.\n\n"
+    prompt += f"{verb.capitalize()} data for the Target Table `{name}`.\n\n"
     if n_rows is not None:
         prompt += f"Number of rows to {verb}: `{n_rows}`.\n\n"
-    if foreign_keys:
-        prompt += (
-            f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
-            f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
-            f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
-            f"In this case, ensure that the foreign keys are consistent with primary keys of the table. "
-            f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
-        )
-    if existing_data is not None:
+    if has_context_table_section:
+        assert foreign_keys
+        prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
+        if has_previous_rows_section:
+            prompt += " Never use values from `Previous Rows of Target Table` section."
+        prompt += " Respect the `Description of the Relationship` of `Context Table` section to understand the relationship, in particular the number of rows to generate."
+        prompt += "\n\n"
+    if has_self_referencing_foreign_keys_section:
+        prompt += "Target Table Self Referencing Foreign Key columns defined in `Self Referencing Foreign Keys` must be consistent with the `Target Table Primary Key`."
+        prompt += " Respect the `Description of the Relationship` of `Self Referencing Foreign Keys` section to understand the relationship."
+        prompt += "\n\n"
+    if has_non_context_tables_section:
+        assert len(foreign_keys) > 1
+        prompt += "All other Target Table Foreign Key columns may only contain values from `Non-Context Table Data` of relevant `Non-Context Table` sections."
+        prompt += " Respect the `Description of the Relationship` of relevant `Non-Context Table` section to understand the relationship."
+        prompt += "\n\n"
+    if has_existing_data_section:
+        assert existing_data is not None
         prompt += (
             f"You are given existing data for the `{name}` table and asked to generate "
-            f"values for the missing columns. The existing data contains column(s): {', '.join(existing_data.columns)}. "
-            f"You need to generate values for column(s): {', '.join(columns.keys() - existing_data.columns)}. "
+            f"values for the missing columns. The existing data contains column(s): {list(existing_data.columns)}. "
+            f"You need to generate values for column(s): {list(columns.keys() - existing_data.columns)}. "
             f"Ensure that the generated values are contextually appropriate and consistent with the existing data. "
             f"Use the existing columns' values to inform the generation of new values. "
             f"Don't generate new rows, only augment the existing data.\n\n"
         )
-    if previous_rows:
+    if has_previous_rows_section:
+        assert previous_rows is not None
         prompt += (
             f"{verb.capitalize()} new rows that maintain consistency with the previous rows where appropriate. "
             "Don't copy previous rows in the output. "
             "Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
         )
     prompt += f"Do not use code to {verb} the data.\n\n"
-    prompt += "Return the full data as a JSON string.\n"
+    prompt += "Return data as a JSON string."
+    prompt += " The JSON string should have 'rows' key at the top level. The value of 'rows' key should be a list of JSON objects."
+    prompt += " Each JSON object should have column names as keys and values as column values."
+    if existing_data is not None:
+        prompt += (
+            f" Only include the following columns in the JSON string: {list(columns.keys() - existing_data.columns)}."
+        )
+    prompt += "\n"
     return prompt
@@ -357,8 +412,10 @@ def _create_table_rows_generator(
     non_context_size: int | None,
     llm_config: LLMConfig,
 ) -> Generator[dict]:
-    def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
-        def create_annotation(column_config: ColumnConfig) -> Type:
+    def create_table_response_format(
+        columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
+    ) -> tuple[type[BaseModel], int]:
+        def create_annotation(column_config: ColumnConfig) -> type:
             if column_config.values or column_config.dtype is DType.CATEGORY:
                 return Literal[tuple(column_config.values)]
             return {
@@ -374,11 +431,14 @@ def _create_table_rows_generator(
         fields = {}
         for column_name, column_config in columns.items():
+            if existing_data is not None and column_name in existing_data.columns:
+                continue  # skip columns that already exist in existing data
             annotation = create_annotation(column_config)
             fields[column_name] = (annotation, Field(...))
         TableRow = create_model("TableRow", **fields)
         TableRows = create_model("TableRows", rows=(list[TableRow], ...))
-        return TableRows
+        n_enforced_columns = len(fields)
+        return TableRows, n_enforced_columns
     def yield_rows_from_json_chunks_stream(response: litellm.CustomStreamWrapper) -> Generator[dict]:
         # starting with dirty buffer is to handle the `{"rows": []}` case
@@ -419,6 +479,18 @@ def _create_table_rows_generator(
                 for i in range(0, len(data), batch_size):
                     yield data.iloc[i : i + batch_size]
+    def completion_with_retries(*args, **kwargs):
+        n_attempts = 3
+        def print_on_retry(_):
+            print(" * Trying again... * ", end="", flush=True)
+        # try up to 3 times, print a message to the user on each retry
+        retryer = tenacity.Retrying(
+            stop=tenacity.stop_after_attempt(n_attempts), reraise=True, before_sleep=print_on_retry
+        )
+        return retryer(litellm.completion, *args, **kwargs)
     if not llm_config.model.startswith("litellm_proxy/"):
         # ensure model supports response_format and json schema (this check does not work with litellm_proxy)
         supported_params = litellm.get_supported_openai_params(model=llm_config.model) or []
@@ -453,7 +525,6 @@ def _create_table_rows_generator(
             non_context_data[non_context_table_name] = data[non_context_table_name]
     litellm_kwargs = {
-        "response_format": create_table_response_format(columns=columns),
         "temperature": llm_config.temperature,
         "top_p": llm_config.top_p,
         "model": llm_config.model,
@@ -488,6 +559,16 @@ def _create_table_rows_generator(
                 table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()
             }
+        if context_batch is None:
+            # for root tables, scale down batch size in order to prevent excessive generations
+            remaining_rows = sample_size - yielded_sequences
+            if batch_size >= remaining_rows:
+                batch_size = remaining_rows + 2  # +2 because LLM may not always count the rows correctly
+        response_format, n_enforced_columns = create_table_response_format(
+            columns=columns, existing_data=existing_batch
+        )
         llm_prompt = _create_table_prompt(
             name=name,
             prompt=prompt,
@@ -502,12 +583,20 @@ def _create_table_rows_generator(
         )
         messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": llm_prompt}]
-        response = litellm.completion(messages=messages, **litellm_kwargs)
-        rows_stream = yield_rows_from_json_chunks_stream(response)
+        if n_enforced_columns != 0:
+            response = completion_with_retries(messages=messages, response_format=response_format, **litellm_kwargs)
+            rows_stream = yield_rows_from_json_chunks_stream(response)
+        else:
+            # skip roundtrip to LLM in case all columns are provided in existing data
+            rows_stream = itertools.repeat({})
+        batch_row_idx = 0
         while True:
             try:
-                row = next(rows_stream)
+                row_generated_part = next(rows_stream)
+                row_existing_part = existing_batch.iloc[batch_row_idx].to_dict() if existing_batch is not None else {}
+                row = {**row_existing_part, **row_generated_part}
+                row = {column: row[column] for column in columns.keys()}  # keep columns order according to user's spec
             except StopIteration:
                 break  # move to next batch
             previous_rows.append(row)
@@ -517,6 +606,7 @@ def _create_table_rows_generator(
                 yielded_sequences += 1
                 if yielded_sequences >= sample_size:
                     return  # move to next table
+            batch_row_idx += 1
         if context_batch is not None:
             # for each context_batch, full sequences are generated
             yielded_sequences += len(context_batch)
@@ -553,6 +643,36 @@ def _convert_table_rows_generator_to_df(
     return df
+def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataFrame] | None) -> dict[str, dict]:
+    def _infer_dtype(series: pd.Series) -> DType:
+        if pd.api.types.is_integer_dtype(series):
+            return DType.INTEGER
+        elif pd.api.types.is_float_dtype(series):
+            return DType.FLOAT
+        elif pd.api.types.is_datetime64_dtype(series):
+            return DType.DATETIME
+        elif pd.api.types.is_bool_dtype(series):
+            return DType.BOOLEAN
+        else:
+            return DType.STRING
+    if existing_data is None:
+        return tables
+    tables = tables.copy()
+    for table_name, existing_table in existing_data.items():
+        table_config = tables.setdefault(table_name, {})
+        column_configs = table_config.setdefault("columns", {})
+        existing_column_configs = {
+            existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
+            for existing_column in existing_table.columns
+            if existing_column not in column_configs
+        }
+        column_configs = {**existing_column_configs, **column_configs}
+        table_config["columns"] = column_configs
+    return tables
 def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
     if isinstance(sample_size, int):
         return {table_name: sample_size for table_name in config.root}
@@ -756,8 +876,6 @@ def sample(
         "patients": {
             "prompt": "Patients of a hospital in Finland",
             "columns": {
-                "age": {},
-                "gender": {},
                 "full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
                 "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
                 "place_of_birth": {"prompt": "place of birth", "dtype": "string"},
@@ -769,7 +887,7 @@ def sample(
         "gender": ["male", "male", "female", "female"],
     })
     enriched_df = mock.sample(
-        tables=tables,
+        tables=tables,
         existing_data={"patients": existing_df},
         model="openai/gpt-4.1-nano"
     )
@@ -833,7 +951,9 @@ def sample(
     ```
     """
+    tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
     config = MockConfig(tables)
     llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
     sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
@@ -853,7 +973,7 @@ def sample(
             primary_keys=primary_keys,
             data=data,
             sample_size=sample_size[table_name],
-            batch_size=30,  # generate 30 root table rows at a time
+            batch_size=20,  # generate 20 root table rows at a time
             previous_rows_size=10,  # present 10 previously generated rows to the LLM
             non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
             llm_config=llm_config,

mostlyai/mock/mcp_server.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 MOSTLY AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import tempfile

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.4
+Version: 0.1.6
 Summary: LLM-generated Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -30,6 +30,7 @@ Requires-Dist: numpy>=1.26.3
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyarrow>=14.0.0
 Requires-Dist: pydantic<3.0.0,>=2.0.0
+Requires-Dist: tenacity>=9.1.2
 Description-Content-Type: text/markdown
 # LLM-generated Mock Data 🔮
@@ -169,7 +170,7 @@ tables = {
 }
 data = mock.sample(
     tables=tables,
-    sample_size=2,
+    sample_size=2,
     model="openai/gpt-4.1"
 )
 print(data["customers"])
@@ -250,9 +251,6 @@ tables = {
     "guests": {
         "prompt": "Guests of an Alpine ski hotel in Austria",
         "columns": {
-            "guest_id": {"prompt": "the unique id of the guest", "dtype": "integer"},
-            "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
-            "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
             "gender": {"dtype": "category", "values": ["male", "female"]},
             "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
             "room_number": {"prompt": "room number", "dtype": "integer"},

mostlyai_mock-0.1.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mostlyai/mock/__init__.py,sha256=8UddMHmwpfwSb7ChuVNvIaWNLTlWkN0Cxh63CskmtBw,714
+mostlyai/mock/core.py,sha256=NFfyucqjT3iC9lqfu4dPmRnYizxtfFH1Tf3KHRRxHvg,42242
+mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
+mostlyai_mock-0.1.6.dist-info/METADATA,sha256=RMYEgGG4P3WfhavNC_4ph6dTCtumqQ3uA-swot9WKyc,13918
+mostlyai_mock-0.1.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+mostlyai_mock-0.1.6.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
+mostlyai_mock-0.1.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mostlyai_mock-0.1.6.dist-info/RECORD,,

mostlyai_mock-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mostlyai/mock/__init__.py,sha256=EvV_Tp6ExzQPq4apGq_8F25qw_paNTcQEC94nIVOEog,714
-mostlyai/mock/core.py,sha256=ubarMA3VUlXdjUsCXQK_mD_kWPkTMOYvLz9G4OughGk,36532
-mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
-mostlyai_mock-0.1.4.dist-info/METADATA,sha256=jibPe0pKcwqyPBoyc7H98LPd72vkGZBStdw_yMNVvJI,14161
-mostlyai_mock-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-mostlyai_mock-0.1.4.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
-mostlyai_mock-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mostlyai_mock-0.1.4.dist-info/RECORD,,

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

mostlyai-mock 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl