PyPI - mostlyai-mock - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

mostlyai-mock 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

mostlyai/mock/__init__.py CHANGED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.1"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.3"  # Do not set this manually. Use poetry version [params].

mostlyai/mock/core.py CHANGED Viewed

@@ -25,21 +25,28 @@ import pandas as pd
 from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
 from tqdm import tqdm
+litellm.suppress_debug_info = True
 SYSTEM_PROMPT = """
-You are a specialized synthetic data generator designed to create
-highly realistic, contextually appropriate data based on schema definitions. Your task is to:
+You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
+Your task is to:
 1. Generate data that strictly adheres to the provided schema constraints (data types, ranges, formats)
 2. Ensure logical consistency across related tables and foreign key relationships
 3. Create contextually appropriate values that reflect real-world patterns and distributions
 4. Produce diverse, non-repetitive data that avoids obvious patterns
 5. Respect uniqueness constraints and other data integrity rules
-6. Return well-formatted JSON output that can be directly parsed.
-7. Don't use markdown formatting.
+6. When enriching existing data, ensure that new values are consistent with existing values
+7. Return well-formatted JSON output that can be directly parsed
+8. Don't use markdown formatting
 For numeric fields, generate realistic distributions rather than random values. For text fields, create contextually \
 appropriate content. For dates and timestamps, ensure logical chronology. Always maintain referential integrity \
 across tables.
+When enriching existing data, carefully analyze the patterns and relationships in the existing columns \
+to generate compatible and realistic values for the missing columns.
 """
@@ -197,7 +204,7 @@ def _sample_table(
     columns: dict[str, ColumnConfig],
     foreign_keys: list[ForeignKeyConfig] | None,
     primary_keys: dict[str, str] | None,
-    generated_data: dict[str, pd.DataFrame] | None,
+    data: dict[str, pd.DataFrame],
     sample_size: int,
     batch_size: int,
     previous_rows_size: int,
@@ -210,7 +217,7 @@ def _sample_table(
         columns=columns,
         primary_keys=primary_keys,
         foreign_keys=foreign_keys,
-        generated_data=generated_data,
+        data=data,
         sample_size=sample_size,
         batch_size=batch_size,
         previous_rows_size=previous_rows_size,
@@ -230,6 +237,7 @@ def _create_table_prompt(
     primary_keys: dict[str, str] | None,
     batch_size: int | None,
     foreign_keys: list[ForeignKeyConfig] | None,
+    existing_data: pd.DataFrame | None,
     context_data: pd.DataFrame | None,
     non_context_data: dict[str, pd.DataFrame] | None,
     previous_rows: list[dict] | None,
@@ -251,6 +259,11 @@ def _create_table_prompt(
         prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
         prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
+    # add existing data to augment
+    if existing_data is not None:
+        prompt += f"\n## Existing Data to Augment:\n\n"
+        prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
     # define foreign keys
     if foreign_keys:
         prompt += "## Foreign Keys:\n\n"
@@ -285,26 +298,46 @@ def _create_table_prompt(
     # add instructions
     prompt += "\n## Instructions:\n\n"
-    if not foreign_keys:
+    verb = "generate" if existing_data is None else "augment"
+    n_rows = None
+    if existing_data is not None:
+        n_rows = len(existing_data)
+    elif not foreign_keys:
         assert batch_size is not None
-        prompt += f"Generate {batch_size} rows for the `{name}` table.\n\n"
-    else:
+        n_rows = batch_size
+    prompt += f"{verb.capitalize()} data for the `{name}` table.\n\n"
+    if n_rows is not None:
+        prompt += f"Number of rows to {verb}: `{n_rows}`.\n\n"
+    if foreign_keys:
         prompt += (
-            f"Generate data for the `{name}` table. "
             f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
             f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
             f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
-            f"In this case, ensure that the generated foreign keys are consistent with generated primary keys of the table. "
+            f"In this case, ensure that the foreign keys are consistent with primary keys of the table. "
             f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
         )
+    if existing_data is not None:
+        prompt += (
+            f"You are given existing data for the `{name}` table and asked to generate "
+            f"values for the missing columns. The existing data contains column(s): {', '.join(existing_data.columns)}. "
+            f"You need to generate values for column(s): {', '.join(columns.keys() - existing_data.columns)}. "
+            f"Ensure that the generated values are contextually appropriate and consistent with the existing data. "
+            f"Use the existing columns' values to inform the generation of new values. "
+            f"Don't generate new rows, only augment the existing data.\n\n"
+        )
     if previous_rows:
         prompt += (
-            "Generate new rows that maintain consistency with the previous rows where appropriate. "
+            f"{verb.capitalize()} new rows that maintain consistency with the previous rows where appropriate. "
             "Don't copy previous rows in the output. "
             "Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
         )
-    prompt += "Do not use code to generate the data.\n\n"
+    prompt += f"Do not use code to {verb} the data.\n\n"
     prompt += "Return the full data as a JSON string.\n"
     return prompt
@@ -317,7 +350,7 @@ def _create_table_rows_generator(
     columns: dict[str, ColumnConfig],
     foreign_keys: list[ForeignKeyConfig] | None,
     primary_keys: dict[str, str] | None,
-    generated_data: dict[str, pd.DataFrame] | None,
+    data: dict[str, pd.DataFrame],
     sample_size: int,
     batch_size: int,
     previous_rows_size: int,
@@ -393,27 +426,31 @@ def _create_table_rows_generator(
             "The model does not support structured output / JSON mode."
         )
+    # derive data for augmentation
+    existing_data: pd.DataFrame | None = None
+    if name in data:
+        existing_data = data[name]
+        sample_size = len(existing_data)
     # derive context data (if first foreign key is present) and harmonize sample size accordingly
     context_data: pd.DataFrame | None = None
     if foreign_keys and foreign_keys[0].referenced_table != name:  # self-dependency is not considered as context
         context_table_name = foreign_keys[0].referenced_table
-        assert generated_data is not None
-        assert context_table_name in generated_data
-        context_data = generated_data[context_table_name]
+        assert context_table_name in data
+        context_data = data[context_table_name]
         batch_size = 1  # generate one sequence at a time
         sample_size = len(context_data)
     # derive non-context data (if more than one foreign key is present)
     non_context_data: dict[str, pd.DataFrame] = {}
     if foreign_keys and len(foreign_keys) > 1:
-        assert generated_data is not None
         assert non_context_size is not None
         for fk in foreign_keys[1:]:
             if fk.referenced_table == name:  # self-dependency is not considered as non-context
                 continue
             non_context_table_name = fk.referenced_table
-            assert non_context_table_name in generated_data
-            non_context_data[non_context_table_name] = generated_data[non_context_table_name]
+            assert non_context_table_name in data
+            non_context_data[non_context_table_name] = data[non_context_table_name]
     litellm_kwargs = {
         "response_format": create_table_response_format(columns=columns),
@@ -424,14 +461,33 @@ def _create_table_rows_generator(
         "stream": True,
     }
+    batch_idx = 0
     yielded_sequences = 0
     previous_rows = deque(maxlen=previous_rows_size)
     for context_batch in batch_infinitely(context_data):
-        non_context_batch = (
-            {table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
-            if non_context_data
-            else None
-        )
+        # pick existing rows for current batch
+        existing_batch: pd.DataFrame | None = None
+        if existing_data is not None:
+            if context_batch is None:
+                # progressively pick portions of existing data in case of root tables
+                assert batch_size is not None
+                existing_batch = existing_data.iloc[batch_idx * batch_size : (batch_idx + 1) * batch_size]
+            else:
+                # pick existing rows that match current context batch
+                assert foreign_keys is not None
+                context_table_name, foreign_key = foreign_keys[0].referenced_table, foreign_keys[0].column
+                context_primary_key = primary_keys[context_table_name]
+                existing_batch = existing_data[existing_data[foreign_key].isin(context_batch[context_primary_key])]
+            if existing_batch.empty:
+                existing_batch = None
+        # sample candidate rows from non-context tables for current batch
+        non_context_batch: dict[str, pd.DataFrame] | None = None
+        if non_context_data:
+            non_context_batch = {
+                table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()
+            }
         llm_prompt = _create_table_prompt(
             name=name,
             prompt=prompt,
@@ -439,6 +495,7 @@ def _create_table_rows_generator(
             primary_keys=primary_keys,
             batch_size=batch_size,
             foreign_keys=foreign_keys,
+            existing_data=existing_batch,
             context_data=context_batch,
             non_context_data=non_context_batch,
             previous_rows=list(previous_rows),
@@ -466,6 +523,8 @@ def _create_table_rows_generator(
             if yielded_sequences >= sample_size:
                 return  # move to next table
+        batch_idx += 1
 def _convert_table_rows_generator_to_df(
     table_rows_generator: Generator[dict],
@@ -559,6 +618,7 @@ def sample(
     *,
     tables: dict[str, dict],
     sample_size: int | dict[str, int] = 10,
+    existing_data: dict[str, pd.DataFrame] | None = None,
     model: str = "openai/gpt-4.1-nano",
     api_key: str | None = None,
     temperature: float = 1.0,
@@ -574,12 +634,14 @@ def sample(
             If a single integer is provided, the same number of rows will be generated for each subject table.
             If a dictionary is provided, the number of rows to generate for each subject table can be specified
             individually.
-            Default is 10.
+            Default is 10. Ignored if existing_data is provided.
+        existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
+            Default is None.
         model (str): The LiteLLM chat completion model to be used. Requires support for structured output / JSON mode.
             Examples include:
-            - `openai/gpt-4.1-nano` (default)
-            - `openai/gpt-4.1-mini`
-            - `openai/gpt-4.1`
+            - `openai/gpt-4.1-nano` (default; fastest)
+            - `openai/gpt-4.1-mini` (slower, but smarter)
+            - `openai/gpt-4.1` (slowest, but smartest)
             - `gemini/gemini-2.0-flash`
             - `gemini/gemini-2.5-flash-preview-04-17`
             - `groq/llama-3.3-70b-versatile`
@@ -628,7 +690,7 @@ def sample(
                 "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
                 "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
             },
-            "primary_key": "customer_id",
+            "primary_key": "customer_id",  # single string; no composite keys allowed
         },
         "warehouses": {
             "prompt": "Warehouses of a hardware store",
@@ -683,17 +745,73 @@ def sample(
     df_orders = data["orders"]
     df_items = data["items"]
     ```
+    Example of data augmentation:
+    ```python
+    from mostlyai import mock
+    import pandas as pd
+    tables = {
+        "customers": {
+            "prompt": "Customers of a hardware store",
+            "columns": {
+                "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
+                "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
+                "email": {"prompt": "email address of the customer", "dtype": "string"},
+                "phone": {"prompt": "phone number of the customer", "dtype": "string"},
+                "loyalty_level": {"dtype": "category", "values": ["bronze", "silver", "gold", "platinum"]},
+            },
+            "primary_key": "customer_id",
+        },
+        "orders": {
+            "prompt": "Orders of a Customer",
+            "columns": {
+                "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
+                "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
+                "order_date": {"prompt": "the date when the order was placed", "dtype": "date"},
+                "total_amount": {"prompt": "order amount in USD", "dtype": "float"},
+                "status": {"dtype": "category", "values": ["pending", "shipped", "delivered", "cancelled"]},
+            },
+            "primary_key": "order_id",
+            "foreign_keys": [
+                {
+                    "column": "customer_id",
+                    "referenced_table": "customers",
+                    "prompt": "each customer has anywhere between 1 and 3 orders",
+                },
+            ],
+        },
+    }
+    existing_customers = pd.DataFrame({
+        "customer_id": [101, 102, 103],
+        "name": ["John Davis", "Maria Garcia", "Wei Chen"],
+    })
+    existing_orders = pd.DataFrame({
+        "order_id": ["ORD-001", "ORD-002"],
+        "customer_id": [101, 101],
+    })
+    data = mock.sample(
+        tables=tables,
+        existing_data={
+            "customers": existing_customers,
+            "orders": existing_orders,
+        },
+        model="openai/gpt-4.1-nano"
+    )
+    df_customers = data["customers"]
+    df_orders = data["orders"]
+    ```
     """
     config = MockConfig(tables)
     llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
-    sample_size = _harmonize_sample_size(sample_size, config)
+    sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
     primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
     execution_plan: list[str] = _build_execution_plan(config)
-    data: dict[str, pd.DataFrame] = {}
+    data: dict[str, pd.DataFrame] = existing_data or {}
     for table_name in execution_plan:
         table_config = config.root[table_name]
@@ -703,7 +821,7 @@ def sample(
             columns=table_config.columns,
             foreign_keys=table_config.foreign_keys,
             primary_keys=primary_keys,
-            generated_data=data,
+            data=data,
             sample_size=sample_size[table_name],
             batch_size=30,  # generate 30 root table rows at a time
             previous_rows_size=10,  # present 10 previously generated rows to the LLM

{mostlyai_mock-0.1.1.dist-info → mostlyai_mock-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.1
-Summary: Synthetic Mock Data
+Version: 0.1.3
+Summary: LLM-generated Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
@@ -32,7 +32,7 @@ Requires-Dist: pyarrow>=14.0.0
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Description-Content-Type: text/markdown
-# Synthetic Mock Data 🔮
+# LLM-generated Mock Data 🔮
 [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
@@ -66,7 +66,7 @@ os.environ["OPENAI_API_KEY"] = "your-api-key"
 Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
-3. Create your first basic synthetic table from scratch
+3. Create your first basic mock table from scratch
 ```python
 from mostlyai import mock
@@ -88,7 +88,7 @@ tables = {
     }
 }
 df = mock.sample(
-    tables=tables,  # provide table and column definitions
+    tables=tables,   # provide table and column definitions
     sample_size=10,  # generate 10 records
     model="openai/gpt-4.1-nano",  # select the LLM model (optional)
 )
@@ -106,7 +106,7 @@ print(df)
 # 9          FR    Louis Martin    male   44    1980-12-05 2025-01-07 10:40:00   False            270.0          103
 ```
-4. Create your first multi-table synthetic dataset
+4. Create your first multi-table mock dataset
 ```python
 from mostlyai import mock
@@ -168,7 +168,7 @@ tables = {
     },
 }
 data = mock.sample(
-    tables=tables,
+    tables=tables,
     sample_size=2,
     model="openai/gpt-4.1"
 )
@@ -201,7 +201,7 @@ print(data["items"])
 # 9  ITM-84312  ORD-11385                   Standard Delivery Service    48.5
 ```
-6. Create your first self-referencing synthetic table
+6. Create your first self-referencing mock table
 ```python
 from mostlyai import mock
@@ -240,6 +240,44 @@ print(df)
 # 9           10    Felix Bennett        3    Senior Systems Analyst
 ```
+7. Enrich existing data with additional columns
+```python
+from mostlyai import mock
+import pandas as pd
+tables = {
+    "guests": {
+        "prompt": "Guests of an Alpine ski hotel in Austria",
+        "columns": {
+            "guest_id": {"prompt": "the unique id of the guest", "dtype": "integer"},
+            "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
+            "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
+            "gender": {"dtype": "category", "values": ["male", "female"]},
+            "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
+            "room_number": {"prompt": "room number", "dtype": "integer"},
+            "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
+        },
+        "primary_key": "guest_id",
+    }
+}
+existing_guests = pd.DataFrame({
+    "guest_id": [1, 2, 3],
+    "name": ["Anna Schmidt", "Marco Rossi", "Sophie Dupont"],
+    "nationality": ["DE", "IT", "FR"],
+})
+df = mock.sample(
+    tables=tables,
+    existing_data={"guests": existing_guests},
+    model="openai/gpt-4.1-nano"
+)
+print(df)
+#    guest_id           name nationality  gender  age  room_number is_vip
+# 0         1   Anna Schmidt          DE  female   29          101   True
+# 1         2    Marco Rossi          IT    male   34          102  False
+# 2         3  Sophie Dupont          FR  female   27          103  False
+```
 ## MCP Server
 This repo comes with MCP Server. It can be easily consumed by any MCP Client by providing the following configuration:

mostlyai_mock-0.1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mostlyai/mock/__init__.py,sha256=38sp2aKJVtPa3koRxanlBS6fe_ccVQvIieILlKb-xuw,714
+mostlyai/mock/core.py,sha256=lO5OzuOz7bvjaLHpfiN-wyjFBPD0oSHSqEA4v8q436Y,35318
+mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
+mostlyai_mock-0.1.3.dist-info/METADATA,sha256=rkHeGDlNUM2cqSxWY_R47FWXsOLktpdl_COja8zYz28,14161
+mostlyai_mock-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+mostlyai_mock-0.1.3.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
+mostlyai_mock-0.1.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mostlyai_mock-0.1.3.dist-info/RECORD,,

mostlyai_mock-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mostlyai/mock/__init__.py,sha256=rwv3TboU77Sn6Py635JgvQu64d_R2s1Nc0dIDDbHAZA,714
-mostlyai/mock/core.py,sha256=MEDVp_woSXlD0JanS3ocxWBa_XilpaWzPhsvNzTZuX0,30138
-mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
-mostlyai_mock-0.1.1.dist-info/METADATA,sha256=tY5BvODgzoiqox8yS8ISfxWtVB1wbch1KNW8CikRImc,12713
-mostlyai_mock-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-mostlyai_mock-0.1.1.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
-mostlyai_mock-0.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mostlyai_mock-0.1.1.dist-info/RECORD,,

{mostlyai_mock-0.1.1.dist-info → mostlyai_mock-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{mostlyai_mock-0.1.1.dist-info → mostlyai_mock-0.1.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mostlyai_mock-0.1.1.dist-info → mostlyai_mock-0.1.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

mostlyai-mock 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl