PyPI - mostlyai-mock - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

mostlyai-mock 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mostlyai/mock/__init__.py CHANGED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.4"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.5"  # Do not set this manually. Use poetry version [params].

mostlyai/mock/core.py CHANGED Viewed

@@ -18,7 +18,7 @@ import json
 from collections import deque
 from collections.abc import Generator
 from enum import Enum
-from typing import Any, Literal, Type
+from typing import Any, Literal
 import litellm
 import pandas as pd
@@ -28,7 +28,7 @@ from tqdm import tqdm
 litellm.suppress_debug_info = True
 SYSTEM_PROMPT = """
-You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
+You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
 Your task is to:
@@ -58,7 +58,7 @@ class LLMConfig(BaseModel):
 class MockConfig(RootModel[dict[str, "TableConfig"]]):
-    root: dict[str, TableConfig] = Field(..., min_items=1)
+    root: dict[str, TableConfig] = Field(..., min_length=1)
     @field_validator("root")
     @classmethod
@@ -127,7 +127,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
 class TableConfig(BaseModel):
     prompt: str = ""
-    columns: dict[str, ColumnConfig] = Field(..., min_items=1)
+    columns: dict[str, ColumnConfig] = Field(..., min_length=1)
     primary_key: str | None = None
     foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
@@ -261,7 +261,7 @@ def _create_table_prompt(
     # add existing data to augment
     if existing_data is not None:
-        prompt += f"\n## Existing Data to Augment:\n\n"
+        prompt += "\n## Existing Data to Augment:\n\n"
         prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
     # define foreign keys
@@ -314,11 +314,11 @@ def _create_table_prompt(
     if foreign_keys:
         prompt += (
-            f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
-            f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
-            f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
-            f"In this case, ensure that the foreign keys are consistent with primary keys of the table. "
-            f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
+            "The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
+            "The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
+            "If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
+            "In this case, ensure that the foreign keys are consistent with primary keys of the table. "
+            "Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
         )
     if existing_data is not None:
@@ -358,7 +358,7 @@ def _create_table_rows_generator(
     llm_config: LLMConfig,
 ) -> Generator[dict]:
     def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
-        def create_annotation(column_config: ColumnConfig) -> Type:
+        def create_annotation(column_config: ColumnConfig) -> type:
             if column_config.values or column_config.dtype is DType.CATEGORY:
                 return Literal[tuple(column_config.values)]
             return {
@@ -488,6 +488,12 @@ def _create_table_rows_generator(
                 table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()
             }
+        if context_batch is None:
+            # for root tables, scale down batch size in order to prevent excessive generations
+            remaining_rows = sample_size - yielded_sequences
+            if batch_size >= remaining_rows:
+                batch_size = remaining_rows + 2  # +2 because LLM may not always count the rows correctly
         llm_prompt = _create_table_prompt(
             name=name,
             prompt=prompt,
@@ -553,6 +559,36 @@ def _convert_table_rows_generator_to_df(
     return df
+def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataFrame] | None) -> dict[str, dict]:
+    def _infer_dtype(series: pd.Series) -> DType:
+        if pd.api.types.is_integer_dtype(series):
+            return DType.INTEGER
+        elif pd.api.types.is_float_dtype(series):
+            return DType.FLOAT
+        elif pd.api.types.is_datetime64_dtype(series):
+            return DType.DATETIME
+        elif pd.api.types.is_bool_dtype(series):
+            return DType.BOOLEAN
+        else:
+            return DType.STRING
+    if existing_data is None:
+        return tables
+    tables = tables.copy()
+    for table_name, existing_table in existing_data.items():
+        table_config = tables.setdefault(table_name, {})
+        column_configs = table_config.setdefault("columns", {})
+        existing_column_configs = {
+            existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
+            for existing_column in existing_table.columns
+            if existing_column not in column_configs
+        }
+        column_configs = {**existing_column_configs, **column_configs}
+        table_config["columns"] = column_configs
+    return tables
 def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
     if isinstance(sample_size, int):
         return {table_name: sample_size for table_name in config.root}
@@ -756,8 +792,6 @@ def sample(
         "patients": {
             "prompt": "Patients of a hospital in Finland",
             "columns": {
-                "age": {},
-                "gender": {},
                 "full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
                 "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
                 "place_of_birth": {"prompt": "place of birth", "dtype": "string"},
@@ -769,7 +803,7 @@ def sample(
         "gender": ["male", "male", "female", "female"],
     })
     enriched_df = mock.sample(
-        tables=tables,
+        tables=tables,
         existing_data={"patients": existing_df},
         model="openai/gpt-4.1-nano"
     )
@@ -833,7 +867,9 @@ def sample(
     ```
     """
+    tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
     config = MockConfig(tables)
     llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
     sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)

mostlyai/mock/mcp_server.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 MOSTLY AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import tempfile

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.4
+Version: 0.1.5
 Summary: LLM-generated Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -169,7 +169,7 @@ tables = {
 }
 data = mock.sample(
     tables=tables,
-    sample_size=2,
+    sample_size=2,
     model="openai/gpt-4.1"
 )
 print(data["customers"])
@@ -250,9 +250,6 @@ tables = {
     "guests": {
         "prompt": "Guests of an Alpine ski hotel in Austria",
         "columns": {
-            "guest_id": {"prompt": "the unique id of the guest", "dtype": "integer"},
-            "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
-            "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
             "gender": {"dtype": "category", "values": ["male", "female"]},
             "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
             "room_number": {"prompt": "room number", "dtype": "integer"},

mostlyai_mock-0.1.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mostlyai/mock/__init__.py,sha256=-bfsVZJQ0OkN5b3IRP3F9aUCiA8Eq1-RmAqBmTg0O0g,714
+mostlyai/mock/core.py,sha256=V7KG7nOQPU95v6lRoSIfJuYivS0pNZ3rbiNC6SqDZSc,38075
+mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
+mostlyai_mock-0.1.5.dist-info/METADATA,sha256=LfugCsu7ANDZk2ozNFHDxgCqY42etJIdkXcfc-S-cUE,13887
+mostlyai_mock-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+mostlyai_mock-0.1.5.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
+mostlyai_mock-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mostlyai_mock-0.1.5.dist-info/RECORD,,

mostlyai_mock-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mostlyai/mock/__init__.py,sha256=EvV_Tp6ExzQPq4apGq_8F25qw_paNTcQEC94nIVOEog,714
-mostlyai/mock/core.py,sha256=ubarMA3VUlXdjUsCXQK_mD_kWPkTMOYvLz9G4OughGk,36532
-mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
-mostlyai_mock-0.1.4.dist-info/METADATA,sha256=jibPe0pKcwqyPBoyc7H98LPd72vkGZBStdw_yMNVvJI,14161
-mostlyai_mock-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-mostlyai_mock-0.1.4.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
-mostlyai_mock-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mostlyai_mock-0.1.4.dist-info/RECORD,,

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

mostlyai-mock 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl