PyPI - mostlyai-mock - Versions diffs - 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

mostlyai-mock 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

mostlyai/mock/__init__.py CHANGED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.16"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.18"  # Do not set this manually. Use poetry version [params].

mostlyai/mock/core.py CHANGED Viewed

@@ -124,14 +124,14 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
         return self
     @model_validator(mode="after")
-    def ensure_primary_key_is_string_dtype(self) -> MockConfig:
+    def ensure_primary_key_is_string_or_integer_dtype(self) -> MockConfig:
         for table_name, table_config in self.root.items():
             if table_config.primary_key:
                 column_config = table_config.columns[table_config.primary_key]
-                if column_config.dtype not in [DType.STRING]:
+                if column_config.dtype not in [DType.STRING, DType.INTEGER]:
                     raise ValueError(
                         f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
-                        f" {[DType.STRING.value]}"
+                        f" {[DType.STRING.value, DType.INTEGER.value]}"
                     )
         return self
@@ -248,6 +248,7 @@ async def _sample_table(
     non_context_size: int | None,
     n_workers: int,
     llm_config: LLMConfig,
+    config: MockConfig,
     progress_callback: Callable | None = None,
 ) -> pd.DataFrame:
     table_rows_generator = _create_table_rows_generator(
@@ -265,7 +266,13 @@ async def _sample_table(
         progress_callback=progress_callback,
     )
     table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
-    table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
+    table_df = await _convert_table_rows_generator_to_df(
+        table_rows_generator=table_rows_generator,
+        columns=columns,
+        primary_key=primary_keys.get(name),
+        foreign_keys=foreign_keys,
+        config=config,
+    )
     return table_df
@@ -326,6 +333,15 @@ def _create_table_prompt(
         column_specifications = {
             column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
         }
+    # ensure primary keys stay as string in the prompt, even if dtype is integer
+    if target_primary_key and target_primary_key in column_specifications:
+        if columns[target_primary_key].dtype == DType.INTEGER:
+            column_specifications[target_primary_key]["dtype"] = DType.STRING.value
+    # ensure foreign keys referencing integer primary keys also stay as string in the prompt
+    for fk in foreign_keys:
+        if fk.column in column_specifications:
+            if columns[fk.column].dtype == DType.INTEGER:
+                column_specifications[fk.column]["dtype"] = DType.STRING.value
     prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
     # add previous rows as context to help the LLM generate consistent data
@@ -565,11 +581,17 @@ async def _yield_rows_from_csv_chunks_stream(response: litellm.CustomStreamWrapp
 def _create_structured_output_schema(
-    columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
+    columns: dict[str, ColumnConfig],
+    existing_data: pd.DataFrame | None,
+    primary_key: str | None,
+    foreign_keys: list[ForeignKeyConfig],
 ) -> type[BaseModel]:
-    def create_annotation(column_config: ColumnConfig) -> type:
+    def create_annotation(column_config: ColumnConfig, is_int_pk_or_fk: bool = False) -> type:
         if column_config.values or column_config.dtype is DType.CATEGORY:
             return Literal[tuple(column_config.values)]  # type: ignore
+        # ensure integer primary keys and foreign keys are treated as strings
+        if is_int_pk_or_fk:
+            return str | None
         return {
             DType.INTEGER: int | None,
             DType.FLOAT: float | None,
@@ -585,7 +607,9 @@ def _create_structured_output_schema(
     for column_name, column_config in columns.items():
         if existing_data is not None and column_name in existing_data.columns:
             continue  # skip columns that already exist in existing data
-        annotation = create_annotation(column_config)
+        is_int_pk = primary_key and column_name == primary_key and column_config.dtype == DType.INTEGER
+        is_int_fk = any(fk.column == column_name for fk in foreign_keys) and column_config.dtype == DType.INTEGER
+        annotation = create_annotation(column_config, is_int_pk or is_int_fk)
         fields[column_name] = (annotation, Field(...))
     TableRow = create_model("TableRow", **fields)
     TableRows = create_model("TableRows", rows=(list[TableRow], ...))
@@ -632,8 +656,9 @@ async def _worker(
             # construct schema for Structured Outputs (applies to JSON LLMOutputFormat only)
             structured_output_schema = None
             if llm_output_format == LLMOutputFormat.JSON:
+                pk_col = primary_keys.get(name)
                 structured_output_schema = _create_structured_output_schema(
-                    columns=columns, existing_data=existing_batch
+                    columns=columns, existing_data=existing_batch, primary_key=pk_col, foreign_keys=foreign_keys
                 )
             # construct litellm kwargs
@@ -974,14 +999,47 @@ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: Co
     return series
+def _get_integer_pk_fk_columns(
+    columns: dict[str, ColumnConfig],
+    primary_key: str | None,
+    foreign_keys: list[ForeignKeyConfig],
+    config: MockConfig,
+) -> set[str]:
+    """determine which columns should be kept as strings (integer PKs and FKs that reference integer PKs)"""
+    skip_conversion = set()
+    # integer primary keys
+    if primary_key and primary_key in columns and columns[primary_key].dtype == DType.INTEGER:
+        skip_conversion.add(primary_key)
+    # foreign keys that reference integer primary keys
+    # note: FK dtype is guaranteed to match referenced PK dtype by config validation
+    for fk in foreign_keys:
+        if fk.column in columns and columns[fk.column].dtype == DType.INTEGER:
+            skip_conversion.add(fk.column)
+    return skip_conversion
 async def _convert_table_rows_generator_to_df(
     table_rows_generator: AsyncGenerator[dict],
     columns: dict[str, ColumnConfig],
+    primary_key: str | None = None,
+    foreign_keys: list[ForeignKeyConfig] | None = None,
+    config: MockConfig | None = None,
 ) -> pd.DataFrame:
     def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
         df = df.copy()
+        skip_int_conversion = (
+            _get_integer_pk_fk_columns(columns, primary_key, foreign_keys or [], config) if config else set()
+        )
         for column_name, column_config in columns.items():
-            df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
+            # keep integer PKs and FKs as strings for now (post-processing will convert them)
+            if column_name in skip_int_conversion:
+                df[column_name] = df[column_name].astype("string[pyarrow]")
+            else:
+                df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
         return df
     # consume entire generator
@@ -1025,11 +1083,6 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
         }
         column_configs = {**existing_column_configs, **column_configs}
-        # primary keys are always strings
-        primary_key = table_config.get("primary_key", None)
-        if primary_key is not None:
-            column_configs[primary_key]["dtype"] = DType.STRING
         table_config["columns"] = column_configs
     return tables
@@ -1129,6 +1182,45 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
     return execution_plan
+def _postprocess_table(
+    table_name: str,
+    df: pd.DataFrame,
+    table_config: TableConfig,
+    config: MockConfig,
+    pk_mappings: dict[str, dict[str, int]],
+) -> pd.DataFrame:
+    """convert integer PKs and FKs from strings to auto-incremented integers"""
+    df = df.copy()
+    # convert integer primary keys to 1, 2, 3, ... and build mapping
+    pk_col = table_config.primary_key
+    if pk_col and table_config.columns[pk_col].dtype == DType.INTEGER:
+        old_values = df[pk_col].tolist()
+        new_values = list(range(1, len(df) + 1))
+        # build mapping: old LLM-generated string values -> new auto-incremented integers
+        pk_mappings[table_name] = {str(old): new for old, new in zip(old_values, new_values)}
+        df[pk_col] = new_values
+    # convert foreign keys that reference integer primary keys
+    # note: FK dtype is guaranteed to match referenced PK dtype by config validation
+    for fk in table_config.foreign_keys:
+        # skip if not an integer FK (which means it doesn't reference an integer PK)
+        if table_config.columns[fk.column].dtype != DType.INTEGER:
+            continue
+        if fk.referenced_table not in pk_mappings:
+            continue
+        # map FK values from strings to integers
+        mapping = pk_mappings[fk.referenced_table]
+        df[fk.column] = (
+            df[fk.column].apply(lambda val: mapping.get(str(val)) if pd.notna(val) else None).astype("int64[pyarrow]")
+        )
+    return df
 async def _sample_common(
     *,
     tables: dict[str, dict],
@@ -1156,6 +1248,10 @@ async def _sample_common(
     data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
+    # track mappings from old string PK values to new integer PK values
+    pk_mappings: dict[str, dict[str, int]] = {}
+    # first, generate all tables (without postprocessing)
     for table_name in execution_plan:
         table_config = config.root[table_name]
         df = await _sample_table(
@@ -1170,10 +1266,16 @@ async def _sample_common(
             non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
             n_workers=n_workers,
             llm_config=llm_config,
+            config=config,
             progress_callback=progress_callback,
         )
         data[table_name] = df
+    # then, postprocess all tables (convert integer PKs/FKs from strings to integers)
+    for table_name in execution_plan:
+        table_config = config.root[table_name]
+        data[table_name] = _postprocess_table(table_name, data[table_name], table_config, config, pk_mappings)
     return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
@@ -1266,7 +1368,7 @@ def sample(
                 "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
                 "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
             },
-            "primary_key": "customer_id",  # single string; no composite keys allowed; primary keys must have string dtype
+            "primary_key": "customer_id",  # no composite keys allowed;
         },
         "warehouses": {
             "prompt": "Warehouses of a hardware store",

{mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.16
+Version: 0.1.18
 Summary: Synthetic Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -208,7 +208,7 @@ print(data["items"])
 # 9  B4-200510  B1-3010022         Bottled Spring Water (24 Pack)   34.95
 ```
-6. Create your first self-referencing mock table
+5. Create your first self-referencing mock table with auto-increment integer primary keys
 ```python
 from mostlyai import mock
@@ -217,9 +217,9 @@ tables = {
     "employees": {
         "prompt": "Employees of a company",
         "columns": {
-            "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
-            "name": {"prompt": "first name and last name of the president", "dtype": "string"},
-            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
+            "employee_id": {"dtype": "integer"},
+            "name": {"prompt": "first name and last name of the employee", "dtype": "string"},
+            "boss_id": {"dtype": "integer"},
             "role": {"prompt": "the role of the employee", "dtype": "string"},
         },
         "primary_key": "employee_id",
@@ -234,20 +234,20 @@ tables = {
 }
 df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
 print(df)
-#   employee_id              name boss_id                   role
-# 0        B0-1      Patricia Lee    <NA>              President
-# 1        B0-2  Edward Rodriguez    B0-1       VP of Operations
-# 2        B0-3      Maria Cortez    B0-1          VP of Finance
-# 3        B0-4     Thomas Nguyen    B0-1       VP of Technology
-# 4        B0-5        Rachel Kim    B0-2     Operations Manager
-# 5        B0-6     Jeffrey Patel    B0-2      Supply Chain Lead
-# 6        B0-7      Olivia Smith    B0-2  Facilities Supervisor
-# 7        B0-8      Brian Carter    B0-3     Accounting Manager
-# 8        B0-9   Lauren Anderson    B0-3      Financial Analyst
-# 9       B0-10   Santiago Romero    B0-3     Payroll Specialist
+#   employee_id              name  boss_id                   role
+# 0            1      Patricia Lee     <NA>              President
+# 1            2  Edward Rodriguez        1       VP of Operations
+# 2            3      Maria Cortez        1          VP of Finance
+# 3            4     Thomas Nguyen        1       VP of Technology
+# 4            5        Rachel Kim        2     Operations Manager
+# 5            6     Jeffrey Patel        2      Supply Chain Lead
+# 6            7      Olivia Smith        2  Facilities Supervisor
+# 7            8      Brian Carter        3     Accounting Manager
+# 8            9   Lauren Anderson        3      Financial Analyst
+# 9           10   Santiago Romero        3     Payroll Specialist
 ```
-7. Enrich existing data with additional columns
+6. Enrich existing data with additional columns
 ```python
 from mostlyai import mock

mostlyai_mock-0.1.18.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mostlyai/mock/__init__.py,sha256=UKmnKlQ7fZVvB0ckh9_nXjojAE0JGa2Kd2mT0Ci8cDU,715
+mostlyai/mock/core.py,sha256=oGSpIXINL7R1X7ZN5dtdwItaPXDD0mGvkakA0CEzmwI,66880
+mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
+mostlyai_mock-0.1.18.dist-info/METADATA,sha256=EmLjpo-D-wJefswHIMk3TCK9TvzLML-3Sjo0OEi9qAI,14257
+mostlyai_mock-0.1.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+mostlyai_mock-0.1.18.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
+mostlyai_mock-0.1.18.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mostlyai_mock-0.1.18.dist-info/RECORD,,

mostlyai_mock-0.1.16.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mostlyai/mock/__init__.py,sha256=XEezyGjkXQBReW_ORi83H2WEVhLolDDLbGjxA2g2yEs,715
-mostlyai/mock/core.py,sha256=FTF0BfJowxNHm_L0RpTk6BhS1mXzvjELP-3Z96aFVMQ,62454
-mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
-mostlyai_mock-0.1.16.dist-info/METADATA,sha256=CT6lcz2cAq5W-u3VjQLr_Dg8VbuEtU-JlvsXg5OsKTk,14297
-mostlyai_mock-0.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-mostlyai_mock-0.1.16.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
-mostlyai_mock-0.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mostlyai_mock-0.1.16.dist-info/RECORD,,

{mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

mostlyai-mock 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl