PyPI - mostlyai-mock - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

mostlyai-mock 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mostlyai/mock/__init__.py CHANGED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.13"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.15"  # Do not set this manually. Use poetry version [params].

mostlyai/mock/core.py CHANGED Viewed

@@ -19,7 +19,7 @@ import concurrent.futures
 import json
 import math
 from collections import deque
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Callable
 from enum import Enum
 from io import StringIO
 from typing import Any, Literal
@@ -40,10 +40,10 @@ class LLMOutputFormat(str, Enum):
 class LLMConfig(BaseModel):
-    model: str = "openai/gpt-4.1-nano"
-    api_key: str | None = None
-    temperature: float = 1.0
-    top_p: float = 0.95
+    model: str
+    api_key: str | None
+    temperature: float
+    top_p: float
 class MockConfig(RootModel[dict[str, "TableConfig"]]):
@@ -248,6 +248,7 @@ async def _sample_table(
     non_context_size: int | None,
     n_workers: int,
     llm_config: LLMConfig,
+    progress_callback: Callable | None = None,
 ) -> pd.DataFrame:
     table_rows_generator = _create_table_rows_generator(
         name=name,
@@ -261,19 +262,15 @@ async def _sample_table(
         non_context_size=non_context_size,
         n_workers=n_workers,
         llm_config=llm_config,
+        progress_callback=progress_callback,
     )
     table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
     table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
     return table_df
-def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
-    return asyncio.run(_sample_table(*args, **kwargs))
 def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
-    return f"""
-You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
+    return f"""You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
 Your task is to:
@@ -291,8 +288,7 @@ appropriate content. For dates and timestamps, ensure logical chronology. Always
 across tables.
 When enriching existing data, carefully analyze the patterns and relationships in the existing columns \
-to generate compatible and realistic values for the missing columns.
-"""
+to generate compatible and realistic values for the missing columns."""
 def _create_table_prompt(
@@ -717,7 +713,7 @@ async def _worker(
             if do_repeat_task:
                 # allow 10 retries across all workers before propagating the exception to the orchestrator
                 await retry_queue.put(1)
-                if retry_queue.qsize() < 10:
+                if retry_queue.qsize() <= 10:
                     # put task back to the front of the batch queue
                     await batch_queue.put((batch_idx, task))
                 else:
@@ -766,6 +762,7 @@ async def _create_table_rows_generator(
     non_context_size: int | None,
     n_workers: int,
     llm_config: LLMConfig,
+    progress_callback: Callable | None = None,
 ) -> AsyncGenerator[dict]:
     batch_size = 20  # generate 20 root table rows at a time
@@ -807,6 +804,13 @@ async def _create_table_rows_generator(
             assert non_context_table_name in data
             non_context_data[non_context_table_name] = data[non_context_table_name]
+    # calculate ideal batch size that spreads the workload evenly across workers
+    ideal_batch_size = max(math.ceil(sample_size / n_workers), 5)
+    if ideal_batch_size < batch_size:
+        # never increase batch_size beyond initial value
+        # this is especially important for sequential tables, where batch_size is currently assumed to be 1 everywhere
+        batch_size = ideal_batch_size
     # calculate batch_sizes
     assert sample_size is not None, "sample_size should have been filled by this point"
     n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
@@ -911,6 +915,12 @@ async def _create_table_rows_generator(
             if n_yielded_sequences >= sample_size:
                 break
         n_completed_batches += 1
+        if progress_callback:
+            await progress_callback(
+                progress=n_completed_batches,
+                total=n_total_batches,
+                message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
+            )
         result_queue.task_done()
     # gracefully shutdown workers
@@ -1101,6 +1111,54 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
     return execution_plan
+async def _sample_common(
+    *,
+    tables: dict[str, dict],
+    sample_size: int | dict[str, int] = 4,
+    existing_data: dict[str, pd.DataFrame] | None = None,
+    model: str = "openai/gpt-4.1-nano",
+    api_key: str | None = None,
+    temperature: float = 1.0,
+    top_p: float = 0.95,
+    n_workers: int = 10,
+    return_type: Literal["auto", "dict"] = "auto",
+    progress_callback: Callable | None = None,
+):
+    tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
+    config = MockConfig(tables)
+    llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
+    sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
+    primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
+    n_workers = max(min(n_workers, 10), 1)
+    execution_plan: list[str] = _build_execution_plan(config)
+    data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
+    for table_name in execution_plan:
+        table_config = config.root[table_name]
+        df = await _sample_table(
+            name=table_name,
+            prompt=table_config.prompt,
+            columns=table_config.columns,
+            foreign_keys=table_config.foreign_keys,
+            primary_keys=primary_keys,
+            data=data,
+            sample_size=sample_size.get(table_name),
+            previous_rows_size=10,  # present 10 previously generated rows to the LLM
+            non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
+            n_workers=n_workers,
+            llm_config=llm_config,
+            progress_callback=progress_callback,
+        )
+        data[table_name] = df
+    return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
 def sample(
     *,
     tables: dict[str, dict],
@@ -1121,11 +1179,12 @@ def sample(
     or the enrichment of existing datasets with new, context-aware columns.
     It is particularly useful for quickly simulating production-like datasets for testing or prototyping purposes.
-    It is advised to limit mocking to small datasets for performance reasons (rows * cols < 100).
+    It is advised to limit mocking to small datasets for performance reasons (rows * cols < 1000).
     It might take a couple of minutes for bigger datasets.
     Args:
         tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
+            Note: Avoid using double quotes (`"`) and other special characters in column names.
         sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
             If a single integer is provided, the same number of rows will be generated for each subject table.
             If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
@@ -1329,42 +1388,51 @@ def sample(
     ```
     """
-    tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
-    config = MockConfig(tables)
+    def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
+        return asyncio.run(_sample_common(*args, **kwargs))
-    llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
-    sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
-    primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
-    n_workers = max(min(n_workers, 10), 1)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(
+            sample_common_sync,
+            tables=tables,
+            sample_size=sample_size,
+            existing_data=existing_data,
+            model=model,
+            api_key=api_key,
+            temperature=temperature,
+            top_p=top_p,
+            n_workers=n_workers,
+            return_type=return_type,
+            progress_callback=None,
+        )
+        return future.result()
-    execution_plan: list[str] = _build_execution_plan(config)
-    data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
+async def _asample(
+    *,
+    tables: dict[str, dict],
+    sample_size: int | dict[str, int] = 4,
+    existing_data: dict[str, pd.DataFrame] | None = None,
+    model: str = "openai/gpt-4.1-nano",
+    api_key: str | None = None,
+    temperature: float = 1.0,
+    top_p: float = 0.95,
+    n_workers: int = 10,
+    return_type: Literal["auto", "dict"] = "auto",
+    progress_callback: Callable | None = None,
+) -> pd.DataFrame | dict[str, pd.DataFrame]:
+    return await _sample_common(
+        tables=tables,
+        sample_size=sample_size,
+        existing_data=existing_data,
+        model=model,
+        api_key=api_key,
+        temperature=temperature,
+        top_p=top_p,
+        n_workers=n_workers,
+        return_type=return_type,
+        progress_callback=progress_callback,
+    )
-    # synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
-    # in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
-    # a new thread is spawned for each call to `_sample_table`
-    # NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
-    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
-        for table_name in execution_plan:
-            table_config = config.root[table_name]
-            future = executor.submit(
-                _sample_table_sync,
-                name=table_name,
-                prompt=table_config.prompt,
-                columns=table_config.columns,
-                foreign_keys=table_config.foreign_keys,
-                primary_keys=primary_keys,
-                data=data,
-                sample_size=sample_size.get(table_name),
-                previous_rows_size=10,  # present 10 previously generated rows to the LLM
-                non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
-                n_workers=n_workers,
-                llm_config=llm_config,
-            )
-            df = future.result()
-            data[table_name] = df
-    return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
+_asample.__doc__ = sample.__doc__

mostlyai/mock/mcp_server.py CHANGED Viewed

@@ -16,16 +16,16 @@ import os
 import tempfile
 import pandas as pd
-from fastmcp import FastMCP
+from fastmcp import Context, FastMCP
-from mostlyai import mock
+from mostlyai.mock.core import _asample
 SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
 Synthetic Mock Data.
 Use LLMs to generate any Tabular Data towards your needs. Create from scratch, expand existing datasets, or enrich tables with new columns.
-This tool is a proxy to the `mostlyai.mock.sample` function, but returns a dictionary of paths to the generated CSV files.
+This tool is a proxy to the `mostlyai.mock.core._asample` function, but returns a dictionary of paths to the generated CSV files.
 Present the result nicely to the user, in Markdown format. Example:
@@ -33,8 +33,8 @@ Mock data can be found under the following paths:
 - `/tmp/tmpl41bwa6n/players.csv`
 - `/tmp/tmpl41bwa6n/seasons.csv`
-== mostlyai.mock.sample DocString ==
-{mock.sample.__doc__}
+== mostlyai.mock.core._asample docstring ==
+{_asample.__doc__}
 """
 mcp = FastMCP(name="MostlyAI Mock MCP Server")
@@ -51,7 +51,8 @@ def _store_locally(data: dict[str, pd.DataFrame]) -> dict[str, str]:
 @mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
-def mock_data(
+async def mock_data(
+    ctx: Context,
     *,
     tables: dict[str, dict],
     sample_size: int,
@@ -60,7 +61,7 @@ def mock_data(
     temperature: float = 1.0,
     top_p: float = 0.95,
 ) -> dict[str, str]:
-    data = mock.sample(
+    data = await _asample(
         tables=tables,
         sample_size=sample_size,
         model=model,
@@ -68,6 +69,7 @@ def mock_data(
         temperature=temperature,
         top_p=top_p,
         return_type="dict",
+        progress_callback=ctx.report_progress,
     )
     locations = _store_locally(data)
     return locations

{mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.13
+Version: 0.1.15
 Summary: Synthetic Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -24,13 +24,16 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Software Development :: Libraries
 Classifier: Typing :: Typed
 Requires-Python: >=3.10
-Requires-Dist: fastmcp<3.0.0,>=2.0.0
 Requires-Dist: litellm>=1.67.0
 Requires-Dist: numpy>=1.26.3
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyarrow>=14.0.0
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: tenacity>=9.1.2
+Provides-Extra: litellm-proxy
+Requires-Dist: litellm[proxy]>=1.67.0; extra == 'litellm-proxy'
+Provides-Extra: mcp
+Requires-Dist: fastmcp<3.0.0,>=2.0.0; extra == 'mcp'
 Description-Content-Type: text/markdown
 # Synthetic Mock Data 🔮
@@ -229,7 +232,7 @@ tables = {
         ],
     }
 }
-df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
+df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
 print(df)
 #   employee_id              name boss_id                   role
 # 0        B0-1      Patricia Lee    <NA>              President
@@ -285,18 +288,18 @@ This repo comes with MCP Server. It can be easily consumed by any MCP Client by
 ```json
 {
-  "mcpServers": {
-      "mostlyai-mock-mcp": {
-          "command": "uvx",
-          "args": ["--from", "mostlyai-mock", "mcp-server"],
-          "env": {
-              "OPENAI_API_KEY": "PROVIDE YOUR KEY",
-              "GEMINI_API_KEY": "PROVIDE YOUR KEY",
-              "GROQ_API_KEY": "PROVIDE YOUR KEY",
-              "ANTHROPIC_API_KEY": "PROVIDE YOUR KEY"
-          }
-      }
-  }
+    "mcpServers": {
+        "mostlyai-mock-mcp": {
+            "command": "uvx",
+            "args": ["--from", "mostlyai-mock[mcp]", "mcp-server"],
+            "env": {
+                "OPENAI_API_KEY": "PROVIDE YOUR KEY",
+                "GEMINI_API_KEY": "PROVIDE YOUR KEY",
+                "GROQ_API_KEY": "PROVIDE YOUR KEY",
+                "ANTHROPIC_API_KEY": "PROVIDE YOUR KEY"
+            }
+        }
+    }
 }
 ```
@@ -306,5 +309,5 @@ For example:
 Troubleshooting:
 1. If the MCP Client fails to detect the MCP Server, provide the absolute path in the `command` field, for example: `/Users/johnsmith/.local/bin/uvx`
-2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock mcp-server`
-3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "mcp-server"]`
+2. To debug MCP Server issues, you can use MCP Inspector by running: `npx @modelcontextprotocol/inspector -- uvx --from mostlyai-mock[mcp] mcp-server`
+3. In order to develop locally, modify the configuration by replacing `"command": "uv"` (or use the full path to `uv` if needed) and `"args": ["--directory", "/Users/johnsmith/mostlyai-mock", "run", "--extra", "mcp", "mcp-server"]`

mostlyai_mock-0.1.15.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mostlyai/mock/__init__.py,sha256=uv2DLnOleN6BNMfAMleXJCPcOZvM_tMTRy5njUIKDag,715
+mostlyai/mock/core.py,sha256=JDJ9nVpRR2WochxumSdQS96zak0OV1frkJOwlQsPVBw,61715
+mostlyai/mock/mcp_server.py,sha256=0Vn1jmrdNAvUZSviaaU7Lhn7L7iHFyd8kGFigM0-4s0,2367
+mostlyai_mock-0.1.15.dist-info/METADATA,sha256=OG3NRdCcH2qycRQ5HrzyLtJwtl74lRw5Py1JtqfB2YI,14305
+mostlyai_mock-0.1.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+mostlyai_mock-0.1.15.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
+mostlyai_mock-0.1.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mostlyai_mock-0.1.15.dist-info/RECORD,,

mostlyai_mock-0.1.13.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mostlyai/mock/__init__.py,sha256=r4GBSmJmB1SGtviYtZwY5b3GBzhK_mt8czzk-py4flo,715
-mostlyai/mock/core.py,sha256=nu0PSX3Xt8l6_95cIrJ7Wt0SbJvfrLD3t0CFIidOLcM,59573
-mostlyai/mock/mcp_server.py,sha256=MrVUrIsAZsFzjK1suwNl1fxS1ES-wpc-YSM8cS8Fqcw,2259
-mostlyai_mock-0.1.13.dist-info/METADATA,sha256=un3lLINiMi8HkVcmsIr64U-OQQiqT5LsgiGam1aNTj4,14110
-mostlyai_mock-0.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-mostlyai_mock-0.1.13.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
-mostlyai_mock-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mostlyai_mock-0.1.13.dist-info/RECORD,,

{mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.15.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mostlyai_mock-0.1.13.dist-info → mostlyai_mock-0.1.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

mostlyai-mock 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl