PyPI - mostlyai-mock - Versions diffs - 0.1.12__tar.gz → 0.1.14__tar.gz - Mend

mostlyai-mock 0.1.12tar.gz → 0.1.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.12
+Version: 0.1.14
 Summary: Synthetic Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -229,7 +229,7 @@ tables = {
         ],
     }
 }
-df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
+df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
 print(df)
 #   employee_id              name boss_id                   role
 # 0        B0-1      Patricia Lee    <NA>              President

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/README.md RENAMED Viewed

@@ -194,7 +194,7 @@ tables = {
         ],
     }
 }
-df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
+df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1", n_workers=1)
 print(df)
 #   employee_id              name boss_id                   role
 # 0        B0-1      Patricia Lee    <NA>              President

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/mostlyai/mock/__init__.py RENAMED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.12"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.14"  # Do not set this manually. Use poetry version [params].

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/mostlyai/mock/core.py RENAMED Viewed

@@ -19,7 +19,7 @@ import concurrent.futures
 import json
 import math
 from collections import deque
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Callable
 from enum import Enum
 from io import StringIO
 from typing import Any, Literal
@@ -248,6 +248,7 @@ async def _sample_table(
     non_context_size: int | None,
     n_workers: int,
     llm_config: LLMConfig,
+    progress_callback: Callable | None = None,
 ) -> pd.DataFrame:
     table_rows_generator = _create_table_rows_generator(
         name=name,
@@ -261,16 +262,13 @@ async def _sample_table(
         non_context_size=non_context_size,
         n_workers=n_workers,
         llm_config=llm_config,
+        progress_callback=progress_callback,
     )
     table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
     table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
     return table_df
-def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
-    return asyncio.run(_sample_table(*args, **kwargs))
 def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
     return f"""
 You are a specialized data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
@@ -766,6 +764,7 @@ async def _create_table_rows_generator(
     non_context_size: int | None,
     n_workers: int,
     llm_config: LLMConfig,
+    progress_callback: Callable | None = None,
 ) -> AsyncGenerator[dict]:
     batch_size = 20  # generate 20 root table rows at a time
@@ -807,6 +806,13 @@ async def _create_table_rows_generator(
             assert non_context_table_name in data
             non_context_data[non_context_table_name] = data[non_context_table_name]
+    # calculate ideal batch size that spreads the workload evenly across workers
+    ideal_batch_size = max(math.ceil(sample_size / n_workers), 5)
+    if ideal_batch_size < batch_size:
+        # never increase batch_size beyond initial value
+        # this is especially important for sequential tables, where batch_size is currently assumed to be 1 everywhere
+        batch_size = ideal_batch_size
     # calculate batch_sizes
     assert sample_size is not None, "sample_size should have been filled by this point"
     n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
@@ -911,6 +917,12 @@ async def _create_table_rows_generator(
             if n_yielded_sequences >= sample_size:
                 break
         n_completed_batches += 1
+        if progress_callback:
+            await progress_callback(
+                progress=n_completed_batches,
+                total=n_total_batches,
+                message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
+            )
         result_queue.task_done()
     # gracefully shutdown workers
@@ -924,9 +936,9 @@ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: Co
     series = series.copy()
     if column_config.dtype in [DType.DATE, DType.DATETIME]:
-        def harmonize_datetime(x):
+        def harmonize_datetime(x: Any):
             try:
-                return dateutil.parser.parse(x)
+                return dateutil.parser.parse(str(x))
             except Exception:
                 return pd.NaT
@@ -1101,6 +1113,54 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
     return execution_plan
+async def _sample_common(
+    *,
+    tables: dict[str, dict],
+    sample_size: int | dict[str, int] = 4,
+    existing_data: dict[str, pd.DataFrame] | None = None,
+    model: str = "openai/gpt-4.1-nano",
+    api_key: str | None = None,
+    temperature: float = 1.0,
+    top_p: float = 0.95,
+    n_workers: int = 10,
+    return_type: Literal["auto", "dict"] = "auto",
+    progress_callback: Callable | None = None,
+):
+    tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
+    config = MockConfig(tables)
+    llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
+    sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
+    primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
+    n_workers = max(min(n_workers, 10), 1)
+    execution_plan: list[str] = _build_execution_plan(config)
+    data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
+    for table_name in execution_plan:
+        table_config = config.root[table_name]
+        df = await _sample_table(
+            name=table_name,
+            prompt=table_config.prompt,
+            columns=table_config.columns,
+            foreign_keys=table_config.foreign_keys,
+            primary_keys=primary_keys,
+            data=data,
+            sample_size=sample_size.get(table_name),
+            previous_rows_size=10,  # present 10 previously generated rows to the LLM
+            non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
+            n_workers=n_workers,
+            llm_config=llm_config,
+            progress_callback=progress_callback,
+        )
+        data[table_name] = df
+    return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
 def sample(
     *,
     tables: dict[str, dict],
@@ -1329,42 +1389,51 @@ def sample(
     ```
     """
-    tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
-    config = MockConfig(tables)
+    def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
+        return asyncio.run(_sample_common(*args, **kwargs))
-    llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
-    sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
-    primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
-    n_workers = max(min(n_workers, 10), 1)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(
+            sample_common_sync,
+            tables=tables,
+            sample_size=sample_size,
+            existing_data=existing_data,
+            model=model,
+            api_key=api_key,
+            temperature=temperature,
+            top_p=top_p,
+            n_workers=n_workers,
+            return_type=return_type,
+            progress_callback=None,
+        )
+        return future.result()
-    execution_plan: list[str] = _build_execution_plan(config)
-    data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
+async def _asample(
+    *,
+    tables: dict[str, dict],
+    sample_size: int | dict[str, int] = 4,
+    existing_data: dict[str, pd.DataFrame] | None = None,
+    model: str = "openai/gpt-4.1-nano",
+    api_key: str | None = None,
+    temperature: float = 1.0,
+    top_p: float = 0.95,
+    n_workers: int = 10,
+    return_type: Literal["auto", "dict"] = "auto",
+    progress_callback: Callable | None = None,
+) -> pd.DataFrame | dict[str, pd.DataFrame]:
+    return await _sample_common(
+        tables=tables,
+        sample_size=sample_size,
+        existing_data=existing_data,
+        model=model,
+        api_key=api_key,
+        temperature=temperature,
+        top_p=top_p,
+        n_workers=n_workers,
+        return_type=return_type,
+        progress_callback=progress_callback,
+    )
-    # synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
-    # in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
-    # a new thread is spawned for each call to `_sample_table`
-    # NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
-    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
-        for table_name in execution_plan:
-            table_config = config.root[table_name]
-            future = executor.submit(
-                _sample_table_sync,
-                name=table_name,
-                prompt=table_config.prompt,
-                columns=table_config.columns,
-                foreign_keys=table_config.foreign_keys,
-                primary_keys=primary_keys,
-                data=data,
-                sample_size=sample_size.get(table_name),
-                previous_rows_size=10,  # present 10 previously generated rows to the LLM
-                non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
-                n_workers=n_workers,
-                llm_config=llm_config,
-            )
-            df = future.result()
-            data[table_name] = df
-    return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
+_asample.__doc__ = sample.__doc__

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/mostlyai/mock/mcp_server.py RENAMED Viewed

@@ -16,16 +16,16 @@ import os
 import tempfile
 import pandas as pd
-from fastmcp import FastMCP
+from fastmcp import Context, FastMCP
-from mostlyai import mock
+from mostlyai.mock.core import _asample
 SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
 Synthetic Mock Data.
 Use LLMs to generate any Tabular Data towards your needs. Create from scratch, expand existing datasets, or enrich tables with new columns.
-This tool is a proxy to the `mostlyai.mock.sample` function, but returns a dictionary of paths to the generated CSV files.
+This tool is a proxy to the `mostlyai.mock.core._asample` function, but returns a dictionary of paths to the generated CSV files.
 Present the result nicely to the user, in Markdown format. Example:
@@ -33,8 +33,8 @@ Mock data can be found under the following paths:
 - `/tmp/tmpl41bwa6n/players.csv`
 - `/tmp/tmpl41bwa6n/seasons.csv`
-== mostlyai.mock.sample DocString ==
-{mock.sample.__doc__}
+== mostlyai.mock.core._asample docstring ==
+{_asample.__doc__}
 """
 mcp = FastMCP(name="MostlyAI Mock MCP Server")
@@ -51,7 +51,8 @@ def _store_locally(data: dict[str, pd.DataFrame]) -> dict[str, str]:
 @mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
-def mock_data(
+async def mock_data(
+    ctx: Context,
     *,
     tables: dict[str, dict],
     sample_size: int,
@@ -60,7 +61,7 @@ def mock_data(
     temperature: float = 1.0,
     top_p: float = 0.95,
 ) -> dict[str, str]:
-    data = mock.sample(
+    data = await _asample(
         tables=tables,
         sample_size=sample_size,
         model=model,
@@ -68,6 +69,7 @@ def mock_data(
         temperature=temperature,
         top_p=top_p,
         return_type="dict",
+        progress_callback=ctx.report_progress,
     )
     locations = _store_locally(data)
     return locations

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mostlyai-mock"
-version = "0.1.12"
+version = "0.1.14"
 description = "Synthetic Mock Data"
 authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
 requires-python = ">=3.10"

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/.gitignore RENAMED Viewed

File without changes

{mostlyai_mock-0.1.12 → mostlyai_mock-0.1.14}/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.1.12__tar.gz → 0.1.14__tar.gz

mostlyai-mock 0.1.12tar.gz → 0.1.14tar.gz