PyPI - mostlyai-mock - Versions diffs - 0.0.6__tar.gz → 0.0.8__tar.gz - Mend

mostlyai-mock 0.0.6tar.gz → 0.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.0.6
+Version: 0.0.8
 Summary: Synthetic Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -24,16 +24,18 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Software Development :: Libraries
 Classifier: Typing :: Typed
 Requires-Python: >=3.10
+Requires-Dist: fastmcp<3.0.0,>=2.0.0
 Requires-Dist: litellm>=1.67.0
 Requires-Dist: numpy>=1.26.3
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyarrow>=14.0.0
 Requires-Dist: pydantic<3.0.0,>=2.0.0
+Requires-Dist: typer<1.0.0,>=0.9.0
 Description-Content-Type: text/markdown
 # Synthetic Mock Data 🔮
-[![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
+[![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
 Create data out of nothing. Prompt LLMs for Tabular Data.
@@ -72,7 +74,7 @@ from mostlyai import mock
 tables = {
     "guests": {
-        "description": "Guests of an Alpine ski hotel in Austria",
+        "prompt": "Guests of an Alpine ski hotel in Austria",
         "columns": {
             "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
             "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -112,17 +114,26 @@ from mostlyai import mock
 tables = {
     "customers": {
-        "description": "Customers of a hardware store",
+        "prompt": "Customers of a hardware store",
         "columns": {
             "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
             "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
         },
         "primary_key": "customer_id",
     },
+    "warehouses": {
+        "prompt": "Warehouses of a hardware store",
+        "columns": {
+            "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
+            "name": {"prompt": "the name of the warehouse", "dtype": "string"},
+        },
+        "primary_key": "warehouse_id",
+    },
     "orders": {
-        "description": "Orders of a Customer",
+        "prompt": "Orders of a Customer",
         "columns": {
             "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
+            "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
             "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
             "text": {"prompt": "order text description", "dtype": "string"},
             "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -132,12 +143,16 @@ tables = {
             {
                 "column": "customer_id",
                 "referenced_table": "customers",
-                "description": "each customer has anywhere between 2 and 3 orders",
-            }
+                "prompt": "each customer has anywhere between 2 and 3 orders",
+            },
+            {
+                "column": "warehouse_id",
+                "referenced_table": "warehouses",
+            },
         ],
     },
     "items": {
-        "description": "Items in an Order",
+        "prompt": "Items in an Order",
         "columns": {
             "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
             "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -148,7 +163,7 @@ tables = {
             {
                 "column": "order_id",
                 "referenced_table": "orders",
-                "description": "each order has between 1 and 2 items",
+                "prompt": "each order has between 1 and 2 items",
             }
         ],
     },
@@ -159,28 +174,69 @@ data = mock.sample(
     model="openai/gpt-4.1"
 )
 print(data["customers"])
-#    customer_id            name
-# 0            1  Michael Torres
-# 1            2      Elaine Kim
+#    customer_id             name
+# 0            1  Matthew Carlson
+# 1            2       Priya Shah
+print(data["warehouses"])
+#    warehouse_id                        name
+# 0             1    Central Distribution Hub
+# 1             2  Northgate Storage Facility
 print(data["orders"])
-#    customer_id        order_id                                               text  amount
-# 0            1  ORD20240612001        Home office desk and ergonomic chair bundle  412.95
-# 1            1  ORD20240517322               Wireless noise-cancelling headphones  226.49
-# 2            1  ORD20240430307         Smart LED desk lamp with USB charging port   69.99
-# 3            2  ORD20240614015            Eco-friendly bamboo kitchen utensil set   39.95
-# 4            2  ORD20240528356  Air fryer with digital touch screen, 5-quart c...  129.99
-# 5            2  ORD20240510078          Double-walled glass coffee mugs, set of 4    48.5
+#    customer_id  warehouse_id   order_id                                               text  amount
+# 0            1             2  ORD-10294  3-tier glass shelving units, expedited deliver...  649.25
+# 1            1             1  ORD-10541  Office desk chairs, set of 6, with assembly se...   824.9
+# 2            1             1  ORD-10802  Executive standing desk, walnut finish, standa...   519.0
+# 3            2             1  ORD-11017  Maple conference table, cable management inclu...  1225.5
+# 4            2             2  ORD-11385  Set of ergonomic task chairs, black mesh, stan...  767.75
 print(data["items"])
-#         item_id        order_id                                       name   price
-# 0   ITEM100001A  ORD20240612001                Ergonomic Mesh Office Chair  179.99
-# 1   ITEM100001B  ORD20240612001                Adjustable Home Office Desk  232.96
-# 2   ITEM100002A  ORD20240517322       Wireless Noise-Cancelling Headphones  226.49
-# 3   ITEM100003A  ORD20240430307                        Smart LED Desk Lamp   59.99
-# 4   ITEM100003B  ORD20240430307  USB Charging Cable (Desk Lamp Compatible)    10.0
-# 5   ITEM100004A  ORD20240614015                       Bamboo Cooking Spoon   13.49
-# 6   ITEM100004B  ORD20240614015                      Bamboo Slotted Turner   12.99
-# 7   ITEM100005A  ORD20240528356         Digital Air Fryer (5-Quart, Black)  115.99
-# 8   ITEM100005B  ORD20240528356     Silicone Liner for Air Fryer (5-Quart)   13.99
-# 9   ITEM100006A  ORD20240510078      Double-Walled Glass Coffee Mug (12oz)   13.75
-# 10  ITEM100006B  ORD20240510078       Double-Walled Glass Coffee Mug (8oz)   11.25
+#      item_id   order_id                                        name   price
+# 0  ITM-80265  ORD-10294         3-Tier Tempered Glass Shelving Unit   409.0
+# 1  ITM-80266  ORD-10294  Brushed Aluminum Shelf Brackets (Set of 4)  240.25
+# 2  ITM-81324  ORD-10541              Ergonomic Mesh-Back Desk Chair   132.5
+# 3  ITM-81325  ORD-10541  Professional Office Chair Assembly Service    45.0
+# 4  ITM-82101  ORD-10802      Executive Standing Desk, Walnut Finish   469.0
+# 5  ITM-82102  ORD-10802         Desk Installation and Setup Service    50.0
+# 6  ITM-83391  ORD-11017             Maple Conference Table, 10-Seat  1125.5
+# 7  ITM-83392  ORD-11017       Integrated Table Cable Management Kit   100.0
+# 8  ITM-84311  ORD-11385            Ergonomic Task Chair, Black Mesh  359.25
+# 9  ITM-84312  ORD-11385                   Standard Delivery Service    48.5
 ```
+6. Create your first self-referencing synthetic table
+```python
+from mostlyai import mock
+tables = {
+    "employees": {
+        "prompt": "Employees of a company",
+        "columns": {
+            "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
+            "name": {"prompt": "first name and last name of the president", "dtype": "string"},
+            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
+            "role": {"prompt": "the role of the employee", "dtype": "string"},
+        },
+        "primary_key": "employee_id",
+        "foreign_keys": [
+            {
+                "column": "boss_id",
+                "referenced_table": "employees",
+                "prompt": "each boss has at most 3 employees",
+            },
+        ],
+    }
+}
+df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
+print(df)
+#    employee_id             name  boss_id                      role
+# 0            1  Sandra Phillips     <NA>                 President
+# 1            2      Marcus Tran        1   Chief Financial Officer
+# 2            3    Ava Whittaker        1  Chief Technology Officer
+# 3            4    Sophie Martin        1  Chief Operations Officer
+# 4            5      Chad Nelson        2           Finance Manager
+# 5            6     Ethan Glover        2         Senior Accountant
+# 6            7   Kimberly Ortiz        2         Junior Accountant
+# 7            8     Lucas Romero        3                IT Manager
+# 8            9      Priya Desai        3    Lead Software Engineer
+# 9           10    Felix Bennett        3    Senior Systems Analyst
+```

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 # Synthetic Mock Data 🔮
-[![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
+[![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
 Create data out of nothing. Prompt LLMs for Tabular Data.
@@ -39,7 +39,7 @@ from mostlyai import mock
 tables = {
     "guests": {
-        "description": "Guests of an Alpine ski hotel in Austria",
+        "prompt": "Guests of an Alpine ski hotel in Austria",
         "columns": {
             "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
             "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -79,17 +79,26 @@ from mostlyai import mock
 tables = {
     "customers": {
-        "description": "Customers of a hardware store",
+        "prompt": "Customers of a hardware store",
         "columns": {
             "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
             "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
         },
         "primary_key": "customer_id",
     },
+    "warehouses": {
+        "prompt": "Warehouses of a hardware store",
+        "columns": {
+            "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
+            "name": {"prompt": "the name of the warehouse", "dtype": "string"},
+        },
+        "primary_key": "warehouse_id",
+    },
     "orders": {
-        "description": "Orders of a Customer",
+        "prompt": "Orders of a Customer",
         "columns": {
             "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
+            "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
             "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
             "text": {"prompt": "order text description", "dtype": "string"},
             "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -99,12 +108,16 @@ tables = {
             {
                 "column": "customer_id",
                 "referenced_table": "customers",
-                "description": "each customer has anywhere between 2 and 3 orders",
-            }
+                "prompt": "each customer has anywhere between 2 and 3 orders",
+            },
+            {
+                "column": "warehouse_id",
+                "referenced_table": "warehouses",
+            },
         ],
     },
     "items": {
-        "description": "Items in an Order",
+        "prompt": "Items in an Order",
         "columns": {
             "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
             "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -115,7 +128,7 @@ tables = {
             {
                 "column": "order_id",
                 "referenced_table": "orders",
-                "description": "each order has between 1 and 2 items",
+                "prompt": "each order has between 1 and 2 items",
             }
         ],
     },
@@ -126,28 +139,69 @@ data = mock.sample(
     model="openai/gpt-4.1"
 )
 print(data["customers"])
-#    customer_id            name
-# 0            1  Michael Torres
-# 1            2      Elaine Kim
+#    customer_id             name
+# 0            1  Matthew Carlson
+# 1            2       Priya Shah
+print(data["warehouses"])
+#    warehouse_id                        name
+# 0             1    Central Distribution Hub
+# 1             2  Northgate Storage Facility
 print(data["orders"])
-#    customer_id        order_id                                               text  amount
-# 0            1  ORD20240612001        Home office desk and ergonomic chair bundle  412.95
-# 1            1  ORD20240517322               Wireless noise-cancelling headphones  226.49
-# 2            1  ORD20240430307         Smart LED desk lamp with USB charging port   69.99
-# 3            2  ORD20240614015            Eco-friendly bamboo kitchen utensil set   39.95
-# 4            2  ORD20240528356  Air fryer with digital touch screen, 5-quart c...  129.99
-# 5            2  ORD20240510078          Double-walled glass coffee mugs, set of 4    48.5
+#    customer_id  warehouse_id   order_id                                               text  amount
+# 0            1             2  ORD-10294  3-tier glass shelving units, expedited deliver...  649.25
+# 1            1             1  ORD-10541  Office desk chairs, set of 6, with assembly se...   824.9
+# 2            1             1  ORD-10802  Executive standing desk, walnut finish, standa...   519.0
+# 3            2             1  ORD-11017  Maple conference table, cable management inclu...  1225.5
+# 4            2             2  ORD-11385  Set of ergonomic task chairs, black mesh, stan...  767.75
 print(data["items"])
-#         item_id        order_id                                       name   price
-# 0   ITEM100001A  ORD20240612001                Ergonomic Mesh Office Chair  179.99
-# 1   ITEM100001B  ORD20240612001                Adjustable Home Office Desk  232.96
-# 2   ITEM100002A  ORD20240517322       Wireless Noise-Cancelling Headphones  226.49
-# 3   ITEM100003A  ORD20240430307                        Smart LED Desk Lamp   59.99
-# 4   ITEM100003B  ORD20240430307  USB Charging Cable (Desk Lamp Compatible)    10.0
-# 5   ITEM100004A  ORD20240614015                       Bamboo Cooking Spoon   13.49
-# 6   ITEM100004B  ORD20240614015                      Bamboo Slotted Turner   12.99
-# 7   ITEM100005A  ORD20240528356         Digital Air Fryer (5-Quart, Black)  115.99
-# 8   ITEM100005B  ORD20240528356     Silicone Liner for Air Fryer (5-Quart)   13.99
-# 9   ITEM100006A  ORD20240510078      Double-Walled Glass Coffee Mug (12oz)   13.75
-# 10  ITEM100006B  ORD20240510078       Double-Walled Glass Coffee Mug (8oz)   11.25
+#      item_id   order_id                                        name   price
+# 0  ITM-80265  ORD-10294         3-Tier Tempered Glass Shelving Unit   409.0
+# 1  ITM-80266  ORD-10294  Brushed Aluminum Shelf Brackets (Set of 4)  240.25
+# 2  ITM-81324  ORD-10541              Ergonomic Mesh-Back Desk Chair   132.5
+# 3  ITM-81325  ORD-10541  Professional Office Chair Assembly Service    45.0
+# 4  ITM-82101  ORD-10802      Executive Standing Desk, Walnut Finish   469.0
+# 5  ITM-82102  ORD-10802         Desk Installation and Setup Service    50.0
+# 6  ITM-83391  ORD-11017             Maple Conference Table, 10-Seat  1125.5
+# 7  ITM-83392  ORD-11017       Integrated Table Cable Management Kit   100.0
+# 8  ITM-84311  ORD-11385            Ergonomic Task Chair, Black Mesh  359.25
+# 9  ITM-84312  ORD-11385                   Standard Delivery Service    48.5
 ```
+6. Create your first self-referencing synthetic table
+```python
+from mostlyai import mock
+tables = {
+    "employees": {
+        "prompt": "Employees of a company",
+        "columns": {
+            "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
+            "name": {"prompt": "first name and last name of the president", "dtype": "string"},
+            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
+            "role": {"prompt": "the role of the employee", "dtype": "string"},
+        },
+        "primary_key": "employee_id",
+        "foreign_keys": [
+            {
+                "column": "boss_id",
+                "referenced_table": "employees",
+                "prompt": "each boss has at most 3 employees",
+            },
+        ],
+    }
+}
+df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
+print(df)
+#    employee_id             name  boss_id                      role
+# 0            1  Sandra Phillips     <NA>                 President
+# 1            2      Marcus Tran        1   Chief Financial Officer
+# 2            3    Ava Whittaker        1  Chief Technology Officer
+# 3            4    Sophie Martin        1  Chief Operations Officer
+# 4            5      Chad Nelson        2           Finance Manager
+# 5            6     Ethan Glover        2         Senior Accountant
+# 6            7   Kimberly Ortiz        2         Junior Accountant
+# 7            8     Lucas Romero        3                IT Manager
+# 8            9      Priya Desai        3    Lead Software Engineer
+# 9           10    Felix Bennett        3    Senior Systems Analyst
+```

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/mostlyai/mock/__init__.py RENAMED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.0.6"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.0.8"  # Do not set this manually. Use poetry version [params].

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/mostlyai/mock/core.py RENAMED Viewed

@@ -44,8 +44,10 @@ across tables.
 class LLMConfig(BaseModel):
-    model: str
+    model: str = "openai/gpt-4.1-nano"
     api_key: str | None = None
+    temperature: float = 1.0
+    top_p: float = 0.95
 class MockConfig(RootModel[dict[str, "TableConfig"]]):
@@ -100,7 +102,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
             if table_name in path:
                 cycle_start = path.index(table_name)
                 cycle = path[cycle_start:] + [table_name]
-                raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}")
+                if len(cycle) > 2:  # len(cycle) == 2 means self-referencing table, which is allowed
+                    raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}.")
             if table_name in visited:
                 return
             visited.add(table_name)
@@ -116,10 +119,10 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
 class TableConfig(BaseModel):
-    description: str = ""
+    prompt: str = ""
     columns: dict[str, ColumnConfig] = Field(..., min_items=1)
     primary_key: str | None = None
-    foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list, min_length=0, max_length=1)
+    foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
 class ColumnConfig(BaseModel):
@@ -163,7 +166,7 @@ class ColumnConfig(BaseModel):
                 DType.DATETIME: (str, "strings"),
             }[self.dtype]
             try:
-                self.values = [cast_fn(c) for c in self.values]
+                self.values = [cast_fn(c) if pd.notna(c) else None for c in self.values]
             except ValueError:
                 raise ValueError(
                     f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
@@ -184,85 +187,78 @@ class DType(str, Enum):
 class ForeignKeyConfig(BaseModel):
     column: str
     referenced_table: str
-    description: str | None = None
+    prompt: str | None = None
 def _sample_table(
     *,
-    table_name: str,
-    table_config: TableConfig,
+    name: str,
+    prompt: str,
+    columns: dict[str, ColumnConfig],
+    foreign_keys: list[ForeignKeyConfig] | None,
     primary_keys: dict[str, str] | None,
-    sample_size: int | None,
-    context_data: pd.DataFrame | None,
-    temperature: float,
-    top_p: float,
+    generated_data: dict[str, pd.DataFrame] | None,
+    sample_size: int,
     batch_size: int,
     previous_rows_size: int,
+    non_context_size: int | None,
     llm_config: LLMConfig,
 ) -> pd.DataFrame:
-    assert (sample_size is None) != (context_data is None), (
-        "Exactly one of sample_size or context_data must be provided"
-    )
-    if sample_size is None:
-        sample_size = len(context_data)
     table_rows_generator = _create_table_rows_generator(
-        table_name=table_name,
-        table_config=table_config,
+        name=name,
+        prompt=prompt,
+        columns=columns,
         primary_keys=primary_keys,
+        foreign_keys=foreign_keys,
+        generated_data=generated_data,
         sample_size=sample_size,
-        context_data=context_data,
-        temperature=temperature,
-        top_p=top_p,
         batch_size=batch_size,
         previous_rows_size=previous_rows_size,
+        non_context_size=non_context_size,
         llm_config=llm_config,
     )
-    table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
-    table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, table_config=table_config)
+    table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
+    table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
     return table_df
 def _create_table_prompt(
     *,
-    table_name: str,
-    table_description: str,
+    name: str,
+    prompt: str,
     columns: dict[str, ColumnConfig],
     primary_keys: dict[str, str] | None,
     batch_size: int | None,
     foreign_keys: list[ForeignKeyConfig] | None,
     context_data: pd.DataFrame | None,
-    previous_rows: list[dict],
+    non_context_data: dict[str, pd.DataFrame] | None,
+    previous_rows: list[dict] | None,
 ) -> str:
-    if batch_size is not None:
-        assert foreign_keys is None
-        assert context_data is None
-    else:
-        assert foreign_keys is not None
-        assert context_data is not None
-        assert primary_keys is not None
-    # add description
-    prompt = f"# {table_description}\n\n"
+    # add table prompt
+    prompt = f"# {prompt}\n\n"
     # define table
-    prompt += f"## Table: {table_name}\n\n"
+    prompt += f"## Table: {name}\n\n"
+    prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
     # add columns specifications
     prompt += "## Columns Specifications:\n\n"
     prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
-    # define foreign keys
-    if foreign_keys is not None:
-        prompt += "## Foreign Keys:\n\n"
-        prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
     # add previous rows as context to help the LLM generate consistent data
     if previous_rows:
         prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
         prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
+    # define foreign keys
+    if foreign_keys:
+        prompt += "## Foreign Keys:\n\n"
+        prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
     # add context table name, primary key and data
-    if context_data is not None:
+    if foreign_keys and foreign_keys[0].referenced_table != name:  # self-dependency is not considered as context
+        assert context_data is not None
         fk = foreign_keys[0]
         prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
@@ -271,16 +267,35 @@ def _create_table_prompt(
         prompt += f"## Context Table Data:\n\n"
         prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
+    # add non-context table names, primary keys and data
+    if foreign_keys and len(foreign_keys) > 1:
+        for fk in foreign_keys[1:]:
+            if fk.referenced_table == name:  # self-dependency is not considered as non-context
+                continue
+            assert non_context_data is not None
+            assert fk.referenced_table in non_context_data
+            prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
+            prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
+            prompt += f"## Non-Context Table Data:\n\n"
+            prompt += f"{non_context_data[fk.referenced_table].to_json(orient='records', indent=2)}\n\n"
     # add instructions
     prompt += "\n## Instructions:\n\n"
-    if batch_size is not None:
-        prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
+    if not foreign_keys:
+        assert batch_size is not None
+        prompt += f"Generate {batch_size} rows for the `{name}` table.\n\n"
     else:
         prompt += (
-            f"Generate data for the `{table_name}` table. "
-            f"The Foreign Key column may only contain values from Context Table Data. "
-            f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
+            f"Generate data for the `{name}` table. "
+            f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
+            f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
+            f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
+            f"In this case, ensure that the generated foreign keys are consistent with generated primary keys of the table. "
+            f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
         )
     if previous_rows:
         prompt += (
             "Generate new rows that maintain consistency with the previous rows where appropriate. "
@@ -295,15 +310,16 @@ def _create_table_prompt(
 def _create_table_rows_generator(
     *,
-    table_name: str,
-    table_config: TableConfig,
+    name: str,
+    prompt: str,
+    columns: dict[str, ColumnConfig],
+    foreign_keys: list[ForeignKeyConfig] | None,
     primary_keys: dict[str, str] | None,
+    generated_data: dict[str, pd.DataFrame] | None,
     sample_size: int,
-    temperature: float,
-    top_p: float,
-    context_data: pd.DataFrame | None,
     batch_size: int,
     previous_rows_size: int,
+    non_context_size: int | None,
     llm_config: LLMConfig,
 ) -> Generator[dict]:
     def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
@@ -311,14 +327,14 @@ def _create_table_rows_generator(
             if column_config.values or column_config.dtype is DType.CATEGORY:
                 return Literal[tuple(column_config.values)]
             return {
-                DType.INTEGER: int,
-                DType.FLOAT: float,
-                DType.STRING: str,
-                DType.BOOLEAN: bool,
+                DType.INTEGER: int | None,
+                DType.FLOAT: float | None,
+                DType.STRING: str | None,
+                DType.BOOLEAN: bool | None,
                 # response_format has limited support for JSON Schema features
                 # thus we represent dates and datetimes as strings
-                DType.DATE: str,
-                DType.DATETIME: str,
+                DType.DATE: str | None,
+                DType.DATETIME: str | None,
             }[column_config.dtype]
         fields = {}
@@ -375,10 +391,31 @@ def _create_table_rows_generator(
         "The model does not support structured output / JSON mode."
     )
+    # derive context data (if first foreign key is present) and harmonize sample size accordingly
+    context_data: pd.DataFrame | None = None
+    if foreign_keys and foreign_keys[0].referenced_table != name:  # self-dependency is not considered as context
+        context_table_name = foreign_keys[0].referenced_table
+        assert generated_data is not None
+        assert context_table_name in generated_data
+        context_data = generated_data[context_table_name]
+        sample_size = len(context_data)
+    # derive non-context data (if more than one foreign key is present)
+    non_context_data: dict[str, pd.DataFrame] = {}
+    if foreign_keys and len(foreign_keys) > 1:
+        assert generated_data is not None
+        assert non_context_size is not None
+        for fk in foreign_keys[1:]:
+            if fk.referenced_table == name:  # self-dependency is not considered as non-context
+                continue
+            non_context_table_name = fk.referenced_table
+            assert non_context_table_name in generated_data
+            non_context_data[non_context_table_name] = generated_data[non_context_table_name]
     litellm_kwargs = {
-        "response_format": create_table_response_format(columns=table_config.columns),
-        "temperature": temperature,
-        "top_p": top_p,
+        "response_format": create_table_response_format(columns=columns),
+        "temperature": llm_config.temperature,
+        "top_p": llm_config.top_p,
         "model": llm_config.model,
         "api_key": llm_config.api_key,
         "stream": True,
@@ -387,17 +424,22 @@ def _create_table_rows_generator(
     yielded_sequences = 0
     previous_rows = deque(maxlen=previous_rows_size)
     for context_batch in batch_infinitely(context_data):
-        prompt_kwargs = {
-            "table_name": table_name,
-            "table_description": table_config.description,
-            "columns": table_config.columns,
-            "primary_keys": primary_keys,
-            "batch_size": batch_size if context_batch is None else None,
-            "foreign_keys": table_config.foreign_keys if context_batch is not None else None,
-            "context_data": context_batch if context_batch is not None else None,
-            "previous_rows": list(previous_rows),
-        }
-        prompt = _create_table_prompt(**prompt_kwargs)
+        non_context_batch = (
+            {table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
+            if non_context_data
+            else None
+        )
+        prompt = _create_table_prompt(
+            name=name,
+            prompt=prompt,
+            columns=columns,
+            primary_keys=primary_keys,
+            batch_size=batch_size,
+            foreign_keys=foreign_keys,
+            context_data=context_batch,
+            non_context_data=non_context_batch,
+            previous_rows=list(previous_rows),
+        )
         messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}]
         response = litellm.completion(messages=messages, **litellm_kwargs)
@@ -423,16 +465,21 @@ def _create_table_rows_generator(
 def _convert_table_rows_generator_to_df(
-    table_rows_generator: Generator[dict], table_config: TableConfig
+    table_rows_generator: Generator[dict],
+    columns: dict[str, ColumnConfig],
 ) -> pd.DataFrame:
     def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
         for column_name, column_config in columns.items():
             if column_config.dtype in [DType.DATE, DType.DATETIME]:
                 df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
-            elif column_config.dtype in [DType.INTEGER, DType.FLOAT]:
-                df[column_name] = pd.to_numeric(df[column_name], errors="coerce", dtype_backend="pyarrow")
+            elif column_config.dtype is DType.INTEGER:
+                df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
+                    "int64[pyarrow]"
+                )
+            elif column_config.dtype is DType.FLOAT:
+                df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
             elif column_config.dtype is DType.BOOLEAN:
-                df[column_name] = df[column_name].astype(bool)
+                df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
             elif column_config.dtype is DType.CATEGORY:
                 df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
             else:
@@ -440,7 +487,7 @@ def _convert_table_rows_generator_to_df(
         return df
     df = pd.DataFrame(list(table_rows_generator))
-    df = align_df_dtypes_with_mock_dtypes(df, table_config.columns)
+    df = align_df_dtypes_with_mock_dtypes(df, columns)
     return df
@@ -453,28 +500,32 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
     return sample_size
-def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
-    child_to_parents = {}
-    parent_to_children = {}
+def _build_execution_plan(config: MockConfig) -> list[str]:
+    def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
+        child_to_parents = {}
+        parent_to_children = {}
-    for table_name in config.root:
-        child_to_parents[table_name] = []
-        parent_to_children[table_name] = []
+        for table_name in config.root:
+            child_to_parents[table_name] = set()
+            parent_to_children[table_name] = set()
-    for table_name, table_config in config.root.items():
-        if table_config.foreign_keys:
-            for fk in table_config.foreign_keys:
-                referenced_table = fk.referenced_table
-                child_to_parents[table_name].append(referenced_table)
-                parent_to_children[referenced_table].append(table_name)
+        for table_name, table_config in config.root.items():
+            if table_config.foreign_keys:
+                for fk in table_config.foreign_keys:
+                    referenced_table = fk.referenced_table
+                    child_to_parents[table_name].add(referenced_table)
+                    parent_to_children[referenced_table].add(table_name)
-    subject_tables = [table_name for table_name, deps in child_to_parents.items() if not deps]
-    return child_to_parents, parent_to_children, subject_tables
+        root_tables = []
+        for table_name, parents in child_to_parents.items():
+            if not parents or parents == {table_name}:  # no dependencies or only self-dependency
+                root_tables.append(table_name)
+        return child_to_parents, parent_to_children, root_tables
+    child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
-def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
     execution_plan = []
-    bfs_queue = list(subject_tables)
+    bfs_queue = list(root_tables)
     processed = set()
     while bfs_queue:
@@ -482,6 +533,16 @@ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tabl
         if table_name in processed:
             continue
+        # ensure all parents are processed before processing this table
+        unprocessed_parents = []
+        for parent in child_to_parents[table_name]:
+            if parent not in processed and parent != table_name:  # exclude self-dependency
+                unprocessed_parents.append(parent)
+        if unprocessed_parents:
+            bfs_queue.extend(unprocessed_parents)
+            bfs_queue.append(table_name)
+            continue
         execution_plan.append(table_name)
         processed.add(table_name)
@@ -499,6 +560,7 @@ def sample(
     api_key: str | None = None,
     temperature: float = 1.0,
     top_p: float = 0.95,
+    return_type: Literal["auto", "dict"] = "auto",
 ) -> pd.DataFrame | dict[str, pd.DataFrame]:
     """
     Generate mock data by prompting an LLM.
@@ -523,6 +585,7 @@ def sample(
         api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
         temperature (float): The temperature to use for the LLM. Default is 1.0.
         top_p (float): The top-p value to use for the LLM. Default is 0.95.
+        return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
     Returns:
         - pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
@@ -534,7 +597,7 @@ def sample(
     tables = {
         "guests": {
-            "description": "Guests of an Alpine ski hotel in Austria",
+            "prompt": "Guests of an Alpine ski hotel in Austria",
             "columns": {
                 "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
                 "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -557,17 +620,26 @@ def sample(
     tables = {
         "customers": {
-            "description": "Customers of a hardware store",
+            "prompt": "Customers of a hardware store",
             "columns": {
                 "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
                 "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
             },
             "primary_key": "customer_id",
         },
+        "warehouses": {
+            "prompt": "Warehouses of a hardware store",
+            "columns": {
+                "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
+                "name": {"prompt": "the name of the warehouse", "dtype": "string"},
+            },
+            "primary_key": "warehouse_id",
+        },
         "orders": {
-            "description": "Orders of a Customer",
+            "prompt": "Orders of a Customer",
             "columns": {
                 "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
+                "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
                 "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
                 "text": {"prompt": "order text description", "dtype": "string"},
                 "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -577,12 +649,16 @@ def sample(
                 {
                     "column": "customer_id",
                     "referenced_table": "customers",
-                    "description": "each customer has anywhere between 2 and 3 orders",
-                }
+                    "prompt": "each customer has anywhere between 2 and 3 orders",
+                },
+                {
+                    "column": "warehouse_id",
+                    "referenced_table": "warehouses",
+                },
             ],
         },
         "items": {
-            "description": "Items in an Order",
+            "prompt": "Items in an Order",
             "columns": {
                 "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
                 "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -593,59 +669,44 @@ def sample(
                 {
                     "column": "order_id",
                     "referenced_table": "orders",
-                    "description": "each order has between 1 and 2 items",
+                    "prompt": "each order has between 1 and 2 items",
                 }
             ],
         },
     }
     data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
     df_customers = data["customers"]
+    df_warehouses = data["warehouses"]
     df_orders = data["orders"]
     df_items = data["items"]
     ```
     """
     config = MockConfig(tables)
+    llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
     sample_size = _harmonize_sample_size(sample_size, config)
     primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
-    child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
-    execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
+    execution_plan: list[str] = _build_execution_plan(config)
-    results: dict[str, pd.DataFrame] = {}
+    data: dict[str, pd.DataFrame] = {}
     for table_name in execution_plan:
         table_config = config.root[table_name]
-        if not child_to_parents[table_name]:
-            # subject table
-            df = _sample_table(
-                table_name=table_name,
-                table_config=table_config,
-                primary_keys=None,
-                sample_size=sample_size[table_name],
-                context_data=None,
-                temperature=temperature,
-                top_p=top_p,
-                batch_size=20,  # generate 20 subjects at a time
-                previous_rows_size=5,
-                llm_config=LLMConfig(model=model, api_key=api_key),
-            )
-        else:
-            # sequencial table
-            referenced_table = table_config.foreign_keys[0].referenced_table
-            df = _sample_table(
-                table_name=table_name,
-                table_config=table_config,
-                primary_keys=primary_keys,
-                sample_size=None,
-                context_data=results[referenced_table],
-                temperature=temperature,
-                top_p=top_p,
-                batch_size=1,  # generate one sequence at a time
-                previous_rows_size=5,
-                llm_config=LLMConfig(model=model, api_key=api_key),
-            )
-        results[table_name] = df
-    return results if len(results) > 1 else next(iter(results.values()))
+        df = _sample_table(
+            name=table_name,
+            prompt=table_config.prompt,
+            columns=table_config.columns,
+            foreign_keys=table_config.foreign_keys,
+            primary_keys=primary_keys,
+            generated_data=data,
+            sample_size=sample_size[table_name],
+            batch_size=30,  # generate 30 root table rows at a time
+            previous_rows_size=10,  # present 10 previously generated rows to the LLM
+            non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
+            llm_config=llm_config,
+        )
+        data[table_name] = df
+    return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data

mostlyai_mock-0.0.8/mostlyai/mock/mcp.py ADDED Viewed

@@ -0,0 +1,46 @@
+import json
+import pandas as pd
+from fastmcp import Context, FastMCP
+from mostlyai import mock
+mcp = FastMCP(name="MostlyAI Mock MCP Server")
+@mcp.tool(description=mock.sample.__doc__)
+def sample_mock_data(
+    *,
+    tables: dict[str, dict],
+    sample_size: int,
+    model: str = "openai/gpt-4.1-nano",
+    api_key: str | None = None,
+    temperature: float = 1.0,
+    top_p: float = 0.95,
+    ctx: Context,
+) -> str:
+    # Notes:
+    # 1. Returning DataFrames directly results in converting them into truncated string.
+    # 2. The logs / progress bars are not propagated to the MCP Client. There is a dedicated API to do that (e.g. `ctx.info(...)`)
+    # 3. MCP Server inherits only selected environment variables (PATH, USER...); one way to pass LLM keys is through client configuration (`mcpServers->env`)
+    # 4. Some MCP Clients, e.g. Cursor, do not like Unions or Optionals in type hints
+    ctx.info(f"Generating mock data for `{len(tables)}` tables")
+    data = mock.sample(
+        tables=tables,
+        sample_size=sample_size,
+        model=model,
+        api_key=api_key,
+        temperature=temperature,
+        top_p=top_p,
+        return_type="dict",
+    )
+    ctx.info(f"Generated mock data for `{len(tables)}` tables")
+    return {k: v.to_dict(orient="records") for k, v in data.items()}
+def main():
+    mcp.run(transport="stdio")
+if __name__ == "__main__":
+    main()

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mostlyai-mock"
-version = "0.0.6"
+version = "0.0.8"
 description = "Synthetic Mock Data"
 authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
 requires-python = ">=3.10"
@@ -29,8 +29,13 @@ dependencies = [
     "pandas>=2.0.0",
     "pyarrow>=14.0.0",
     "litellm>=1.67.0",
+    "typer>=0.9.0,<1.0.0",
+    "fastmcp>=2.0.0,<3.0.0",
 ]
+[project.scripts]
+mcp-server = "mostlyai.mock.mcp:main"
 [project.urls]
 homepage = "https://github.com/mostly-ai/mostlyai-mock"
 repository = "https://github.com/mostly-ai/mostlyai-mock"

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/.gitignore RENAMED Viewed

File without changes

{mostlyai_mock-0.0.6 → mostlyai_mock-0.0.8}/LICENSE RENAMED Viewed

File without changes

mostlyai-mock 0.0.6__tar.gz → 0.0.8__tar.gz

mostlyai-mock 0.0.6tar.gz → 0.0.8tar.gz