PyPI - mostlyai-mock - Versions diffs - 0.1.10__tar.gz → 0.1.12__tar.gz - Mend

mostlyai-mock 0.1.10tar.gz → 0.1.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mostlyai-mock
-Version: 0.1.10
+Version: 0.1.12
 Summary: Synthetic Mock Data
 Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
 Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -95,17 +95,17 @@ df = mock.sample(
     model="openai/gpt-4.1-nano",  # select the LLM model (optional)
 )
 print(df)
-#   nationality            name  gender  age date_of_birth        checkin_time  is_vip  price_per_night  room_number
-# 0          AT     Anna Müller  female   29    1994-09-15 2025-01-05 14:30:00    True            350.0          101
-# 1          DE  Johann Schmidt    male   45    1978-11-20 2025-01-06 16:45:00   False            250.0          102
-# 2          CH      Lara Meier  female   32    1991-04-12 2025-01-05 12:00:00    True            400.0          103
-# 3          IT     Marco Rossi    male   38    1985-02-25 2025-01-07 09:15:00   False            280.0          201
-# 4          FR   Claire Dupont  female   24    2000-07-08 2025-01-07 11:20:00   False            220.0          202
-# 5          AT    Felix Gruber    male   52    1972-01-10 2025-01-06 17:50:00    True            375.0          203
-# 6          DE   Sophie Becker  female   27    1996-03-30 2025-01-08 08:30:00   False            230.0          204
-# 7          CH      Max Keller    male   31    1992-05-16 2025-01-09 14:10:00   False            290.0          101
-# 8          IT  Giulia Bianchi  female   36    1988-08-19 2025-01-05 15:55:00    True            410.0          102
-# 9          FR    Louis Martin    male   44    1980-12-05 2025-01-07 10:40:00   False            270.0          103
+#   nationality                 name  gender  age date_of_birth        checkin_time is_vip  price_per_night  room_number
+# 0          FR          Jean Dupont    male   29    1994-03-15 2025-01-10 14:30:00  False            150.0          101
+# 1          DE         Anna Schmidt  female   34    1989-07-22 2025-01-11 16:45:00   True            200.0          201
+# 2          IT          Marco Rossi    male   45    1979-11-05 2025-01-09 10:15:00  False            180.0          102
+# 3          AT         Laura Gruber  female   28    1996-02-19 2025-01-12 09:00:00  False            165.0          202
+# 4          CH         David Müller    male   37    1987-08-30 2025-01-08 17:20:00   True            210.0          203
+# 5          NL  Sophie van den Berg  female   22    2002-04-12 2025-01-10 12:00:00  False            140.0          103
+# 6          GB         James Carter    male   31    1992-09-10 2025-01-11 11:30:00  False            155.0          204
+# 7          BE        Lotte Peeters  female   26    1998-05-25 2025-01-09 15:45:00  False            160.0          201
+# 8          DK        Anders Jensen    male   33    1990-12-03 2025-01-12 08:15:00   True            220.0          202
+# 9          ES         Carlos Lopez    male   38    1985-06-14 2025-01-10 18:00:00  False            170.0          203
 ```
 4. Create your first multi-table mock dataset
@@ -117,7 +117,7 @@ tables = {
     "customers": {
         "prompt": "Customers of a hardware store",
         "columns": {
-            "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
+            "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
             "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
         },
         "primary_key": "customer_id",
@@ -125,7 +125,7 @@ tables = {
     "warehouses": {
         "prompt": "Warehouses of a hardware store",
         "columns": {
-            "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
+            "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
             "name": {"prompt": "the name of the warehouse", "dtype": "string"},
         },
         "primary_key": "warehouse_id",
@@ -133,8 +133,8 @@ tables = {
     "orders": {
         "prompt": "Orders of a Customer",
         "columns": {
-            "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
-            "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
+            "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
+            "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
             "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
             "text": {"prompt": "order text description", "dtype": "string"},
             "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -167,40 +167,42 @@ tables = {
                 "prompt": "each order has between 1 and 2 items",
             }
         ],
+        "primary_key": "item_id",
     },
 }
 data = mock.sample(
     tables=tables,
     sample_size=2,
-    model="openai/gpt-4.1"
+    model="openai/gpt-4.1",
+    n_workers=1,
 )
 print(data["customers"])
-#    customer_id             name
-# 0            1  Matthew Carlson
-# 1            2       Priya Shah
+#   customer_id             name
+# 0   B0-100235  Danielle Rogers
+# 1   B0-100236       Edward Kim
 print(data["warehouses"])
-#    warehouse_id                        name
-# 0             1    Central Distribution Hub
-# 1             2  Northgate Storage Facility
+#   warehouse_id                          name
+# 0       B0-001  Downtown Distribution Center
+# 1       B0-002     Westside Storage Facility
 print(data["orders"])
-#    customer_id  warehouse_id   order_id                                               text  amount
-# 0            1             2  ORD-10294  3-tier glass shelving units, expedited deliver...  649.25
-# 1            1             1  ORD-10541  Office desk chairs, set of 6, with assembly se...   824.9
-# 2            1             1  ORD-10802  Executive standing desk, walnut finish, standa...   519.0
-# 3            2             1  ORD-11017  Maple conference table, cable management inclu...  1225.5
-# 4            2             2  ORD-11385  Set of ergonomic task chairs, black mesh, stan...  767.75
+#   customer_id warehouse_id    order_id                                               text   amount
+# 0   B0-100235       B0-002  B0-3010021  Office furniture replenishment - desks, chairs...  1268.35
+# 1   B0-100235       B0-001  B0-3010022  Bulk stationery order: printer paper, notebook...    449.9
+# 2   B0-100235       B0-001  B0-3010023  Electronics restock: monitors and wireless key...    877.6
+# 3   B0-100236       B0-001  B1-3010021  Monthly cleaning supplies: disinfectant, trash...   314.75
+# 4   B0-100236       B0-002  B1-3010022  Breakroom essentials restock: coffee, tea, and...   182.45
 print(data["items"])
-#      item_id   order_id                                        name   price
-# 0  ITM-80265  ORD-10294         3-Tier Tempered Glass Shelving Unit   409.0
-# 1  ITM-80266  ORD-10294  Brushed Aluminum Shelf Brackets (Set of 4)  240.25
-# 2  ITM-81324  ORD-10541              Ergonomic Mesh-Back Desk Chair   132.5
-# 3  ITM-81325  ORD-10541  Professional Office Chair Assembly Service    45.0
-# 4  ITM-82101  ORD-10802      Executive Standing Desk, Walnut Finish   469.0
-# 5  ITM-82102  ORD-10802         Desk Installation and Setup Service    50.0
-# 6  ITM-83391  ORD-11017             Maple Conference Table, 10-Seat  1125.5
-# 7  ITM-83392  ORD-11017       Integrated Table Cable Management Kit   100.0
-# 8  ITM-84311  ORD-11385            Ergonomic Task Chair, Black Mesh  359.25
-# 9  ITM-84312  ORD-11385                   Standard Delivery Service    48.5
+#      item_id    order_id                                   name   price
+# 0  B0-200501  B0-3010021                  Ergonomic Office Desk  545.99
+# 1  B0-200502  B0-3010021              Mesh Back Executive Chair   399.5
+# 2  B1-200503  B0-3010022   Multipack Printer Paper (500 sheets)  129.95
+# 3  B1-200504  B0-3010022             Spiral Notebooks - 12 Pack   59.99
+# 4  B2-200505  B0-3010023               27" LED Computer Monitor  489.95
+# 5  B2-200506  B0-3010023            Wireless Ergonomic Keyboard  387.65
+# 6  B3-200507  B1-3010021  Industrial Disinfectant Solution (5L)  148.95
+# 7  B3-200508  B1-3010021  Commercial Trash Liners - Case of 100    84.5
+# 8  B4-200509  B1-3010022        Premium Ground Coffee (2lb Bag)   74.99
+# 9  B4-200510  B1-3010022         Bottled Spring Water (24 Pack)   34.95
 ```
 6. Create your first self-referencing mock table
@@ -212,9 +214,9 @@ tables = {
     "employees": {
         "prompt": "Employees of a company",
         "columns": {
-            "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
+            "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
             "name": {"prompt": "first name and last name of the president", "dtype": "string"},
-            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
+            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
             "role": {"prompt": "the role of the employee", "dtype": "string"},
         },
         "primary_key": "employee_id",
@@ -229,17 +231,17 @@ tables = {
 }
 df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
 print(df)
-#    employee_id             name  boss_id                      role
-# 0            1  Sandra Phillips     <NA>                 President
-# 1            2      Marcus Tran        1   Chief Financial Officer
-# 2            3    Ava Whittaker        1  Chief Technology Officer
-# 3            4    Sophie Martin        1  Chief Operations Officer
-# 4            5      Chad Nelson        2           Finance Manager
-# 5            6     Ethan Glover        2         Senior Accountant
-# 6            7   Kimberly Ortiz        2         Junior Accountant
-# 7            8     Lucas Romero        3                IT Manager
-# 8            9      Priya Desai        3    Lead Software Engineer
-# 9           10    Felix Bennett        3    Senior Systems Analyst
+#   employee_id              name boss_id                   role
+# 0        B0-1      Patricia Lee    <NA>              President
+# 1        B0-2  Edward Rodriguez    B0-1       VP of Operations
+# 2        B0-3      Maria Cortez    B0-1          VP of Finance
+# 3        B0-4     Thomas Nguyen    B0-1       VP of Technology
+# 4        B0-5        Rachel Kim    B0-2     Operations Manager
+# 5        B0-6     Jeffrey Patel    B0-2      Supply Chain Lead
+# 6        B0-7      Olivia Smith    B0-2  Facilities Supervisor
+# 7        B0-8      Brian Carter    B0-3     Accounting Manager
+# 8        B0-9   Lauren Anderson    B0-3      Financial Analyst
+# 9       B0-10   Santiago Romero    B0-3     Payroll Specialist
 ```
 7. Enrich existing data with additional columns
@@ -271,10 +273,10 @@ df = mock.sample(
     model="openai/gpt-4.1-nano"
 )
 print(df)
-#    guest_id           name nationality  gender  age  room_number is_vip
-# 0         1   Anna Schmidt          DE  female   29          101   True
-# 1         2    Marco Rossi          IT    male   34          102  False
-# 2         3  Sophie Dupont          FR  female   27          103  False
+#   guest_id           name nationality  gender  age  room_number is_vip
+# 0        1   Anna Schmidt          DE  female   30          102  False
+# 1        2    Marco Rossi          IT    male   27          215   True
+# 2        3  Sophie Dupont          FR  female   22          108  False
 ```
 ## MCP Server

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/README.md RENAMED Viewed

@@ -60,17 +60,17 @@ df = mock.sample(
     model="openai/gpt-4.1-nano",  # select the LLM model (optional)
 )
 print(df)
-#   nationality            name  gender  age date_of_birth        checkin_time  is_vip  price_per_night  room_number
-# 0          AT     Anna Müller  female   29    1994-09-15 2025-01-05 14:30:00    True            350.0          101
-# 1          DE  Johann Schmidt    male   45    1978-11-20 2025-01-06 16:45:00   False            250.0          102
-# 2          CH      Lara Meier  female   32    1991-04-12 2025-01-05 12:00:00    True            400.0          103
-# 3          IT     Marco Rossi    male   38    1985-02-25 2025-01-07 09:15:00   False            280.0          201
-# 4          FR   Claire Dupont  female   24    2000-07-08 2025-01-07 11:20:00   False            220.0          202
-# 5          AT    Felix Gruber    male   52    1972-01-10 2025-01-06 17:50:00    True            375.0          203
-# 6          DE   Sophie Becker  female   27    1996-03-30 2025-01-08 08:30:00   False            230.0          204
-# 7          CH      Max Keller    male   31    1992-05-16 2025-01-09 14:10:00   False            290.0          101
-# 8          IT  Giulia Bianchi  female   36    1988-08-19 2025-01-05 15:55:00    True            410.0          102
-# 9          FR    Louis Martin    male   44    1980-12-05 2025-01-07 10:40:00   False            270.0          103
+#   nationality                 name  gender  age date_of_birth        checkin_time is_vip  price_per_night  room_number
+# 0          FR          Jean Dupont    male   29    1994-03-15 2025-01-10 14:30:00  False            150.0          101
+# 1          DE         Anna Schmidt  female   34    1989-07-22 2025-01-11 16:45:00   True            200.0          201
+# 2          IT          Marco Rossi    male   45    1979-11-05 2025-01-09 10:15:00  False            180.0          102
+# 3          AT         Laura Gruber  female   28    1996-02-19 2025-01-12 09:00:00  False            165.0          202
+# 4          CH         David Müller    male   37    1987-08-30 2025-01-08 17:20:00   True            210.0          203
+# 5          NL  Sophie van den Berg  female   22    2002-04-12 2025-01-10 12:00:00  False            140.0          103
+# 6          GB         James Carter    male   31    1992-09-10 2025-01-11 11:30:00  False            155.0          204
+# 7          BE        Lotte Peeters  female   26    1998-05-25 2025-01-09 15:45:00  False            160.0          201
+# 8          DK        Anders Jensen    male   33    1990-12-03 2025-01-12 08:15:00   True            220.0          202
+# 9          ES         Carlos Lopez    male   38    1985-06-14 2025-01-10 18:00:00  False            170.0          203
 ```
 4. Create your first multi-table mock dataset
@@ -82,7 +82,7 @@ tables = {
     "customers": {
         "prompt": "Customers of a hardware store",
         "columns": {
-            "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
+            "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
             "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
         },
         "primary_key": "customer_id",
@@ -90,7 +90,7 @@ tables = {
     "warehouses": {
         "prompt": "Warehouses of a hardware store",
         "columns": {
-            "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
+            "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
             "name": {"prompt": "the name of the warehouse", "dtype": "string"},
         },
         "primary_key": "warehouse_id",
@@ -98,8 +98,8 @@ tables = {
     "orders": {
         "prompt": "Orders of a Customer",
         "columns": {
-            "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
-            "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
+            "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
+            "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
             "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
             "text": {"prompt": "order text description", "dtype": "string"},
             "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -132,40 +132,42 @@ tables = {
                 "prompt": "each order has between 1 and 2 items",
             }
         ],
+        "primary_key": "item_id",
     },
 }
 data = mock.sample(
     tables=tables,
     sample_size=2,
-    model="openai/gpt-4.1"
+    model="openai/gpt-4.1",
+    n_workers=1,
 )
 print(data["customers"])
-#    customer_id             name
-# 0            1  Matthew Carlson
-# 1            2       Priya Shah
+#   customer_id             name
+# 0   B0-100235  Danielle Rogers
+# 1   B0-100236       Edward Kim
 print(data["warehouses"])
-#    warehouse_id                        name
-# 0             1    Central Distribution Hub
-# 1             2  Northgate Storage Facility
+#   warehouse_id                          name
+# 0       B0-001  Downtown Distribution Center
+# 1       B0-002     Westside Storage Facility
 print(data["orders"])
-#    customer_id  warehouse_id   order_id                                               text  amount
-# 0            1             2  ORD-10294  3-tier glass shelving units, expedited deliver...  649.25
-# 1            1             1  ORD-10541  Office desk chairs, set of 6, with assembly se...   824.9
-# 2            1             1  ORD-10802  Executive standing desk, walnut finish, standa...   519.0
-# 3            2             1  ORD-11017  Maple conference table, cable management inclu...  1225.5
-# 4            2             2  ORD-11385  Set of ergonomic task chairs, black mesh, stan...  767.75
+#   customer_id warehouse_id    order_id                                               text   amount
+# 0   B0-100235       B0-002  B0-3010021  Office furniture replenishment - desks, chairs...  1268.35
+# 1   B0-100235       B0-001  B0-3010022  Bulk stationery order: printer paper, notebook...    449.9
+# 2   B0-100235       B0-001  B0-3010023  Electronics restock: monitors and wireless key...    877.6
+# 3   B0-100236       B0-001  B1-3010021  Monthly cleaning supplies: disinfectant, trash...   314.75
+# 4   B0-100236       B0-002  B1-3010022  Breakroom essentials restock: coffee, tea, and...   182.45
 print(data["items"])
-#      item_id   order_id                                        name   price
-# 0  ITM-80265  ORD-10294         3-Tier Tempered Glass Shelving Unit   409.0
-# 1  ITM-80266  ORD-10294  Brushed Aluminum Shelf Brackets (Set of 4)  240.25
-# 2  ITM-81324  ORD-10541              Ergonomic Mesh-Back Desk Chair   132.5
-# 3  ITM-81325  ORD-10541  Professional Office Chair Assembly Service    45.0
-# 4  ITM-82101  ORD-10802      Executive Standing Desk, Walnut Finish   469.0
-# 5  ITM-82102  ORD-10802         Desk Installation and Setup Service    50.0
-# 6  ITM-83391  ORD-11017             Maple Conference Table, 10-Seat  1125.5
-# 7  ITM-83392  ORD-11017       Integrated Table Cable Management Kit   100.0
-# 8  ITM-84311  ORD-11385            Ergonomic Task Chair, Black Mesh  359.25
-# 9  ITM-84312  ORD-11385                   Standard Delivery Service    48.5
+#      item_id    order_id                                   name   price
+# 0  B0-200501  B0-3010021                  Ergonomic Office Desk  545.99
+# 1  B0-200502  B0-3010021              Mesh Back Executive Chair   399.5
+# 2  B1-200503  B0-3010022   Multipack Printer Paper (500 sheets)  129.95
+# 3  B1-200504  B0-3010022             Spiral Notebooks - 12 Pack   59.99
+# 4  B2-200505  B0-3010023               27" LED Computer Monitor  489.95
+# 5  B2-200506  B0-3010023            Wireless Ergonomic Keyboard  387.65
+# 6  B3-200507  B1-3010021  Industrial Disinfectant Solution (5L)  148.95
+# 7  B3-200508  B1-3010021  Commercial Trash Liners - Case of 100    84.5
+# 8  B4-200509  B1-3010022        Premium Ground Coffee (2lb Bag)   74.99
+# 9  B4-200510  B1-3010022         Bottled Spring Water (24 Pack)   34.95
 ```
 6. Create your first self-referencing mock table
@@ -177,9 +179,9 @@ tables = {
     "employees": {
         "prompt": "Employees of a company",
         "columns": {
-            "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
+            "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
             "name": {"prompt": "first name and last name of the president", "dtype": "string"},
-            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
+            "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
             "role": {"prompt": "the role of the employee", "dtype": "string"},
         },
         "primary_key": "employee_id",
@@ -194,17 +196,17 @@ tables = {
 }
 df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
 print(df)
-#    employee_id             name  boss_id                      role
-# 0            1  Sandra Phillips     <NA>                 President
-# 1            2      Marcus Tran        1   Chief Financial Officer
-# 2            3    Ava Whittaker        1  Chief Technology Officer
-# 3            4    Sophie Martin        1  Chief Operations Officer
-# 4            5      Chad Nelson        2           Finance Manager
-# 5            6     Ethan Glover        2         Senior Accountant
-# 6            7   Kimberly Ortiz        2         Junior Accountant
-# 7            8     Lucas Romero        3                IT Manager
-# 8            9      Priya Desai        3    Lead Software Engineer
-# 9           10    Felix Bennett        3    Senior Systems Analyst
+#   employee_id              name boss_id                   role
+# 0        B0-1      Patricia Lee    <NA>              President
+# 1        B0-2  Edward Rodriguez    B0-1       VP of Operations
+# 2        B0-3      Maria Cortez    B0-1          VP of Finance
+# 3        B0-4     Thomas Nguyen    B0-1       VP of Technology
+# 4        B0-5        Rachel Kim    B0-2     Operations Manager
+# 5        B0-6     Jeffrey Patel    B0-2      Supply Chain Lead
+# 6        B0-7      Olivia Smith    B0-2  Facilities Supervisor
+# 7        B0-8      Brian Carter    B0-3     Accounting Manager
+# 8        B0-9   Lauren Anderson    B0-3      Financial Analyst
+# 9       B0-10   Santiago Romero    B0-3     Payroll Specialist
 ```
 7. Enrich existing data with additional columns
@@ -236,10 +238,10 @@ df = mock.sample(
     model="openai/gpt-4.1-nano"
 )
 print(df)
-#    guest_id           name nationality  gender  age  room_number is_vip
-# 0         1   Anna Schmidt          DE  female   29          101   True
-# 1         2    Marco Rossi          IT    male   34          102  False
-# 2         3  Sophie Dupont          FR  female   27          103  False
+#   guest_id           name nationality  gender  age  room_number is_vip
+# 0        1   Anna Schmidt          DE  female   30          102  False
+# 1        2    Marco Rossi          IT    male   27          215   True
+# 2        3  Sophie Dupont          FR  female   22          108  False
 ```
 ## MCP Server

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/mostlyai/mock/__init__.py RENAMED Viewed

@@ -15,4 +15,4 @@
 from mostlyai.mock.core import sample
 __all__ = ["sample"]
-__version__ = "0.1.10"  # Do not set this manually. Use poetry version [params].
+__version__ = "0.1.12"  # Do not set this manually. Use poetry version [params].

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/mostlyai/mock/core.py RENAMED Viewed

@@ -81,8 +81,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
                 if fk_field.dtype != pk_field.dtype:
                     raise ValueError(
                         f"Foreign key violation in table '{table_name}': "
-                        f"Column '{fk.column}' type '{fk_field.dtype}' does not match "
-                        f"referenced primary key '{referenced_config.primary_key}' type '{pk_field.dtype}'"
+                        f"Column '{fk.column}' type '{fk_field.dtype.value}' does not match "
+                        f"referenced primary key '{referenced_config.primary_key}' type '{pk_field.dtype.value}'"
                     )
         return tables
@@ -113,6 +113,49 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
         return self
+    @model_validator(mode="after")
+    def ensure_values_are_not_provided_for_primary_key(self) -> MockConfig:
+        for table_name, table_config in self.root.items():
+            for column_name, column_config in table_config.columns.items():
+                if column_name == table_config.primary_key and column_config.values:
+                    raise ValueError(
+                        f"Values cannot be provided for primary key column '{column_name}' in table '{table_name}'"
+                    )
+        return self
+    @model_validator(mode="after")
+    def ensure_primary_key_is_string_dtype(self) -> MockConfig:
+        for table_name, table_config in self.root.items():
+            if table_config.primary_key:
+                column_config = table_config.columns[table_config.primary_key]
+                if column_config.dtype not in [DType.STRING]:
+                    raise ValueError(
+                        f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
+                        f" {[DType.STRING.value]}"
+                    )
+        return self
+    def get_dependency_mappings(self) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
+        child_to_parents = {}
+        parent_to_children = {}
+        for table_name in self.root:
+            child_to_parents[table_name] = set()
+            parent_to_children[table_name] = set()
+        for table_name, table_config in self.root.items():
+            if table_config.foreign_keys:
+                for fk in table_config.foreign_keys:
+                    referenced_table = fk.referenced_table
+                    child_to_parents[table_name].add(referenced_table)
+                    parent_to_children[referenced_table].add(table_name)
+        root_tables = []
+        for table_name, parents in child_to_parents.items():
+            if not parents or parents == {table_name}:  # no dependencies or only self-dependency
+                root_tables.append(table_name)
+        return child_to_parents, parent_to_children, root_tables
 class TableConfig(BaseModel):
     prompt: str = ""
@@ -200,7 +243,7 @@ async def _sample_table(
     foreign_keys: list[ForeignKeyConfig],
     primary_keys: dict[str, str],
     data: dict[str, pd.DataFrame],
-    sample_size: int,
+    sample_size: int | None,
     previous_rows_size: int,
     non_context_size: int | None,
     n_workers: int,
@@ -225,12 +268,7 @@ async def _sample_table(
 def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        return loop.run_until_complete(_sample_table(*args, **kwargs))
-    finally:
-        loop.close()
+    return asyncio.run(_sample_table(*args, **kwargs))
 def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
@@ -263,6 +301,7 @@ def _create_table_prompt(
     prompt: str,
     columns: dict[str, ColumnConfig],
     primary_keys: dict[str, str],
+    batch_idx: int,
     batch_size: int | None,
     foreign_keys: list[ForeignKeyConfig],
     existing_data: pd.DataFrame | None,
@@ -277,7 +316,8 @@ def _create_table_prompt(
     # define table
     prompt += f"## Target Table: `{name}`\n\n"
-    prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
+    target_primary_key = primary_keys[name]
+    prompt += f"### Target Table Primary Key: `{target_primary_key}`\n\n"
     # add columns specifications
     prompt += "### Target Table Column Specifications:\n\n"
@@ -313,7 +353,7 @@ def _create_table_prompt(
         has_self_referencing_foreign_keys_section = True
         prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
         for fk in self_referencing_foreign_keys:
-            prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
+            prompt += f"### Primary Key Column: `{target_primary_key}`\n\n"
             prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
@@ -374,6 +414,11 @@ def _create_table_prompt(
     if n_rows is not None:
         prompt += f"Number of data rows to {verb}: `{n_rows}`.\n\n"
+    if target_primary_key is not None:
+        prompt += f"Add prefix to all values of Target Table Primary Key. The prefix is 'B{batch_idx}-'."
+        prompt += " There is one exception: if primary keys are in existing data, don't add prefix to them."
+        prompt += "\n\n"
     if has_context_table_section:
         assert foreign_keys
         prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
@@ -528,7 +573,7 @@ def _create_structured_output_schema(
 ) -> type[BaseModel]:
     def create_annotation(column_config: ColumnConfig) -> type:
         if column_config.values or column_config.dtype is DType.CATEGORY:
-            return Literal[tuple(column_config.values)]
+            return Literal[tuple(column_config.values)]  # type: ignore
         return {
             DType.INTEGER: int | None,
             DType.FLOAT: float | None,
@@ -610,8 +655,9 @@ async def _worker(
                 name=name,
                 prompt=prompt,
                 columns=columns,
-                primary_keys=primary_keys,
+                batch_idx=batch_idx,
                 batch_size=batch_size,
+                primary_keys=primary_keys,
                 foreign_keys=foreign_keys,
                 existing_data=existing_batch,
                 context_data=context_batch,
@@ -715,7 +761,7 @@ async def _create_table_rows_generator(
     foreign_keys: list[ForeignKeyConfig],
     primary_keys: dict[str, str],
     data: dict[str, pd.DataFrame],
-    sample_size: int,
+    sample_size: int | None,
     previous_rows_size: int,
     non_context_size: int | None,
     n_workers: int,
@@ -762,6 +808,7 @@ async def _create_table_rows_generator(
             non_context_data[non_context_table_name] = data[non_context_table_name]
     # calculate batch_sizes
+    assert sample_size is not None, "sample_size should have been filled by this point"
     n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
     batch_sizes = [batch_size] * n_total_batches
     if context_batches is None:
@@ -873,6 +920,32 @@ async def _create_table_rows_generator(
     await asyncio.gather(*workers)
+def _align_series_dtypes_with_column_config(series: pd.Series, column_config: ColumnConfig) -> pd.Series:
+    series = series.copy()
+    if column_config.dtype in [DType.DATE, DType.DATETIME]:
+        def harmonize_datetime(x):
+            try:
+                return dateutil.parser.parse(x)
+            except Exception:
+                return pd.NaT
+        series = pd.to_datetime(series.apply(harmonize_datetime), errors="coerce")
+    elif column_config.dtype is DType.INTEGER:
+        series = pd.to_numeric(series, errors="coerce", downcast="integer").astype("int64[pyarrow]")
+    elif column_config.dtype is DType.FLOAT:
+        series = pd.to_numeric(series, errors="coerce").astype("double[pyarrow]")
+    elif column_config.dtype is DType.BOOLEAN:
+        series = series.map(lambda x: True if str(x).lower() == "true" else x)
+        series = series.map(lambda x: False if str(x).lower() == "false" else x)
+        series = pd.to_numeric(series, errors="coerce").astype("boolean[pyarrow]")
+    elif column_config.dtype is DType.CATEGORY:
+        series = pd.Categorical(series, categories=column_config.values)
+    else:
+        series = series.astype("string[pyarrow]")
+    return series
 async def _convert_table_rows_generator_to_df(
     table_rows_generator: AsyncGenerator[dict],
     columns: dict[str, ColumnConfig],
@@ -880,29 +953,7 @@ async def _convert_table_rows_generator_to_df(
     def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
         df = df.copy()
         for column_name, column_config in columns.items():
-            if column_config.dtype in [DType.DATE, DType.DATETIME]:
-                def harmonize_datetime(x):
-                    try:
-                        return dateutil.parser.parse(x)
-                    except Exception:
-                        return pd.NaT
-                df[column_name] = pd.to_datetime(df[column_name].apply(harmonize_datetime), errors="coerce")
-            elif column_config.dtype is DType.INTEGER:
-                df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
-                    "int64[pyarrow]"
-                )
-            elif column_config.dtype is DType.FLOAT:
-                df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
-            elif column_config.dtype is DType.BOOLEAN:
-                df[column_name] = df[column_name].map(lambda x: True if str(x).lower() == "true" else x)
-                df[column_name] = df[column_name].map(lambda x: False if str(x).lower() == "false" else x)
-                df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
-            elif column_config.dtype is DType.CATEGORY:
-                df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
-            else:
-                df[column_name] = df[column_name].astype("string[pyarrow]")
+            df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
         return df
     # consume entire generator
@@ -912,6 +963,7 @@ async def _convert_table_rows_generator_to_df(
     # extract rows and convert to DataFrame
     rows = [item["row"] for item in items]
     df = pd.DataFrame(rows)
+    # harmonize dtypes
     df = align_df_dtypes_with_mock_dtypes(df, columns)
     return df
@@ -935,6 +987,8 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
     tables = tables.copy()
     for table_name, existing_table in existing_data.items():
         table_config = tables.setdefault(table_name, {})
+        # prepend column configs for existing data columns, that are not specified in the mock config
         column_configs = table_config.setdefault("columns", {})
         existing_column_configs = {
             existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
@@ -942,42 +996,82 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
             if existing_column not in column_configs
         }
         column_configs = {**existing_column_configs, **column_configs}
+        # primary keys are always strings
+        primary_key = table_config.get("primary_key", None)
+        if primary_key is not None:
+            column_configs[primary_key]["dtype"] = DType.STRING
         table_config["columns"] = column_configs
     return tables
 def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
+    _, _, root_tables = config.get_dependency_mappings()
     if isinstance(sample_size, int):
-        return {table_name: sample_size for table_name in config.root}
+        sample_size = {table_name: sample_size for table_name in root_tables}
+    for table_name in root_tables:
+        if table_name not in sample_size or sample_size[table_name] is None:
+            # set default sample size for missing or None sample sizes
+            sample_size[table_name] = 4
+        # clamp sample_size to [1, inf)
+        sample_size[table_name] = max(1, sample_size[table_name])
-    if sample_size.keys() != config.root.keys():
-        raise ValueError(f"Sample size keys must match table names: {sample_size.keys()} != {config.root.keys()}")
     return sample_size
-def _build_execution_plan(config: MockConfig) -> list[str]:
-    def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
-        child_to_parents = {}
-        parent_to_children = {}
+def _harmonize_existing_data(
+    existing_data: dict[str, pd.DataFrame] | None, mock_config: MockConfig
+) -> dict[str, pd.DataFrame]:
+    if existing_data is None:
+        return {}
-        for table_name in config.root:
-            child_to_parents[table_name] = set()
-            parent_to_children[table_name] = set()
+    # by this point, mock config should have been validated, so we can assume that all tables in existing_data are defined in the mock config
+    assert set(mock_config.root.keys()).issuperset(existing_data.keys())
-        for table_name, table_config in config.root.items():
-            if table_config.foreign_keys:
-                for fk in table_config.foreign_keys:
-                    referenced_table = fk.referenced_table
-                    child_to_parents[table_name].add(referenced_table)
-                    parent_to_children[referenced_table].add(table_name)
+    for existing_table_name, existing_table in existing_data.items():
+        existing_table_config = mock_config.root[existing_table_name]
-        root_tables = []
-        for table_name, parents in child_to_parents.items():
-            if not parents or parents == {table_name}:  # no dependencies or only self-dependency
-                root_tables.append(table_name)
-        return child_to_parents, parent_to_children, root_tables
+        for existing_column in existing_table.columns:
+            existing_column_config = existing_table_config.columns[existing_column]
-    child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
+            # ensure that the existing data has compatible dtypes with the column config
+            original_series = existing_table[existing_column]
+            coerced_series = _align_series_dtypes_with_column_config(original_series, existing_column_config)
+            n_original_na = original_series.isna().sum()
+            n_coerced_na = coerced_series.isna().sum()
+            if n_original_na != n_coerced_na:
+                raise ValueError(
+                    f"Coercion of existing data column '{existing_column}' in table '{existing_table_name}' resulted in data loss. "
+                    f"Ensure that the existing data is consistent with the mock configuration."
+                )
+            # ensure that the existing data has values allowed by the column config
+            if existing_column_config.values:
+                if not set(existing_table[existing_column].unique()).issubset(existing_column_config.values):
+                    raise ValueError(
+                        f"Existing data column '{existing_column}' in table '{existing_table_name}' has values disallowed by the column config. "
+                        f"Ensure that the existing data is consistent with the mock configuration."
+                    )
+        # ensure that the existing data has unique primary keys
+        existing_table_primary_key = existing_table_config.primary_key
+        if existing_table_primary_key is not None:
+            if not existing_table[existing_table_primary_key].is_unique:
+                raise ValueError(
+                    f"Existing data table '{existing_table_name}' has non-unique primary key column '{existing_table_primary_key}'. "
+                    f"Ensure that the primary key is unique."
+                )
+            existing_table[existing_column] = coerced_series
+    return existing_data
+def _build_execution_plan(config: MockConfig) -> list[str]:
+    child_to_parents, parent_to_children, root_tables = config.get_dependency_mappings()
     execution_plan = []
     bfs_queue = list(root_tables)
@@ -1035,7 +1129,7 @@ def sample(
         sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
             If a single integer is provided, the same number of rows will be generated for each subject table.
             If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
-            Default is 4. Ignored if existing_data is provided.
+            Default is 4. Ignored if existing_data is provided. Ignored for non-root tables.
             If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
         existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
             Default is None.
@@ -1092,15 +1186,15 @@ def sample(
         "customers": {
             "prompt": "Customers of a hardware store",
             "columns": {
-                "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
+                "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
                 "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
             },
-            "primary_key": "customer_id",  # single string; no composite keys allowed
+            "primary_key": "customer_id",  # single string; no composite keys allowed; primary keys must have string dtype
         },
         "warehouses": {
             "prompt": "Warehouses of a hardware store",
             "columns": {
-                "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
+                "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
                 "name": {"prompt": "the name of the warehouse", "dtype": "string"},
             },
             "primary_key": "warehouse_id",
@@ -1108,8 +1202,8 @@ def sample(
         "orders": {
             "prompt": "Orders of a Customer",
             "columns": {
-                "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
-                "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
+                "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
+                "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
                 "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
                 "text": {"prompt": "order text description", "dtype": "string"},
                 "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -1187,7 +1281,7 @@ def sample(
         "customers": {
             "prompt": "Customers of a hardware store",
             "columns": {
-                "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
+                "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
                 "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
                 "email": {"prompt": "email address of the customer", "dtype": "string"},
                 "phone": {"prompt": "phone number of the customer", "dtype": "string"},
@@ -1199,7 +1293,7 @@ def sample(
             "prompt": "Orders of a Customer",
             "columns": {
                 "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
-                "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
+                "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
                 "order_date": {"prompt": "the date when the order was placed", "dtype": "date"},
                 "total_amount": {"prompt": "order amount in USD", "dtype": "float"},
                 "status": {"dtype": "category", "values": ["pending", "shipped", "delivered", "cancelled"]},
@@ -1247,12 +1341,15 @@ def sample(
     execution_plan: list[str] = _build_execution_plan(config)
-    data: dict[str, pd.DataFrame] = existing_data or {}
-    for table_name in execution_plan:
-        table_config = config.root[table_name]
+    data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
-        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+    # synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
+    # in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
+    # a new thread is spawned for each call to `_sample_table`
+    # NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        for table_name in execution_plan:
+            table_config = config.root[table_name]
             future = executor.submit(
                 _sample_table_sync,
                 name=table_name,
@@ -1261,13 +1358,13 @@ def sample(
                 foreign_keys=table_config.foreign_keys,
                 primary_keys=primary_keys,
                 data=data,
-                sample_size=sample_size[table_name],
+                sample_size=sample_size.get(table_name),
                 previous_rows_size=10,  # present 10 previously generated rows to the LLM
                 non_context_size=10,  # pick 10 rows to choose from for each non-context foreign key
                 n_workers=n_workers,
                 llm_config=llm_config,
             )
             df = future.result()
-        data[table_name] = df
+            data[table_name] = df
     return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mostlyai-mock"
-version = "0.1.10"
+version = "0.1.12"
 description = "Synthetic Mock Data"
 authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
 requires-python = ">=3.10"

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/.gitignore RENAMED Viewed

File without changes

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/LICENSE RENAMED Viewed

File without changes

{mostlyai_mock-0.1.10 → mostlyai_mock-0.1.12}/mostlyai/mock/mcp_server.py RENAMED Viewed

File without changes

mostlyai-mock 0.1.10__tar.gz → 0.1.12__tar.gz

mostlyai-mock 0.1.10tar.gz → 0.1.12tar.gz