mostlyai-mock 0.1.10__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -95,17 +95,17 @@ df = mock.sample(
95
95
  model="openai/gpt-4.1-nano", # select the LLM model (optional)
96
96
  )
97
97
  print(df)
98
- # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
99
- # 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
100
- # 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
101
- # 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
102
- # 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
103
- # 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
104
- # 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
105
- # 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
106
- # 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
107
- # 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
108
- # 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
98
+ # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
99
+ # 0 FR Jean Dupont male 29 1994-03-15 2025-01-10 14:30:00 False 150.0 101
100
+ # 1 DE Anna Schmidt female 34 1989-07-22 2025-01-11 16:45:00 True 200.0 201
101
+ # 2 IT Marco Rossi male 45 1979-11-05 2025-01-09 10:15:00 False 180.0 102
102
+ # 3 AT Laura Gruber female 28 1996-02-19 2025-01-12 09:00:00 False 165.0 202
103
+ # 4 CH David Müller male 37 1987-08-30 2025-01-08 17:20:00 True 210.0 203
104
+ # 5 NL Sophie van den Berg female 22 2002-04-12 2025-01-10 12:00:00 False 140.0 103
105
+ # 6 GB James Carter male 31 1992-09-10 2025-01-11 11:30:00 False 155.0 204
106
+ # 7 BE Lotte Peeters female 26 1998-05-25 2025-01-09 15:45:00 False 160.0 201
107
+ # 8 DK Anders Jensen male 33 1990-12-03 2025-01-12 08:15:00 True 220.0 202
108
+ # 9 ES Carlos Lopez male 38 1985-06-14 2025-01-10 18:00:00 False 170.0 203
109
109
  ```
110
110
 
111
111
  4. Create your first multi-table mock dataset
@@ -117,7 +117,7 @@ tables = {
117
117
  "customers": {
118
118
  "prompt": "Customers of a hardware store",
119
119
  "columns": {
120
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
120
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
121
121
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
122
122
  },
123
123
  "primary_key": "customer_id",
@@ -125,7 +125,7 @@ tables = {
125
125
  "warehouses": {
126
126
  "prompt": "Warehouses of a hardware store",
127
127
  "columns": {
128
- "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
128
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
129
129
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
130
130
  },
131
131
  "primary_key": "warehouse_id",
@@ -133,8 +133,8 @@ tables = {
133
133
  "orders": {
134
134
  "prompt": "Orders of a Customer",
135
135
  "columns": {
136
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
137
- "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
136
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
137
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
138
138
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
139
139
  "text": {"prompt": "order text description", "dtype": "string"},
140
140
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -167,40 +167,42 @@ tables = {
167
167
  "prompt": "each order has between 1 and 2 items",
168
168
  }
169
169
  ],
170
+ "primary_key": "item_id",
170
171
  },
171
172
  }
172
173
  data = mock.sample(
173
174
  tables=tables,
174
175
  sample_size=2,
175
- model="openai/gpt-4.1"
176
+ model="openai/gpt-4.1",
177
+ n_workers=1,
176
178
  )
177
179
  print(data["customers"])
178
- # customer_id name
179
- # 0 1 Matthew Carlson
180
- # 1 2 Priya Shah
180
+ # customer_id name
181
+ # 0 B0-100235 Danielle Rogers
182
+ # 1 B0-100236 Edward Kim
181
183
  print(data["warehouses"])
182
- # warehouse_id name
183
- # 0 1 Central Distribution Hub
184
- # 1 2 Northgate Storage Facility
184
+ # warehouse_id name
185
+ # 0 B0-001 Downtown Distribution Center
186
+ # 1 B0-002 Westside Storage Facility
185
187
  print(data["orders"])
186
- # customer_id warehouse_id order_id text amount
187
- # 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
188
- # 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
189
- # 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
190
- # 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
191
- # 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
188
+ # customer_id warehouse_id order_id text amount
189
+ # 0 B0-100235 B0-002 B0-3010021 Office furniture replenishment - desks, chairs... 1268.35
190
+ # 1 B0-100235 B0-001 B0-3010022 Bulk stationery order: printer paper, notebook... 449.9
191
+ # 2 B0-100235 B0-001 B0-3010023 Electronics restock: monitors and wireless key... 877.6
192
+ # 3 B0-100236 B0-001 B1-3010021 Monthly cleaning supplies: disinfectant, trash... 314.75
193
+ # 4 B0-100236 B0-002 B1-3010022 Breakroom essentials restock: coffee, tea, and... 182.45
192
194
  print(data["items"])
193
- # item_id order_id name price
194
- # 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
195
- # 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
196
- # 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
197
- # 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
198
- # 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
199
- # 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
200
- # 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
201
- # 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
202
- # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
203
- # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
195
+ # item_id order_id name price
196
+ # 0 B0-200501 B0-3010021 Ergonomic Office Desk 545.99
197
+ # 1 B0-200502 B0-3010021 Mesh Back Executive Chair 399.5
198
+ # 2 B1-200503 B0-3010022 Multipack Printer Paper (500 sheets) 129.95
199
+ # 3 B1-200504 B0-3010022 Spiral Notebooks - 12 Pack 59.99
200
+ # 4 B2-200505 B0-3010023 27" LED Computer Monitor 489.95
201
+ # 5 B2-200506 B0-3010023 Wireless Ergonomic Keyboard 387.65
202
+ # 6 B3-200507 B1-3010021 Industrial Disinfectant Solution (5L) 148.95
203
+ # 7 B3-200508 B1-3010021 Commercial Trash Liners - Case of 100 84.5
204
+ # 8 B4-200509 B1-3010022 Premium Ground Coffee (2lb Bag) 74.99
205
+ # 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
204
206
  ```
205
207
 
206
208
  6. Create your first self-referencing mock table
@@ -212,9 +214,9 @@ tables = {
212
214
  "employees": {
213
215
  "prompt": "Employees of a company",
214
216
  "columns": {
215
- "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
217
+ "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
216
218
  "name": {"prompt": "first name and last name of the president", "dtype": "string"},
217
- "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
219
+ "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
218
220
  "role": {"prompt": "the role of the employee", "dtype": "string"},
219
221
  },
220
222
  "primary_key": "employee_id",
@@ -229,17 +231,17 @@ tables = {
229
231
  }
230
232
  df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
231
233
  print(df)
232
- # employee_id name boss_id role
233
- # 0 1 Sandra Phillips <NA> President
234
- # 1 2 Marcus Tran 1 Chief Financial Officer
235
- # 2 3 Ava Whittaker 1 Chief Technology Officer
236
- # 3 4 Sophie Martin 1 Chief Operations Officer
237
- # 4 5 Chad Nelson 2 Finance Manager
238
- # 5 6 Ethan Glover 2 Senior Accountant
239
- # 6 7 Kimberly Ortiz 2 Junior Accountant
240
- # 7 8 Lucas Romero 3 IT Manager
241
- # 8 9 Priya Desai 3 Lead Software Engineer
242
- # 9 10 Felix Bennett 3 Senior Systems Analyst
234
+ # employee_id name boss_id role
235
+ # 0 B0-1 Patricia Lee <NA> President
236
+ # 1 B0-2 Edward Rodriguez B0-1 VP of Operations
237
+ # 2 B0-3 Maria Cortez B0-1 VP of Finance
238
+ # 3 B0-4 Thomas Nguyen B0-1 VP of Technology
239
+ # 4 B0-5 Rachel Kim B0-2 Operations Manager
240
+ # 5 B0-6 Jeffrey Patel B0-2 Supply Chain Lead
241
+ # 6 B0-7 Olivia Smith B0-2 Facilities Supervisor
242
+ # 7 B0-8 Brian Carter B0-3 Accounting Manager
243
+ # 8 B0-9 Lauren Anderson B0-3 Financial Analyst
244
+ # 9 B0-10 Santiago Romero B0-3 Payroll Specialist
243
245
  ```
244
246
 
245
247
  7. Enrich existing data with additional columns
@@ -271,10 +273,10 @@ df = mock.sample(
271
273
  model="openai/gpt-4.1-nano"
272
274
  )
273
275
  print(df)
274
- # guest_id name nationality gender age room_number is_vip
275
- # 0 1 Anna Schmidt DE female 29 101 True
276
- # 1 2 Marco Rossi IT male 34 102 False
277
- # 2 3 Sophie Dupont FR female 27 103 False
276
+ # guest_id name nationality gender age room_number is_vip
277
+ # 0 1 Anna Schmidt DE female 30 102 False
278
+ # 1 2 Marco Rossi IT male 27 215 True
279
+ # 2 3 Sophie Dupont FR female 22 108 False
278
280
  ```
279
281
 
280
282
  ## MCP Server
@@ -60,17 +60,17 @@ df = mock.sample(
60
60
  model="openai/gpt-4.1-nano", # select the LLM model (optional)
61
61
  )
62
62
  print(df)
63
- # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
64
- # 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
65
- # 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
66
- # 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
67
- # 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
68
- # 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
69
- # 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
70
- # 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
71
- # 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
72
- # 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
73
- # 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
63
+ # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
64
+ # 0 FR Jean Dupont male 29 1994-03-15 2025-01-10 14:30:00 False 150.0 101
65
+ # 1 DE Anna Schmidt female 34 1989-07-22 2025-01-11 16:45:00 True 200.0 201
66
+ # 2 IT Marco Rossi male 45 1979-11-05 2025-01-09 10:15:00 False 180.0 102
67
+ # 3 AT Laura Gruber female 28 1996-02-19 2025-01-12 09:00:00 False 165.0 202
68
+ # 4 CH David Müller male 37 1987-08-30 2025-01-08 17:20:00 True 210.0 203
69
+ # 5 NL Sophie van den Berg female 22 2002-04-12 2025-01-10 12:00:00 False 140.0 103
70
+ # 6 GB James Carter male 31 1992-09-10 2025-01-11 11:30:00 False 155.0 204
71
+ # 7 BE Lotte Peeters female 26 1998-05-25 2025-01-09 15:45:00 False 160.0 201
72
+ # 8 DK Anders Jensen male 33 1990-12-03 2025-01-12 08:15:00 True 220.0 202
73
+ # 9 ES Carlos Lopez male 38 1985-06-14 2025-01-10 18:00:00 False 170.0 203
74
74
  ```
75
75
 
76
76
  4. Create your first multi-table mock dataset
@@ -82,7 +82,7 @@ tables = {
82
82
  "customers": {
83
83
  "prompt": "Customers of a hardware store",
84
84
  "columns": {
85
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
85
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
86
86
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
87
87
  },
88
88
  "primary_key": "customer_id",
@@ -90,7 +90,7 @@ tables = {
90
90
  "warehouses": {
91
91
  "prompt": "Warehouses of a hardware store",
92
92
  "columns": {
93
- "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
93
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
94
94
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
95
95
  },
96
96
  "primary_key": "warehouse_id",
@@ -98,8 +98,8 @@ tables = {
98
98
  "orders": {
99
99
  "prompt": "Orders of a Customer",
100
100
  "columns": {
101
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
102
- "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
101
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
102
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
103
103
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
104
104
  "text": {"prompt": "order text description", "dtype": "string"},
105
105
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -132,40 +132,42 @@ tables = {
132
132
  "prompt": "each order has between 1 and 2 items",
133
133
  }
134
134
  ],
135
+ "primary_key": "item_id",
135
136
  },
136
137
  }
137
138
  data = mock.sample(
138
139
  tables=tables,
139
140
  sample_size=2,
140
- model="openai/gpt-4.1"
141
+ model="openai/gpt-4.1",
142
+ n_workers=1,
141
143
  )
142
144
  print(data["customers"])
143
- # customer_id name
144
- # 0 1 Matthew Carlson
145
- # 1 2 Priya Shah
145
+ # customer_id name
146
+ # 0 B0-100235 Danielle Rogers
147
+ # 1 B0-100236 Edward Kim
146
148
  print(data["warehouses"])
147
- # warehouse_id name
148
- # 0 1 Central Distribution Hub
149
- # 1 2 Northgate Storage Facility
149
+ # warehouse_id name
150
+ # 0 B0-001 Downtown Distribution Center
151
+ # 1 B0-002 Westside Storage Facility
150
152
  print(data["orders"])
151
- # customer_id warehouse_id order_id text amount
152
- # 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
153
- # 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
154
- # 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
155
- # 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
156
- # 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
153
+ # customer_id warehouse_id order_id text amount
154
+ # 0 B0-100235 B0-002 B0-3010021 Office furniture replenishment - desks, chairs... 1268.35
155
+ # 1 B0-100235 B0-001 B0-3010022 Bulk stationery order: printer paper, notebook... 449.9
156
+ # 2 B0-100235 B0-001 B0-3010023 Electronics restock: monitors and wireless key... 877.6
157
+ # 3 B0-100236 B0-001 B1-3010021 Monthly cleaning supplies: disinfectant, trash... 314.75
158
+ # 4 B0-100236 B0-002 B1-3010022 Breakroom essentials restock: coffee, tea, and... 182.45
157
159
  print(data["items"])
158
- # item_id order_id name price
159
- # 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
160
- # 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
161
- # 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
162
- # 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
163
- # 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
164
- # 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
165
- # 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
166
- # 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
167
- # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
168
- # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
160
+ # item_id order_id name price
161
+ # 0 B0-200501 B0-3010021 Ergonomic Office Desk 545.99
162
+ # 1 B0-200502 B0-3010021 Mesh Back Executive Chair 399.5
163
+ # 2 B1-200503 B0-3010022 Multipack Printer Paper (500 sheets) 129.95
164
+ # 3 B1-200504 B0-3010022 Spiral Notebooks - 12 Pack 59.99
165
+ # 4 B2-200505 B0-3010023 27" LED Computer Monitor 489.95
166
+ # 5 B2-200506 B0-3010023 Wireless Ergonomic Keyboard 387.65
167
+ # 6 B3-200507 B1-3010021 Industrial Disinfectant Solution (5L) 148.95
168
+ # 7 B3-200508 B1-3010021 Commercial Trash Liners - Case of 100 84.5
169
+ # 8 B4-200509 B1-3010022 Premium Ground Coffee (2lb Bag) 74.99
170
+ # 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
169
171
  ```
170
172
 
171
173
  6. Create your first self-referencing mock table
@@ -177,9 +179,9 @@ tables = {
177
179
  "employees": {
178
180
  "prompt": "Employees of a company",
179
181
  "columns": {
180
- "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
182
+ "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
181
183
  "name": {"prompt": "first name and last name of the president", "dtype": "string"},
182
- "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
184
+ "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
183
185
  "role": {"prompt": "the role of the employee", "dtype": "string"},
184
186
  },
185
187
  "primary_key": "employee_id",
@@ -194,17 +196,17 @@ tables = {
194
196
  }
195
197
  df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
196
198
  print(df)
197
- # employee_id name boss_id role
198
- # 0 1 Sandra Phillips <NA> President
199
- # 1 2 Marcus Tran 1 Chief Financial Officer
200
- # 2 3 Ava Whittaker 1 Chief Technology Officer
201
- # 3 4 Sophie Martin 1 Chief Operations Officer
202
- # 4 5 Chad Nelson 2 Finance Manager
203
- # 5 6 Ethan Glover 2 Senior Accountant
204
- # 6 7 Kimberly Ortiz 2 Junior Accountant
205
- # 7 8 Lucas Romero 3 IT Manager
206
- # 8 9 Priya Desai 3 Lead Software Engineer
207
- # 9 10 Felix Bennett 3 Senior Systems Analyst
199
+ # employee_id name boss_id role
200
+ # 0 B0-1 Patricia Lee <NA> President
201
+ # 1 B0-2 Edward Rodriguez B0-1 VP of Operations
202
+ # 2 B0-3 Maria Cortez B0-1 VP of Finance
203
+ # 3 B0-4 Thomas Nguyen B0-1 VP of Technology
204
+ # 4 B0-5 Rachel Kim B0-2 Operations Manager
205
+ # 5 B0-6 Jeffrey Patel B0-2 Supply Chain Lead
206
+ # 6 B0-7 Olivia Smith B0-2 Facilities Supervisor
207
+ # 7 B0-8 Brian Carter B0-3 Accounting Manager
208
+ # 8 B0-9 Lauren Anderson B0-3 Financial Analyst
209
+ # 9 B0-10 Santiago Romero B0-3 Payroll Specialist
208
210
  ```
209
211
 
210
212
  7. Enrich existing data with additional columns
@@ -236,10 +238,10 @@ df = mock.sample(
236
238
  model="openai/gpt-4.1-nano"
237
239
  )
238
240
  print(df)
239
- # guest_id name nationality gender age room_number is_vip
240
- # 0 1 Anna Schmidt DE female 29 101 True
241
- # 1 2 Marco Rossi IT male 34 102 False
242
- # 2 3 Sophie Dupont FR female 27 103 False
241
+ # guest_id name nationality gender age room_number is_vip
242
+ # 0 1 Anna Schmidt DE female 30 102 False
243
+ # 1 2 Marco Rossi IT male 27 215 True
244
+ # 2 3 Sophie Dupont FR female 22 108 False
243
245
  ```
244
246
 
245
247
  ## MCP Server
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.10" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.12" # Do not set this manually. Use poetry version [params].
@@ -81,8 +81,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
81
81
  if fk_field.dtype != pk_field.dtype:
82
82
  raise ValueError(
83
83
  f"Foreign key violation in table '{table_name}': "
84
- f"Column '{fk.column}' type '{fk_field.dtype}' does not match "
85
- f"referenced primary key '{referenced_config.primary_key}' type '{pk_field.dtype}'"
84
+ f"Column '{fk.column}' type '{fk_field.dtype.value}' does not match "
85
+ f"referenced primary key '{referenced_config.primary_key}' type '{pk_field.dtype.value}'"
86
86
  )
87
87
 
88
88
  return tables
@@ -113,6 +113,49 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
113
113
 
114
114
  return self
115
115
 
116
+ @model_validator(mode="after")
117
+ def ensure_values_are_not_provided_for_primary_key(self) -> MockConfig:
118
+ for table_name, table_config in self.root.items():
119
+ for column_name, column_config in table_config.columns.items():
120
+ if column_name == table_config.primary_key and column_config.values:
121
+ raise ValueError(
122
+ f"Values cannot be provided for primary key column '{column_name}' in table '{table_name}'"
123
+ )
124
+ return self
125
+
126
+ @model_validator(mode="after")
127
+ def ensure_primary_key_is_string_dtype(self) -> MockConfig:
128
+ for table_name, table_config in self.root.items():
129
+ if table_config.primary_key:
130
+ column_config = table_config.columns[table_config.primary_key]
131
+ if column_config.dtype not in [DType.STRING]:
132
+ raise ValueError(
133
+ f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
134
+ f" {[DType.STRING.value]}"
135
+ )
136
+ return self
137
+
138
+ def get_dependency_mappings(self) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
139
+ child_to_parents = {}
140
+ parent_to_children = {}
141
+
142
+ for table_name in self.root:
143
+ child_to_parents[table_name] = set()
144
+ parent_to_children[table_name] = set()
145
+
146
+ for table_name, table_config in self.root.items():
147
+ if table_config.foreign_keys:
148
+ for fk in table_config.foreign_keys:
149
+ referenced_table = fk.referenced_table
150
+ child_to_parents[table_name].add(referenced_table)
151
+ parent_to_children[referenced_table].add(table_name)
152
+
153
+ root_tables = []
154
+ for table_name, parents in child_to_parents.items():
155
+ if not parents or parents == {table_name}: # no dependencies or only self-dependency
156
+ root_tables.append(table_name)
157
+ return child_to_parents, parent_to_children, root_tables
158
+
116
159
 
117
160
  class TableConfig(BaseModel):
118
161
  prompt: str = ""
@@ -200,7 +243,7 @@ async def _sample_table(
200
243
  foreign_keys: list[ForeignKeyConfig],
201
244
  primary_keys: dict[str, str],
202
245
  data: dict[str, pd.DataFrame],
203
- sample_size: int,
246
+ sample_size: int | None,
204
247
  previous_rows_size: int,
205
248
  non_context_size: int | None,
206
249
  n_workers: int,
@@ -225,12 +268,7 @@ async def _sample_table(
225
268
 
226
269
 
227
270
  def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
228
- loop = asyncio.new_event_loop()
229
- asyncio.set_event_loop(loop)
230
- try:
231
- return loop.run_until_complete(_sample_table(*args, **kwargs))
232
- finally:
233
- loop.close()
271
+ return asyncio.run(_sample_table(*args, **kwargs))
234
272
 
235
273
 
236
274
  def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
@@ -263,6 +301,7 @@ def _create_table_prompt(
263
301
  prompt: str,
264
302
  columns: dict[str, ColumnConfig],
265
303
  primary_keys: dict[str, str],
304
+ batch_idx: int,
266
305
  batch_size: int | None,
267
306
  foreign_keys: list[ForeignKeyConfig],
268
307
  existing_data: pd.DataFrame | None,
@@ -277,7 +316,8 @@ def _create_table_prompt(
277
316
  # define table
278
317
  prompt += f"## Target Table: `{name}`\n\n"
279
318
 
280
- prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
319
+ target_primary_key = primary_keys[name]
320
+ prompt += f"### Target Table Primary Key: `{target_primary_key}`\n\n"
281
321
 
282
322
  # add columns specifications
283
323
  prompt += "### Target Table Column Specifications:\n\n"
@@ -313,7 +353,7 @@ def _create_table_prompt(
313
353
  has_self_referencing_foreign_keys_section = True
314
354
  prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
315
355
  for fk in self_referencing_foreign_keys:
316
- prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
356
+ prompt += f"### Primary Key Column: `{target_primary_key}`\n\n"
317
357
 
318
358
  prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
319
359
 
@@ -374,6 +414,11 @@ def _create_table_prompt(
374
414
  if n_rows is not None:
375
415
  prompt += f"Number of data rows to {verb}: `{n_rows}`.\n\n"
376
416
 
417
+ if target_primary_key is not None:
418
+ prompt += f"Add prefix to all values of Target Table Primary Key. The prefix is 'B{batch_idx}-'."
419
+ prompt += " There is one exception: if primary keys are in existing data, don't add prefix to them."
420
+ prompt += "\n\n"
421
+
377
422
  if has_context_table_section:
378
423
  assert foreign_keys
379
424
  prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
@@ -528,7 +573,7 @@ def _create_structured_output_schema(
528
573
  ) -> type[BaseModel]:
529
574
  def create_annotation(column_config: ColumnConfig) -> type:
530
575
  if column_config.values or column_config.dtype is DType.CATEGORY:
531
- return Literal[tuple(column_config.values)]
576
+ return Literal[tuple(column_config.values)] # type: ignore
532
577
  return {
533
578
  DType.INTEGER: int | None,
534
579
  DType.FLOAT: float | None,
@@ -610,8 +655,9 @@ async def _worker(
610
655
  name=name,
611
656
  prompt=prompt,
612
657
  columns=columns,
613
- primary_keys=primary_keys,
658
+ batch_idx=batch_idx,
614
659
  batch_size=batch_size,
660
+ primary_keys=primary_keys,
615
661
  foreign_keys=foreign_keys,
616
662
  existing_data=existing_batch,
617
663
  context_data=context_batch,
@@ -715,7 +761,7 @@ async def _create_table_rows_generator(
715
761
  foreign_keys: list[ForeignKeyConfig],
716
762
  primary_keys: dict[str, str],
717
763
  data: dict[str, pd.DataFrame],
718
- sample_size: int,
764
+ sample_size: int | None,
719
765
  previous_rows_size: int,
720
766
  non_context_size: int | None,
721
767
  n_workers: int,
@@ -762,6 +808,7 @@ async def _create_table_rows_generator(
762
808
  non_context_data[non_context_table_name] = data[non_context_table_name]
763
809
 
764
810
  # calculate batch_sizes
811
+ assert sample_size is not None, "sample_size should have been filled by this point"
765
812
  n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
766
813
  batch_sizes = [batch_size] * n_total_batches
767
814
  if context_batches is None:
@@ -873,6 +920,32 @@ async def _create_table_rows_generator(
873
920
  await asyncio.gather(*workers)
874
921
 
875
922
 
923
+ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: ColumnConfig) -> pd.Series:
924
+ series = series.copy()
925
+ if column_config.dtype in [DType.DATE, DType.DATETIME]:
926
+
927
+ def harmonize_datetime(x):
928
+ try:
929
+ return dateutil.parser.parse(x)
930
+ except Exception:
931
+ return pd.NaT
932
+
933
+ series = pd.to_datetime(series.apply(harmonize_datetime), errors="coerce")
934
+ elif column_config.dtype is DType.INTEGER:
935
+ series = pd.to_numeric(series, errors="coerce", downcast="integer").astype("int64[pyarrow]")
936
+ elif column_config.dtype is DType.FLOAT:
937
+ series = pd.to_numeric(series, errors="coerce").astype("double[pyarrow]")
938
+ elif column_config.dtype is DType.BOOLEAN:
939
+ series = series.map(lambda x: True if str(x).lower() == "true" else x)
940
+ series = series.map(lambda x: False if str(x).lower() == "false" else x)
941
+ series = pd.to_numeric(series, errors="coerce").astype("boolean[pyarrow]")
942
+ elif column_config.dtype is DType.CATEGORY:
943
+ series = pd.Categorical(series, categories=column_config.values)
944
+ else:
945
+ series = series.astype("string[pyarrow]")
946
+ return series
947
+
948
+
876
949
  async def _convert_table_rows_generator_to_df(
877
950
  table_rows_generator: AsyncGenerator[dict],
878
951
  columns: dict[str, ColumnConfig],
@@ -880,29 +953,7 @@ async def _convert_table_rows_generator_to_df(
880
953
  def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
881
954
  df = df.copy()
882
955
  for column_name, column_config in columns.items():
883
- if column_config.dtype in [DType.DATE, DType.DATETIME]:
884
-
885
- def harmonize_datetime(x):
886
- try:
887
- return dateutil.parser.parse(x)
888
- except Exception:
889
- return pd.NaT
890
-
891
- df[column_name] = pd.to_datetime(df[column_name].apply(harmonize_datetime), errors="coerce")
892
- elif column_config.dtype is DType.INTEGER:
893
- df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
894
- "int64[pyarrow]"
895
- )
896
- elif column_config.dtype is DType.FLOAT:
897
- df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
898
- elif column_config.dtype is DType.BOOLEAN:
899
- df[column_name] = df[column_name].map(lambda x: True if str(x).lower() == "true" else x)
900
- df[column_name] = df[column_name].map(lambda x: False if str(x).lower() == "false" else x)
901
- df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
902
- elif column_config.dtype is DType.CATEGORY:
903
- df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
904
- else:
905
- df[column_name] = df[column_name].astype("string[pyarrow]")
956
+ df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
906
957
  return df
907
958
 
908
959
  # consume entire generator
@@ -912,6 +963,7 @@ async def _convert_table_rows_generator_to_df(
912
963
  # extract rows and convert to DataFrame
913
964
  rows = [item["row"] for item in items]
914
965
  df = pd.DataFrame(rows)
966
+ # harmonize dtypes
915
967
  df = align_df_dtypes_with_mock_dtypes(df, columns)
916
968
  return df
917
969
 
@@ -935,6 +987,8 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
935
987
  tables = tables.copy()
936
988
  for table_name, existing_table in existing_data.items():
937
989
  table_config = tables.setdefault(table_name, {})
990
+
991
+ # prepend column configs for existing data columns, that are not specified in the mock config
938
992
  column_configs = table_config.setdefault("columns", {})
939
993
  existing_column_configs = {
940
994
  existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
@@ -942,42 +996,82 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
942
996
  if existing_column not in column_configs
943
997
  }
944
998
  column_configs = {**existing_column_configs, **column_configs}
999
+
1000
+ # primary keys are always strings
1001
+ primary_key = table_config.get("primary_key", None)
1002
+ if primary_key is not None:
1003
+ column_configs[primary_key]["dtype"] = DType.STRING
1004
+
945
1005
  table_config["columns"] = column_configs
946
1006
  return tables
947
1007
 
948
1008
 
949
1009
  def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
1010
+ _, _, root_tables = config.get_dependency_mappings()
1011
+
950
1012
  if isinstance(sample_size, int):
951
- return {table_name: sample_size for table_name in config.root}
1013
+ sample_size = {table_name: sample_size for table_name in root_tables}
1014
+
1015
+ for table_name in root_tables:
1016
+ if table_name not in sample_size or sample_size[table_name] is None:
1017
+ # set default sample size for missing or None sample sizes
1018
+ sample_size[table_name] = 4
1019
+ # clamp sample_size to [1, inf)
1020
+ sample_size[table_name] = max(1, sample_size[table_name])
952
1021
 
953
- if sample_size.keys() != config.root.keys():
954
- raise ValueError(f"Sample size keys must match table names: {sample_size.keys()} != {config.root.keys()}")
955
1022
  return sample_size
956
1023
 
957
1024
 
958
- def _build_execution_plan(config: MockConfig) -> list[str]:
959
- def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
960
- child_to_parents = {}
961
- parent_to_children = {}
1025
+ def _harmonize_existing_data(
1026
+ existing_data: dict[str, pd.DataFrame] | None, mock_config: MockConfig
1027
+ ) -> dict[str, pd.DataFrame]:
1028
+ if existing_data is None:
1029
+ return {}
962
1030
 
963
- for table_name in config.root:
964
- child_to_parents[table_name] = set()
965
- parent_to_children[table_name] = set()
1031
+ # by this point, mock config should have been validated, so we can assume that all tables in existing_data are defined in the mock config
1032
+ assert set(mock_config.root.keys()).issuperset(existing_data.keys())
966
1033
 
967
- for table_name, table_config in config.root.items():
968
- if table_config.foreign_keys:
969
- for fk in table_config.foreign_keys:
970
- referenced_table = fk.referenced_table
971
- child_to_parents[table_name].add(referenced_table)
972
- parent_to_children[referenced_table].add(table_name)
1034
+ for existing_table_name, existing_table in existing_data.items():
1035
+ existing_table_config = mock_config.root[existing_table_name]
973
1036
 
974
- root_tables = []
975
- for table_name, parents in child_to_parents.items():
976
- if not parents or parents == {table_name}: # no dependencies or only self-dependency
977
- root_tables.append(table_name)
978
- return child_to_parents, parent_to_children, root_tables
1037
+ for existing_column in existing_table.columns:
1038
+ existing_column_config = existing_table_config.columns[existing_column]
979
1039
 
980
- child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
1040
+ # ensure that the existing data has compatible dtypes with the column config
1041
+ original_series = existing_table[existing_column]
1042
+ coerced_series = _align_series_dtypes_with_column_config(original_series, existing_column_config)
1043
+ n_original_na = original_series.isna().sum()
1044
+ n_coerced_na = coerced_series.isna().sum()
1045
+ if n_original_na != n_coerced_na:
1046
+ raise ValueError(
1047
+ f"Coercion of existing data column '{existing_column}' in table '{existing_table_name}' resulted in data loss. "
1048
+ f"Ensure that the existing data is consistent with the mock configuration."
1049
+ )
1050
+
1051
+ # ensure that the existing data has values allowed by the column config
1052
+ if existing_column_config.values:
1053
+ if not set(existing_table[existing_column].unique()).issubset(existing_column_config.values):
1054
+ raise ValueError(
1055
+ f"Existing data column '{existing_column}' in table '{existing_table_name}' has values disallowed by the column config. "
1056
+ f"Ensure that the existing data is consistent with the mock configuration."
1057
+ )
1058
+
1059
+ # ensure that the existing data has unique primary keys
1060
+ existing_table_primary_key = existing_table_config.primary_key
1061
+ if existing_table_primary_key is not None:
1062
+ if not existing_table[existing_table_primary_key].is_unique:
1063
+ raise ValueError(
1064
+ f"Existing data table '{existing_table_name}' has non-unique primary key column '{existing_table_primary_key}'. "
1065
+ f"Ensure that the primary key is unique."
1066
+ )
1067
+
1068
+ existing_table[existing_column] = coerced_series
1069
+
1070
+ return existing_data
1071
+
1072
+
1073
+ def _build_execution_plan(config: MockConfig) -> list[str]:
1074
+ child_to_parents, parent_to_children, root_tables = config.get_dependency_mappings()
981
1075
 
982
1076
  execution_plan = []
983
1077
  bfs_queue = list(root_tables)
@@ -1035,7 +1129,7 @@ def sample(
1035
1129
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
1036
1130
  If a single integer is provided, the same number of rows will be generated for each subject table.
1037
1131
  If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
1038
- Default is 4. Ignored if existing_data is provided.
1132
+ Default is 4. Ignored if existing_data is provided. Ignored for non-root tables.
1039
1133
  If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
1040
1134
  existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
1041
1135
  Default is None.
@@ -1092,15 +1186,15 @@ def sample(
1092
1186
  "customers": {
1093
1187
  "prompt": "Customers of a hardware store",
1094
1188
  "columns": {
1095
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
1189
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
1096
1190
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
1097
1191
  },
1098
- "primary_key": "customer_id", # single string; no composite keys allowed
1192
+ "primary_key": "customer_id", # single string; no composite keys allowed; primary keys must have string dtype
1099
1193
  },
1100
1194
  "warehouses": {
1101
1195
  "prompt": "Warehouses of a hardware store",
1102
1196
  "columns": {
1103
- "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
1197
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
1104
1198
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
1105
1199
  },
1106
1200
  "primary_key": "warehouse_id",
@@ -1108,8 +1202,8 @@ def sample(
1108
1202
  "orders": {
1109
1203
  "prompt": "Orders of a Customer",
1110
1204
  "columns": {
1111
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
1112
- "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
1205
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
1206
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
1113
1207
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
1114
1208
  "text": {"prompt": "order text description", "dtype": "string"},
1115
1209
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -1187,7 +1281,7 @@ def sample(
1187
1281
  "customers": {
1188
1282
  "prompt": "Customers of a hardware store",
1189
1283
  "columns": {
1190
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
1284
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
1191
1285
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
1192
1286
  "email": {"prompt": "email address of the customer", "dtype": "string"},
1193
1287
  "phone": {"prompt": "phone number of the customer", "dtype": "string"},
@@ -1199,7 +1293,7 @@ def sample(
1199
1293
  "prompt": "Orders of a Customer",
1200
1294
  "columns": {
1201
1295
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
1202
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
1296
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
1203
1297
  "order_date": {"prompt": "the date when the order was placed", "dtype": "date"},
1204
1298
  "total_amount": {"prompt": "order amount in USD", "dtype": "float"},
1205
1299
  "status": {"dtype": "category", "values": ["pending", "shipped", "delivered", "cancelled"]},
@@ -1247,12 +1341,15 @@ def sample(
1247
1341
 
1248
1342
  execution_plan: list[str] = _build_execution_plan(config)
1249
1343
 
1250
- data: dict[str, pd.DataFrame] = existing_data or {}
1251
-
1252
- for table_name in execution_plan:
1253
- table_config = config.root[table_name]
1344
+ data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
1254
1345
 
1255
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1346
+ # synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
1347
+ # in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
1348
+ # a new thread is spawned for each call to `_sample_table`
1349
+ # NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
1350
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1351
+ for table_name in execution_plan:
1352
+ table_config = config.root[table_name]
1256
1353
  future = executor.submit(
1257
1354
  _sample_table_sync,
1258
1355
  name=table_name,
@@ -1261,13 +1358,13 @@ def sample(
1261
1358
  foreign_keys=table_config.foreign_keys,
1262
1359
  primary_keys=primary_keys,
1263
1360
  data=data,
1264
- sample_size=sample_size[table_name],
1361
+ sample_size=sample_size.get(table_name),
1265
1362
  previous_rows_size=10, # present 10 previously generated rows to the LLM
1266
1363
  non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
1267
1364
  n_workers=n_workers,
1268
1365
  llm_config=llm_config,
1269
1366
  )
1270
1367
  df = future.result()
1271
- data[table_name] = df
1368
+ data[table_name] = df
1272
1369
 
1273
1370
  return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.1.10"
3
+ version = "0.1.12"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes