mostlyai-mock 0.0.5__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai_mock-0.0.7/PKG-INFO +201 -0
- mostlyai_mock-0.0.7/README.md +168 -0
- {mostlyai_mock-0.0.5 → mostlyai_mock-0.0.7}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.5 → mostlyai_mock-0.0.7}/mostlyai/mock/core.py +103 -34
- {mostlyai_mock-0.0.5 → mostlyai_mock-0.0.7}/pyproject.toml +18 -1
- mostlyai_mock-0.0.5/PKG-INFO +0 -117
- mostlyai_mock-0.0.5/README.md +0 -99
- {mostlyai_mock-0.0.5 → mostlyai_mock-0.0.7}/.gitignore +0 -0
- {mostlyai_mock-0.0.5 → mostlyai_mock-0.0.7}/LICENSE +0 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: mostlyai-mock
|
3
|
+
Version: 0.0.7
|
4
|
+
Summary: Synthetic Mock Data
|
5
|
+
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
|
+
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
7
|
+
Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
|
8
|
+
Author-email: MOSTLY AI <dev@mostly.ai>
|
9
|
+
License-Expression: Apache-2.0
|
10
|
+
License-File: LICENSE
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
14
|
+
Classifier: Intended Audience :: Healthcare Industry
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
17
|
+
Classifier: Intended Audience :: Telecommunications Industry
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
24
|
+
Classifier: Topic :: Software Development :: Libraries
|
25
|
+
Classifier: Typing :: Typed
|
26
|
+
Requires-Python: >=3.10
|
27
|
+
Requires-Dist: litellm>=1.67.0
|
28
|
+
Requires-Dist: numpy>=1.26.3
|
29
|
+
Requires-Dist: pandas>=2.0.0
|
30
|
+
Requires-Dist: pyarrow>=14.0.0
|
31
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
|
34
|
+
# Synthetic Mock Data 🔮
|
35
|
+
|
36
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
37
|
+
|
38
|
+
Create data out of nothing. Prompt LLMs for Tabular Data.
|
39
|
+
|
40
|
+
## Key Features
|
41
|
+
|
42
|
+
* A light-weight python client for prompting LLMs for mixed-type tabular data
|
43
|
+
* Select from a range of LLM endpoints, that provide structured output
|
44
|
+
* Supports single-table as well as multi-table scenarios.
|
45
|
+
* Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
|
46
|
+
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
47
|
+
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
48
|
+
|
49
|
+
## Getting Started
|
50
|
+
|
51
|
+
1. Install the latest version of the `mostlyai-mock` python package.
|
52
|
+
|
53
|
+
```bash
|
54
|
+
pip install -U mostlyai-mock
|
55
|
+
```
|
56
|
+
|
57
|
+
2. Set the API key of your LLM endpoint (if not done yet)
|
58
|
+
|
59
|
+
```python
|
60
|
+
import os
|
61
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
62
|
+
# os.environ["GEMINI_API_KEY"] = "your-api-key"
|
63
|
+
# os.environ["GROQ_API_KEY"] = "your-api-key"
|
64
|
+
```
|
65
|
+
|
66
|
+
Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
|
67
|
+
|
68
|
+
3. Create your first basic synthetic table from scratch
|
69
|
+
|
70
|
+
```python
|
71
|
+
from mostlyai import mock
|
72
|
+
|
73
|
+
tables = {
|
74
|
+
"guests": {
|
75
|
+
"description": "Guests of an Alpine ski hotel in Austria",
|
76
|
+
"columns": {
|
77
|
+
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
78
|
+
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
79
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
80
|
+
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
81
|
+
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
82
|
+
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
83
|
+
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
84
|
+
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
85
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
86
|
+
},
|
87
|
+
}
|
88
|
+
}
|
89
|
+
df = mock.sample(
|
90
|
+
tables=tables, # provide table and column definitions
|
91
|
+
sample_size=10, # generate 10 records
|
92
|
+
model="openai/gpt-4.1-nano", # select the LLM model (optional)
|
93
|
+
)
|
94
|
+
print(df)
|
95
|
+
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
96
|
+
# 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
|
97
|
+
# 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
|
98
|
+
# 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
|
99
|
+
# 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
|
100
|
+
# 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
|
101
|
+
# 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
|
102
|
+
# 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
|
103
|
+
# 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
|
104
|
+
# 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
|
105
|
+
# 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
|
106
|
+
```
|
107
|
+
|
108
|
+
4. Create your first multi-table synthetic dataset
|
109
|
+
|
110
|
+
```python
|
111
|
+
from mostlyai import mock
|
112
|
+
|
113
|
+
tables = {
|
114
|
+
"customers": {
|
115
|
+
"description": "Customers of a hardware store",
|
116
|
+
"columns": {
|
117
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
118
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
119
|
+
},
|
120
|
+
"primary_key": "customer_id",
|
121
|
+
},
|
122
|
+
"warehouses": {
|
123
|
+
"description": "Warehouses of a hardware store",
|
124
|
+
"columns": {
|
125
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
126
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
127
|
+
},
|
128
|
+
"primary_key": "warehouse_id",
|
129
|
+
},
|
130
|
+
"orders": {
|
131
|
+
"description": "Orders of a Customer",
|
132
|
+
"columns": {
|
133
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
134
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
135
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
136
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
137
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
138
|
+
},
|
139
|
+
"primary_key": "order_id",
|
140
|
+
"foreign_keys": [
|
141
|
+
{
|
142
|
+
"column": "customer_id",
|
143
|
+
"referenced_table": "customers",
|
144
|
+
"description": "each customer has anywhere between 2 and 3 orders",
|
145
|
+
},
|
146
|
+
{
|
147
|
+
"column": "warehouse_id",
|
148
|
+
"referenced_table": "warehouses",
|
149
|
+
},
|
150
|
+
],
|
151
|
+
},
|
152
|
+
"items": {
|
153
|
+
"description": "Items in an Order",
|
154
|
+
"columns": {
|
155
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
156
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
157
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
158
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
159
|
+
},
|
160
|
+
"foreign_keys": [
|
161
|
+
{
|
162
|
+
"column": "order_id",
|
163
|
+
"referenced_table": "orders",
|
164
|
+
"description": "each order has between 1 and 2 items",
|
165
|
+
}
|
166
|
+
],
|
167
|
+
},
|
168
|
+
}
|
169
|
+
data = mock.sample(
|
170
|
+
tables=tables,
|
171
|
+
sample_size=2,
|
172
|
+
model="openai/gpt-4.1"
|
173
|
+
)
|
174
|
+
print(data["customers"])
|
175
|
+
# customer_id name
|
176
|
+
# 0 1 Matthew Carlson
|
177
|
+
# 1 2 Priya Shah
|
178
|
+
print(data["warehouses"])
|
179
|
+
# warehouse_id name
|
180
|
+
# 0 1 Central Distribution Hub
|
181
|
+
# 1 2 Northgate Storage Facility
|
182
|
+
print(data["orders"])
|
183
|
+
# customer_id warehouse_id order_id text amount
|
184
|
+
# 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
|
185
|
+
# 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
|
186
|
+
# 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
|
187
|
+
# 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
|
188
|
+
# 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
|
189
|
+
print(data["items"])
|
190
|
+
# item_id order_id name price
|
191
|
+
# 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
|
192
|
+
# 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
|
193
|
+
# 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
|
194
|
+
# 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
|
195
|
+
# 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
|
196
|
+
# 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
|
197
|
+
# 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
|
198
|
+
# 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
|
199
|
+
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
200
|
+
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
201
|
+
```
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# Synthetic Mock Data 🔮
|
2
|
+
|
3
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)  
|
4
|
+
|
5
|
+
Create data out of nothing. Prompt LLMs for Tabular Data.
|
6
|
+
|
7
|
+
## Key Features
|
8
|
+
|
9
|
+
* A light-weight python client for prompting LLMs for mixed-type tabular data
|
10
|
+
* Select from a range of LLM endpoints, that provide structured output
|
11
|
+
* Supports single-table as well as multi-table scenarios.
|
12
|
+
* Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
|
13
|
+
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
14
|
+
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
15
|
+
|
16
|
+
## Getting Started
|
17
|
+
|
18
|
+
1. Install the latest version of the `mostlyai-mock` python package.
|
19
|
+
|
20
|
+
```bash
|
21
|
+
pip install -U mostlyai-mock
|
22
|
+
```
|
23
|
+
|
24
|
+
2. Set the API key of your LLM endpoint (if not done yet)
|
25
|
+
|
26
|
+
```python
|
27
|
+
import os
|
28
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
29
|
+
# os.environ["GEMINI_API_KEY"] = "your-api-key"
|
30
|
+
# os.environ["GROQ_API_KEY"] = "your-api-key"
|
31
|
+
```
|
32
|
+
|
33
|
+
Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
|
34
|
+
|
35
|
+
3. Create your first basic synthetic table from scratch
|
36
|
+
|
37
|
+
```python
|
38
|
+
from mostlyai import mock
|
39
|
+
|
40
|
+
tables = {
|
41
|
+
"guests": {
|
42
|
+
"description": "Guests of an Alpine ski hotel in Austria",
|
43
|
+
"columns": {
|
44
|
+
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
45
|
+
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
46
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
47
|
+
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
48
|
+
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
49
|
+
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
50
|
+
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
51
|
+
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
52
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
53
|
+
},
|
54
|
+
}
|
55
|
+
}
|
56
|
+
df = mock.sample(
|
57
|
+
tables=tables, # provide table and column definitions
|
58
|
+
sample_size=10, # generate 10 records
|
59
|
+
model="openai/gpt-4.1-nano", # select the LLM model (optional)
|
60
|
+
)
|
61
|
+
print(df)
|
62
|
+
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
63
|
+
# 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
|
64
|
+
# 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
|
65
|
+
# 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
|
66
|
+
# 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
|
67
|
+
# 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
|
68
|
+
# 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
|
69
|
+
# 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
|
70
|
+
# 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
|
71
|
+
# 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
|
72
|
+
# 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
|
73
|
+
```
|
74
|
+
|
75
|
+
4. Create your first multi-table synthetic dataset
|
76
|
+
|
77
|
+
```python
|
78
|
+
from mostlyai import mock
|
79
|
+
|
80
|
+
tables = {
|
81
|
+
"customers": {
|
82
|
+
"description": "Customers of a hardware store",
|
83
|
+
"columns": {
|
84
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
85
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
86
|
+
},
|
87
|
+
"primary_key": "customer_id",
|
88
|
+
},
|
89
|
+
"warehouses": {
|
90
|
+
"description": "Warehouses of a hardware store",
|
91
|
+
"columns": {
|
92
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
93
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
94
|
+
},
|
95
|
+
"primary_key": "warehouse_id",
|
96
|
+
},
|
97
|
+
"orders": {
|
98
|
+
"description": "Orders of a Customer",
|
99
|
+
"columns": {
|
100
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
101
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
102
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
103
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
104
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
105
|
+
},
|
106
|
+
"primary_key": "order_id",
|
107
|
+
"foreign_keys": [
|
108
|
+
{
|
109
|
+
"column": "customer_id",
|
110
|
+
"referenced_table": "customers",
|
111
|
+
"description": "each customer has anywhere between 2 and 3 orders",
|
112
|
+
},
|
113
|
+
{
|
114
|
+
"column": "warehouse_id",
|
115
|
+
"referenced_table": "warehouses",
|
116
|
+
},
|
117
|
+
],
|
118
|
+
},
|
119
|
+
"items": {
|
120
|
+
"description": "Items in an Order",
|
121
|
+
"columns": {
|
122
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
123
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
124
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
125
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
126
|
+
},
|
127
|
+
"foreign_keys": [
|
128
|
+
{
|
129
|
+
"column": "order_id",
|
130
|
+
"referenced_table": "orders",
|
131
|
+
"description": "each order has between 1 and 2 items",
|
132
|
+
}
|
133
|
+
],
|
134
|
+
},
|
135
|
+
}
|
136
|
+
data = mock.sample(
|
137
|
+
tables=tables,
|
138
|
+
sample_size=2,
|
139
|
+
model="openai/gpt-4.1"
|
140
|
+
)
|
141
|
+
print(data["customers"])
|
142
|
+
# customer_id name
|
143
|
+
# 0 1 Matthew Carlson
|
144
|
+
# 1 2 Priya Shah
|
145
|
+
print(data["warehouses"])
|
146
|
+
# warehouse_id name
|
147
|
+
# 0 1 Central Distribution Hub
|
148
|
+
# 1 2 Northgate Storage Facility
|
149
|
+
print(data["orders"])
|
150
|
+
# customer_id warehouse_id order_id text amount
|
151
|
+
# 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
|
152
|
+
# 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
|
153
|
+
# 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
|
154
|
+
# 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
|
155
|
+
# 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
|
156
|
+
print(data["items"])
|
157
|
+
# item_id order_id name price
|
158
|
+
# 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
|
159
|
+
# 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
|
160
|
+
# 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
|
161
|
+
# 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
|
162
|
+
# 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
|
163
|
+
# 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
|
164
|
+
# 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
|
165
|
+
# 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
|
166
|
+
# 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
|
167
|
+
# 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
|
168
|
+
```
|
@@ -100,7 +100,10 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
100
100
|
if table_name in path:
|
101
101
|
cycle_start = path.index(table_name)
|
102
102
|
cycle = path[cycle_start:] + [table_name]
|
103
|
-
|
103
|
+
msg = f"Circular dependency detected: {' -> '.join(cycle)}."
|
104
|
+
if len(cycle) == 2:
|
105
|
+
msg += " Self-referencing tables are not yet supported."
|
106
|
+
raise ValueError(msg)
|
104
107
|
if table_name in visited:
|
105
108
|
return
|
106
109
|
visited.add(table_name)
|
@@ -119,7 +122,7 @@ class TableConfig(BaseModel):
|
|
119
122
|
description: str = ""
|
120
123
|
columns: dict[str, ColumnConfig] = Field(..., min_items=1)
|
121
124
|
primary_key: str | None = None
|
122
|
-
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list
|
125
|
+
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
|
123
126
|
|
124
127
|
|
125
128
|
class ColumnConfig(BaseModel):
|
@@ -163,7 +166,7 @@ class ColumnConfig(BaseModel):
|
|
163
166
|
DType.DATETIME: (str, "strings"),
|
164
167
|
}[self.dtype]
|
165
168
|
try:
|
166
|
-
self.values = [cast_fn(c) for c in self.values]
|
169
|
+
self.values = [cast_fn(c) if pd.notna(c) else None for c in self.values]
|
167
170
|
except ValueError:
|
168
171
|
raise ValueError(
|
169
172
|
f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
|
@@ -193,28 +196,25 @@ def _sample_table(
|
|
193
196
|
table_config: TableConfig,
|
194
197
|
primary_keys: dict[str, str] | None,
|
195
198
|
sample_size: int | None,
|
196
|
-
|
199
|
+
generated_data: dict[str, pd.DataFrame] | None,
|
197
200
|
temperature: float,
|
198
201
|
top_p: float,
|
199
202
|
batch_size: int,
|
200
203
|
previous_rows_size: int,
|
204
|
+
non_context_size: int | None,
|
201
205
|
llm_config: LLMConfig,
|
202
206
|
) -> pd.DataFrame:
|
203
|
-
assert (sample_size is None) != (context_data is None), (
|
204
|
-
"Exactly one of sample_size or context_data must be provided"
|
205
|
-
)
|
206
|
-
if sample_size is None:
|
207
|
-
sample_size = len(context_data)
|
208
207
|
table_rows_generator = _create_table_rows_generator(
|
209
208
|
table_name=table_name,
|
210
209
|
table_config=table_config,
|
211
210
|
primary_keys=primary_keys,
|
212
211
|
sample_size=sample_size,
|
213
|
-
|
212
|
+
generated_data=generated_data,
|
214
213
|
temperature=temperature,
|
215
214
|
top_p=top_p,
|
216
215
|
batch_size=batch_size,
|
217
216
|
previous_rows_size=previous_rows_size,
|
217
|
+
non_context_size=non_context_size,
|
218
218
|
llm_config=llm_config,
|
219
219
|
)
|
220
220
|
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
|
@@ -231,6 +231,7 @@ def _create_table_prompt(
|
|
231
231
|
batch_size: int | None,
|
232
232
|
foreign_keys: list[ForeignKeyConfig] | None,
|
233
233
|
context_data: pd.DataFrame | None,
|
234
|
+
non_context_data: dict[str, pd.DataFrame],
|
234
235
|
previous_rows: list[dict],
|
235
236
|
) -> str:
|
236
237
|
if batch_size is not None:
|
@@ -271,16 +272,29 @@ def _create_table_prompt(
|
|
271
272
|
prompt += f"## Context Table Data:\n\n"
|
272
273
|
prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
|
273
274
|
|
275
|
+
# add non-context table names, primary keys and data
|
276
|
+
if non_context_data:
|
277
|
+
for fk in foreign_keys[1:]:
|
278
|
+
prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
|
279
|
+
|
280
|
+
prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
281
|
+
|
282
|
+
prompt += f"## Non-Context Table Data:\n\n"
|
283
|
+
prompt += f"{non_context_data[fk.referenced_table].to_json(orient='records', indent=2)}\n\n"
|
284
|
+
|
274
285
|
# add instructions
|
275
286
|
prompt += "\n## Instructions:\n\n"
|
276
287
|
if batch_size is not None:
|
277
288
|
prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
|
278
|
-
|
289
|
+
|
290
|
+
if context_data is not None:
|
279
291
|
prompt += (
|
280
292
|
f"Generate data for the `{table_name}` table. "
|
281
|
-
f"The Foreign Key column may only contain values from Context Table Data. "
|
293
|
+
f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
|
294
|
+
f"The second Foreign Key column from Foreign Keys section (if exists) may only contain values from Non-Context Table Data. "
|
282
295
|
f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
|
283
296
|
)
|
297
|
+
|
284
298
|
if previous_rows:
|
285
299
|
prompt += (
|
286
300
|
"Generate new rows that maintain consistency with the previous rows where appropriate. "
|
@@ -298,12 +312,13 @@ def _create_table_rows_generator(
|
|
298
312
|
table_name: str,
|
299
313
|
table_config: TableConfig,
|
300
314
|
primary_keys: dict[str, str] | None,
|
301
|
-
sample_size: int,
|
315
|
+
sample_size: int | None,
|
316
|
+
generated_data: dict[str, pd.DataFrame] | None,
|
302
317
|
temperature: float,
|
303
318
|
top_p: float,
|
304
|
-
context_data: pd.DataFrame | None,
|
305
319
|
batch_size: int,
|
306
320
|
previous_rows_size: int,
|
321
|
+
non_context_size: int | None,
|
307
322
|
llm_config: LLMConfig,
|
308
323
|
) -> Generator[dict]:
|
309
324
|
def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
|
@@ -311,14 +326,14 @@ def _create_table_rows_generator(
|
|
311
326
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
312
327
|
return Literal[tuple(column_config.values)]
|
313
328
|
return {
|
314
|
-
DType.INTEGER: int,
|
315
|
-
DType.FLOAT: float,
|
316
|
-
DType.STRING: str,
|
317
|
-
DType.BOOLEAN: bool,
|
329
|
+
DType.INTEGER: int | None,
|
330
|
+
DType.FLOAT: float | None,
|
331
|
+
DType.STRING: str | None,
|
332
|
+
DType.BOOLEAN: bool | None,
|
318
333
|
# response_format has limited support for JSON Schema features
|
319
334
|
# thus we represent dates and datetimes as strings
|
320
|
-
DType.DATE: str,
|
321
|
-
DType.DATETIME: str,
|
335
|
+
DType.DATE: str | None,
|
336
|
+
DType.DATETIME: str | None,
|
322
337
|
}[column_config.dtype]
|
323
338
|
|
324
339
|
fields = {}
|
@@ -368,6 +383,26 @@ def _create_table_rows_generator(
|
|
368
383
|
for i in range(0, len(data), batch_size):
|
369
384
|
yield data.iloc[i : i + batch_size]
|
370
385
|
|
386
|
+
# derive context data (if first foreign key is present) and harmonize sample size accordingly
|
387
|
+
context_data: pd.DataFrame | None = None
|
388
|
+
if table_config.foreign_keys:
|
389
|
+
context_table_name = table_config.foreign_keys[0].referenced_table
|
390
|
+
assert generated_data is not None
|
391
|
+
assert context_table_name in generated_data
|
392
|
+
context_data = generated_data[context_table_name]
|
393
|
+
sample_size = len(context_data)
|
394
|
+
assert sample_size is not None
|
395
|
+
|
396
|
+
# derive non-context data (if more than one foreign key is present)
|
397
|
+
non_context_data: dict[str, pd.DataFrame] = {}
|
398
|
+
if table_config.foreign_keys and len(table_config.foreign_keys) > 1:
|
399
|
+
assert generated_data is not None
|
400
|
+
assert non_context_size is not None
|
401
|
+
for fk in table_config.foreign_keys[1:]:
|
402
|
+
non_context_table_name = fk.referenced_table
|
403
|
+
assert non_context_table_name in generated_data
|
404
|
+
non_context_data[non_context_table_name] = generated_data[non_context_table_name]
|
405
|
+
|
371
406
|
# ensure model supports response_format and json schema
|
372
407
|
supported_params = litellm.get_supported_openai_params(model=llm_config.model)
|
373
408
|
assert "response_format" in supported_params
|
@@ -387,6 +422,11 @@ def _create_table_rows_generator(
|
|
387
422
|
yielded_sequences = 0
|
388
423
|
previous_rows = deque(maxlen=previous_rows_size)
|
389
424
|
for context_batch in batch_infinitely(context_data):
|
425
|
+
non_context_batch = (
|
426
|
+
{table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
|
427
|
+
if non_context_data
|
428
|
+
else None
|
429
|
+
)
|
390
430
|
prompt_kwargs = {
|
391
431
|
"table_name": table_name,
|
392
432
|
"table_description": table_config.description,
|
@@ -395,6 +435,7 @@ def _create_table_rows_generator(
|
|
395
435
|
"batch_size": batch_size if context_batch is None else None,
|
396
436
|
"foreign_keys": table_config.foreign_keys if context_batch is not None else None,
|
397
437
|
"context_data": context_batch if context_batch is not None else None,
|
438
|
+
"non_context_data": non_context_batch if non_context_batch else None,
|
398
439
|
"previous_rows": list(previous_rows),
|
399
440
|
}
|
400
441
|
prompt = _create_table_prompt(**prompt_kwargs)
|
@@ -429,10 +470,14 @@ def _convert_table_rows_generator_to_df(
|
|
429
470
|
for column_name, column_config in columns.items():
|
430
471
|
if column_config.dtype in [DType.DATE, DType.DATETIME]:
|
431
472
|
df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
|
432
|
-
elif column_config.dtype
|
433
|
-
df[column_name] = pd.to_numeric(df[column_name], errors="coerce",
|
473
|
+
elif column_config.dtype is DType.INTEGER:
|
474
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
|
475
|
+
"int64[pyarrow]"
|
476
|
+
)
|
477
|
+
elif column_config.dtype is DType.FLOAT:
|
478
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
|
434
479
|
elif column_config.dtype is DType.BOOLEAN:
|
435
|
-
df[column_name] = df[column_name].astype(
|
480
|
+
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
|
436
481
|
elif column_config.dtype is DType.CATEGORY:
|
437
482
|
df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
|
438
483
|
else:
|
@@ -472,7 +517,9 @@ def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], d
|
|
472
517
|
return child_to_parents, parent_to_children, subject_tables
|
473
518
|
|
474
519
|
|
475
|
-
def _build_execution_plan(
|
520
|
+
def _build_execution_plan(
|
521
|
+
parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
|
522
|
+
) -> list[str]:
|
476
523
|
execution_plan = []
|
477
524
|
bfs_queue = list(subject_tables)
|
478
525
|
processed = set()
|
@@ -482,6 +529,13 @@ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tabl
|
|
482
529
|
if table_name in processed:
|
483
530
|
continue
|
484
531
|
|
532
|
+
# ensure all parents are processed before processing this table
|
533
|
+
unprocessed_parents = [p for p in child_to_parents[table_name] if p not in processed]
|
534
|
+
if unprocessed_parents:
|
535
|
+
bfs_queue.extend(unprocessed_parents)
|
536
|
+
bfs_queue.append(table_name)
|
537
|
+
continue
|
538
|
+
|
485
539
|
execution_plan.append(table_name)
|
486
540
|
processed.add(table_name)
|
487
541
|
|
@@ -564,10 +618,19 @@ def sample(
|
|
564
618
|
},
|
565
619
|
"primary_key": "customer_id",
|
566
620
|
},
|
621
|
+
"warehouses": {
|
622
|
+
"description": "Warehouses of a hardware store",
|
623
|
+
"columns": {
|
624
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
|
625
|
+
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
626
|
+
},
|
627
|
+
"primary_key": "warehouse_id",
|
628
|
+
},
|
567
629
|
"orders": {
|
568
630
|
"description": "Orders of a Customer",
|
569
631
|
"columns": {
|
570
632
|
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
633
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
|
571
634
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
572
635
|
"text": {"prompt": "order text description", "dtype": "string"},
|
573
636
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -577,8 +640,12 @@ def sample(
|
|
577
640
|
{
|
578
641
|
"column": "customer_id",
|
579
642
|
"referenced_table": "customers",
|
580
|
-
"description": "each customer has anywhere between
|
581
|
-
}
|
643
|
+
"description": "each customer has anywhere between 2 and 3 orders",
|
644
|
+
},
|
645
|
+
{
|
646
|
+
"column": "warehouse_id",
|
647
|
+
"referenced_table": "warehouses",
|
648
|
+
},
|
582
649
|
],
|
583
650
|
},
|
584
651
|
"items": {
|
@@ -593,13 +660,14 @@ def sample(
|
|
593
660
|
{
|
594
661
|
"column": "order_id",
|
595
662
|
"referenced_table": "orders",
|
596
|
-
"description": "each order has between
|
663
|
+
"description": "each order has between 1 and 2 items",
|
597
664
|
}
|
598
665
|
],
|
599
666
|
},
|
600
667
|
}
|
601
668
|
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
602
669
|
df_customers = data["customers"]
|
670
|
+
df_warehouses = data["warehouses"]
|
603
671
|
df_orders = data["orders"]
|
604
672
|
df_items = data["items"]
|
605
673
|
```
|
@@ -611,7 +679,7 @@ def sample(
|
|
611
679
|
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
612
680
|
|
613
681
|
child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
|
614
|
-
execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
|
682
|
+
execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
|
615
683
|
|
616
684
|
results: dict[str, pd.DataFrame] = {}
|
617
685
|
|
@@ -624,26 +692,27 @@ def sample(
|
|
624
692
|
table_config=table_config,
|
625
693
|
primary_keys=None,
|
626
694
|
sample_size=sample_size[table_name],
|
627
|
-
|
695
|
+
generated_data=None,
|
628
696
|
temperature=temperature,
|
629
697
|
top_p=top_p,
|
630
|
-
batch_size=
|
631
|
-
previous_rows_size=
|
698
|
+
batch_size=30, # generate 30 subjects at a time
|
699
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
700
|
+
non_context_size=None,
|
632
701
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
633
702
|
)
|
634
703
|
else:
|
635
704
|
# sequencial table
|
636
|
-
referenced_table = table_config.foreign_keys[0].referenced_table
|
637
705
|
df = _sample_table(
|
638
706
|
table_name=table_name,
|
639
707
|
table_config=table_config,
|
640
708
|
primary_keys=primary_keys,
|
641
709
|
sample_size=None,
|
642
|
-
|
710
|
+
generated_data=results,
|
643
711
|
temperature=temperature,
|
644
712
|
top_p=top_p,
|
645
713
|
batch_size=1, # generate one sequence at a time
|
646
|
-
previous_rows_size=
|
714
|
+
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
715
|
+
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
647
716
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
648
717
|
)
|
649
718
|
results[table_name] = df
|
@@ -1,11 +1,28 @@
|
|
1
1
|
[project]
|
2
2
|
name = "mostlyai-mock"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.7"
|
4
4
|
description = "Synthetic Mock Data"
|
5
5
|
authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
|
6
6
|
requires-python = ">=3.10"
|
7
7
|
readme = "README.md"
|
8
8
|
license = "Apache-2.0"
|
9
|
+
classifiers = [
|
10
|
+
"Development Status :: 4 - Beta",
|
11
|
+
"Intended Audience :: Developers",
|
12
|
+
"Intended Audience :: Science/Research",
|
13
|
+
"Intended Audience :: Information Technology",
|
14
|
+
"Intended Audience :: Financial and Insurance Industry",
|
15
|
+
"Intended Audience :: Healthcare Industry",
|
16
|
+
"Intended Audience :: Telecommunications Industry",
|
17
|
+
"Programming Language :: Python :: 3.10",
|
18
|
+
"Programming Language :: Python :: 3.11",
|
19
|
+
"Programming Language :: Python :: 3.12",
|
20
|
+
"Programming Language :: Python :: 3.13",
|
21
|
+
"License :: OSI Approved :: Apache Software License",
|
22
|
+
"Operating System :: OS Independent",
|
23
|
+
"Topic :: Software Development :: Libraries",
|
24
|
+
"Typing :: Typed",
|
25
|
+
]
|
9
26
|
dependencies = [
|
10
27
|
"pydantic>=2.0.0,<3.0.0",
|
11
28
|
"numpy>=1.26.3",
|
mostlyai_mock-0.0.5/PKG-INFO
DELETED
@@ -1,117 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: mostlyai-mock
|
3
|
-
Version: 0.0.5
|
4
|
-
Summary: Synthetic Mock Data
|
5
|
-
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
|
-
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
7
|
-
Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
|
8
|
-
Author-email: MOSTLY AI <dev@mostly.ai>
|
9
|
-
License-Expression: Apache-2.0
|
10
|
-
License-File: LICENSE
|
11
|
-
Requires-Python: >=3.10
|
12
|
-
Requires-Dist: litellm>=1.67.0
|
13
|
-
Requires-Dist: numpy>=1.26.3
|
14
|
-
Requires-Dist: pandas>=2.0.0
|
15
|
-
Requires-Dist: pyarrow>=14.0.0
|
16
|
-
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
17
|
-
Description-Content-Type: text/markdown
|
18
|
-
|
19
|
-
# Synthetic Mock Data 🔮
|
20
|
-
|
21
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)   
|
22
|
-
|
23
|
-
Create data out of nothing. Prompt LLMs for Tabular Data.
|
24
|
-
|
25
|
-
## Installation
|
26
|
-
|
27
|
-
The latest release of `mostlyai-mock` can be installed via pip:
|
28
|
-
|
29
|
-
```bash
|
30
|
-
pip install -U mostlyai-mock
|
31
|
-
```
|
32
|
-
|
33
|
-
Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
|
34
|
-
|
35
|
-
## Quick Start
|
36
|
-
|
37
|
-
### Single Table
|
38
|
-
|
39
|
-
```python
|
40
|
-
from mostlyai import mock
|
41
|
-
|
42
|
-
tables = {
|
43
|
-
"guests": {
|
44
|
-
"description": "Guests of an Alpine ski hotel in Austria",
|
45
|
-
"columns": {
|
46
|
-
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
47
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
48
|
-
"gender": {"dtype": "category", "values": ["male", "female"]},
|
49
|
-
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
50
|
-
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
51
|
-
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
52
|
-
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
53
|
-
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
54
|
-
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
55
|
-
},
|
56
|
-
}
|
57
|
-
}
|
58
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
|
59
|
-
print(df)
|
60
|
-
```
|
61
|
-
|
62
|
-
### Multiple Tables
|
63
|
-
|
64
|
-
```python
|
65
|
-
from mostlyai import mock
|
66
|
-
|
67
|
-
tables = {
|
68
|
-
"customers": {
|
69
|
-
"description": "Customers of a hardware store",
|
70
|
-
"columns": {
|
71
|
-
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
72
|
-
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
73
|
-
},
|
74
|
-
"primary_key": "customer_id",
|
75
|
-
},
|
76
|
-
"orders": {
|
77
|
-
"description": "Orders of a Customer",
|
78
|
-
"columns": {
|
79
|
-
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
80
|
-
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
81
|
-
"text": {"prompt": "order text description", "dtype": "string"},
|
82
|
-
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
83
|
-
},
|
84
|
-
"primary_key": "order_id",
|
85
|
-
"foreign_keys": [
|
86
|
-
{
|
87
|
-
"column": "customer_id",
|
88
|
-
"referenced_table": "customers",
|
89
|
-
"description": "each customer has anywhere between 1 and 3 orders",
|
90
|
-
}
|
91
|
-
],
|
92
|
-
},
|
93
|
-
"items": {
|
94
|
-
"description": "Items in an Order",
|
95
|
-
"columns": {
|
96
|
-
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
97
|
-
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
98
|
-
"name": {"prompt": "the name of the item", "dtype": "string"},
|
99
|
-
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
100
|
-
},
|
101
|
-
"foreign_keys": [
|
102
|
-
{
|
103
|
-
"column": "order_id",
|
104
|
-
"referenced_table": "orders",
|
105
|
-
"description": "each order has between 2 and 5 items",
|
106
|
-
}
|
107
|
-
],
|
108
|
-
},
|
109
|
-
}
|
110
|
-
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
111
|
-
df_customers = data["customers"]
|
112
|
-
df_orders = data["orders"]
|
113
|
-
df_items = data["items"]
|
114
|
-
print(df_customers)
|
115
|
-
print(df_orders)
|
116
|
-
print(df_items)
|
117
|
-
```
|
mostlyai_mock-0.0.5/README.md
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
# Synthetic Mock Data 🔮
|
2
|
-
|
3
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)   
|
4
|
-
|
5
|
-
Create data out of nothing. Prompt LLMs for Tabular Data.
|
6
|
-
|
7
|
-
## Installation
|
8
|
-
|
9
|
-
The latest release of `mostlyai-mock` can be installed via pip:
|
10
|
-
|
11
|
-
```bash
|
12
|
-
pip install -U mostlyai-mock
|
13
|
-
```
|
14
|
-
|
15
|
-
Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
|
16
|
-
|
17
|
-
## Quick Start
|
18
|
-
|
19
|
-
### Single Table
|
20
|
-
|
21
|
-
```python
|
22
|
-
from mostlyai import mock
|
23
|
-
|
24
|
-
tables = {
|
25
|
-
"guests": {
|
26
|
-
"description": "Guests of an Alpine ski hotel in Austria",
|
27
|
-
"columns": {
|
28
|
-
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
29
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
30
|
-
"gender": {"dtype": "category", "values": ["male", "female"]},
|
31
|
-
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
32
|
-
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
33
|
-
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
34
|
-
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
35
|
-
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
36
|
-
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
37
|
-
},
|
38
|
-
}
|
39
|
-
}
|
40
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
|
41
|
-
print(df)
|
42
|
-
```
|
43
|
-
|
44
|
-
### Multiple Tables
|
45
|
-
|
46
|
-
```python
|
47
|
-
from mostlyai import mock
|
48
|
-
|
49
|
-
tables = {
|
50
|
-
"customers": {
|
51
|
-
"description": "Customers of a hardware store",
|
52
|
-
"columns": {
|
53
|
-
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
54
|
-
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
55
|
-
},
|
56
|
-
"primary_key": "customer_id",
|
57
|
-
},
|
58
|
-
"orders": {
|
59
|
-
"description": "Orders of a Customer",
|
60
|
-
"columns": {
|
61
|
-
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
62
|
-
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
63
|
-
"text": {"prompt": "order text description", "dtype": "string"},
|
64
|
-
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
65
|
-
},
|
66
|
-
"primary_key": "order_id",
|
67
|
-
"foreign_keys": [
|
68
|
-
{
|
69
|
-
"column": "customer_id",
|
70
|
-
"referenced_table": "customers",
|
71
|
-
"description": "each customer has anywhere between 1 and 3 orders",
|
72
|
-
}
|
73
|
-
],
|
74
|
-
},
|
75
|
-
"items": {
|
76
|
-
"description": "Items in an Order",
|
77
|
-
"columns": {
|
78
|
-
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
79
|
-
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
80
|
-
"name": {"prompt": "the name of the item", "dtype": "string"},
|
81
|
-
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
82
|
-
},
|
83
|
-
"foreign_keys": [
|
84
|
-
{
|
85
|
-
"column": "order_id",
|
86
|
-
"referenced_table": "orders",
|
87
|
-
"description": "each order has between 2 and 5 items",
|
88
|
-
}
|
89
|
-
],
|
90
|
-
},
|
91
|
-
}
|
92
|
-
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
93
|
-
df_customers = data["customers"]
|
94
|
-
df_orders = data["orders"]
|
95
|
-
df_items = data["items"]
|
96
|
-
print(df_customers)
|
97
|
-
print(df_orders)
|
98
|
-
print(df_items)
|
99
|
-
```
|
File without changes
|
File without changes
|