mostlyai-mock 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai_mock-0.0.6/PKG-INFO +186 -0
- mostlyai_mock-0.0.6/README.md +153 -0
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.6}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.6}/mostlyai/mock/core.py +118 -30
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.6}/pyproject.toml +18 -1
- mostlyai_mock-0.0.4/PKG-INFO +0 -98
- mostlyai_mock-0.0.4/README.md +0 -80
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.6}/.gitignore +0 -0
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.6}/LICENSE +0 -0
@@ -0,0 +1,186 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: mostlyai-mock
|
3
|
+
Version: 0.0.6
|
4
|
+
Summary: Synthetic Mock Data
|
5
|
+
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
|
+
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
7
|
+
Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
|
8
|
+
Author-email: MOSTLY AI <dev@mostly.ai>
|
9
|
+
License-Expression: Apache-2.0
|
10
|
+
License-File: LICENSE
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
14
|
+
Classifier: Intended Audience :: Healthcare Industry
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
17
|
+
Classifier: Intended Audience :: Telecommunications Industry
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
24
|
+
Classifier: Topic :: Software Development :: Libraries
|
25
|
+
Classifier: Typing :: Typed
|
26
|
+
Requires-Python: >=3.10
|
27
|
+
Requires-Dist: litellm>=1.67.0
|
28
|
+
Requires-Dist: numpy>=1.26.3
|
29
|
+
Requires-Dist: pandas>=2.0.0
|
30
|
+
Requires-Dist: pyarrow>=14.0.0
|
31
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
|
34
|
+
# Synthetic Mock Data 🔮
|
35
|
+
|
36
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)   
|
37
|
+
|
38
|
+
Create data out of nothing. Prompt LLMs for Tabular Data.
|
39
|
+
|
40
|
+
## Key Features
|
41
|
+
|
42
|
+
* A light-weight python client for prompting LLMs for mixed-type tabular data
|
43
|
+
* Select from a range of LLM endpoints, that provide structured output
|
44
|
+
* Supports single-table as well as multi-table scenarios.
|
45
|
+
* Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
|
46
|
+
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
47
|
+
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
48
|
+
|
49
|
+
## Getting Started
|
50
|
+
|
51
|
+
1. Install the latest version of the `mostlyai-mock` python package.
|
52
|
+
|
53
|
+
```bash
|
54
|
+
pip install -U mostlyai-mock
|
55
|
+
```
|
56
|
+
|
57
|
+
2. Set the API key of your LLM endpoint (if not done yet)
|
58
|
+
|
59
|
+
```python
|
60
|
+
import os
|
61
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
62
|
+
# os.environ["GEMINI_API_KEY"] = "your-api-key"
|
63
|
+
# os.environ["GROQ_API_KEY"] = "your-api-key"
|
64
|
+
```
|
65
|
+
|
66
|
+
Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
|
67
|
+
|
68
|
+
3. Create your first basic synthetic table from scratch
|
69
|
+
|
70
|
+
```python
|
71
|
+
from mostlyai import mock
|
72
|
+
|
73
|
+
tables = {
|
74
|
+
"guests": {
|
75
|
+
"description": "Guests of an Alpine ski hotel in Austria",
|
76
|
+
"columns": {
|
77
|
+
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
78
|
+
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
79
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
80
|
+
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
81
|
+
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
82
|
+
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
83
|
+
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
84
|
+
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
85
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
86
|
+
},
|
87
|
+
}
|
88
|
+
}
|
89
|
+
df = mock.sample(
|
90
|
+
tables=tables, # provide table and column definitions
|
91
|
+
sample_size=10, # generate 10 records
|
92
|
+
model="openai/gpt-4.1-nano", # select the LLM model (optional)
|
93
|
+
)
|
94
|
+
print(df)
|
95
|
+
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
96
|
+
# 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
|
97
|
+
# 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
|
98
|
+
# 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
|
99
|
+
# 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
|
100
|
+
# 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
|
101
|
+
# 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
|
102
|
+
# 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
|
103
|
+
# 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
|
104
|
+
# 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
|
105
|
+
# 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
|
106
|
+
```
|
107
|
+
|
108
|
+
4. Create your first multi-table synthetic dataset
|
109
|
+
|
110
|
+
```python
|
111
|
+
from mostlyai import mock
|
112
|
+
|
113
|
+
tables = {
|
114
|
+
"customers": {
|
115
|
+
"description": "Customers of a hardware store",
|
116
|
+
"columns": {
|
117
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
118
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
119
|
+
},
|
120
|
+
"primary_key": "customer_id",
|
121
|
+
},
|
122
|
+
"orders": {
|
123
|
+
"description": "Orders of a Customer",
|
124
|
+
"columns": {
|
125
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
126
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
127
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
128
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
129
|
+
},
|
130
|
+
"primary_key": "order_id",
|
131
|
+
"foreign_keys": [
|
132
|
+
{
|
133
|
+
"column": "customer_id",
|
134
|
+
"referenced_table": "customers",
|
135
|
+
"description": "each customer has anywhere between 2 and 3 orders",
|
136
|
+
}
|
137
|
+
],
|
138
|
+
},
|
139
|
+
"items": {
|
140
|
+
"description": "Items in an Order",
|
141
|
+
"columns": {
|
142
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
143
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
144
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
145
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
146
|
+
},
|
147
|
+
"foreign_keys": [
|
148
|
+
{
|
149
|
+
"column": "order_id",
|
150
|
+
"referenced_table": "orders",
|
151
|
+
"description": "each order has between 1 and 2 items",
|
152
|
+
}
|
153
|
+
],
|
154
|
+
},
|
155
|
+
}
|
156
|
+
data = mock.sample(
|
157
|
+
tables=tables,
|
158
|
+
sample_size=2,
|
159
|
+
model="openai/gpt-4.1"
|
160
|
+
)
|
161
|
+
print(data["customers"])
|
162
|
+
# customer_id name
|
163
|
+
# 0 1 Michael Torres
|
164
|
+
# 1 2 Elaine Kim
|
165
|
+
print(data["orders"])
|
166
|
+
# customer_id order_id text amount
|
167
|
+
# 0 1 ORD20240612001 Home office desk and ergonomic chair bundle 412.95
|
168
|
+
# 1 1 ORD20240517322 Wireless noise-cancelling headphones 226.49
|
169
|
+
# 2 1 ORD20240430307 Smart LED desk lamp with USB charging port 69.99
|
170
|
+
# 3 2 ORD20240614015 Eco-friendly bamboo kitchen utensil set 39.95
|
171
|
+
# 4 2 ORD20240528356 Air fryer with digital touch screen, 5-quart c... 129.99
|
172
|
+
# 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
|
173
|
+
print(data["items"])
|
174
|
+
# item_id order_id name price
|
175
|
+
# 0 ITEM100001A ORD20240612001 Ergonomic Mesh Office Chair 179.99
|
176
|
+
# 1 ITEM100001B ORD20240612001 Adjustable Home Office Desk 232.96
|
177
|
+
# 2 ITEM100002A ORD20240517322 Wireless Noise-Cancelling Headphones 226.49
|
178
|
+
# 3 ITEM100003A ORD20240430307 Smart LED Desk Lamp 59.99
|
179
|
+
# 4 ITEM100003B ORD20240430307 USB Charging Cable (Desk Lamp Compatible) 10.0
|
180
|
+
# 5 ITEM100004A ORD20240614015 Bamboo Cooking Spoon 13.49
|
181
|
+
# 6 ITEM100004B ORD20240614015 Bamboo Slotted Turner 12.99
|
182
|
+
# 7 ITEM100005A ORD20240528356 Digital Air Fryer (5-Quart, Black) 115.99
|
183
|
+
# 8 ITEM100005B ORD20240528356 Silicone Liner for Air Fryer (5-Quart) 13.99
|
184
|
+
# 9 ITEM100006A ORD20240510078 Double-Walled Glass Coffee Mug (12oz) 13.75
|
185
|
+
# 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
|
186
|
+
```
|
@@ -0,0 +1,153 @@
|
|
1
|
+
# Synthetic Mock Data 🔮
|
2
|
+
|
3
|
+
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)   
|
4
|
+
|
5
|
+
Create data out of nothing. Prompt LLMs for Tabular Data.
|
6
|
+
|
7
|
+
## Key Features
|
8
|
+
|
9
|
+
* A light-weight python client for prompting LLMs for mixed-type tabular data
|
10
|
+
* Select from a range of LLM endpoints, that provide structured output
|
11
|
+
* Supports single-table as well as multi-table scenarios.
|
12
|
+
* Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
|
13
|
+
* Specify context, distributions and rules via dataset-, table- or column-level prompts.
|
14
|
+
* Tailor the diversity and realism of your generated data via temperature and top_p.
|
15
|
+
|
16
|
+
## Getting Started
|
17
|
+
|
18
|
+
1. Install the latest version of the `mostlyai-mock` python package.
|
19
|
+
|
20
|
+
```bash
|
21
|
+
pip install -U mostlyai-mock
|
22
|
+
```
|
23
|
+
|
24
|
+
2. Set the API key of your LLM endpoint (if not done yet)
|
25
|
+
|
26
|
+
```python
|
27
|
+
import os
|
28
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
29
|
+
# os.environ["GEMINI_API_KEY"] = "your-api-key"
|
30
|
+
# os.environ["GROQ_API_KEY"] = "your-api-key"
|
31
|
+
```
|
32
|
+
|
33
|
+
Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
|
34
|
+
|
35
|
+
3. Create your first basic synthetic table from scratch
|
36
|
+
|
37
|
+
```python
|
38
|
+
from mostlyai import mock
|
39
|
+
|
40
|
+
tables = {
|
41
|
+
"guests": {
|
42
|
+
"description": "Guests of an Alpine ski hotel in Austria",
|
43
|
+
"columns": {
|
44
|
+
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
45
|
+
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
46
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
47
|
+
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
48
|
+
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
49
|
+
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
50
|
+
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
51
|
+
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
52
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
53
|
+
},
|
54
|
+
}
|
55
|
+
}
|
56
|
+
df = mock.sample(
|
57
|
+
tables=tables, # provide table and column definitions
|
58
|
+
sample_size=10, # generate 10 records
|
59
|
+
model="openai/gpt-4.1-nano", # select the LLM model (optional)
|
60
|
+
)
|
61
|
+
print(df)
|
62
|
+
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
63
|
+
# 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
|
64
|
+
# 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
|
65
|
+
# 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
|
66
|
+
# 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
|
67
|
+
# 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
|
68
|
+
# 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
|
69
|
+
# 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
|
70
|
+
# 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
|
71
|
+
# 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
|
72
|
+
# 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
|
73
|
+
```
|
74
|
+
|
75
|
+
4. Create your first multi-table synthetic dataset
|
76
|
+
|
77
|
+
```python
|
78
|
+
from mostlyai import mock
|
79
|
+
|
80
|
+
tables = {
|
81
|
+
"customers": {
|
82
|
+
"description": "Customers of a hardware store",
|
83
|
+
"columns": {
|
84
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
85
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
86
|
+
},
|
87
|
+
"primary_key": "customer_id",
|
88
|
+
},
|
89
|
+
"orders": {
|
90
|
+
"description": "Orders of a Customer",
|
91
|
+
"columns": {
|
92
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
93
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
94
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
95
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
96
|
+
},
|
97
|
+
"primary_key": "order_id",
|
98
|
+
"foreign_keys": [
|
99
|
+
{
|
100
|
+
"column": "customer_id",
|
101
|
+
"referenced_table": "customers",
|
102
|
+
"description": "each customer has anywhere between 2 and 3 orders",
|
103
|
+
}
|
104
|
+
],
|
105
|
+
},
|
106
|
+
"items": {
|
107
|
+
"description": "Items in an Order",
|
108
|
+
"columns": {
|
109
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
110
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
111
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
112
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
113
|
+
},
|
114
|
+
"foreign_keys": [
|
115
|
+
{
|
116
|
+
"column": "order_id",
|
117
|
+
"referenced_table": "orders",
|
118
|
+
"description": "each order has between 1 and 2 items",
|
119
|
+
}
|
120
|
+
],
|
121
|
+
},
|
122
|
+
}
|
123
|
+
data = mock.sample(
|
124
|
+
tables=tables,
|
125
|
+
sample_size=2,
|
126
|
+
model="openai/gpt-4.1"
|
127
|
+
)
|
128
|
+
print(data["customers"])
|
129
|
+
# customer_id name
|
130
|
+
# 0 1 Michael Torres
|
131
|
+
# 1 2 Elaine Kim
|
132
|
+
print(data["orders"])
|
133
|
+
# customer_id order_id text amount
|
134
|
+
# 0 1 ORD20240612001 Home office desk and ergonomic chair bundle 412.95
|
135
|
+
# 1 1 ORD20240517322 Wireless noise-cancelling headphones 226.49
|
136
|
+
# 2 1 ORD20240430307 Smart LED desk lamp with USB charging port 69.99
|
137
|
+
# 3 2 ORD20240614015 Eco-friendly bamboo kitchen utensil set 39.95
|
138
|
+
# 4 2 ORD20240528356 Air fryer with digital touch screen, 5-quart c... 129.99
|
139
|
+
# 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
|
140
|
+
print(data["items"])
|
141
|
+
# item_id order_id name price
|
142
|
+
# 0 ITEM100001A ORD20240612001 Ergonomic Mesh Office Chair 179.99
|
143
|
+
# 1 ITEM100001B ORD20240612001 Adjustable Home Office Desk 232.96
|
144
|
+
# 2 ITEM100002A ORD20240517322 Wireless Noise-Cancelling Headphones 226.49
|
145
|
+
# 3 ITEM100003A ORD20240430307 Smart LED Desk Lamp 59.99
|
146
|
+
# 4 ITEM100003B ORD20240430307 USB Charging Cable (Desk Lamp Compatible) 10.0
|
147
|
+
# 5 ITEM100004A ORD20240614015 Bamboo Cooking Spoon 13.49
|
148
|
+
# 6 ITEM100004B ORD20240614015 Bamboo Slotted Turner 12.99
|
149
|
+
# 7 ITEM100005A ORD20240528356 Digital Air Fryer (5-Quart, Black) 115.99
|
150
|
+
# 8 ITEM100005B ORD20240528356 Silicone Liner for Air Fryer (5-Quart) 13.99
|
151
|
+
# 9 ITEM100006A ORD20240510078 Double-Walled Glass Coffee Mug (12oz) 13.75
|
152
|
+
# 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
|
153
|
+
```
|
@@ -89,6 +89,31 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
89
89
|
|
90
90
|
return tables
|
91
91
|
|
92
|
+
@model_validator(mode="after")
|
93
|
+
def validate_no_circular_dependencies(self) -> MockConfig:
|
94
|
+
child_to_parents = {}
|
95
|
+
for table_name, table_config in self.root.items():
|
96
|
+
child_to_parents[table_name] = [fk.referenced_table for fk in table_config.foreign_keys]
|
97
|
+
visited = set()
|
98
|
+
|
99
|
+
def detect_cycle(table_name: str, path: list[str]) -> None:
|
100
|
+
if table_name in path:
|
101
|
+
cycle_start = path.index(table_name)
|
102
|
+
cycle = path[cycle_start:] + [table_name]
|
103
|
+
raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}")
|
104
|
+
if table_name in visited:
|
105
|
+
return
|
106
|
+
visited.add(table_name)
|
107
|
+
path.append(table_name)
|
108
|
+
for parent in child_to_parents[table_name]:
|
109
|
+
detect_cycle(parent, path)
|
110
|
+
path.pop()
|
111
|
+
|
112
|
+
for table_name in child_to_parents:
|
113
|
+
detect_cycle(table_name, [])
|
114
|
+
|
115
|
+
return self
|
116
|
+
|
92
117
|
|
93
118
|
class TableConfig(BaseModel):
|
94
119
|
description: str = ""
|
@@ -234,7 +259,7 @@ def _create_table_prompt(
|
|
234
259
|
# add previous rows as context to help the LLM generate consistent data
|
235
260
|
if previous_rows:
|
236
261
|
prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
|
237
|
-
prompt += json.dumps(previous_rows, indent=2)
|
262
|
+
prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
|
238
263
|
|
239
264
|
# add context table name, primary key and data
|
240
265
|
if context_data is not None:
|
@@ -252,12 +277,14 @@ def _create_table_prompt(
|
|
252
277
|
prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
|
253
278
|
else:
|
254
279
|
prompt += (
|
255
|
-
f"Generate
|
256
|
-
f"The Foreign Key column may only contain values from Context Table Data
|
280
|
+
f"Generate data for the `{table_name}` table. "
|
281
|
+
f"The Foreign Key column may only contain values from Context Table Data. "
|
282
|
+
f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
|
257
283
|
)
|
258
284
|
if previous_rows:
|
259
285
|
prompt += (
|
260
286
|
"Generate new rows that maintain consistency with the previous rows where appropriate. "
|
287
|
+
"Don't copy previous rows in the output. "
|
261
288
|
"Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
|
262
289
|
)
|
263
290
|
prompt += f"Do not use code to generate the data.\n\n"
|
@@ -426,6 +453,44 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
|
|
426
453
|
return sample_size
|
427
454
|
|
428
455
|
|
456
|
+
def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
|
457
|
+
child_to_parents = {}
|
458
|
+
parent_to_children = {}
|
459
|
+
|
460
|
+
for table_name in config.root:
|
461
|
+
child_to_parents[table_name] = []
|
462
|
+
parent_to_children[table_name] = []
|
463
|
+
|
464
|
+
for table_name, table_config in config.root.items():
|
465
|
+
if table_config.foreign_keys:
|
466
|
+
for fk in table_config.foreign_keys:
|
467
|
+
referenced_table = fk.referenced_table
|
468
|
+
child_to_parents[table_name].append(referenced_table)
|
469
|
+
parent_to_children[referenced_table].append(table_name)
|
470
|
+
|
471
|
+
subject_tables = [table_name for table_name, deps in child_to_parents.items() if not deps]
|
472
|
+
return child_to_parents, parent_to_children, subject_tables
|
473
|
+
|
474
|
+
|
475
|
+
def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
|
476
|
+
execution_plan = []
|
477
|
+
bfs_queue = list(subject_tables)
|
478
|
+
processed = set()
|
479
|
+
|
480
|
+
while bfs_queue:
|
481
|
+
table_name = bfs_queue.pop(0)
|
482
|
+
if table_name in processed:
|
483
|
+
continue
|
484
|
+
|
485
|
+
execution_plan.append(table_name)
|
486
|
+
processed.add(table_name)
|
487
|
+
|
488
|
+
for child in parent_to_children[table_name]:
|
489
|
+
if child not in bfs_queue and child not in processed:
|
490
|
+
bfs_queue.append(child)
|
491
|
+
return execution_plan
|
492
|
+
|
493
|
+
|
429
494
|
def sample(
|
430
495
|
*,
|
431
496
|
tables: dict[str, dict],
|
@@ -491,34 +556,52 @@ def sample(
|
|
491
556
|
from mostlyai import mock
|
492
557
|
|
493
558
|
tables = {
|
494
|
-
"
|
495
|
-
"description": "
|
559
|
+
"customers": {
|
560
|
+
"description": "Customers of a hardware store",
|
496
561
|
"columns": {
|
497
|
-
"
|
498
|
-
"name": {"prompt": "first name and last name of the
|
562
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
563
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
564
|
+
},
|
565
|
+
"primary_key": "customer_id",
|
566
|
+
},
|
567
|
+
"orders": {
|
568
|
+
"description": "Orders of a Customer",
|
569
|
+
"columns": {
|
570
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
571
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
572
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
573
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
499
574
|
},
|
500
|
-
"primary_key": "
|
575
|
+
"primary_key": "order_id",
|
576
|
+
"foreign_keys": [
|
577
|
+
{
|
578
|
+
"column": "customer_id",
|
579
|
+
"referenced_table": "customers",
|
580
|
+
"description": "each customer has anywhere between 2 and 3 orders",
|
581
|
+
}
|
582
|
+
],
|
501
583
|
},
|
502
|
-
"
|
503
|
-
"description": "
|
584
|
+
"items": {
|
585
|
+
"description": "Items in an Order",
|
504
586
|
"columns": {
|
505
|
-
"
|
506
|
-
"
|
507
|
-
"
|
508
|
-
"
|
587
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
588
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
589
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
590
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
509
591
|
},
|
510
592
|
"foreign_keys": [
|
511
593
|
{
|
512
|
-
"column": "
|
513
|
-
"referenced_table": "
|
514
|
-
"description": "each
|
594
|
+
"column": "order_id",
|
595
|
+
"referenced_table": "orders",
|
596
|
+
"description": "each order has between 1 and 2 items",
|
515
597
|
}
|
516
598
|
],
|
517
599
|
},
|
518
600
|
}
|
519
|
-
data = mock.sample(tables=tables, sample_size=
|
520
|
-
|
521
|
-
|
601
|
+
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
602
|
+
df_customers = data["customers"]
|
603
|
+
df_orders = data["orders"]
|
604
|
+
df_items = data["items"]
|
522
605
|
```
|
523
606
|
"""
|
524
607
|
|
@@ -526,9 +609,15 @@ def sample(
|
|
526
609
|
|
527
610
|
sample_size = _harmonize_sample_size(sample_size, config)
|
528
611
|
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
529
|
-
|
530
|
-
|
531
|
-
|
612
|
+
|
613
|
+
child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
|
614
|
+
execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
|
615
|
+
|
616
|
+
results: dict[str, pd.DataFrame] = {}
|
617
|
+
|
618
|
+
for table_name in execution_plan:
|
619
|
+
table_config = config.root[table_name]
|
620
|
+
if not child_to_parents[table_name]:
|
532
621
|
# subject table
|
533
622
|
df = _sample_table(
|
534
623
|
table_name=table_name,
|
@@ -542,22 +631,21 @@ def sample(
|
|
542
631
|
previous_rows_size=5,
|
543
632
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
544
633
|
)
|
545
|
-
|
546
|
-
#
|
634
|
+
else:
|
635
|
+
# sequencial table
|
636
|
+
referenced_table = table_config.foreign_keys[0].referenced_table
|
547
637
|
df = _sample_table(
|
548
638
|
table_name=table_name,
|
549
639
|
table_config=table_config,
|
550
640
|
primary_keys=primary_keys,
|
551
641
|
sample_size=None,
|
552
|
-
context_data=
|
642
|
+
context_data=results[referenced_table],
|
553
643
|
temperature=temperature,
|
554
644
|
top_p=top_p,
|
555
645
|
batch_size=1, # generate one sequence at a time
|
556
646
|
previous_rows_size=5,
|
557
647
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
558
648
|
)
|
559
|
-
|
560
|
-
raise RuntimeError("Only 1 or 2 table setups are supported for now")
|
561
|
-
dfs[table_name] = df
|
649
|
+
results[table_name] = df
|
562
650
|
|
563
|
-
return
|
651
|
+
return results if len(results) > 1 else next(iter(results.values()))
|
@@ -1,11 +1,28 @@
|
|
1
1
|
[project]
|
2
2
|
name = "mostlyai-mock"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.6"
|
4
4
|
description = "Synthetic Mock Data"
|
5
5
|
authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
|
6
6
|
requires-python = ">=3.10"
|
7
7
|
readme = "README.md"
|
8
8
|
license = "Apache-2.0"
|
9
|
+
classifiers = [
|
10
|
+
"Development Status :: 4 - Beta",
|
11
|
+
"Intended Audience :: Developers",
|
12
|
+
"Intended Audience :: Science/Research",
|
13
|
+
"Intended Audience :: Information Technology",
|
14
|
+
"Intended Audience :: Financial and Insurance Industry",
|
15
|
+
"Intended Audience :: Healthcare Industry",
|
16
|
+
"Intended Audience :: Telecommunications Industry",
|
17
|
+
"Programming Language :: Python :: 3.10",
|
18
|
+
"Programming Language :: Python :: 3.11",
|
19
|
+
"Programming Language :: Python :: 3.12",
|
20
|
+
"Programming Language :: Python :: 3.13",
|
21
|
+
"License :: OSI Approved :: Apache Software License",
|
22
|
+
"Operating System :: OS Independent",
|
23
|
+
"Topic :: Software Development :: Libraries",
|
24
|
+
"Typing :: Typed",
|
25
|
+
]
|
9
26
|
dependencies = [
|
10
27
|
"pydantic>=2.0.0,<3.0.0",
|
11
28
|
"numpy>=1.26.3",
|
mostlyai_mock-0.0.4/PKG-INFO
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: mostlyai-mock
|
3
|
-
Version: 0.0.4
|
4
|
-
Summary: Synthetic Mock Data
|
5
|
-
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
|
-
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
7
|
-
Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
|
8
|
-
Author-email: MOSTLY AI <dev@mostly.ai>
|
9
|
-
License-Expression: Apache-2.0
|
10
|
-
License-File: LICENSE
|
11
|
-
Requires-Python: >=3.10
|
12
|
-
Requires-Dist: litellm>=1.67.0
|
13
|
-
Requires-Dist: numpy>=1.26.3
|
14
|
-
Requires-Dist: pandas>=2.0.0
|
15
|
-
Requires-Dist: pyarrow>=14.0.0
|
16
|
-
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
17
|
-
Description-Content-Type: text/markdown
|
18
|
-
|
19
|
-
# Synthetic Mock Data 🔮
|
20
|
-
|
21
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)   
|
22
|
-
|
23
|
-
Create data out of nothing. Prompt LLMs for Tabular Data.
|
24
|
-
|
25
|
-
## Installation
|
26
|
-
|
27
|
-
The latest release of `mostlyai-mock` can be installed via pip:
|
28
|
-
|
29
|
-
```bash
|
30
|
-
pip install -U mostlyai-mock
|
31
|
-
```
|
32
|
-
|
33
|
-
Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
|
34
|
-
|
35
|
-
## Quick Start
|
36
|
-
|
37
|
-
### Single Table
|
38
|
-
|
39
|
-
```python
|
40
|
-
from mostlyai import mock
|
41
|
-
|
42
|
-
tables = {
|
43
|
-
"guests": {
|
44
|
-
"description": "Guests of an Alpine ski hotel in Austria",
|
45
|
-
"columns": {
|
46
|
-
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
47
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
48
|
-
"gender": {"dtype": "category", "values": ["male", "female"]},
|
49
|
-
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
50
|
-
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
51
|
-
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
52
|
-
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
53
|
-
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
54
|
-
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
55
|
-
},
|
56
|
-
}
|
57
|
-
}
|
58
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
|
59
|
-
print(df)
|
60
|
-
```
|
61
|
-
|
62
|
-
### Multiple Tables
|
63
|
-
|
64
|
-
```python
|
65
|
-
from mostlyai import mock
|
66
|
-
|
67
|
-
tables = {
|
68
|
-
"guests": {
|
69
|
-
"description": "Guests of an Alpine ski hotel in Austria",
|
70
|
-
"columns": {
|
71
|
-
"id": {"prompt": "the unique id of the guest", "dtype": "integer"},
|
72
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
73
|
-
},
|
74
|
-
"primary_key": "id",
|
75
|
-
},
|
76
|
-
"purchases": {
|
77
|
-
"description": "Purchases of a Guest during their stay",
|
78
|
-
"columns": {
|
79
|
-
"guest_id": {"prompt": "the guest id for that purchase", "dtype": "integer"},
|
80
|
-
"purchase_id": {"prompt": "the unique id of the purchase", "dtype": "string"},
|
81
|
-
"text": {"prompt": "purchase text description", "dtype": "string"},
|
82
|
-
"amount": {"prompt": "purchase amount in EUR", "dtype": "float"},
|
83
|
-
},
|
84
|
-
"foreign_keys": [
|
85
|
-
{
|
86
|
-
"column": "guest_id",
|
87
|
-
"referenced_table": "guests",
|
88
|
-
"description": "each guest has anywhere between 1 and 10 purchases",
|
89
|
-
}
|
90
|
-
],
|
91
|
-
},
|
92
|
-
}
|
93
|
-
data = mock.sample(tables=tables, sample_size=5, model="openai/gpt-4.1-nano")
|
94
|
-
df_guests = data["guests"]
|
95
|
-
df_purchases = data["purchases"]
|
96
|
-
print(df_guests)
|
97
|
-
print(df_purchases)
|
98
|
-
```
|
mostlyai_mock-0.0.4/README.md
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
# Synthetic Mock Data 🔮
|
2
|
-
|
3
|
-
[](https://mostly-ai.github.io/mostlyai-mock/) [](https://pypi.org/project/mostlyai-mock/)   
|
4
|
-
|
5
|
-
Create data out of nothing. Prompt LLMs for Tabular Data.
|
6
|
-
|
7
|
-
## Installation
|
8
|
-
|
9
|
-
The latest release of `mostlyai-mock` can be installed via pip:
|
10
|
-
|
11
|
-
```bash
|
12
|
-
pip install -U mostlyai-mock
|
13
|
-
```
|
14
|
-
|
15
|
-
Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
|
16
|
-
|
17
|
-
## Quick Start
|
18
|
-
|
19
|
-
### Single Table
|
20
|
-
|
21
|
-
```python
|
22
|
-
from mostlyai import mock
|
23
|
-
|
24
|
-
tables = {
|
25
|
-
"guests": {
|
26
|
-
"description": "Guests of an Alpine ski hotel in Austria",
|
27
|
-
"columns": {
|
28
|
-
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
29
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
30
|
-
"gender": {"dtype": "category", "values": ["male", "female"]},
|
31
|
-
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
32
|
-
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
33
|
-
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
34
|
-
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
35
|
-
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
36
|
-
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
37
|
-
},
|
38
|
-
}
|
39
|
-
}
|
40
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
|
41
|
-
print(df)
|
42
|
-
```
|
43
|
-
|
44
|
-
### Multiple Tables
|
45
|
-
|
46
|
-
```python
|
47
|
-
from mostlyai import mock
|
48
|
-
|
49
|
-
tables = {
|
50
|
-
"guests": {
|
51
|
-
"description": "Guests of an Alpine ski hotel in Austria",
|
52
|
-
"columns": {
|
53
|
-
"id": {"prompt": "the unique id of the guest", "dtype": "integer"},
|
54
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
55
|
-
},
|
56
|
-
"primary_key": "id",
|
57
|
-
},
|
58
|
-
"purchases": {
|
59
|
-
"description": "Purchases of a Guest during their stay",
|
60
|
-
"columns": {
|
61
|
-
"guest_id": {"prompt": "the guest id for that purchase", "dtype": "integer"},
|
62
|
-
"purchase_id": {"prompt": "the unique id of the purchase", "dtype": "string"},
|
63
|
-
"text": {"prompt": "purchase text description", "dtype": "string"},
|
64
|
-
"amount": {"prompt": "purchase amount in EUR", "dtype": "float"},
|
65
|
-
},
|
66
|
-
"foreign_keys": [
|
67
|
-
{
|
68
|
-
"column": "guest_id",
|
69
|
-
"referenced_table": "guests",
|
70
|
-
"description": "each guest has anywhere between 1 and 10 purchases",
|
71
|
-
}
|
72
|
-
],
|
73
|
-
},
|
74
|
-
}
|
75
|
-
data = mock.sample(tables=tables, sample_size=5, model="openai/gpt-4.1-nano")
|
76
|
-
df_guests = data["guests"]
|
77
|
-
df_purchases = data["purchases"]
|
78
|
-
print(df_guests)
|
79
|
-
print(df_purchases)
|
80
|
-
```
|
File without changes
|
File without changes
|