mostlyai-mock 0.0.5__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,201 @@
1
+ Metadata-Version: 2.4
2
+ Name: mostlyai-mock
3
+ Version: 0.0.7
4
+ Summary: Synthetic Mock Data
5
+ Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
+ Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
7
+ Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
8
+ Author-email: MOSTLY AI <dev@mostly.ai>
9
+ License-Expression: Apache-2.0
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Financial and Insurance Industry
14
+ Classifier: Intended Audience :: Healthcare Industry
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Intended Audience :: Telecommunications Industry
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Software Development :: Libraries
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: litellm>=1.67.0
28
+ Requires-Dist: numpy>=1.26.3
29
+ Requires-Dist: pandas>=2.0.0
30
+ Requires-Dist: pyarrow>=14.0.0
31
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Synthetic Mock Data 🔮
35
+
36
+ [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
37
+
38
+ Create data out of nothing. Prompt LLMs for Tabular Data.
39
+
40
+ ## Key Features
41
+
42
+ * A light-weight python client for prompting LLMs for mixed-type tabular data
43
+ * Select from a range of LLM endpoints, that provide structured output
44
+ * Supports single-table as well as multi-table scenarios.
45
+ * Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
46
+ * Specify context, distributions and rules via dataset-, table- or column-level prompts.
47
+ * Tailor the diversity and realism of your generated data via temperature and top_p.
48
+
49
+ ## Getting Started
50
+
51
+ 1. Install the latest version of the `mostlyai-mock` python package.
52
+
53
+ ```bash
54
+ pip install -U mostlyai-mock
55
+ ```
56
+
57
+ 2. Set the API key of your LLM endpoint (if not done yet)
58
+
59
+ ```python
60
+ import os
61
+ os.environ["OPENAI_API_KEY"] = "your-api-key"
62
+ # os.environ["GEMINI_API_KEY"] = "your-api-key"
63
+ # os.environ["GROQ_API_KEY"] = "your-api-key"
64
+ ```
65
+
66
+ Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
67
+
68
+ 3. Create your first basic synthetic table from scratch
69
+
70
+ ```python
71
+ from mostlyai import mock
72
+
73
+ tables = {
74
+ "guests": {
75
+ "description": "Guests of an Alpine ski hotel in Austria",
76
+ "columns": {
77
+ "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
78
+ "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
79
+ "gender": {"dtype": "category", "values": ["male", "female"]},
80
+ "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
81
+ "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
82
+ "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
83
+ "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
84
+ "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
85
+ "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
86
+ },
87
+ }
88
+ }
89
+ df = mock.sample(
90
+ tables=tables, # provide table and column definitions
91
+ sample_size=10, # generate 10 records
92
+ model="openai/gpt-4.1-nano", # select the LLM model (optional)
93
+ )
94
+ print(df)
95
+ # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
96
+ # 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
97
+ # 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
98
+ # 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
99
+ # 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
100
+ # 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
101
+ # 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
102
+ # 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
103
+ # 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
104
+ # 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
105
+ # 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
106
+ ```
107
+
108
+ 4. Create your first multi-table synthetic dataset
109
+
110
+ ```python
111
+ from mostlyai import mock
112
+
113
+ tables = {
114
+ "customers": {
115
+ "description": "Customers of a hardware store",
116
+ "columns": {
117
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
118
+ "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
119
+ },
120
+ "primary_key": "customer_id",
121
+ },
122
+ "warehouses": {
123
+ "description": "Warehouses of a hardware store",
124
+ "columns": {
125
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
126
+ "name": {"prompt": "the name of the warehouse", "dtype": "string"},
127
+ },
128
+ "primary_key": "warehouse_id",
129
+ },
130
+ "orders": {
131
+ "description": "Orders of a Customer",
132
+ "columns": {
133
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
134
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
135
+ "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
136
+ "text": {"prompt": "order text description", "dtype": "string"},
137
+ "amount": {"prompt": "order amount in USD", "dtype": "float"},
138
+ },
139
+ "primary_key": "order_id",
140
+ "foreign_keys": [
141
+ {
142
+ "column": "customer_id",
143
+ "referenced_table": "customers",
144
+ "description": "each customer has anywhere between 2 and 3 orders",
145
+ },
146
+ {
147
+ "column": "warehouse_id",
148
+ "referenced_table": "warehouses",
149
+ },
150
+ ],
151
+ },
152
+ "items": {
153
+ "description": "Items in an Order",
154
+ "columns": {
155
+ "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
156
+ "order_id": {"prompt": "the order id for that item", "dtype": "string"},
157
+ "name": {"prompt": "the name of the item", "dtype": "string"},
158
+ "price": {"prompt": "the price of the item in USD", "dtype": "float"},
159
+ },
160
+ "foreign_keys": [
161
+ {
162
+ "column": "order_id",
163
+ "referenced_table": "orders",
164
+ "description": "each order has between 1 and 2 items",
165
+ }
166
+ ],
167
+ },
168
+ }
169
+ data = mock.sample(
170
+ tables=tables,
171
+ sample_size=2,
172
+ model="openai/gpt-4.1"
173
+ )
174
+ print(data["customers"])
175
+ # customer_id name
176
+ # 0 1 Matthew Carlson
177
+ # 1 2 Priya Shah
178
+ print(data["warehouses"])
179
+ # warehouse_id name
180
+ # 0 1 Central Distribution Hub
181
+ # 1 2 Northgate Storage Facility
182
+ print(data["orders"])
183
+ # customer_id warehouse_id order_id text amount
184
+ # 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
185
+ # 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
186
+ # 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
187
+ # 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
188
+ # 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
189
+ print(data["items"])
190
+ # item_id order_id name price
191
+ # 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
192
+ # 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
193
+ # 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
194
+ # 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
195
+ # 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
196
+ # 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
197
+ # 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
198
+ # 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
199
+ # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
200
+ # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
201
+ ```
@@ -0,0 +1,168 @@
1
+ # Synthetic Mock Data 🔮
2
+
3
+ [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock)
4
+
5
+ Create data out of nothing. Prompt LLMs for Tabular Data.
6
+
7
+ ## Key Features
8
+
9
+ * A light-weight python client for prompting LLMs for mixed-type tabular data
10
+ * Select from a range of LLM endpoints, that provide structured output
11
+ * Supports single-table as well as multi-table scenarios.
12
+ * Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
13
+ * Specify context, distributions and rules via dataset-, table- or column-level prompts.
14
+ * Tailor the diversity and realism of your generated data via temperature and top_p.
15
+
16
+ ## Getting Started
17
+
18
+ 1. Install the latest version of the `mostlyai-mock` python package.
19
+
20
+ ```bash
21
+ pip install -U mostlyai-mock
22
+ ```
23
+
24
+ 2. Set the API key of your LLM endpoint (if not done yet)
25
+
26
+ ```python
27
+ import os
28
+ os.environ["OPENAI_API_KEY"] = "your-api-key"
29
+ # os.environ["GEMINI_API_KEY"] = "your-api-key"
30
+ # os.environ["GROQ_API_KEY"] = "your-api-key"
31
+ ```
32
+
33
+ Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
34
+
35
+ 3. Create your first basic synthetic table from scratch
36
+
37
+ ```python
38
+ from mostlyai import mock
39
+
40
+ tables = {
41
+ "guests": {
42
+ "description": "Guests of an Alpine ski hotel in Austria",
43
+ "columns": {
44
+ "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
45
+ "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
46
+ "gender": {"dtype": "category", "values": ["male", "female"]},
47
+ "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
48
+ "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
49
+ "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
50
+ "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
51
+ "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
52
+ "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
53
+ },
54
+ }
55
+ }
56
+ df = mock.sample(
57
+ tables=tables, # provide table and column definitions
58
+ sample_size=10, # generate 10 records
59
+ model="openai/gpt-4.1-nano", # select the LLM model (optional)
60
+ )
61
+ print(df)
62
+ # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
63
+ # 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
64
+ # 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
65
+ # 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
66
+ # 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
67
+ # 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
68
+ # 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
69
+ # 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
70
+ # 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
71
+ # 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
72
+ # 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
73
+ ```
74
+
75
+ 4. Create your first multi-table synthetic dataset
76
+
77
+ ```python
78
+ from mostlyai import mock
79
+
80
+ tables = {
81
+ "customers": {
82
+ "description": "Customers of a hardware store",
83
+ "columns": {
84
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
85
+ "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
86
+ },
87
+ "primary_key": "customer_id",
88
+ },
89
+ "warehouses": {
90
+ "description": "Warehouses of a hardware store",
91
+ "columns": {
92
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
93
+ "name": {"prompt": "the name of the warehouse", "dtype": "string"},
94
+ },
95
+ "primary_key": "warehouse_id",
96
+ },
97
+ "orders": {
98
+ "description": "Orders of a Customer",
99
+ "columns": {
100
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
101
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
102
+ "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
103
+ "text": {"prompt": "order text description", "dtype": "string"},
104
+ "amount": {"prompt": "order amount in USD", "dtype": "float"},
105
+ },
106
+ "primary_key": "order_id",
107
+ "foreign_keys": [
108
+ {
109
+ "column": "customer_id",
110
+ "referenced_table": "customers",
111
+ "description": "each customer has anywhere between 2 and 3 orders",
112
+ },
113
+ {
114
+ "column": "warehouse_id",
115
+ "referenced_table": "warehouses",
116
+ },
117
+ ],
118
+ },
119
+ "items": {
120
+ "description": "Items in an Order",
121
+ "columns": {
122
+ "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
123
+ "order_id": {"prompt": "the order id for that item", "dtype": "string"},
124
+ "name": {"prompt": "the name of the item", "dtype": "string"},
125
+ "price": {"prompt": "the price of the item in USD", "dtype": "float"},
126
+ },
127
+ "foreign_keys": [
128
+ {
129
+ "column": "order_id",
130
+ "referenced_table": "orders",
131
+ "description": "each order has between 1 and 2 items",
132
+ }
133
+ ],
134
+ },
135
+ }
136
+ data = mock.sample(
137
+ tables=tables,
138
+ sample_size=2,
139
+ model="openai/gpt-4.1"
140
+ )
141
+ print(data["customers"])
142
+ # customer_id name
143
+ # 0 1 Matthew Carlson
144
+ # 1 2 Priya Shah
145
+ print(data["warehouses"])
146
+ # warehouse_id name
147
+ # 0 1 Central Distribution Hub
148
+ # 1 2 Northgate Storage Facility
149
+ print(data["orders"])
150
+ # customer_id warehouse_id order_id text amount
151
+ # 0 1 2 ORD-10294 3-tier glass shelving units, expedited deliver... 649.25
152
+ # 1 1 1 ORD-10541 Office desk chairs, set of 6, with assembly se... 824.9
153
+ # 2 1 1 ORD-10802 Executive standing desk, walnut finish, standa... 519.0
154
+ # 3 2 1 ORD-11017 Maple conference table, cable management inclu... 1225.5
155
+ # 4 2 2 ORD-11385 Set of ergonomic task chairs, black mesh, stan... 767.75
156
+ print(data["items"])
157
+ # item_id order_id name price
158
+ # 0 ITM-80265 ORD-10294 3-Tier Tempered Glass Shelving Unit 409.0
159
+ # 1 ITM-80266 ORD-10294 Brushed Aluminum Shelf Brackets (Set of 4) 240.25
160
+ # 2 ITM-81324 ORD-10541 Ergonomic Mesh-Back Desk Chair 132.5
161
+ # 3 ITM-81325 ORD-10541 Professional Office Chair Assembly Service 45.0
162
+ # 4 ITM-82101 ORD-10802 Executive Standing Desk, Walnut Finish 469.0
163
+ # 5 ITM-82102 ORD-10802 Desk Installation and Setup Service 50.0
164
+ # 6 ITM-83391 ORD-11017 Maple Conference Table, 10-Seat 1125.5
165
+ # 7 ITM-83392 ORD-11017 Integrated Table Cable Management Kit 100.0
166
+ # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
167
+ # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
168
+ ```
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.5" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.7" # Do not set this manually. Use poetry version [params].
@@ -100,7 +100,10 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
100
100
  if table_name in path:
101
101
  cycle_start = path.index(table_name)
102
102
  cycle = path[cycle_start:] + [table_name]
103
- raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}")
103
+ msg = f"Circular dependency detected: {' -> '.join(cycle)}."
104
+ if len(cycle) == 2:
105
+ msg += " Self-referencing tables are not yet supported."
106
+ raise ValueError(msg)
104
107
  if table_name in visited:
105
108
  return
106
109
  visited.add(table_name)
@@ -119,7 +122,7 @@ class TableConfig(BaseModel):
119
122
  description: str = ""
120
123
  columns: dict[str, ColumnConfig] = Field(..., min_items=1)
121
124
  primary_key: str | None = None
122
- foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list, min_length=0, max_length=1)
125
+ foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
123
126
 
124
127
 
125
128
  class ColumnConfig(BaseModel):
@@ -163,7 +166,7 @@ class ColumnConfig(BaseModel):
163
166
  DType.DATETIME: (str, "strings"),
164
167
  }[self.dtype]
165
168
  try:
166
- self.values = [cast_fn(c) for c in self.values]
169
+ self.values = [cast_fn(c) if pd.notna(c) else None for c in self.values]
167
170
  except ValueError:
168
171
  raise ValueError(
169
172
  f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
@@ -193,28 +196,25 @@ def _sample_table(
193
196
  table_config: TableConfig,
194
197
  primary_keys: dict[str, str] | None,
195
198
  sample_size: int | None,
196
- context_data: pd.DataFrame | None,
199
+ generated_data: dict[str, pd.DataFrame] | None,
197
200
  temperature: float,
198
201
  top_p: float,
199
202
  batch_size: int,
200
203
  previous_rows_size: int,
204
+ non_context_size: int | None,
201
205
  llm_config: LLMConfig,
202
206
  ) -> pd.DataFrame:
203
- assert (sample_size is None) != (context_data is None), (
204
- "Exactly one of sample_size or context_data must be provided"
205
- )
206
- if sample_size is None:
207
- sample_size = len(context_data)
208
207
  table_rows_generator = _create_table_rows_generator(
209
208
  table_name=table_name,
210
209
  table_config=table_config,
211
210
  primary_keys=primary_keys,
212
211
  sample_size=sample_size,
213
- context_data=context_data,
212
+ generated_data=generated_data,
214
213
  temperature=temperature,
215
214
  top_p=top_p,
216
215
  batch_size=batch_size,
217
216
  previous_rows_size=previous_rows_size,
217
+ non_context_size=non_context_size,
218
218
  llm_config=llm_config,
219
219
  )
220
220
  table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
@@ -231,6 +231,7 @@ def _create_table_prompt(
231
231
  batch_size: int | None,
232
232
  foreign_keys: list[ForeignKeyConfig] | None,
233
233
  context_data: pd.DataFrame | None,
234
+ non_context_data: dict[str, pd.DataFrame],
234
235
  previous_rows: list[dict],
235
236
  ) -> str:
236
237
  if batch_size is not None:
@@ -271,16 +272,29 @@ def _create_table_prompt(
271
272
  prompt += f"## Context Table Data:\n\n"
272
273
  prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
273
274
 
275
+ # add non-context table names, primary keys and data
276
+ if non_context_data:
277
+ for fk in foreign_keys[1:]:
278
+ prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
279
+
280
+ prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
281
+
282
+ prompt += f"## Non-Context Table Data:\n\n"
283
+ prompt += f"{non_context_data[fk.referenced_table].to_json(orient='records', indent=2)}\n\n"
284
+
274
285
  # add instructions
275
286
  prompt += "\n## Instructions:\n\n"
276
287
  if batch_size is not None:
277
288
  prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
278
- else:
289
+
290
+ if context_data is not None:
279
291
  prompt += (
280
292
  f"Generate data for the `{table_name}` table. "
281
- f"The Foreign Key column may only contain values from Context Table Data. "
293
+ f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
294
+ f"The second Foreign Key column from Foreign Keys section (if exists) may only contain values from Non-Context Table Data. "
282
295
  f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
283
296
  )
297
+
284
298
  if previous_rows:
285
299
  prompt += (
286
300
  "Generate new rows that maintain consistency with the previous rows where appropriate. "
@@ -298,12 +312,13 @@ def _create_table_rows_generator(
298
312
  table_name: str,
299
313
  table_config: TableConfig,
300
314
  primary_keys: dict[str, str] | None,
301
- sample_size: int,
315
+ sample_size: int | None,
316
+ generated_data: dict[str, pd.DataFrame] | None,
302
317
  temperature: float,
303
318
  top_p: float,
304
- context_data: pd.DataFrame | None,
305
319
  batch_size: int,
306
320
  previous_rows_size: int,
321
+ non_context_size: int | None,
307
322
  llm_config: LLMConfig,
308
323
  ) -> Generator[dict]:
309
324
  def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
@@ -311,14 +326,14 @@ def _create_table_rows_generator(
311
326
  if column_config.values or column_config.dtype is DType.CATEGORY:
312
327
  return Literal[tuple(column_config.values)]
313
328
  return {
314
- DType.INTEGER: int,
315
- DType.FLOAT: float,
316
- DType.STRING: str,
317
- DType.BOOLEAN: bool,
329
+ DType.INTEGER: int | None,
330
+ DType.FLOAT: float | None,
331
+ DType.STRING: str | None,
332
+ DType.BOOLEAN: bool | None,
318
333
  # response_format has limited support for JSON Schema features
319
334
  # thus we represent dates and datetimes as strings
320
- DType.DATE: str,
321
- DType.DATETIME: str,
335
+ DType.DATE: str | None,
336
+ DType.DATETIME: str | None,
322
337
  }[column_config.dtype]
323
338
 
324
339
  fields = {}
@@ -368,6 +383,26 @@ def _create_table_rows_generator(
368
383
  for i in range(0, len(data), batch_size):
369
384
  yield data.iloc[i : i + batch_size]
370
385
 
386
+ # derive context data (if first foreign key is present) and harmonize sample size accordingly
387
+ context_data: pd.DataFrame | None = None
388
+ if table_config.foreign_keys:
389
+ context_table_name = table_config.foreign_keys[0].referenced_table
390
+ assert generated_data is not None
391
+ assert context_table_name in generated_data
392
+ context_data = generated_data[context_table_name]
393
+ sample_size = len(context_data)
394
+ assert sample_size is not None
395
+
396
+ # derive non-context data (if more than one foreign key is present)
397
+ non_context_data: dict[str, pd.DataFrame] = {}
398
+ if table_config.foreign_keys and len(table_config.foreign_keys) > 1:
399
+ assert generated_data is not None
400
+ assert non_context_size is not None
401
+ for fk in table_config.foreign_keys[1:]:
402
+ non_context_table_name = fk.referenced_table
403
+ assert non_context_table_name in generated_data
404
+ non_context_data[non_context_table_name] = generated_data[non_context_table_name]
405
+
371
406
  # ensure model supports response_format and json schema
372
407
  supported_params = litellm.get_supported_openai_params(model=llm_config.model)
373
408
  assert "response_format" in supported_params
@@ -387,6 +422,11 @@ def _create_table_rows_generator(
387
422
  yielded_sequences = 0
388
423
  previous_rows = deque(maxlen=previous_rows_size)
389
424
  for context_batch in batch_infinitely(context_data):
425
+ non_context_batch = (
426
+ {table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()}
427
+ if non_context_data
428
+ else None
429
+ )
390
430
  prompt_kwargs = {
391
431
  "table_name": table_name,
392
432
  "table_description": table_config.description,
@@ -395,6 +435,7 @@ def _create_table_rows_generator(
395
435
  "batch_size": batch_size if context_batch is None else None,
396
436
  "foreign_keys": table_config.foreign_keys if context_batch is not None else None,
397
437
  "context_data": context_batch if context_batch is not None else None,
438
+ "non_context_data": non_context_batch if non_context_batch else None,
398
439
  "previous_rows": list(previous_rows),
399
440
  }
400
441
  prompt = _create_table_prompt(**prompt_kwargs)
@@ -429,10 +470,14 @@ def _convert_table_rows_generator_to_df(
429
470
  for column_name, column_config in columns.items():
430
471
  if column_config.dtype in [DType.DATE, DType.DATETIME]:
431
472
  df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
432
- elif column_config.dtype in [DType.INTEGER, DType.FLOAT]:
433
- df[column_name] = pd.to_numeric(df[column_name], errors="coerce", dtype_backend="pyarrow")
473
+ elif column_config.dtype is DType.INTEGER:
474
+ df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
475
+ "int64[pyarrow]"
476
+ )
477
+ elif column_config.dtype is DType.FLOAT:
478
+ df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
434
479
  elif column_config.dtype is DType.BOOLEAN:
435
- df[column_name] = df[column_name].astype(bool)
480
+ df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
436
481
  elif column_config.dtype is DType.CATEGORY:
437
482
  df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
438
483
  else:
@@ -472,7 +517,9 @@ def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], d
472
517
  return child_to_parents, parent_to_children, subject_tables
473
518
 
474
519
 
475
- def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
520
+ def _build_execution_plan(
521
+ parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
522
+ ) -> list[str]:
476
523
  execution_plan = []
477
524
  bfs_queue = list(subject_tables)
478
525
  processed = set()
@@ -482,6 +529,13 @@ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tabl
482
529
  if table_name in processed:
483
530
  continue
484
531
 
532
+ # ensure all parents are processed before processing this table
533
+ unprocessed_parents = [p for p in child_to_parents[table_name] if p not in processed]
534
+ if unprocessed_parents:
535
+ bfs_queue.extend(unprocessed_parents)
536
+ bfs_queue.append(table_name)
537
+ continue
538
+
485
539
  execution_plan.append(table_name)
486
540
  processed.add(table_name)
487
541
 
@@ -564,10 +618,19 @@ def sample(
564
618
  },
565
619
  "primary_key": "customer_id",
566
620
  },
621
+ "warehouses": {
622
+ "description": "Warehouses of a hardware store",
623
+ "columns": {
624
+ "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
625
+ "name": {"prompt": "the name of the warehouse", "dtype": "string"},
626
+ },
627
+ "primary_key": "warehouse_id",
628
+ },
567
629
  "orders": {
568
630
  "description": "Orders of a Customer",
569
631
  "columns": {
570
632
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
633
+ "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
571
634
  "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
572
635
  "text": {"prompt": "order text description", "dtype": "string"},
573
636
  "amount": {"prompt": "order amount in USD", "dtype": "float"},
@@ -577,8 +640,12 @@ def sample(
577
640
  {
578
641
  "column": "customer_id",
579
642
  "referenced_table": "customers",
580
- "description": "each customer has anywhere between 1 and 3 orders",
581
- }
643
+ "description": "each customer has anywhere between 2 and 3 orders",
644
+ },
645
+ {
646
+ "column": "warehouse_id",
647
+ "referenced_table": "warehouses",
648
+ },
582
649
  ],
583
650
  },
584
651
  "items": {
@@ -593,13 +660,14 @@ def sample(
593
660
  {
594
661
  "column": "order_id",
595
662
  "referenced_table": "orders",
596
- "description": "each order has between 2 and 5 items",
663
+ "description": "each order has between 1 and 2 items",
597
664
  }
598
665
  ],
599
666
  },
600
667
  }
601
668
  data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
602
669
  df_customers = data["customers"]
670
+ df_warehouses = data["warehouses"]
603
671
  df_orders = data["orders"]
604
672
  df_items = data["items"]
605
673
  ```
@@ -611,7 +679,7 @@ def sample(
611
679
  primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
612
680
 
613
681
  child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
614
- execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
682
+ execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
615
683
 
616
684
  results: dict[str, pd.DataFrame] = {}
617
685
 
@@ -624,26 +692,27 @@ def sample(
624
692
  table_config=table_config,
625
693
  primary_keys=None,
626
694
  sample_size=sample_size[table_name],
627
- context_data=None,
695
+ generated_data=None,
628
696
  temperature=temperature,
629
697
  top_p=top_p,
630
- batch_size=20, # generate 20 subjects at a time
631
- previous_rows_size=5,
698
+ batch_size=30, # generate 30 subjects at a time
699
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
700
+ non_context_size=None,
632
701
  llm_config=LLMConfig(model=model, api_key=api_key),
633
702
  )
634
703
  else:
635
704
  # sequencial table
636
- referenced_table = table_config.foreign_keys[0].referenced_table
637
705
  df = _sample_table(
638
706
  table_name=table_name,
639
707
  table_config=table_config,
640
708
  primary_keys=primary_keys,
641
709
  sample_size=None,
642
- context_data=results[referenced_table],
710
+ generated_data=results,
643
711
  temperature=temperature,
644
712
  top_p=top_p,
645
713
  batch_size=1, # generate one sequence at a time
646
- previous_rows_size=5,
714
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
715
+ non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
647
716
  llm_config=LLMConfig(model=model, api_key=api_key),
648
717
  )
649
718
  results[table_name] = df
@@ -1,11 +1,28 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.0.5"
3
+ version = "0.0.7"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
7
7
  readme = "README.md"
8
8
  license = "Apache-2.0"
9
+ classifiers = [
10
+ "Development Status :: 4 - Beta",
11
+ "Intended Audience :: Developers",
12
+ "Intended Audience :: Science/Research",
13
+ "Intended Audience :: Information Technology",
14
+ "Intended Audience :: Financial and Insurance Industry",
15
+ "Intended Audience :: Healthcare Industry",
16
+ "Intended Audience :: Telecommunications Industry",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "License :: OSI Approved :: Apache Software License",
22
+ "Operating System :: OS Independent",
23
+ "Topic :: Software Development :: Libraries",
24
+ "Typing :: Typed",
25
+ ]
9
26
  dependencies = [
10
27
  "pydantic>=2.0.0,<3.0.0",
11
28
  "numpy>=1.26.3",
@@ -1,117 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: mostlyai-mock
3
- Version: 0.0.5
4
- Summary: Synthetic Mock Data
5
- Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
- Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
7
- Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
8
- Author-email: MOSTLY AI <dev@mostly.ai>
9
- License-Expression: Apache-2.0
10
- License-File: LICENSE
11
- Requires-Python: >=3.10
12
- Requires-Dist: litellm>=1.67.0
13
- Requires-Dist: numpy>=1.26.3
14
- Requires-Dist: pandas>=2.0.0
15
- Requires-Dist: pyarrow>=14.0.0
16
- Requires-Dist: pydantic<3.0.0,>=2.0.0
17
- Description-Content-Type: text/markdown
18
-
19
- # Synthetic Mock Data 🔮
20
-
21
- [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
22
-
23
- Create data out of nothing. Prompt LLMs for Tabular Data.
24
-
25
- ## Installation
26
-
27
- The latest release of `mostlyai-mock` can be installed via pip:
28
-
29
- ```bash
30
- pip install -U mostlyai-mock
31
- ```
32
-
33
- Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
34
-
35
- ## Quick Start
36
-
37
- ### Single Table
38
-
39
- ```python
40
- from mostlyai import mock
41
-
42
- tables = {
43
- "guests": {
44
- "description": "Guests of an Alpine ski hotel in Austria",
45
- "columns": {
46
- "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
47
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
48
- "gender": {"dtype": "category", "values": ["male", "female"]},
49
- "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
50
- "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
51
- "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
52
- "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
53
- "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
54
- "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
55
- },
56
- }
57
- }
58
- df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
59
- print(df)
60
- ```
61
-
62
- ### Multiple Tables
63
-
64
- ```python
65
- from mostlyai import mock
66
-
67
- tables = {
68
- "customers": {
69
- "description": "Customers of a hardware store",
70
- "columns": {
71
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
72
- "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
73
- },
74
- "primary_key": "customer_id",
75
- },
76
- "orders": {
77
- "description": "Orders of a Customer",
78
- "columns": {
79
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
80
- "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
81
- "text": {"prompt": "order text description", "dtype": "string"},
82
- "amount": {"prompt": "order amount in USD", "dtype": "float"},
83
- },
84
- "primary_key": "order_id",
85
- "foreign_keys": [
86
- {
87
- "column": "customer_id",
88
- "referenced_table": "customers",
89
- "description": "each customer has anywhere between 1 and 3 orders",
90
- }
91
- ],
92
- },
93
- "items": {
94
- "description": "Items in an Order",
95
- "columns": {
96
- "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
97
- "order_id": {"prompt": "the order id for that item", "dtype": "string"},
98
- "name": {"prompt": "the name of the item", "dtype": "string"},
99
- "price": {"prompt": "the price of the item in USD", "dtype": "float"},
100
- },
101
- "foreign_keys": [
102
- {
103
- "column": "order_id",
104
- "referenced_table": "orders",
105
- "description": "each order has between 2 and 5 items",
106
- }
107
- ],
108
- },
109
- }
110
- data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
111
- df_customers = data["customers"]
112
- df_orders = data["orders"]
113
- df_items = data["items"]
114
- print(df_customers)
115
- print(df_orders)
116
- print(df_items)
117
- ```
@@ -1,99 +0,0 @@
1
- # Synthetic Mock Data 🔮
2
-
3
- [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
4
-
5
- Create data out of nothing. Prompt LLMs for Tabular Data.
6
-
7
- ## Installation
8
-
9
- The latest release of `mostlyai-mock` can be installed via pip:
10
-
11
- ```bash
12
- pip install -U mostlyai-mock
13
- ```
14
-
15
- Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
16
-
17
- ## Quick Start
18
-
19
- ### Single Table
20
-
21
- ```python
22
- from mostlyai import mock
23
-
24
- tables = {
25
- "guests": {
26
- "description": "Guests of an Alpine ski hotel in Austria",
27
- "columns": {
28
- "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
29
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
30
- "gender": {"dtype": "category", "values": ["male", "female"]},
31
- "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
32
- "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
33
- "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
34
- "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
35
- "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
36
- "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
37
- },
38
- }
39
- }
40
- df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
41
- print(df)
42
- ```
43
-
44
- ### Multiple Tables
45
-
46
- ```python
47
- from mostlyai import mock
48
-
49
- tables = {
50
- "customers": {
51
- "description": "Customers of a hardware store",
52
- "columns": {
53
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
54
- "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
55
- },
56
- "primary_key": "customer_id",
57
- },
58
- "orders": {
59
- "description": "Orders of a Customer",
60
- "columns": {
61
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
62
- "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
63
- "text": {"prompt": "order text description", "dtype": "string"},
64
- "amount": {"prompt": "order amount in USD", "dtype": "float"},
65
- },
66
- "primary_key": "order_id",
67
- "foreign_keys": [
68
- {
69
- "column": "customer_id",
70
- "referenced_table": "customers",
71
- "description": "each customer has anywhere between 1 and 3 orders",
72
- }
73
- ],
74
- },
75
- "items": {
76
- "description": "Items in an Order",
77
- "columns": {
78
- "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
79
- "order_id": {"prompt": "the order id for that item", "dtype": "string"},
80
- "name": {"prompt": "the name of the item", "dtype": "string"},
81
- "price": {"prompt": "the price of the item in USD", "dtype": "float"},
82
- },
83
- "foreign_keys": [
84
- {
85
- "column": "order_id",
86
- "referenced_table": "orders",
87
- "description": "each order has between 2 and 5 items",
88
- }
89
- ],
90
- },
91
- }
92
- data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
93
- df_customers = data["customers"]
94
- df_orders = data["orders"]
95
- df_items = data["items"]
96
- print(df_customers)
97
- print(df_orders)
98
- print(df_items)
99
- ```
File without changes
File without changes