mostlyai-mock 0.1.15__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/PKG-INFO +21 -21
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/README.md +20 -20
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/mostlyai/mock/core.py +144 -24
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/mostlyai/mock/mcp_server.py +1 -1
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/pyproject.toml +1 -1
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/.gitignore +0 -0
- {mostlyai_mock-0.1.15 → mostlyai_mock-0.1.17}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.17
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -95,7 +95,7 @@ tables = {
|
|
95
95
|
df = mock.sample(
|
96
96
|
tables=tables, # provide table and column definitions
|
97
97
|
sample_size=10, # generate 10 records
|
98
|
-
model="openai/gpt-
|
98
|
+
model="openai/gpt-5-nano", # select the LLM model (optional)
|
99
99
|
)
|
100
100
|
print(df)
|
101
101
|
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
@@ -176,7 +176,7 @@ tables = {
|
|
176
176
|
data = mock.sample(
|
177
177
|
tables=tables,
|
178
178
|
sample_size=2,
|
179
|
-
model="openai/gpt-
|
179
|
+
model="openai/gpt-5",
|
180
180
|
n_workers=1,
|
181
181
|
)
|
182
182
|
print(data["customers"])
|
@@ -208,7 +208,7 @@ print(data["items"])
|
|
208
208
|
# 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
|
209
209
|
```
|
210
210
|
|
211
|
-
|
211
|
+
5. Create your first self-referencing mock table with auto-increment integer primary keys
|
212
212
|
|
213
213
|
```python
|
214
214
|
from mostlyai import mock
|
@@ -217,9 +217,9 @@ tables = {
|
|
217
217
|
"employees": {
|
218
218
|
"prompt": "Employees of a company",
|
219
219
|
"columns": {
|
220
|
-
"employee_id": {"
|
221
|
-
"name": {"prompt": "first name and last name of the
|
222
|
-
"boss_id": {"
|
220
|
+
"employee_id": {"dtype": "integer"},
|
221
|
+
"name": {"prompt": "first name and last name of the employee", "dtype": "string"},
|
222
|
+
"boss_id": {"dtype": "integer"},
|
223
223
|
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
224
224
|
},
|
225
225
|
"primary_key": "employee_id",
|
@@ -232,22 +232,22 @@ tables = {
|
|
232
232
|
],
|
233
233
|
}
|
234
234
|
}
|
235
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-
|
235
|
+
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
|
236
236
|
print(df)
|
237
|
-
# employee_id name
|
238
|
-
# 0
|
239
|
-
# 1
|
240
|
-
# 2
|
241
|
-
# 3
|
242
|
-
# 4
|
243
|
-
# 5
|
244
|
-
# 6
|
245
|
-
# 7
|
246
|
-
# 8
|
247
|
-
# 9
|
237
|
+
# employee_id name boss_id role
|
238
|
+
# 0 1 Patricia Lee <NA> President
|
239
|
+
# 1 2 Edward Rodriguez 1 VP of Operations
|
240
|
+
# 2 3 Maria Cortez 1 VP of Finance
|
241
|
+
# 3 4 Thomas Nguyen 1 VP of Technology
|
242
|
+
# 4 5 Rachel Kim 2 Operations Manager
|
243
|
+
# 5 6 Jeffrey Patel 2 Supply Chain Lead
|
244
|
+
# 6 7 Olivia Smith 2 Facilities Supervisor
|
245
|
+
# 7 8 Brian Carter 3 Accounting Manager
|
246
|
+
# 8 9 Lauren Anderson 3 Financial Analyst
|
247
|
+
# 9 10 Santiago Romero 3 Payroll Specialist
|
248
248
|
```
|
249
249
|
|
250
|
-
|
250
|
+
6. Enrich existing data with additional columns
|
251
251
|
|
252
252
|
```python
|
253
253
|
from mostlyai import mock
|
@@ -273,7 +273,7 @@ existing_guests = pd.DataFrame({
|
|
273
273
|
df = mock.sample(
|
274
274
|
tables=tables,
|
275
275
|
existing_data={"guests": existing_guests},
|
276
|
-
model="openai/gpt-
|
276
|
+
model="openai/gpt-5-nano"
|
277
277
|
)
|
278
278
|
print(df)
|
279
279
|
# guest_id name nationality gender age room_number is_vip
|
@@ -57,7 +57,7 @@ tables = {
|
|
57
57
|
df = mock.sample(
|
58
58
|
tables=tables, # provide table and column definitions
|
59
59
|
sample_size=10, # generate 10 records
|
60
|
-
model="openai/gpt-
|
60
|
+
model="openai/gpt-5-nano", # select the LLM model (optional)
|
61
61
|
)
|
62
62
|
print(df)
|
63
63
|
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
@@ -138,7 +138,7 @@ tables = {
|
|
138
138
|
data = mock.sample(
|
139
139
|
tables=tables,
|
140
140
|
sample_size=2,
|
141
|
-
model="openai/gpt-
|
141
|
+
model="openai/gpt-5",
|
142
142
|
n_workers=1,
|
143
143
|
)
|
144
144
|
print(data["customers"])
|
@@ -170,7 +170,7 @@ print(data["items"])
|
|
170
170
|
# 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
|
171
171
|
```
|
172
172
|
|
173
|
-
|
173
|
+
5. Create your first self-referencing mock table with auto-increment integer primary keys
|
174
174
|
|
175
175
|
```python
|
176
176
|
from mostlyai import mock
|
@@ -179,9 +179,9 @@ tables = {
|
|
179
179
|
"employees": {
|
180
180
|
"prompt": "Employees of a company",
|
181
181
|
"columns": {
|
182
|
-
"employee_id": {"
|
183
|
-
"name": {"prompt": "first name and last name of the
|
184
|
-
"boss_id": {"
|
182
|
+
"employee_id": {"dtype": "integer"},
|
183
|
+
"name": {"prompt": "first name and last name of the employee", "dtype": "string"},
|
184
|
+
"boss_id": {"dtype": "integer"},
|
185
185
|
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
186
186
|
},
|
187
187
|
"primary_key": "employee_id",
|
@@ -194,22 +194,22 @@ tables = {
|
|
194
194
|
],
|
195
195
|
}
|
196
196
|
}
|
197
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-
|
197
|
+
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
|
198
198
|
print(df)
|
199
|
-
# employee_id name
|
200
|
-
# 0
|
201
|
-
# 1
|
202
|
-
# 2
|
203
|
-
# 3
|
204
|
-
# 4
|
205
|
-
# 5
|
206
|
-
# 6
|
207
|
-
# 7
|
208
|
-
# 8
|
209
|
-
# 9
|
199
|
+
# employee_id name boss_id role
|
200
|
+
# 0 1 Patricia Lee <NA> President
|
201
|
+
# 1 2 Edward Rodriguez 1 VP of Operations
|
202
|
+
# 2 3 Maria Cortez 1 VP of Finance
|
203
|
+
# 3 4 Thomas Nguyen 1 VP of Technology
|
204
|
+
# 4 5 Rachel Kim 2 Operations Manager
|
205
|
+
# 5 6 Jeffrey Patel 2 Supply Chain Lead
|
206
|
+
# 6 7 Olivia Smith 2 Facilities Supervisor
|
207
|
+
# 7 8 Brian Carter 3 Accounting Manager
|
208
|
+
# 8 9 Lauren Anderson 3 Financial Analyst
|
209
|
+
# 9 10 Santiago Romero 3 Payroll Specialist
|
210
210
|
```
|
211
211
|
|
212
|
-
|
212
|
+
6. Enrich existing data with additional columns
|
213
213
|
|
214
214
|
```python
|
215
215
|
from mostlyai import mock
|
@@ -235,7 +235,7 @@ existing_guests = pd.DataFrame({
|
|
235
235
|
df = mock.sample(
|
236
236
|
tables=tables,
|
237
237
|
existing_data={"guests": existing_guests},
|
238
|
-
model="openai/gpt-
|
238
|
+
model="openai/gpt-5-nano"
|
239
239
|
)
|
240
240
|
print(df)
|
241
241
|
# guest_id name nationality gender age room_number is_vip
|
@@ -124,14 +124,14 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
124
124
|
return self
|
125
125
|
|
126
126
|
@model_validator(mode="after")
|
127
|
-
def
|
127
|
+
def ensure_primary_key_is_string_or_integer_dtype(self) -> MockConfig:
|
128
128
|
for table_name, table_config in self.root.items():
|
129
129
|
if table_config.primary_key:
|
130
130
|
column_config = table_config.columns[table_config.primary_key]
|
131
|
-
if column_config.dtype not in [DType.STRING]:
|
131
|
+
if column_config.dtype not in [DType.STRING, DType.INTEGER]:
|
132
132
|
raise ValueError(
|
133
133
|
f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
|
134
|
-
f" {[DType.STRING.value]}"
|
134
|
+
f" {[DType.STRING.value, DType.INTEGER.value]}"
|
135
135
|
)
|
136
136
|
return self
|
137
137
|
|
@@ -248,6 +248,7 @@ async def _sample_table(
|
|
248
248
|
non_context_size: int | None,
|
249
249
|
n_workers: int,
|
250
250
|
llm_config: LLMConfig,
|
251
|
+
config: MockConfig,
|
251
252
|
progress_callback: Callable | None = None,
|
252
253
|
) -> pd.DataFrame:
|
253
254
|
table_rows_generator = _create_table_rows_generator(
|
@@ -265,7 +266,13 @@ async def _sample_table(
|
|
265
266
|
progress_callback=progress_callback,
|
266
267
|
)
|
267
268
|
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
268
|
-
table_df = await _convert_table_rows_generator_to_df(
|
269
|
+
table_df = await _convert_table_rows_generator_to_df(
|
270
|
+
table_rows_generator=table_rows_generator,
|
271
|
+
columns=columns,
|
272
|
+
primary_key=primary_keys.get(name),
|
273
|
+
foreign_keys=foreign_keys,
|
274
|
+
config=config,
|
275
|
+
)
|
269
276
|
return table_df
|
270
277
|
|
271
278
|
|
@@ -326,6 +333,15 @@ def _create_table_prompt(
|
|
326
333
|
column_specifications = {
|
327
334
|
column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
|
328
335
|
}
|
336
|
+
# ensure primary keys stay as string in the prompt, even if dtype is integer
|
337
|
+
if target_primary_key and target_primary_key in column_specifications:
|
338
|
+
if columns[target_primary_key].dtype == DType.INTEGER:
|
339
|
+
column_specifications[target_primary_key]["dtype"] = DType.STRING.value
|
340
|
+
# ensure foreign keys referencing integer primary keys also stay as string in the prompt
|
341
|
+
for fk in foreign_keys:
|
342
|
+
if fk.column in column_specifications:
|
343
|
+
if columns[fk.column].dtype == DType.INTEGER:
|
344
|
+
column_specifications[fk.column]["dtype"] = DType.STRING.value
|
329
345
|
prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
|
330
346
|
|
331
347
|
# add previous rows as context to help the LLM generate consistent data
|
@@ -565,11 +581,17 @@ async def _yield_rows_from_csv_chunks_stream(response: litellm.CustomStreamWrapp
|
|
565
581
|
|
566
582
|
|
567
583
|
def _create_structured_output_schema(
|
568
|
-
columns: dict[str, ColumnConfig],
|
584
|
+
columns: dict[str, ColumnConfig],
|
585
|
+
existing_data: pd.DataFrame | None,
|
586
|
+
primary_key: str | None,
|
587
|
+
foreign_keys: list[ForeignKeyConfig],
|
569
588
|
) -> type[BaseModel]:
|
570
|
-
def create_annotation(column_config: ColumnConfig) -> type:
|
589
|
+
def create_annotation(column_config: ColumnConfig, is_int_pk_or_fk: bool = False) -> type:
|
571
590
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
572
591
|
return Literal[tuple(column_config.values)] # type: ignore
|
592
|
+
# ensure integer primary keys and foreign keys are treated as strings
|
593
|
+
if is_int_pk_or_fk:
|
594
|
+
return str | None
|
573
595
|
return {
|
574
596
|
DType.INTEGER: int | None,
|
575
597
|
DType.FLOAT: float | None,
|
@@ -585,7 +607,9 @@ def _create_structured_output_schema(
|
|
585
607
|
for column_name, column_config in columns.items():
|
586
608
|
if existing_data is not None and column_name in existing_data.columns:
|
587
609
|
continue # skip columns that already exist in existing data
|
588
|
-
|
610
|
+
is_int_pk = primary_key and column_name == primary_key and column_config.dtype == DType.INTEGER
|
611
|
+
is_int_fk = any(fk.column == column_name for fk in foreign_keys) and column_config.dtype == DType.INTEGER
|
612
|
+
annotation = create_annotation(column_config, is_int_pk or is_int_fk)
|
589
613
|
fields[column_name] = (annotation, Field(...))
|
590
614
|
TableRow = create_model("TableRow", **fields)
|
591
615
|
TableRows = create_model("TableRows", rows=(list[TableRow], ...))
|
@@ -632,8 +656,9 @@ async def _worker(
|
|
632
656
|
# construct schema for Structured Outputs (applies to JSON LLMOutputFormat only)
|
633
657
|
structured_output_schema = None
|
634
658
|
if llm_output_format == LLMOutputFormat.JSON:
|
659
|
+
pk_col = primary_keys.get(name)
|
635
660
|
structured_output_schema = _create_structured_output_schema(
|
636
|
-
columns=columns, existing_data=existing_batch
|
661
|
+
columns=columns, existing_data=existing_batch, primary_key=pk_col, foreign_keys=foreign_keys
|
637
662
|
)
|
638
663
|
|
639
664
|
# construct litellm kwargs
|
@@ -645,6 +670,24 @@ async def _worker(
|
|
645
670
|
"stream": True,
|
646
671
|
}
|
647
672
|
|
673
|
+
# support for openai reasoning models
|
674
|
+
model_only = llm_config.model.split("/")[-1] if "/" in llm_config.model else llm_config.model
|
675
|
+
reasoning_effort = (
|
676
|
+
"low"
|
677
|
+
if (model_only.startswith("o") and (model_only[1:].isdigit() or model_only[1:].split("-")[0].isdigit()))
|
678
|
+
else "minimal"
|
679
|
+
if (
|
680
|
+
model_only.startswith("gpt-")
|
681
|
+
and model_only.split("-")[1].isdigit()
|
682
|
+
and int(model_only.split("-")[1]) >= 5
|
683
|
+
)
|
684
|
+
else None
|
685
|
+
)
|
686
|
+
|
687
|
+
if reasoning_effort:
|
688
|
+
litellm_kwargs.pop("top_p")
|
689
|
+
litellm_kwargs["reasoning_effort"] = reasoning_effort
|
690
|
+
|
648
691
|
# construct messages
|
649
692
|
system_prompt = _create_system_prompt(llm_output_format)
|
650
693
|
user_prompt = _create_table_prompt(
|
@@ -956,14 +999,47 @@ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: Co
|
|
956
999
|
return series
|
957
1000
|
|
958
1001
|
|
1002
|
+
def _get_integer_pk_fk_columns(
|
1003
|
+
columns: dict[str, ColumnConfig],
|
1004
|
+
primary_key: str | None,
|
1005
|
+
foreign_keys: list[ForeignKeyConfig],
|
1006
|
+
config: MockConfig,
|
1007
|
+
) -> set[str]:
|
1008
|
+
"""determine which columns should be kept as strings (integer PKs and FKs that reference integer PKs)"""
|
1009
|
+
skip_conversion = set()
|
1010
|
+
|
1011
|
+
# integer primary keys
|
1012
|
+
if primary_key and primary_key in columns and columns[primary_key].dtype == DType.INTEGER:
|
1013
|
+
skip_conversion.add(primary_key)
|
1014
|
+
|
1015
|
+
# foreign keys that reference integer primary keys
|
1016
|
+
# note: FK dtype is guaranteed to match referenced PK dtype by config validation
|
1017
|
+
for fk in foreign_keys:
|
1018
|
+
if fk.column in columns and columns[fk.column].dtype == DType.INTEGER:
|
1019
|
+
skip_conversion.add(fk.column)
|
1020
|
+
|
1021
|
+
return skip_conversion
|
1022
|
+
|
1023
|
+
|
959
1024
|
async def _convert_table_rows_generator_to_df(
|
960
1025
|
table_rows_generator: AsyncGenerator[dict],
|
961
1026
|
columns: dict[str, ColumnConfig],
|
1027
|
+
primary_key: str | None = None,
|
1028
|
+
foreign_keys: list[ForeignKeyConfig] | None = None,
|
1029
|
+
config: MockConfig | None = None,
|
962
1030
|
) -> pd.DataFrame:
|
963
1031
|
def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
|
964
1032
|
df = df.copy()
|
1033
|
+
skip_int_conversion = (
|
1034
|
+
_get_integer_pk_fk_columns(columns, primary_key, foreign_keys or [], config) if config else set()
|
1035
|
+
)
|
1036
|
+
|
965
1037
|
for column_name, column_config in columns.items():
|
966
|
-
|
1038
|
+
# keep integer PKs and FKs as strings for now (post-processing will convert them)
|
1039
|
+
if column_name in skip_int_conversion:
|
1040
|
+
df[column_name] = df[column_name].astype("string[pyarrow]")
|
1041
|
+
else:
|
1042
|
+
df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
|
967
1043
|
return df
|
968
1044
|
|
969
1045
|
# consume entire generator
|
@@ -1007,11 +1083,6 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
|
|
1007
1083
|
}
|
1008
1084
|
column_configs = {**existing_column_configs, **column_configs}
|
1009
1085
|
|
1010
|
-
# primary keys are always strings
|
1011
|
-
primary_key = table_config.get("primary_key", None)
|
1012
|
-
if primary_key is not None:
|
1013
|
-
column_configs[primary_key]["dtype"] = DType.STRING
|
1014
|
-
|
1015
1086
|
table_config["columns"] = column_configs
|
1016
1087
|
return tables
|
1017
1088
|
|
@@ -1111,12 +1182,51 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
|
|
1111
1182
|
return execution_plan
|
1112
1183
|
|
1113
1184
|
|
1185
|
+
def _postprocess_table(
|
1186
|
+
table_name: str,
|
1187
|
+
df: pd.DataFrame,
|
1188
|
+
table_config: TableConfig,
|
1189
|
+
config: MockConfig,
|
1190
|
+
pk_mappings: dict[str, dict[str, int]],
|
1191
|
+
) -> pd.DataFrame:
|
1192
|
+
"""convert integer PKs and FKs from strings to auto-incremented integers"""
|
1193
|
+
df = df.copy()
|
1194
|
+
|
1195
|
+
# convert integer primary keys to 1, 2, 3, ... and build mapping
|
1196
|
+
pk_col = table_config.primary_key
|
1197
|
+
if pk_col and table_config.columns[pk_col].dtype == DType.INTEGER:
|
1198
|
+
old_values = df[pk_col].tolist()
|
1199
|
+
new_values = list(range(1, len(df) + 1))
|
1200
|
+
|
1201
|
+
# build mapping: old LLM-generated string values -> new auto-incremented integers
|
1202
|
+
pk_mappings[table_name] = {str(old): new for old, new in zip(old_values, new_values)}
|
1203
|
+
|
1204
|
+
df[pk_col] = new_values
|
1205
|
+
|
1206
|
+
# convert foreign keys that reference integer primary keys
|
1207
|
+
# note: FK dtype is guaranteed to match referenced PK dtype by config validation
|
1208
|
+
for fk in table_config.foreign_keys:
|
1209
|
+
# skip if not an integer FK (which means it doesn't reference an integer PK)
|
1210
|
+
if table_config.columns[fk.column].dtype != DType.INTEGER:
|
1211
|
+
continue
|
1212
|
+
if fk.referenced_table not in pk_mappings:
|
1213
|
+
continue
|
1214
|
+
|
1215
|
+
# map FK values from strings to integers
|
1216
|
+
mapping = pk_mappings[fk.referenced_table]
|
1217
|
+
df[fk.column] = (
|
1218
|
+
df[fk.column].apply(lambda val: mapping.get(str(val)) if pd.notna(val) else None).astype("int64[pyarrow]")
|
1219
|
+
)
|
1220
|
+
|
1221
|
+
return df
|
1222
|
+
|
1223
|
+
|
1114
1224
|
async def _sample_common(
|
1115
1225
|
*,
|
1116
1226
|
tables: dict[str, dict],
|
1117
1227
|
sample_size: int | dict[str, int] = 4,
|
1118
1228
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
1119
|
-
model: str = "openai/gpt-
|
1229
|
+
model: str = "openai/gpt-5-nano",
|
1120
1230
|
api_key: str | None = None,
|
1121
1231
|
temperature: float = 1.0,
|
1122
1232
|
top_p: float = 0.95,
|
@@ -1138,6 +1248,10 @@ async def _sample_common(
|
|
1138
1248
|
|
1139
1249
|
data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
|
1140
1250
|
|
1251
|
+
# track mappings from old string PK values to new integer PK values
|
1252
|
+
pk_mappings: dict[str, dict[str, int]] = {}
|
1253
|
+
|
1254
|
+
# first, generate all tables (without postprocessing)
|
1141
1255
|
for table_name in execution_plan:
|
1142
1256
|
table_config = config.root[table_name]
|
1143
1257
|
df = await _sample_table(
|
@@ -1152,10 +1266,16 @@ async def _sample_common(
|
|
1152
1266
|
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
1153
1267
|
n_workers=n_workers,
|
1154
1268
|
llm_config=llm_config,
|
1269
|
+
config=config,
|
1155
1270
|
progress_callback=progress_callback,
|
1156
1271
|
)
|
1157
1272
|
data[table_name] = df
|
1158
1273
|
|
1274
|
+
# then, postprocess all tables (convert integer PKs/FKs from strings to integers)
|
1275
|
+
for table_name in execution_plan:
|
1276
|
+
table_config = config.root[table_name]
|
1277
|
+
data[table_name] = _postprocess_table(table_name, data[table_name], table_config, config, pk_mappings)
|
1278
|
+
|
1159
1279
|
return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
|
1160
1280
|
|
1161
1281
|
|
@@ -1164,7 +1284,7 @@ def sample(
|
|
1164
1284
|
tables: dict[str, dict],
|
1165
1285
|
sample_size: int | dict[str, int] = 4,
|
1166
1286
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
1167
|
-
model: str = "openai/gpt-
|
1287
|
+
model: str = "openai/gpt-5-nano",
|
1168
1288
|
api_key: str | None = None,
|
1169
1289
|
temperature: float = 1.0,
|
1170
1290
|
top_p: float = 0.95,
|
@@ -1194,9 +1314,9 @@ def sample(
|
|
1194
1314
|
Default is None.
|
1195
1315
|
model (str): The LiteLLM chat completion model to be used.
|
1196
1316
|
Examples include:
|
1197
|
-
- `openai/gpt-
|
1198
|
-
- `openai/gpt-
|
1199
|
-
- `openai/gpt-
|
1317
|
+
- `openai/gpt-5-nano` (default; fast, and smart)
|
1318
|
+
- `openai/gpt-5-mini` (slower, but smarter)
|
1319
|
+
- `openai/gpt-5` (slowest, but smartest)
|
1200
1320
|
- `gemini/gemini-2.0-flash`
|
1201
1321
|
- `gemini/gemini-2.5-flash-preview-04-17`
|
1202
1322
|
- 'groq/gemma2-9b-it`
|
@@ -1234,7 +1354,7 @@ def sample(
|
|
1234
1354
|
},
|
1235
1355
|
}
|
1236
1356
|
}
|
1237
|
-
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-
|
1357
|
+
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5-nano")
|
1238
1358
|
```
|
1239
1359
|
|
1240
1360
|
Example of generating mock data for multiple tables (with PK/FK relationships):
|
@@ -1297,7 +1417,7 @@ def sample(
|
|
1297
1417
|
],
|
1298
1418
|
},
|
1299
1419
|
}
|
1300
|
-
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-
|
1420
|
+
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-5")
|
1301
1421
|
df_customers = data["customers"]
|
1302
1422
|
df_warehouses = data["warehouses"]
|
1303
1423
|
df_orders = data["orders"]
|
@@ -1326,7 +1446,7 @@ def sample(
|
|
1326
1446
|
enriched_df = mock.sample(
|
1327
1447
|
tables=tables,
|
1328
1448
|
existing_data={"patients": existing_df},
|
1329
|
-
model="openai/gpt-
|
1449
|
+
model="openai/gpt-5-nano"
|
1330
1450
|
)
|
1331
1451
|
enriched_df
|
1332
1452
|
```
|
@@ -1381,7 +1501,7 @@ def sample(
|
|
1381
1501
|
"customers": existing_customers,
|
1382
1502
|
"orders": existing_orders,
|
1383
1503
|
},
|
1384
|
-
model="openai/gpt-
|
1504
|
+
model="openai/gpt-5-nano"
|
1385
1505
|
)
|
1386
1506
|
df_customers = data["customers"]
|
1387
1507
|
df_orders = data["orders"]
|
@@ -1413,7 +1533,7 @@ async def _asample(
|
|
1413
1533
|
tables: dict[str, dict],
|
1414
1534
|
sample_size: int | dict[str, int] = 4,
|
1415
1535
|
existing_data: dict[str, pd.DataFrame] | None = None,
|
1416
|
-
model: str = "openai/gpt-
|
1536
|
+
model: str = "openai/gpt-5-nano",
|
1417
1537
|
api_key: str | None = None,
|
1418
1538
|
temperature: float = 1.0,
|
1419
1539
|
top_p: float = 0.95,
|
File without changes
|
File without changes
|