mostlyai-mock 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +117 -15
- {mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/METADATA +17 -17
- mostlyai_mock-0.1.18.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.16.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.16.dist-info → mostlyai_mock-0.1.18.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -124,14 +124,14 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
124
124
|
return self
|
125
125
|
|
126
126
|
@model_validator(mode="after")
|
127
|
-
def
|
127
|
+
def ensure_primary_key_is_string_or_integer_dtype(self) -> MockConfig:
|
128
128
|
for table_name, table_config in self.root.items():
|
129
129
|
if table_config.primary_key:
|
130
130
|
column_config = table_config.columns[table_config.primary_key]
|
131
|
-
if column_config.dtype not in [DType.STRING]:
|
131
|
+
if column_config.dtype not in [DType.STRING, DType.INTEGER]:
|
132
132
|
raise ValueError(
|
133
133
|
f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
|
134
|
-
f" {[DType.STRING.value]}"
|
134
|
+
f" {[DType.STRING.value, DType.INTEGER.value]}"
|
135
135
|
)
|
136
136
|
return self
|
137
137
|
|
@@ -248,6 +248,7 @@ async def _sample_table(
|
|
248
248
|
non_context_size: int | None,
|
249
249
|
n_workers: int,
|
250
250
|
llm_config: LLMConfig,
|
251
|
+
config: MockConfig,
|
251
252
|
progress_callback: Callable | None = None,
|
252
253
|
) -> pd.DataFrame:
|
253
254
|
table_rows_generator = _create_table_rows_generator(
|
@@ -265,7 +266,13 @@ async def _sample_table(
|
|
265
266
|
progress_callback=progress_callback,
|
266
267
|
)
|
267
268
|
table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
|
268
|
-
table_df = await _convert_table_rows_generator_to_df(
|
269
|
+
table_df = await _convert_table_rows_generator_to_df(
|
270
|
+
table_rows_generator=table_rows_generator,
|
271
|
+
columns=columns,
|
272
|
+
primary_key=primary_keys.get(name),
|
273
|
+
foreign_keys=foreign_keys,
|
274
|
+
config=config,
|
275
|
+
)
|
269
276
|
return table_df
|
270
277
|
|
271
278
|
|
@@ -326,6 +333,15 @@ def _create_table_prompt(
|
|
326
333
|
column_specifications = {
|
327
334
|
column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
|
328
335
|
}
|
336
|
+
# ensure primary keys stay as string in the prompt, even if dtype is integer
|
337
|
+
if target_primary_key and target_primary_key in column_specifications:
|
338
|
+
if columns[target_primary_key].dtype == DType.INTEGER:
|
339
|
+
column_specifications[target_primary_key]["dtype"] = DType.STRING.value
|
340
|
+
# ensure foreign keys referencing integer primary keys also stay as string in the prompt
|
341
|
+
for fk in foreign_keys:
|
342
|
+
if fk.column in column_specifications:
|
343
|
+
if columns[fk.column].dtype == DType.INTEGER:
|
344
|
+
column_specifications[fk.column]["dtype"] = DType.STRING.value
|
329
345
|
prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
|
330
346
|
|
331
347
|
# add previous rows as context to help the LLM generate consistent data
|
@@ -565,11 +581,17 @@ async def _yield_rows_from_csv_chunks_stream(response: litellm.CustomStreamWrapp
|
|
565
581
|
|
566
582
|
|
567
583
|
def _create_structured_output_schema(
|
568
|
-
columns: dict[str, ColumnConfig],
|
584
|
+
columns: dict[str, ColumnConfig],
|
585
|
+
existing_data: pd.DataFrame | None,
|
586
|
+
primary_key: str | None,
|
587
|
+
foreign_keys: list[ForeignKeyConfig],
|
569
588
|
) -> type[BaseModel]:
|
570
|
-
def create_annotation(column_config: ColumnConfig) -> type:
|
589
|
+
def create_annotation(column_config: ColumnConfig, is_int_pk_or_fk: bool = False) -> type:
|
571
590
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
572
591
|
return Literal[tuple(column_config.values)] # type: ignore
|
592
|
+
# ensure integer primary keys and foreign keys are treated as strings
|
593
|
+
if is_int_pk_or_fk:
|
594
|
+
return str | None
|
573
595
|
return {
|
574
596
|
DType.INTEGER: int | None,
|
575
597
|
DType.FLOAT: float | None,
|
@@ -585,7 +607,9 @@ def _create_structured_output_schema(
|
|
585
607
|
for column_name, column_config in columns.items():
|
586
608
|
if existing_data is not None and column_name in existing_data.columns:
|
587
609
|
continue # skip columns that already exist in existing data
|
588
|
-
|
610
|
+
is_int_pk = primary_key and column_name == primary_key and column_config.dtype == DType.INTEGER
|
611
|
+
is_int_fk = any(fk.column == column_name for fk in foreign_keys) and column_config.dtype == DType.INTEGER
|
612
|
+
annotation = create_annotation(column_config, is_int_pk or is_int_fk)
|
589
613
|
fields[column_name] = (annotation, Field(...))
|
590
614
|
TableRow = create_model("TableRow", **fields)
|
591
615
|
TableRows = create_model("TableRows", rows=(list[TableRow], ...))
|
@@ -632,8 +656,9 @@ async def _worker(
|
|
632
656
|
# construct schema for Structured Outputs (applies to JSON LLMOutputFormat only)
|
633
657
|
structured_output_schema = None
|
634
658
|
if llm_output_format == LLMOutputFormat.JSON:
|
659
|
+
pk_col = primary_keys.get(name)
|
635
660
|
structured_output_schema = _create_structured_output_schema(
|
636
|
-
columns=columns, existing_data=existing_batch
|
661
|
+
columns=columns, existing_data=existing_batch, primary_key=pk_col, foreign_keys=foreign_keys
|
637
662
|
)
|
638
663
|
|
639
664
|
# construct litellm kwargs
|
@@ -974,14 +999,47 @@ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: Co
|
|
974
999
|
return series
|
975
1000
|
|
976
1001
|
|
1002
|
+
def _get_integer_pk_fk_columns(
|
1003
|
+
columns: dict[str, ColumnConfig],
|
1004
|
+
primary_key: str | None,
|
1005
|
+
foreign_keys: list[ForeignKeyConfig],
|
1006
|
+
config: MockConfig,
|
1007
|
+
) -> set[str]:
|
1008
|
+
"""determine which columns should be kept as strings (integer PKs and FKs that reference integer PKs)"""
|
1009
|
+
skip_conversion = set()
|
1010
|
+
|
1011
|
+
# integer primary keys
|
1012
|
+
if primary_key and primary_key in columns and columns[primary_key].dtype == DType.INTEGER:
|
1013
|
+
skip_conversion.add(primary_key)
|
1014
|
+
|
1015
|
+
# foreign keys that reference integer primary keys
|
1016
|
+
# note: FK dtype is guaranteed to match referenced PK dtype by config validation
|
1017
|
+
for fk in foreign_keys:
|
1018
|
+
if fk.column in columns and columns[fk.column].dtype == DType.INTEGER:
|
1019
|
+
skip_conversion.add(fk.column)
|
1020
|
+
|
1021
|
+
return skip_conversion
|
1022
|
+
|
1023
|
+
|
977
1024
|
async def _convert_table_rows_generator_to_df(
|
978
1025
|
table_rows_generator: AsyncGenerator[dict],
|
979
1026
|
columns: dict[str, ColumnConfig],
|
1027
|
+
primary_key: str | None = None,
|
1028
|
+
foreign_keys: list[ForeignKeyConfig] | None = None,
|
1029
|
+
config: MockConfig | None = None,
|
980
1030
|
) -> pd.DataFrame:
|
981
1031
|
def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
|
982
1032
|
df = df.copy()
|
1033
|
+
skip_int_conversion = (
|
1034
|
+
_get_integer_pk_fk_columns(columns, primary_key, foreign_keys or [], config) if config else set()
|
1035
|
+
)
|
1036
|
+
|
983
1037
|
for column_name, column_config in columns.items():
|
984
|
-
|
1038
|
+
# keep integer PKs and FKs as strings for now (post-processing will convert them)
|
1039
|
+
if column_name in skip_int_conversion:
|
1040
|
+
df[column_name] = df[column_name].astype("string[pyarrow]")
|
1041
|
+
else:
|
1042
|
+
df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
|
985
1043
|
return df
|
986
1044
|
|
987
1045
|
# consume entire generator
|
@@ -1025,11 +1083,6 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
|
|
1025
1083
|
}
|
1026
1084
|
column_configs = {**existing_column_configs, **column_configs}
|
1027
1085
|
|
1028
|
-
# primary keys are always strings
|
1029
|
-
primary_key = table_config.get("primary_key", None)
|
1030
|
-
if primary_key is not None:
|
1031
|
-
column_configs[primary_key]["dtype"] = DType.STRING
|
1032
|
-
|
1033
1086
|
table_config["columns"] = column_configs
|
1034
1087
|
return tables
|
1035
1088
|
|
@@ -1129,6 +1182,45 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
|
|
1129
1182
|
return execution_plan
|
1130
1183
|
|
1131
1184
|
|
1185
|
+
def _postprocess_table(
|
1186
|
+
table_name: str,
|
1187
|
+
df: pd.DataFrame,
|
1188
|
+
table_config: TableConfig,
|
1189
|
+
config: MockConfig,
|
1190
|
+
pk_mappings: dict[str, dict[str, int]],
|
1191
|
+
) -> pd.DataFrame:
|
1192
|
+
"""convert integer PKs and FKs from strings to auto-incremented integers"""
|
1193
|
+
df = df.copy()
|
1194
|
+
|
1195
|
+
# convert integer primary keys to 1, 2, 3, ... and build mapping
|
1196
|
+
pk_col = table_config.primary_key
|
1197
|
+
if pk_col and table_config.columns[pk_col].dtype == DType.INTEGER:
|
1198
|
+
old_values = df[pk_col].tolist()
|
1199
|
+
new_values = list(range(1, len(df) + 1))
|
1200
|
+
|
1201
|
+
# build mapping: old LLM-generated string values -> new auto-incremented integers
|
1202
|
+
pk_mappings[table_name] = {str(old): new for old, new in zip(old_values, new_values)}
|
1203
|
+
|
1204
|
+
df[pk_col] = new_values
|
1205
|
+
|
1206
|
+
# convert foreign keys that reference integer primary keys
|
1207
|
+
# note: FK dtype is guaranteed to match referenced PK dtype by config validation
|
1208
|
+
for fk in table_config.foreign_keys:
|
1209
|
+
# skip if not an integer FK (which means it doesn't reference an integer PK)
|
1210
|
+
if table_config.columns[fk.column].dtype != DType.INTEGER:
|
1211
|
+
continue
|
1212
|
+
if fk.referenced_table not in pk_mappings:
|
1213
|
+
continue
|
1214
|
+
|
1215
|
+
# map FK values from strings to integers
|
1216
|
+
mapping = pk_mappings[fk.referenced_table]
|
1217
|
+
df[fk.column] = (
|
1218
|
+
df[fk.column].apply(lambda val: mapping.get(str(val)) if pd.notna(val) else None).astype("int64[pyarrow]")
|
1219
|
+
)
|
1220
|
+
|
1221
|
+
return df
|
1222
|
+
|
1223
|
+
|
1132
1224
|
async def _sample_common(
|
1133
1225
|
*,
|
1134
1226
|
tables: dict[str, dict],
|
@@ -1156,6 +1248,10 @@ async def _sample_common(
|
|
1156
1248
|
|
1157
1249
|
data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
|
1158
1250
|
|
1251
|
+
# track mappings from old string PK values to new integer PK values
|
1252
|
+
pk_mappings: dict[str, dict[str, int]] = {}
|
1253
|
+
|
1254
|
+
# first, generate all tables (without postprocessing)
|
1159
1255
|
for table_name in execution_plan:
|
1160
1256
|
table_config = config.root[table_name]
|
1161
1257
|
df = await _sample_table(
|
@@ -1170,10 +1266,16 @@ async def _sample_common(
|
|
1170
1266
|
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
1171
1267
|
n_workers=n_workers,
|
1172
1268
|
llm_config=llm_config,
|
1269
|
+
config=config,
|
1173
1270
|
progress_callback=progress_callback,
|
1174
1271
|
)
|
1175
1272
|
data[table_name] = df
|
1176
1273
|
|
1274
|
+
# then, postprocess all tables (convert integer PKs/FKs from strings to integers)
|
1275
|
+
for table_name in execution_plan:
|
1276
|
+
table_config = config.root[table_name]
|
1277
|
+
data[table_name] = _postprocess_table(table_name, data[table_name], table_config, config, pk_mappings)
|
1278
|
+
|
1177
1279
|
return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
|
1178
1280
|
|
1179
1281
|
|
@@ -1266,7 +1368,7 @@ def sample(
|
|
1266
1368
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
|
1267
1369
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
1268
1370
|
},
|
1269
|
-
"primary_key": "customer_id", #
|
1371
|
+
"primary_key": "customer_id", # no composite keys allowed;
|
1270
1372
|
},
|
1271
1373
|
"warehouses": {
|
1272
1374
|
"prompt": "Warehouses of a hardware store",
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -208,7 +208,7 @@ print(data["items"])
|
|
208
208
|
# 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
|
209
209
|
```
|
210
210
|
|
211
|
-
|
211
|
+
5. Create your first self-referencing mock table with auto-increment integer primary keys
|
212
212
|
|
213
213
|
```python
|
214
214
|
from mostlyai import mock
|
@@ -217,9 +217,9 @@ tables = {
|
|
217
217
|
"employees": {
|
218
218
|
"prompt": "Employees of a company",
|
219
219
|
"columns": {
|
220
|
-
"employee_id": {"
|
221
|
-
"name": {"prompt": "first name and last name of the
|
222
|
-
"boss_id": {"
|
220
|
+
"employee_id": {"dtype": "integer"},
|
221
|
+
"name": {"prompt": "first name and last name of the employee", "dtype": "string"},
|
222
|
+
"boss_id": {"dtype": "integer"},
|
223
223
|
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
224
224
|
},
|
225
225
|
"primary_key": "employee_id",
|
@@ -234,20 +234,20 @@ tables = {
|
|
234
234
|
}
|
235
235
|
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
|
236
236
|
print(df)
|
237
|
-
# employee_id name
|
238
|
-
# 0
|
239
|
-
# 1
|
240
|
-
# 2
|
241
|
-
# 3
|
242
|
-
# 4
|
243
|
-
# 5
|
244
|
-
# 6
|
245
|
-
# 7
|
246
|
-
# 8
|
247
|
-
# 9
|
237
|
+
# employee_id name boss_id role
|
238
|
+
# 0 1 Patricia Lee <NA> President
|
239
|
+
# 1 2 Edward Rodriguez 1 VP of Operations
|
240
|
+
# 2 3 Maria Cortez 1 VP of Finance
|
241
|
+
# 3 4 Thomas Nguyen 1 VP of Technology
|
242
|
+
# 4 5 Rachel Kim 2 Operations Manager
|
243
|
+
# 5 6 Jeffrey Patel 2 Supply Chain Lead
|
244
|
+
# 6 7 Olivia Smith 2 Facilities Supervisor
|
245
|
+
# 7 8 Brian Carter 3 Accounting Manager
|
246
|
+
# 8 9 Lauren Anderson 3 Financial Analyst
|
247
|
+
# 9 10 Santiago Romero 3 Payroll Specialist
|
248
248
|
```
|
249
249
|
|
250
|
-
|
250
|
+
6. Enrich existing data with additional columns
|
251
251
|
|
252
252
|
```python
|
253
253
|
from mostlyai import mock
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=UKmnKlQ7fZVvB0ckh9_nXjojAE0JGa2Kd2mT0Ci8cDU,715
|
2
|
+
mostlyai/mock/core.py,sha256=oGSpIXINL7R1X7ZN5dtdwItaPXDD0mGvkakA0CEzmwI,66880
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
|
4
|
+
mostlyai_mock-0.1.18.dist-info/METADATA,sha256=EmLjpo-D-wJefswHIMk3TCK9TvzLML-3Sjo0OEi9qAI,14257
|
5
|
+
mostlyai_mock-0.1.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.18.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.18.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.18.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=XEezyGjkXQBReW_ORi83H2WEVhLolDDLbGjxA2g2yEs,715
|
2
|
-
mostlyai/mock/core.py,sha256=FTF0BfJowxNHm_L0RpTk6BhS1mXzvjELP-3Z96aFVMQ,62454
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
|
4
|
-
mostlyai_mock-0.1.16.dist-info/METADATA,sha256=CT6lcz2cAq5W-u3VjQLr_Dg8VbuEtU-JlvsXg5OsKTk,14297
|
5
|
-
mostlyai_mock-0.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.16.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|