mostlyai-mock 0.1.16__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -208,7 +208,7 @@ print(data["items"])
208
208
  # 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
209
209
  ```
210
210
 
211
- 6. Create your first self-referencing mock table
211
+ 5. Create your first self-referencing mock table with auto-increment integer primary keys
212
212
 
213
213
  ```python
214
214
  from mostlyai import mock
@@ -217,9 +217,9 @@ tables = {
217
217
  "employees": {
218
218
  "prompt": "Employees of a company",
219
219
  "columns": {
220
- "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
221
- "name": {"prompt": "first name and last name of the president", "dtype": "string"},
222
- "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
220
+ "employee_id": {"dtype": "integer"},
221
+ "name": {"prompt": "first name and last name of the employee", "dtype": "string"},
222
+ "boss_id": {"dtype": "integer"},
223
223
  "role": {"prompt": "the role of the employee", "dtype": "string"},
224
224
  },
225
225
  "primary_key": "employee_id",
@@ -234,20 +234,20 @@ tables = {
234
234
  }
235
235
  df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
236
236
  print(df)
237
- # employee_id name boss_id role
238
- # 0 B0-1 Patricia Lee <NA> President
239
- # 1 B0-2 Edward Rodriguez B0-1 VP of Operations
240
- # 2 B0-3 Maria Cortez B0-1 VP of Finance
241
- # 3 B0-4 Thomas Nguyen B0-1 VP of Technology
242
- # 4 B0-5 Rachel Kim B0-2 Operations Manager
243
- # 5 B0-6 Jeffrey Patel B0-2 Supply Chain Lead
244
- # 6 B0-7 Olivia Smith B0-2 Facilities Supervisor
245
- # 7 B0-8 Brian Carter B0-3 Accounting Manager
246
- # 8 B0-9 Lauren Anderson B0-3 Financial Analyst
247
- # 9 B0-10 Santiago Romero B0-3 Payroll Specialist
237
+ # employee_id name boss_id role
238
+ # 0 1 Patricia Lee <NA> President
239
+ # 1 2 Edward Rodriguez 1 VP of Operations
240
+ # 2 3 Maria Cortez 1 VP of Finance
241
+ # 3 4 Thomas Nguyen 1 VP of Technology
242
+ # 4 5 Rachel Kim 2 Operations Manager
243
+ # 5 6 Jeffrey Patel 2 Supply Chain Lead
244
+ # 6 7 Olivia Smith 2 Facilities Supervisor
245
+ # 7 8 Brian Carter 3 Accounting Manager
246
+ # 8 9 Lauren Anderson 3 Financial Analyst
247
+ # 9 10 Santiago Romero 3 Payroll Specialist
248
248
  ```
249
249
 
250
- 7. Enrich existing data with additional columns
250
+ 6. Enrich existing data with additional columns
251
251
 
252
252
  ```python
253
253
  from mostlyai import mock
@@ -170,7 +170,7 @@ print(data["items"])
170
170
  # 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
171
171
  ```
172
172
 
173
- 6. Create your first self-referencing mock table
173
+ 5. Create your first self-referencing mock table with auto-increment integer primary keys
174
174
 
175
175
  ```python
176
176
  from mostlyai import mock
@@ -179,9 +179,9 @@ tables = {
179
179
  "employees": {
180
180
  "prompt": "Employees of a company",
181
181
  "columns": {
182
- "employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
183
- "name": {"prompt": "first name and last name of the president", "dtype": "string"},
184
- "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
182
+ "employee_id": {"dtype": "integer"},
183
+ "name": {"prompt": "first name and last name of the employee", "dtype": "string"},
184
+ "boss_id": {"dtype": "integer"},
185
185
  "role": {"prompt": "the role of the employee", "dtype": "string"},
186
186
  },
187
187
  "primary_key": "employee_id",
@@ -196,20 +196,20 @@ tables = {
196
196
  }
197
197
  df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
198
198
  print(df)
199
- # employee_id name boss_id role
200
- # 0 B0-1 Patricia Lee <NA> President
201
- # 1 B0-2 Edward Rodriguez B0-1 VP of Operations
202
- # 2 B0-3 Maria Cortez B0-1 VP of Finance
203
- # 3 B0-4 Thomas Nguyen B0-1 VP of Technology
204
- # 4 B0-5 Rachel Kim B0-2 Operations Manager
205
- # 5 B0-6 Jeffrey Patel B0-2 Supply Chain Lead
206
- # 6 B0-7 Olivia Smith B0-2 Facilities Supervisor
207
- # 7 B0-8 Brian Carter B0-3 Accounting Manager
208
- # 8 B0-9 Lauren Anderson B0-3 Financial Analyst
209
- # 9 B0-10 Santiago Romero B0-3 Payroll Specialist
199
+ # employee_id name boss_id role
200
+ # 0 1 Patricia Lee <NA> President
201
+ # 1 2 Edward Rodriguez 1 VP of Operations
202
+ # 2 3 Maria Cortez 1 VP of Finance
203
+ # 3 4 Thomas Nguyen 1 VP of Technology
204
+ # 4 5 Rachel Kim 2 Operations Manager
205
+ # 5 6 Jeffrey Patel 2 Supply Chain Lead
206
+ # 6 7 Olivia Smith 2 Facilities Supervisor
207
+ # 7 8 Brian Carter 3 Accounting Manager
208
+ # 8 9 Lauren Anderson 3 Financial Analyst
209
+ # 9 10 Santiago Romero 3 Payroll Specialist
210
210
  ```
211
211
 
212
- 7. Enrich existing data with additional columns
212
+ 6. Enrich existing data with additional columns
213
213
 
214
214
  ```python
215
215
  from mostlyai import mock
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.16" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.18" # Do not set this manually. Use poetry version [params].
@@ -124,14 +124,14 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
124
124
  return self
125
125
 
126
126
  @model_validator(mode="after")
127
- def ensure_primary_key_is_string_dtype(self) -> MockConfig:
127
+ def ensure_primary_key_is_string_or_integer_dtype(self) -> MockConfig:
128
128
  for table_name, table_config in self.root.items():
129
129
  if table_config.primary_key:
130
130
  column_config = table_config.columns[table_config.primary_key]
131
- if column_config.dtype not in [DType.STRING]:
131
+ if column_config.dtype not in [DType.STRING, DType.INTEGER]:
132
132
  raise ValueError(
133
133
  f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
134
- f" {[DType.STRING.value]}"
134
+ f" {[DType.STRING.value, DType.INTEGER.value]}"
135
135
  )
136
136
  return self
137
137
 
@@ -248,6 +248,7 @@ async def _sample_table(
248
248
  non_context_size: int | None,
249
249
  n_workers: int,
250
250
  llm_config: LLMConfig,
251
+ config: MockConfig,
251
252
  progress_callback: Callable | None = None,
252
253
  ) -> pd.DataFrame:
253
254
  table_rows_generator = _create_table_rows_generator(
@@ -265,7 +266,13 @@ async def _sample_table(
265
266
  progress_callback=progress_callback,
266
267
  )
267
268
  table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
268
- table_df = await _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
269
+ table_df = await _convert_table_rows_generator_to_df(
270
+ table_rows_generator=table_rows_generator,
271
+ columns=columns,
272
+ primary_key=primary_keys.get(name),
273
+ foreign_keys=foreign_keys,
274
+ config=config,
275
+ )
269
276
  return table_df
270
277
 
271
278
 
@@ -326,6 +333,15 @@ def _create_table_prompt(
326
333
  column_specifications = {
327
334
  column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
328
335
  }
336
+ # ensure primary keys stay as string in the prompt, even if dtype is integer
337
+ if target_primary_key and target_primary_key in column_specifications:
338
+ if columns[target_primary_key].dtype == DType.INTEGER:
339
+ column_specifications[target_primary_key]["dtype"] = DType.STRING.value
340
+ # ensure foreign keys referencing integer primary keys also stay as string in the prompt
341
+ for fk in foreign_keys:
342
+ if fk.column in column_specifications:
343
+ if columns[fk.column].dtype == DType.INTEGER:
344
+ column_specifications[fk.column]["dtype"] = DType.STRING.value
329
345
  prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
330
346
 
331
347
  # add previous rows as context to help the LLM generate consistent data
@@ -565,11 +581,17 @@ async def _yield_rows_from_csv_chunks_stream(response: litellm.CustomStreamWrapp
565
581
 
566
582
 
567
583
  def _create_structured_output_schema(
568
- columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
584
+ columns: dict[str, ColumnConfig],
585
+ existing_data: pd.DataFrame | None,
586
+ primary_key: str | None,
587
+ foreign_keys: list[ForeignKeyConfig],
569
588
  ) -> type[BaseModel]:
570
- def create_annotation(column_config: ColumnConfig) -> type:
589
+ def create_annotation(column_config: ColumnConfig, is_int_pk_or_fk: bool = False) -> type:
571
590
  if column_config.values or column_config.dtype is DType.CATEGORY:
572
591
  return Literal[tuple(column_config.values)] # type: ignore
592
+ # ensure integer primary keys and foreign keys are treated as strings
593
+ if is_int_pk_or_fk:
594
+ return str | None
573
595
  return {
574
596
  DType.INTEGER: int | None,
575
597
  DType.FLOAT: float | None,
@@ -585,7 +607,9 @@ def _create_structured_output_schema(
585
607
  for column_name, column_config in columns.items():
586
608
  if existing_data is not None and column_name in existing_data.columns:
587
609
  continue # skip columns that already exist in existing data
588
- annotation = create_annotation(column_config)
610
+ is_int_pk = primary_key and column_name == primary_key and column_config.dtype == DType.INTEGER
611
+ is_int_fk = any(fk.column == column_name for fk in foreign_keys) and column_config.dtype == DType.INTEGER
612
+ annotation = create_annotation(column_config, is_int_pk or is_int_fk)
589
613
  fields[column_name] = (annotation, Field(...))
590
614
  TableRow = create_model("TableRow", **fields)
591
615
  TableRows = create_model("TableRows", rows=(list[TableRow], ...))
@@ -632,8 +656,9 @@ async def _worker(
632
656
  # construct schema for Structured Outputs (applies to JSON LLMOutputFormat only)
633
657
  structured_output_schema = None
634
658
  if llm_output_format == LLMOutputFormat.JSON:
659
+ pk_col = primary_keys.get(name)
635
660
  structured_output_schema = _create_structured_output_schema(
636
- columns=columns, existing_data=existing_batch
661
+ columns=columns, existing_data=existing_batch, primary_key=pk_col, foreign_keys=foreign_keys
637
662
  )
638
663
 
639
664
  # construct litellm kwargs
@@ -974,14 +999,47 @@ def _align_series_dtypes_with_column_config(series: pd.Series, column_config: Co
974
999
  return series
975
1000
 
976
1001
 
1002
+ def _get_integer_pk_fk_columns(
1003
+ columns: dict[str, ColumnConfig],
1004
+ primary_key: str | None,
1005
+ foreign_keys: list[ForeignKeyConfig],
1006
+ config: MockConfig,
1007
+ ) -> set[str]:
1008
+ """determine which columns should be kept as strings (integer PKs and FKs that reference integer PKs)"""
1009
+ skip_conversion = set()
1010
+
1011
+ # integer primary keys
1012
+ if primary_key and primary_key in columns and columns[primary_key].dtype == DType.INTEGER:
1013
+ skip_conversion.add(primary_key)
1014
+
1015
+ # foreign keys that reference integer primary keys
1016
+ # note: FK dtype is guaranteed to match referenced PK dtype by config validation
1017
+ for fk in foreign_keys:
1018
+ if fk.column in columns and columns[fk.column].dtype == DType.INTEGER:
1019
+ skip_conversion.add(fk.column)
1020
+
1021
+ return skip_conversion
1022
+
1023
+
977
1024
  async def _convert_table_rows_generator_to_df(
978
1025
  table_rows_generator: AsyncGenerator[dict],
979
1026
  columns: dict[str, ColumnConfig],
1027
+ primary_key: str | None = None,
1028
+ foreign_keys: list[ForeignKeyConfig] | None = None,
1029
+ config: MockConfig | None = None,
980
1030
  ) -> pd.DataFrame:
981
1031
  def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
982
1032
  df = df.copy()
1033
+ skip_int_conversion = (
1034
+ _get_integer_pk_fk_columns(columns, primary_key, foreign_keys or [], config) if config else set()
1035
+ )
1036
+
983
1037
  for column_name, column_config in columns.items():
984
- df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
1038
+ # keep integer PKs and FKs as strings for now (post-processing will convert them)
1039
+ if column_name in skip_int_conversion:
1040
+ df[column_name] = df[column_name].astype("string[pyarrow]")
1041
+ else:
1042
+ df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
985
1043
  return df
986
1044
 
987
1045
  # consume entire generator
@@ -1025,11 +1083,6 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
1025
1083
  }
1026
1084
  column_configs = {**existing_column_configs, **column_configs}
1027
1085
 
1028
- # primary keys are always strings
1029
- primary_key = table_config.get("primary_key", None)
1030
- if primary_key is not None:
1031
- column_configs[primary_key]["dtype"] = DType.STRING
1032
-
1033
1086
  table_config["columns"] = column_configs
1034
1087
  return tables
1035
1088
 
@@ -1129,6 +1182,45 @@ def _build_execution_plan(config: MockConfig) -> list[str]:
1129
1182
  return execution_plan
1130
1183
 
1131
1184
 
1185
+ def _postprocess_table(
1186
+ table_name: str,
1187
+ df: pd.DataFrame,
1188
+ table_config: TableConfig,
1189
+ config: MockConfig,
1190
+ pk_mappings: dict[str, dict[str, int]],
1191
+ ) -> pd.DataFrame:
1192
+ """convert integer PKs and FKs from strings to auto-incremented integers"""
1193
+ df = df.copy()
1194
+
1195
+ # convert integer primary keys to 1, 2, 3, ... and build mapping
1196
+ pk_col = table_config.primary_key
1197
+ if pk_col and table_config.columns[pk_col].dtype == DType.INTEGER:
1198
+ old_values = df[pk_col].tolist()
1199
+ new_values = list(range(1, len(df) + 1))
1200
+
1201
+ # build mapping: old LLM-generated string values -> new auto-incremented integers
1202
+ pk_mappings[table_name] = {str(old): new for old, new in zip(old_values, new_values)}
1203
+
1204
+ df[pk_col] = new_values
1205
+
1206
+ # convert foreign keys that reference integer primary keys
1207
+ # note: FK dtype is guaranteed to match referenced PK dtype by config validation
1208
+ for fk in table_config.foreign_keys:
1209
+ # skip if not an integer FK (which means it doesn't reference an integer PK)
1210
+ if table_config.columns[fk.column].dtype != DType.INTEGER:
1211
+ continue
1212
+ if fk.referenced_table not in pk_mappings:
1213
+ continue
1214
+
1215
+ # map FK values from strings to integers
1216
+ mapping = pk_mappings[fk.referenced_table]
1217
+ df[fk.column] = (
1218
+ df[fk.column].apply(lambda val: mapping.get(str(val)) if pd.notna(val) else None).astype("int64[pyarrow]")
1219
+ )
1220
+
1221
+ return df
1222
+
1223
+
1132
1224
  async def _sample_common(
1133
1225
  *,
1134
1226
  tables: dict[str, dict],
@@ -1156,6 +1248,10 @@ async def _sample_common(
1156
1248
 
1157
1249
  data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
1158
1250
 
1251
+ # track mappings from old string PK values to new integer PK values
1252
+ pk_mappings: dict[str, dict[str, int]] = {}
1253
+
1254
+ # first, generate all tables (without postprocessing)
1159
1255
  for table_name in execution_plan:
1160
1256
  table_config = config.root[table_name]
1161
1257
  df = await _sample_table(
@@ -1170,10 +1266,16 @@ async def _sample_common(
1170
1266
  non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
1171
1267
  n_workers=n_workers,
1172
1268
  llm_config=llm_config,
1269
+ config=config,
1173
1270
  progress_callback=progress_callback,
1174
1271
  )
1175
1272
  data[table_name] = df
1176
1273
 
1274
+ # then, postprocess all tables (convert integer PKs/FKs from strings to integers)
1275
+ for table_name in execution_plan:
1276
+ table_config = config.root[table_name]
1277
+ data[table_name] = _postprocess_table(table_name, data[table_name], table_config, config, pk_mappings)
1278
+
1177
1279
  return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
1178
1280
 
1179
1281
 
@@ -1266,7 +1368,7 @@ def sample(
1266
1368
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
1267
1369
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
1268
1370
  },
1269
- "primary_key": "customer_id", # single string; no composite keys allowed; primary keys must have string dtype
1371
+ "primary_key": "customer_id", # no composite keys allowed;
1270
1372
  },
1271
1373
  "warehouses": {
1272
1374
  "prompt": "Warehouses of a hardware store",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.1.16"
3
+ version = "0.1.18"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes