mostlyai-mock 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +171 -77
- {mostlyai_mock-0.1.11.dist-info → mostlyai_mock-0.1.13.dist-info}/METADATA +59 -57
- mostlyai_mock-0.1.13.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.11.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.11.dist-info → mostlyai_mock-0.1.13.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.11.dist-info → mostlyai_mock-0.1.13.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.11.dist-info → mostlyai_mock-0.1.13.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -81,8 +81,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
81
81
|
if fk_field.dtype != pk_field.dtype:
|
82
82
|
raise ValueError(
|
83
83
|
f"Foreign key violation in table '{table_name}': "
|
84
|
-
f"Column '{fk.column}' type '{fk_field.dtype}' does not match "
|
85
|
-
f"referenced primary key '{referenced_config.primary_key}' type '{pk_field.dtype}'"
|
84
|
+
f"Column '{fk.column}' type '{fk_field.dtype.value}' does not match "
|
85
|
+
f"referenced primary key '{referenced_config.primary_key}' type '{pk_field.dtype.value}'"
|
86
86
|
)
|
87
87
|
|
88
88
|
return tables
|
@@ -113,6 +113,49 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
113
113
|
|
114
114
|
return self
|
115
115
|
|
116
|
+
@model_validator(mode="after")
|
117
|
+
def ensure_values_are_not_provided_for_primary_key(self) -> MockConfig:
|
118
|
+
for table_name, table_config in self.root.items():
|
119
|
+
for column_name, column_config in table_config.columns.items():
|
120
|
+
if column_name == table_config.primary_key and column_config.values:
|
121
|
+
raise ValueError(
|
122
|
+
f"Values cannot be provided for primary key column '{column_name}' in table '{table_name}'"
|
123
|
+
)
|
124
|
+
return self
|
125
|
+
|
126
|
+
@model_validator(mode="after")
|
127
|
+
def ensure_primary_key_is_string_dtype(self) -> MockConfig:
|
128
|
+
for table_name, table_config in self.root.items():
|
129
|
+
if table_config.primary_key:
|
130
|
+
column_config = table_config.columns[table_config.primary_key]
|
131
|
+
if column_config.dtype not in [DType.STRING]:
|
132
|
+
raise ValueError(
|
133
|
+
f"Primary key column '{table_config.primary_key}' in table '{table_name}' must be one of the following types:"
|
134
|
+
f" {[DType.STRING.value]}"
|
135
|
+
)
|
136
|
+
return self
|
137
|
+
|
138
|
+
def get_dependency_mappings(self) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
|
139
|
+
child_to_parents = {}
|
140
|
+
parent_to_children = {}
|
141
|
+
|
142
|
+
for table_name in self.root:
|
143
|
+
child_to_parents[table_name] = set()
|
144
|
+
parent_to_children[table_name] = set()
|
145
|
+
|
146
|
+
for table_name, table_config in self.root.items():
|
147
|
+
if table_config.foreign_keys:
|
148
|
+
for fk in table_config.foreign_keys:
|
149
|
+
referenced_table = fk.referenced_table
|
150
|
+
child_to_parents[table_name].add(referenced_table)
|
151
|
+
parent_to_children[referenced_table].add(table_name)
|
152
|
+
|
153
|
+
root_tables = []
|
154
|
+
for table_name, parents in child_to_parents.items():
|
155
|
+
if not parents or parents == {table_name}: # no dependencies or only self-dependency
|
156
|
+
root_tables.append(table_name)
|
157
|
+
return child_to_parents, parent_to_children, root_tables
|
158
|
+
|
116
159
|
|
117
160
|
class TableConfig(BaseModel):
|
118
161
|
prompt: str = ""
|
@@ -200,7 +243,7 @@ async def _sample_table(
|
|
200
243
|
foreign_keys: list[ForeignKeyConfig],
|
201
244
|
primary_keys: dict[str, str],
|
202
245
|
data: dict[str, pd.DataFrame],
|
203
|
-
sample_size: int,
|
246
|
+
sample_size: int | None,
|
204
247
|
previous_rows_size: int,
|
205
248
|
non_context_size: int | None,
|
206
249
|
n_workers: int,
|
@@ -225,12 +268,7 @@ async def _sample_table(
|
|
225
268
|
|
226
269
|
|
227
270
|
def _sample_table_sync(*args, **kwargs) -> pd.DataFrame:
|
228
|
-
|
229
|
-
asyncio.set_event_loop(loop)
|
230
|
-
try:
|
231
|
-
return loop.run_until_complete(_sample_table(*args, **kwargs))
|
232
|
-
finally:
|
233
|
-
loop.close()
|
271
|
+
return asyncio.run(_sample_table(*args, **kwargs))
|
234
272
|
|
235
273
|
|
236
274
|
def _create_system_prompt(llm_output_format: LLMOutputFormat) -> str:
|
@@ -263,6 +301,7 @@ def _create_table_prompt(
|
|
263
301
|
prompt: str,
|
264
302
|
columns: dict[str, ColumnConfig],
|
265
303
|
primary_keys: dict[str, str],
|
304
|
+
batch_idx: int,
|
266
305
|
batch_size: int | None,
|
267
306
|
foreign_keys: list[ForeignKeyConfig],
|
268
307
|
existing_data: pd.DataFrame | None,
|
@@ -277,7 +316,8 @@ def _create_table_prompt(
|
|
277
316
|
# define table
|
278
317
|
prompt += f"## Target Table: `{name}`\n\n"
|
279
318
|
|
280
|
-
|
319
|
+
target_primary_key = primary_keys[name]
|
320
|
+
prompt += f"### Target Table Primary Key: `{target_primary_key}`\n\n"
|
281
321
|
|
282
322
|
# add columns specifications
|
283
323
|
prompt += "### Target Table Column Specifications:\n\n"
|
@@ -313,7 +353,7 @@ def _create_table_prompt(
|
|
313
353
|
has_self_referencing_foreign_keys_section = True
|
314
354
|
prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
|
315
355
|
for fk in self_referencing_foreign_keys:
|
316
|
-
prompt += f"### Primary Key Column: `{
|
356
|
+
prompt += f"### Primary Key Column: `{target_primary_key}`\n\n"
|
317
357
|
|
318
358
|
prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
|
319
359
|
|
@@ -374,6 +414,11 @@ def _create_table_prompt(
|
|
374
414
|
if n_rows is not None:
|
375
415
|
prompt += f"Number of data rows to {verb}: `{n_rows}`.\n\n"
|
376
416
|
|
417
|
+
if target_primary_key is not None:
|
418
|
+
prompt += f"Add prefix to all values of Target Table Primary Key. The prefix is 'B{batch_idx}-'."
|
419
|
+
prompt += " There is one exception: if primary keys are in existing data, don't add prefix to them."
|
420
|
+
prompt += "\n\n"
|
421
|
+
|
377
422
|
if has_context_table_section:
|
378
423
|
assert foreign_keys
|
379
424
|
prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
|
@@ -528,7 +573,7 @@ def _create_structured_output_schema(
|
|
528
573
|
) -> type[BaseModel]:
|
529
574
|
def create_annotation(column_config: ColumnConfig) -> type:
|
530
575
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
531
|
-
return Literal[tuple(column_config.values)]
|
576
|
+
return Literal[tuple(column_config.values)] # type: ignore
|
532
577
|
return {
|
533
578
|
DType.INTEGER: int | None,
|
534
579
|
DType.FLOAT: float | None,
|
@@ -610,8 +655,9 @@ async def _worker(
|
|
610
655
|
name=name,
|
611
656
|
prompt=prompt,
|
612
657
|
columns=columns,
|
613
|
-
|
658
|
+
batch_idx=batch_idx,
|
614
659
|
batch_size=batch_size,
|
660
|
+
primary_keys=primary_keys,
|
615
661
|
foreign_keys=foreign_keys,
|
616
662
|
existing_data=existing_batch,
|
617
663
|
context_data=context_batch,
|
@@ -715,7 +761,7 @@ async def _create_table_rows_generator(
|
|
715
761
|
foreign_keys: list[ForeignKeyConfig],
|
716
762
|
primary_keys: dict[str, str],
|
717
763
|
data: dict[str, pd.DataFrame],
|
718
|
-
sample_size: int,
|
764
|
+
sample_size: int | None,
|
719
765
|
previous_rows_size: int,
|
720
766
|
non_context_size: int | None,
|
721
767
|
n_workers: int,
|
@@ -762,6 +808,7 @@ async def _create_table_rows_generator(
|
|
762
808
|
non_context_data[non_context_table_name] = data[non_context_table_name]
|
763
809
|
|
764
810
|
# calculate batch_sizes
|
811
|
+
assert sample_size is not None, "sample_size should have been filled by this point"
|
765
812
|
n_total_batches = len(context_batches) if context_batches is not None else math.ceil(sample_size / batch_size)
|
766
813
|
batch_sizes = [batch_size] * n_total_batches
|
767
814
|
if context_batches is None:
|
@@ -873,6 +920,32 @@ async def _create_table_rows_generator(
|
|
873
920
|
await asyncio.gather(*workers)
|
874
921
|
|
875
922
|
|
923
|
+
def _align_series_dtypes_with_column_config(series: pd.Series, column_config: ColumnConfig) -> pd.Series:
|
924
|
+
series = series.copy()
|
925
|
+
if column_config.dtype in [DType.DATE, DType.DATETIME]:
|
926
|
+
|
927
|
+
def harmonize_datetime(x: Any):
|
928
|
+
try:
|
929
|
+
return dateutil.parser.parse(str(x))
|
930
|
+
except Exception:
|
931
|
+
return pd.NaT
|
932
|
+
|
933
|
+
series = pd.to_datetime(series.apply(harmonize_datetime), errors="coerce")
|
934
|
+
elif column_config.dtype is DType.INTEGER:
|
935
|
+
series = pd.to_numeric(series, errors="coerce", downcast="integer").astype("int64[pyarrow]")
|
936
|
+
elif column_config.dtype is DType.FLOAT:
|
937
|
+
series = pd.to_numeric(series, errors="coerce").astype("double[pyarrow]")
|
938
|
+
elif column_config.dtype is DType.BOOLEAN:
|
939
|
+
series = series.map(lambda x: True if str(x).lower() == "true" else x)
|
940
|
+
series = series.map(lambda x: False if str(x).lower() == "false" else x)
|
941
|
+
series = pd.to_numeric(series, errors="coerce").astype("boolean[pyarrow]")
|
942
|
+
elif column_config.dtype is DType.CATEGORY:
|
943
|
+
series = pd.Categorical(series, categories=column_config.values)
|
944
|
+
else:
|
945
|
+
series = series.astype("string[pyarrow]")
|
946
|
+
return series
|
947
|
+
|
948
|
+
|
876
949
|
async def _convert_table_rows_generator_to_df(
|
877
950
|
table_rows_generator: AsyncGenerator[dict],
|
878
951
|
columns: dict[str, ColumnConfig],
|
@@ -880,29 +953,7 @@ async def _convert_table_rows_generator_to_df(
|
|
880
953
|
def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
|
881
954
|
df = df.copy()
|
882
955
|
for column_name, column_config in columns.items():
|
883
|
-
|
884
|
-
|
885
|
-
def harmonize_datetime(x):
|
886
|
-
try:
|
887
|
-
return dateutil.parser.parse(x)
|
888
|
-
except Exception:
|
889
|
-
return pd.NaT
|
890
|
-
|
891
|
-
df[column_name] = pd.to_datetime(df[column_name].apply(harmonize_datetime), errors="coerce")
|
892
|
-
elif column_config.dtype is DType.INTEGER:
|
893
|
-
df[column_name] = pd.to_numeric(df[column_name], errors="coerce", downcast="integer").astype(
|
894
|
-
"int64[pyarrow]"
|
895
|
-
)
|
896
|
-
elif column_config.dtype is DType.FLOAT:
|
897
|
-
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("double[pyarrow]")
|
898
|
-
elif column_config.dtype is DType.BOOLEAN:
|
899
|
-
df[column_name] = df[column_name].map(lambda x: True if str(x).lower() == "true" else x)
|
900
|
-
df[column_name] = df[column_name].map(lambda x: False if str(x).lower() == "false" else x)
|
901
|
-
df[column_name] = pd.to_numeric(df[column_name], errors="coerce").astype("boolean[pyarrow]")
|
902
|
-
elif column_config.dtype is DType.CATEGORY:
|
903
|
-
df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
|
904
|
-
else:
|
905
|
-
df[column_name] = df[column_name].astype("string[pyarrow]")
|
956
|
+
df[column_name] = _align_series_dtypes_with_column_config(df[column_name], column_config)
|
906
957
|
return df
|
907
958
|
|
908
959
|
# consume entire generator
|
@@ -912,6 +963,7 @@ async def _convert_table_rows_generator_to_df(
|
|
912
963
|
# extract rows and convert to DataFrame
|
913
964
|
rows = [item["row"] for item in items]
|
914
965
|
df = pd.DataFrame(rows)
|
966
|
+
# harmonize dtypes
|
915
967
|
df = align_df_dtypes_with_mock_dtypes(df, columns)
|
916
968
|
return df
|
917
969
|
|
@@ -935,6 +987,8 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
|
|
935
987
|
tables = tables.copy()
|
936
988
|
for table_name, existing_table in existing_data.items():
|
937
989
|
table_config = tables.setdefault(table_name, {})
|
990
|
+
|
991
|
+
# prepend column configs for existing data columns, that are not specified in the mock config
|
938
992
|
column_configs = table_config.setdefault("columns", {})
|
939
993
|
existing_column_configs = {
|
940
994
|
existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
|
@@ -942,42 +996,82 @@ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataF
|
|
942
996
|
if existing_column not in column_configs
|
943
997
|
}
|
944
998
|
column_configs = {**existing_column_configs, **column_configs}
|
999
|
+
|
1000
|
+
# primary keys are always strings
|
1001
|
+
primary_key = table_config.get("primary_key", None)
|
1002
|
+
if primary_key is not None:
|
1003
|
+
column_configs[primary_key]["dtype"] = DType.STRING
|
1004
|
+
|
945
1005
|
table_config["columns"] = column_configs
|
946
1006
|
return tables
|
947
1007
|
|
948
1008
|
|
949
1009
|
def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
|
1010
|
+
_, _, root_tables = config.get_dependency_mappings()
|
1011
|
+
|
950
1012
|
if isinstance(sample_size, int):
|
951
|
-
|
1013
|
+
sample_size = {table_name: sample_size for table_name in root_tables}
|
1014
|
+
|
1015
|
+
for table_name in root_tables:
|
1016
|
+
if table_name not in sample_size or sample_size[table_name] is None:
|
1017
|
+
# set default sample size for missing or None sample sizes
|
1018
|
+
sample_size[table_name] = 4
|
1019
|
+
# clamp sample_size to [1, inf)
|
1020
|
+
sample_size[table_name] = max(1, sample_size[table_name])
|
952
1021
|
|
953
|
-
if sample_size.keys() != config.root.keys():
|
954
|
-
raise ValueError(f"Sample size keys must match table names: {sample_size.keys()} != {config.root.keys()}")
|
955
1022
|
return sample_size
|
956
1023
|
|
957
1024
|
|
958
|
-
def
|
959
|
-
|
960
|
-
|
961
|
-
|
1025
|
+
def _harmonize_existing_data(
|
1026
|
+
existing_data: dict[str, pd.DataFrame] | None, mock_config: MockConfig
|
1027
|
+
) -> dict[str, pd.DataFrame]:
|
1028
|
+
if existing_data is None:
|
1029
|
+
return {}
|
962
1030
|
|
963
|
-
|
964
|
-
|
965
|
-
parent_to_children[table_name] = set()
|
1031
|
+
# by this point, mock config should have been validated, so we can assume that all tables in existing_data are defined in the mock config
|
1032
|
+
assert set(mock_config.root.keys()).issuperset(existing_data.keys())
|
966
1033
|
|
967
|
-
|
968
|
-
|
969
|
-
for fk in table_config.foreign_keys:
|
970
|
-
referenced_table = fk.referenced_table
|
971
|
-
child_to_parents[table_name].add(referenced_table)
|
972
|
-
parent_to_children[referenced_table].add(table_name)
|
1034
|
+
for existing_table_name, existing_table in existing_data.items():
|
1035
|
+
existing_table_config = mock_config.root[existing_table_name]
|
973
1036
|
|
974
|
-
|
975
|
-
|
976
|
-
if not parents or parents == {table_name}: # no dependencies or only self-dependency
|
977
|
-
root_tables.append(table_name)
|
978
|
-
return child_to_parents, parent_to_children, root_tables
|
1037
|
+
for existing_column in existing_table.columns:
|
1038
|
+
existing_column_config = existing_table_config.columns[existing_column]
|
979
1039
|
|
980
|
-
|
1040
|
+
# ensure that the existing data has compatible dtypes with the column config
|
1041
|
+
original_series = existing_table[existing_column]
|
1042
|
+
coerced_series = _align_series_dtypes_with_column_config(original_series, existing_column_config)
|
1043
|
+
n_original_na = original_series.isna().sum()
|
1044
|
+
n_coerced_na = coerced_series.isna().sum()
|
1045
|
+
if n_original_na != n_coerced_na:
|
1046
|
+
raise ValueError(
|
1047
|
+
f"Coercion of existing data column '{existing_column}' in table '{existing_table_name}' resulted in data loss. "
|
1048
|
+
f"Ensure that the existing data is consistent with the mock configuration."
|
1049
|
+
)
|
1050
|
+
|
1051
|
+
# ensure that the existing data has values allowed by the column config
|
1052
|
+
if existing_column_config.values:
|
1053
|
+
if not set(existing_table[existing_column].unique()).issubset(existing_column_config.values):
|
1054
|
+
raise ValueError(
|
1055
|
+
f"Existing data column '{existing_column}' in table '{existing_table_name}' has values disallowed by the column config. "
|
1056
|
+
f"Ensure that the existing data is consistent with the mock configuration."
|
1057
|
+
)
|
1058
|
+
|
1059
|
+
# ensure that the existing data has unique primary keys
|
1060
|
+
existing_table_primary_key = existing_table_config.primary_key
|
1061
|
+
if existing_table_primary_key is not None:
|
1062
|
+
if not existing_table[existing_table_primary_key].is_unique:
|
1063
|
+
raise ValueError(
|
1064
|
+
f"Existing data table '{existing_table_name}' has non-unique primary key column '{existing_table_primary_key}'. "
|
1065
|
+
f"Ensure that the primary key is unique."
|
1066
|
+
)
|
1067
|
+
|
1068
|
+
existing_table[existing_column] = coerced_series
|
1069
|
+
|
1070
|
+
return existing_data
|
1071
|
+
|
1072
|
+
|
1073
|
+
def _build_execution_plan(config: MockConfig) -> list[str]:
|
1074
|
+
child_to_parents, parent_to_children, root_tables = config.get_dependency_mappings()
|
981
1075
|
|
982
1076
|
execution_plan = []
|
983
1077
|
bfs_queue = list(root_tables)
|
@@ -1035,7 +1129,7 @@ def sample(
|
|
1035
1129
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
1036
1130
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
1037
1131
|
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
1038
|
-
Default is 4. Ignored if existing_data is provided.
|
1132
|
+
Default is 4. Ignored if existing_data is provided. Ignored for non-root tables.
|
1039
1133
|
If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
|
1040
1134
|
existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
|
1041
1135
|
Default is None.
|
@@ -1092,15 +1186,15 @@ def sample(
|
|
1092
1186
|
"customers": {
|
1093
1187
|
"prompt": "Customers of a hardware store",
|
1094
1188
|
"columns": {
|
1095
|
-
"customer_id": {"prompt": "the unique id of the customer", "dtype": "
|
1189
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
|
1096
1190
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
1097
1191
|
},
|
1098
|
-
"primary_key": "customer_id", # single string; no composite keys allowed
|
1192
|
+
"primary_key": "customer_id", # single string; no composite keys allowed; primary keys must have string dtype
|
1099
1193
|
},
|
1100
1194
|
"warehouses": {
|
1101
1195
|
"prompt": "Warehouses of a hardware store",
|
1102
1196
|
"columns": {
|
1103
|
-
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "
|
1197
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
|
1104
1198
|
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
1105
1199
|
},
|
1106
1200
|
"primary_key": "warehouse_id",
|
@@ -1108,8 +1202,8 @@ def sample(
|
|
1108
1202
|
"orders": {
|
1109
1203
|
"prompt": "Orders of a Customer",
|
1110
1204
|
"columns": {
|
1111
|
-
"customer_id": {"prompt": "the customer id for that order", "dtype": "
|
1112
|
-
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "
|
1205
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
|
1206
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
|
1113
1207
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
1114
1208
|
"text": {"prompt": "order text description", "dtype": "string"},
|
1115
1209
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -1187,7 +1281,7 @@ def sample(
|
|
1187
1281
|
"customers": {
|
1188
1282
|
"prompt": "Customers of a hardware store",
|
1189
1283
|
"columns": {
|
1190
|
-
"customer_id": {"prompt": "the unique id of the customer", "dtype": "
|
1284
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
|
1191
1285
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
1192
1286
|
"email": {"prompt": "email address of the customer", "dtype": "string"},
|
1193
1287
|
"phone": {"prompt": "phone number of the customer", "dtype": "string"},
|
@@ -1199,7 +1293,7 @@ def sample(
|
|
1199
1293
|
"prompt": "Orders of a Customer",
|
1200
1294
|
"columns": {
|
1201
1295
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
1202
|
-
"customer_id": {"prompt": "the customer id for that order", "dtype": "
|
1296
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
|
1203
1297
|
"order_date": {"prompt": "the date when the order was placed", "dtype": "date"},
|
1204
1298
|
"total_amount": {"prompt": "order amount in USD", "dtype": "float"},
|
1205
1299
|
"status": {"dtype": "category", "values": ["pending", "shipped", "delivered", "cancelled"]},
|
@@ -1247,15 +1341,15 @@ def sample(
|
|
1247
1341
|
|
1248
1342
|
execution_plan: list[str] = _build_execution_plan(config)
|
1249
1343
|
|
1250
|
-
data: dict[str, pd.DataFrame] = existing_data or {}
|
1251
|
-
|
1252
|
-
for table_name in execution_plan:
|
1253
|
-
table_config = config.root[table_name]
|
1344
|
+
data: dict[str, pd.DataFrame] = _harmonize_existing_data(existing_data, config) or {}
|
1254
1345
|
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1346
|
+
# synchronous `sample` function makes independent calls to asynchronous `_sample_table` function
|
1347
|
+
# in order to avoid conflicts with potentially existing event loop (e.g. in Jupyter environment),
|
1348
|
+
# a new thread is spawned for each call to `_sample_table`
|
1349
|
+
# NOTE: initialize executor only once, doing that inside the loop might lead to deadlocks
|
1350
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
1351
|
+
for table_name in execution_plan:
|
1352
|
+
table_config = config.root[table_name]
|
1259
1353
|
future = executor.submit(
|
1260
1354
|
_sample_table_sync,
|
1261
1355
|
name=table_name,
|
@@ -1264,13 +1358,13 @@ def sample(
|
|
1264
1358
|
foreign_keys=table_config.foreign_keys,
|
1265
1359
|
primary_keys=primary_keys,
|
1266
1360
|
data=data,
|
1267
|
-
sample_size=sample_size
|
1361
|
+
sample_size=sample_size.get(table_name),
|
1268
1362
|
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
1269
1363
|
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
1270
1364
|
n_workers=n_workers,
|
1271
1365
|
llm_config=llm_config,
|
1272
1366
|
)
|
1273
1367
|
df = future.result()
|
1274
|
-
|
1368
|
+
data[table_name] = df
|
1275
1369
|
|
1276
1370
|
return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -95,17 +95,17 @@ df = mock.sample(
|
|
95
95
|
model="openai/gpt-4.1-nano", # select the LLM model (optional)
|
96
96
|
)
|
97
97
|
print(df)
|
98
|
-
# nationality
|
99
|
-
# 0
|
100
|
-
# 1 DE
|
101
|
-
# 2
|
102
|
-
# 3
|
103
|
-
# 4
|
104
|
-
# 5
|
105
|
-
# 6
|
106
|
-
# 7
|
107
|
-
# 8
|
108
|
-
# 9
|
98
|
+
# nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
|
99
|
+
# 0 FR Jean Dupont male 29 1994-03-15 2025-01-10 14:30:00 False 150.0 101
|
100
|
+
# 1 DE Anna Schmidt female 34 1989-07-22 2025-01-11 16:45:00 True 200.0 201
|
101
|
+
# 2 IT Marco Rossi male 45 1979-11-05 2025-01-09 10:15:00 False 180.0 102
|
102
|
+
# 3 AT Laura Gruber female 28 1996-02-19 2025-01-12 09:00:00 False 165.0 202
|
103
|
+
# 4 CH David Müller male 37 1987-08-30 2025-01-08 17:20:00 True 210.0 203
|
104
|
+
# 5 NL Sophie van den Berg female 22 2002-04-12 2025-01-10 12:00:00 False 140.0 103
|
105
|
+
# 6 GB James Carter male 31 1992-09-10 2025-01-11 11:30:00 False 155.0 204
|
106
|
+
# 7 BE Lotte Peeters female 26 1998-05-25 2025-01-09 15:45:00 False 160.0 201
|
107
|
+
# 8 DK Anders Jensen male 33 1990-12-03 2025-01-12 08:15:00 True 220.0 202
|
108
|
+
# 9 ES Carlos Lopez male 38 1985-06-14 2025-01-10 18:00:00 False 170.0 203
|
109
109
|
```
|
110
110
|
|
111
111
|
4. Create your first multi-table mock dataset
|
@@ -117,7 +117,7 @@ tables = {
|
|
117
117
|
"customers": {
|
118
118
|
"prompt": "Customers of a hardware store",
|
119
119
|
"columns": {
|
120
|
-
"customer_id": {"prompt": "the unique id of the customer", "dtype": "
|
120
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
|
121
121
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
122
122
|
},
|
123
123
|
"primary_key": "customer_id",
|
@@ -125,7 +125,7 @@ tables = {
|
|
125
125
|
"warehouses": {
|
126
126
|
"prompt": "Warehouses of a hardware store",
|
127
127
|
"columns": {
|
128
|
-
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "
|
128
|
+
"warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "string"},
|
129
129
|
"name": {"prompt": "the name of the warehouse", "dtype": "string"},
|
130
130
|
},
|
131
131
|
"primary_key": "warehouse_id",
|
@@ -133,8 +133,8 @@ tables = {
|
|
133
133
|
"orders": {
|
134
134
|
"prompt": "Orders of a Customer",
|
135
135
|
"columns": {
|
136
|
-
"customer_id": {"prompt": "the customer id for that order", "dtype": "
|
137
|
-
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "
|
136
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "string"},
|
137
|
+
"warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "string"},
|
138
138
|
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
139
139
|
"text": {"prompt": "order text description", "dtype": "string"},
|
140
140
|
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
@@ -167,40 +167,42 @@ tables = {
|
|
167
167
|
"prompt": "each order has between 1 and 2 items",
|
168
168
|
}
|
169
169
|
],
|
170
|
+
"primary_key": "item_id",
|
170
171
|
},
|
171
172
|
}
|
172
173
|
data = mock.sample(
|
173
174
|
tables=tables,
|
174
175
|
sample_size=2,
|
175
|
-
model="openai/gpt-4.1"
|
176
|
+
model="openai/gpt-4.1",
|
177
|
+
n_workers=1,
|
176
178
|
)
|
177
179
|
print(data["customers"])
|
178
|
-
#
|
179
|
-
# 0
|
180
|
-
# 1
|
180
|
+
# customer_id name
|
181
|
+
# 0 B0-100235 Danielle Rogers
|
182
|
+
# 1 B0-100236 Edward Kim
|
181
183
|
print(data["warehouses"])
|
182
|
-
#
|
183
|
-
# 0
|
184
|
-
# 1
|
184
|
+
# warehouse_id name
|
185
|
+
# 0 B0-001 Downtown Distribution Center
|
186
|
+
# 1 B0-002 Westside Storage Facility
|
185
187
|
print(data["orders"])
|
186
|
-
#
|
187
|
-
# 0
|
188
|
-
# 1
|
189
|
-
# 2
|
190
|
-
# 3
|
191
|
-
# 4
|
188
|
+
# customer_id warehouse_id order_id text amount
|
189
|
+
# 0 B0-100235 B0-002 B0-3010021 Office furniture replenishment - desks, chairs... 1268.35
|
190
|
+
# 1 B0-100235 B0-001 B0-3010022 Bulk stationery order: printer paper, notebook... 449.9
|
191
|
+
# 2 B0-100235 B0-001 B0-3010023 Electronics restock: monitors and wireless key... 877.6
|
192
|
+
# 3 B0-100236 B0-001 B1-3010021 Monthly cleaning supplies: disinfectant, trash... 314.75
|
193
|
+
# 4 B0-100236 B0-002 B1-3010022 Breakroom essentials restock: coffee, tea, and... 182.45
|
192
194
|
print(data["items"])
|
193
|
-
# item_id
|
194
|
-
# 0
|
195
|
-
# 1
|
196
|
-
# 2
|
197
|
-
# 3
|
198
|
-
# 4
|
199
|
-
# 5
|
200
|
-
# 6
|
201
|
-
# 7
|
202
|
-
# 8
|
203
|
-
# 9
|
195
|
+
# item_id order_id name price
|
196
|
+
# 0 B0-200501 B0-3010021 Ergonomic Office Desk 545.99
|
197
|
+
# 1 B0-200502 B0-3010021 Mesh Back Executive Chair 399.5
|
198
|
+
# 2 B1-200503 B0-3010022 Multipack Printer Paper (500 sheets) 129.95
|
199
|
+
# 3 B1-200504 B0-3010022 Spiral Notebooks - 12 Pack 59.99
|
200
|
+
# 4 B2-200505 B0-3010023 27" LED Computer Monitor 489.95
|
201
|
+
# 5 B2-200506 B0-3010023 Wireless Ergonomic Keyboard 387.65
|
202
|
+
# 6 B3-200507 B1-3010021 Industrial Disinfectant Solution (5L) 148.95
|
203
|
+
# 7 B3-200508 B1-3010021 Commercial Trash Liners - Case of 100 84.5
|
204
|
+
# 8 B4-200509 B1-3010022 Premium Ground Coffee (2lb Bag) 74.99
|
205
|
+
# 9 B4-200510 B1-3010022 Bottled Spring Water (24 Pack) 34.95
|
204
206
|
```
|
205
207
|
|
206
208
|
6. Create your first self-referencing mock table
|
@@ -212,9 +214,9 @@ tables = {
|
|
212
214
|
"employees": {
|
213
215
|
"prompt": "Employees of a company",
|
214
216
|
"columns": {
|
215
|
-
"employee_id": {"prompt": "the unique id of the employee", "dtype": "
|
217
|
+
"employee_id": {"prompt": "the unique id of the employee; sequential", "dtype": "string"},
|
216
218
|
"name": {"prompt": "first name and last name of the president", "dtype": "string"},
|
217
|
-
"boss_id": {"prompt": "the id of the boss of the employee", "dtype": "
|
219
|
+
"boss_id": {"prompt": "the id of the boss of the employee", "dtype": "string"},
|
218
220
|
"role": {"prompt": "the role of the employee", "dtype": "string"},
|
219
221
|
},
|
220
222
|
"primary_key": "employee_id",
|
@@ -229,17 +231,17 @@ tables = {
|
|
229
231
|
}
|
230
232
|
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
|
231
233
|
print(df)
|
232
|
-
#
|
233
|
-
# 0
|
234
|
-
# 1
|
235
|
-
# 2
|
236
|
-
# 3
|
237
|
-
# 4
|
238
|
-
# 5
|
239
|
-
# 6
|
240
|
-
# 7
|
241
|
-
# 8
|
242
|
-
# 9
|
234
|
+
# employee_id name boss_id role
|
235
|
+
# 0 B0-1 Patricia Lee <NA> President
|
236
|
+
# 1 B0-2 Edward Rodriguez B0-1 VP of Operations
|
237
|
+
# 2 B0-3 Maria Cortez B0-1 VP of Finance
|
238
|
+
# 3 B0-4 Thomas Nguyen B0-1 VP of Technology
|
239
|
+
# 4 B0-5 Rachel Kim B0-2 Operations Manager
|
240
|
+
# 5 B0-6 Jeffrey Patel B0-2 Supply Chain Lead
|
241
|
+
# 6 B0-7 Olivia Smith B0-2 Facilities Supervisor
|
242
|
+
# 7 B0-8 Brian Carter B0-3 Accounting Manager
|
243
|
+
# 8 B0-9 Lauren Anderson B0-3 Financial Analyst
|
244
|
+
# 9 B0-10 Santiago Romero B0-3 Payroll Specialist
|
243
245
|
```
|
244
246
|
|
245
247
|
7. Enrich existing data with additional columns
|
@@ -271,10 +273,10 @@ df = mock.sample(
|
|
271
273
|
model="openai/gpt-4.1-nano"
|
272
274
|
)
|
273
275
|
print(df)
|
274
|
-
#
|
275
|
-
# 0
|
276
|
-
# 1
|
277
|
-
# 2
|
276
|
+
# guest_id name nationality gender age room_number is_vip
|
277
|
+
# 0 1 Anna Schmidt DE female 30 102 False
|
278
|
+
# 1 2 Marco Rossi IT male 27 215 True
|
279
|
+
# 2 3 Sophie Dupont FR female 22 108 False
|
278
280
|
```
|
279
281
|
|
280
282
|
## MCP Server
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=r4GBSmJmB1SGtviYtZwY5b3GBzhK_mt8czzk-py4flo,715
|
2
|
+
mostlyai/mock/core.py,sha256=nu0PSX3Xt8l6_95cIrJ7Wt0SbJvfrLD3t0CFIidOLcM,59573
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=MrVUrIsAZsFzjK1suwNl1fxS1ES-wpc-YSM8cS8Fqcw,2259
|
4
|
+
mostlyai_mock-0.1.13.dist-info/METADATA,sha256=un3lLINiMi8HkVcmsIr64U-OQQiqT5LsgiGam1aNTj4,14110
|
5
|
+
mostlyai_mock-0.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.13.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.13.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=TijW-xX5zpSpPThkVonR3iwZThtry7MlDSg03EWcZLc,715
|
2
|
-
mostlyai/mock/core.py,sha256=K3f5i4C7Q50xF-o2OAwzTNKxiUW7qzLW3z5rGDlPbS4,54873
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=MrVUrIsAZsFzjK1suwNl1fxS1ES-wpc-YSM8cS8Fqcw,2259
|
4
|
-
mostlyai_mock-0.1.11.dist-info/METADATA,sha256=qqva72RqPxlajbDMDxLUqCm1n06-lgL04Wj8A-3WDbw,14100
|
5
|
-
mostlyai_mock-0.1.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.11.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|