mostlyai-mock 0.0.4__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/PKG-INFO +39 -20
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/README.md +38 -19
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/mostlyai/mock/core.py +118 -30
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/pyproject.toml +1 -1
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/.gitignore +0 -0
- {mostlyai_mock-0.0.4 → mostlyai_mock-0.0.5}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -65,34 +65,53 @@ print(df)
|
|
65
65
|
from mostlyai import mock
|
66
66
|
|
67
67
|
tables = {
|
68
|
-
"
|
69
|
-
"description": "
|
68
|
+
"customers": {
|
69
|
+
"description": "Customers of a hardware store",
|
70
70
|
"columns": {
|
71
|
-
"
|
72
|
-
"name": {"prompt": "first name and last name of the
|
71
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
72
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
73
|
+
},
|
74
|
+
"primary_key": "customer_id",
|
75
|
+
},
|
76
|
+
"orders": {
|
77
|
+
"description": "Orders of a Customer",
|
78
|
+
"columns": {
|
79
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
80
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
81
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
82
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
73
83
|
},
|
74
|
-
"primary_key": "
|
84
|
+
"primary_key": "order_id",
|
85
|
+
"foreign_keys": [
|
86
|
+
{
|
87
|
+
"column": "customer_id",
|
88
|
+
"referenced_table": "customers",
|
89
|
+
"description": "each customer has anywhere between 1 and 3 orders",
|
90
|
+
}
|
91
|
+
],
|
75
92
|
},
|
76
|
-
"
|
77
|
-
"description": "
|
93
|
+
"items": {
|
94
|
+
"description": "Items in an Order",
|
78
95
|
"columns": {
|
79
|
-
"
|
80
|
-
"
|
81
|
-
"
|
82
|
-
"
|
96
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
97
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
98
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
99
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
83
100
|
},
|
84
101
|
"foreign_keys": [
|
85
102
|
{
|
86
|
-
"column": "
|
87
|
-
"referenced_table": "
|
88
|
-
"description": "each
|
103
|
+
"column": "order_id",
|
104
|
+
"referenced_table": "orders",
|
105
|
+
"description": "each order has between 2 and 5 items",
|
89
106
|
}
|
90
107
|
],
|
91
108
|
},
|
92
109
|
}
|
93
|
-
data = mock.sample(tables=tables, sample_size=
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
print(
|
110
|
+
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
111
|
+
df_customers = data["customers"]
|
112
|
+
df_orders = data["orders"]
|
113
|
+
df_items = data["items"]
|
114
|
+
print(df_customers)
|
115
|
+
print(df_orders)
|
116
|
+
print(df_items)
|
98
117
|
```
|
@@ -47,34 +47,53 @@ print(df)
|
|
47
47
|
from mostlyai import mock
|
48
48
|
|
49
49
|
tables = {
|
50
|
-
"
|
51
|
-
"description": "
|
50
|
+
"customers": {
|
51
|
+
"description": "Customers of a hardware store",
|
52
52
|
"columns": {
|
53
|
-
"
|
54
|
-
"name": {"prompt": "first name and last name of the
|
53
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
54
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
55
|
+
},
|
56
|
+
"primary_key": "customer_id",
|
57
|
+
},
|
58
|
+
"orders": {
|
59
|
+
"description": "Orders of a Customer",
|
60
|
+
"columns": {
|
61
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
62
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
63
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
64
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
55
65
|
},
|
56
|
-
"primary_key": "
|
66
|
+
"primary_key": "order_id",
|
67
|
+
"foreign_keys": [
|
68
|
+
{
|
69
|
+
"column": "customer_id",
|
70
|
+
"referenced_table": "customers",
|
71
|
+
"description": "each customer has anywhere between 1 and 3 orders",
|
72
|
+
}
|
73
|
+
],
|
57
74
|
},
|
58
|
-
"
|
59
|
-
"description": "
|
75
|
+
"items": {
|
76
|
+
"description": "Items in an Order",
|
60
77
|
"columns": {
|
61
|
-
"
|
62
|
-
"
|
63
|
-
"
|
64
|
-
"
|
78
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
79
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
80
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
81
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
65
82
|
},
|
66
83
|
"foreign_keys": [
|
67
84
|
{
|
68
|
-
"column": "
|
69
|
-
"referenced_table": "
|
70
|
-
"description": "each
|
85
|
+
"column": "order_id",
|
86
|
+
"referenced_table": "orders",
|
87
|
+
"description": "each order has between 2 and 5 items",
|
71
88
|
}
|
72
89
|
],
|
73
90
|
},
|
74
91
|
}
|
75
|
-
data = mock.sample(tables=tables, sample_size=
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
print(
|
92
|
+
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
93
|
+
df_customers = data["customers"]
|
94
|
+
df_orders = data["orders"]
|
95
|
+
df_items = data["items"]
|
96
|
+
print(df_customers)
|
97
|
+
print(df_orders)
|
98
|
+
print(df_items)
|
80
99
|
```
|
@@ -89,6 +89,31 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
89
89
|
|
90
90
|
return tables
|
91
91
|
|
92
|
+
@model_validator(mode="after")
|
93
|
+
def validate_no_circular_dependencies(self) -> MockConfig:
|
94
|
+
child_to_parents = {}
|
95
|
+
for table_name, table_config in self.root.items():
|
96
|
+
child_to_parents[table_name] = [fk.referenced_table for fk in table_config.foreign_keys]
|
97
|
+
visited = set()
|
98
|
+
|
99
|
+
def detect_cycle(table_name: str, path: list[str]) -> None:
|
100
|
+
if table_name in path:
|
101
|
+
cycle_start = path.index(table_name)
|
102
|
+
cycle = path[cycle_start:] + [table_name]
|
103
|
+
raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}")
|
104
|
+
if table_name in visited:
|
105
|
+
return
|
106
|
+
visited.add(table_name)
|
107
|
+
path.append(table_name)
|
108
|
+
for parent in child_to_parents[table_name]:
|
109
|
+
detect_cycle(parent, path)
|
110
|
+
path.pop()
|
111
|
+
|
112
|
+
for table_name in child_to_parents:
|
113
|
+
detect_cycle(table_name, [])
|
114
|
+
|
115
|
+
return self
|
116
|
+
|
92
117
|
|
93
118
|
class TableConfig(BaseModel):
|
94
119
|
description: str = ""
|
@@ -234,7 +259,7 @@ def _create_table_prompt(
|
|
234
259
|
# add previous rows as context to help the LLM generate consistent data
|
235
260
|
if previous_rows:
|
236
261
|
prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
|
237
|
-
prompt += json.dumps(previous_rows, indent=2)
|
262
|
+
prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
|
238
263
|
|
239
264
|
# add context table name, primary key and data
|
240
265
|
if context_data is not None:
|
@@ -252,12 +277,14 @@ def _create_table_prompt(
|
|
252
277
|
prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
|
253
278
|
else:
|
254
279
|
prompt += (
|
255
|
-
f"Generate
|
256
|
-
f"The Foreign Key column may only contain values from Context Table Data
|
280
|
+
f"Generate data for the `{table_name}` table. "
|
281
|
+
f"The Foreign Key column may only contain values from Context Table Data. "
|
282
|
+
f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
|
257
283
|
)
|
258
284
|
if previous_rows:
|
259
285
|
prompt += (
|
260
286
|
"Generate new rows that maintain consistency with the previous rows where appropriate. "
|
287
|
+
"Don't copy previous rows in the output. "
|
261
288
|
"Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
|
262
289
|
)
|
263
290
|
prompt += f"Do not use code to generate the data.\n\n"
|
@@ -426,6 +453,44 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
|
|
426
453
|
return sample_size
|
427
454
|
|
428
455
|
|
456
|
+
def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
|
457
|
+
child_to_parents = {}
|
458
|
+
parent_to_children = {}
|
459
|
+
|
460
|
+
for table_name in config.root:
|
461
|
+
child_to_parents[table_name] = []
|
462
|
+
parent_to_children[table_name] = []
|
463
|
+
|
464
|
+
for table_name, table_config in config.root.items():
|
465
|
+
if table_config.foreign_keys:
|
466
|
+
for fk in table_config.foreign_keys:
|
467
|
+
referenced_table = fk.referenced_table
|
468
|
+
child_to_parents[table_name].append(referenced_table)
|
469
|
+
parent_to_children[referenced_table].append(table_name)
|
470
|
+
|
471
|
+
subject_tables = [table_name for table_name, deps in child_to_parents.items() if not deps]
|
472
|
+
return child_to_parents, parent_to_children, subject_tables
|
473
|
+
|
474
|
+
|
475
|
+
def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
|
476
|
+
execution_plan = []
|
477
|
+
bfs_queue = list(subject_tables)
|
478
|
+
processed = set()
|
479
|
+
|
480
|
+
while bfs_queue:
|
481
|
+
table_name = bfs_queue.pop(0)
|
482
|
+
if table_name in processed:
|
483
|
+
continue
|
484
|
+
|
485
|
+
execution_plan.append(table_name)
|
486
|
+
processed.add(table_name)
|
487
|
+
|
488
|
+
for child in parent_to_children[table_name]:
|
489
|
+
if child not in bfs_queue and child not in processed:
|
490
|
+
bfs_queue.append(child)
|
491
|
+
return execution_plan
|
492
|
+
|
493
|
+
|
429
494
|
def sample(
|
430
495
|
*,
|
431
496
|
tables: dict[str, dict],
|
@@ -491,34 +556,52 @@ def sample(
|
|
491
556
|
from mostlyai import mock
|
492
557
|
|
493
558
|
tables = {
|
494
|
-
"
|
495
|
-
"description": "
|
559
|
+
"customers": {
|
560
|
+
"description": "Customers of a hardware store",
|
496
561
|
"columns": {
|
497
|
-
"
|
498
|
-
"name": {"prompt": "first name and last name of the
|
562
|
+
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
563
|
+
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
564
|
+
},
|
565
|
+
"primary_key": "customer_id",
|
566
|
+
},
|
567
|
+
"orders": {
|
568
|
+
"description": "Orders of a Customer",
|
569
|
+
"columns": {
|
570
|
+
"customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
|
571
|
+
"order_id": {"prompt": "the unique id of the order", "dtype": "string"},
|
572
|
+
"text": {"prompt": "order text description", "dtype": "string"},
|
573
|
+
"amount": {"prompt": "order amount in USD", "dtype": "float"},
|
499
574
|
},
|
500
|
-
"primary_key": "
|
575
|
+
"primary_key": "order_id",
|
576
|
+
"foreign_keys": [
|
577
|
+
{
|
578
|
+
"column": "customer_id",
|
579
|
+
"referenced_table": "customers",
|
580
|
+
"description": "each customer has anywhere between 1 and 3 orders",
|
581
|
+
}
|
582
|
+
],
|
501
583
|
},
|
502
|
-
"
|
503
|
-
"description": "
|
584
|
+
"items": {
|
585
|
+
"description": "Items in an Order",
|
504
586
|
"columns": {
|
505
|
-
"
|
506
|
-
"
|
507
|
-
"
|
508
|
-
"
|
587
|
+
"item_id": {"prompt": "the unique id of the item", "dtype": "string"},
|
588
|
+
"order_id": {"prompt": "the order id for that item", "dtype": "string"},
|
589
|
+
"name": {"prompt": "the name of the item", "dtype": "string"},
|
590
|
+
"price": {"prompt": "the price of the item in USD", "dtype": "float"},
|
509
591
|
},
|
510
592
|
"foreign_keys": [
|
511
593
|
{
|
512
|
-
"column": "
|
513
|
-
"referenced_table": "
|
514
|
-
"description": "each
|
594
|
+
"column": "order_id",
|
595
|
+
"referenced_table": "orders",
|
596
|
+
"description": "each order has between 2 and 5 items",
|
515
597
|
}
|
516
598
|
],
|
517
599
|
},
|
518
600
|
}
|
519
|
-
data = mock.sample(tables=tables, sample_size=
|
520
|
-
|
521
|
-
|
601
|
+
data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
|
602
|
+
df_customers = data["customers"]
|
603
|
+
df_orders = data["orders"]
|
604
|
+
df_items = data["items"]
|
522
605
|
```
|
523
606
|
"""
|
524
607
|
|
@@ -526,9 +609,15 @@ def sample(
|
|
526
609
|
|
527
610
|
sample_size = _harmonize_sample_size(sample_size, config)
|
528
611
|
primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
|
529
|
-
|
530
|
-
|
531
|
-
|
612
|
+
|
613
|
+
child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
|
614
|
+
execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
|
615
|
+
|
616
|
+
results: dict[str, pd.DataFrame] = {}
|
617
|
+
|
618
|
+
for table_name in execution_plan:
|
619
|
+
table_config = config.root[table_name]
|
620
|
+
if not child_to_parents[table_name]:
|
532
621
|
# subject table
|
533
622
|
df = _sample_table(
|
534
623
|
table_name=table_name,
|
@@ -542,22 +631,21 @@ def sample(
|
|
542
631
|
previous_rows_size=5,
|
543
632
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
544
633
|
)
|
545
|
-
|
546
|
-
#
|
634
|
+
else:
|
635
|
+
# sequencial table
|
636
|
+
referenced_table = table_config.foreign_keys[0].referenced_table
|
547
637
|
df = _sample_table(
|
548
638
|
table_name=table_name,
|
549
639
|
table_config=table_config,
|
550
640
|
primary_keys=primary_keys,
|
551
641
|
sample_size=None,
|
552
|
-
context_data=
|
642
|
+
context_data=results[referenced_table],
|
553
643
|
temperature=temperature,
|
554
644
|
top_p=top_p,
|
555
645
|
batch_size=1, # generate one sequence at a time
|
556
646
|
previous_rows_size=5,
|
557
647
|
llm_config=LLMConfig(model=model, api_key=api_key),
|
558
648
|
)
|
559
|
-
|
560
|
-
raise RuntimeError("Only 1 or 2 table setups are supported for now")
|
561
|
-
dfs[table_name] = df
|
649
|
+
results[table_name] = df
|
562
650
|
|
563
|
-
return
|
651
|
+
return results if len(results) > 1 else next(iter(results.values()))
|
File without changes
|
File without changes
|