mostlyai-mock 0.0.4__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -65,34 +65,53 @@ print(df)
65
65
  from mostlyai import mock
66
66
 
67
67
  tables = {
68
- "guests": {
69
- "description": "Guests of an Alpine ski hotel in Austria",
68
+ "customers": {
69
+ "description": "Customers of a hardware store",
70
70
  "columns": {
71
- "id": {"prompt": "the unique id of the guest", "dtype": "integer"},
72
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
71
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
72
+ "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
73
+ },
74
+ "primary_key": "customer_id",
75
+ },
76
+ "orders": {
77
+ "description": "Orders of a Customer",
78
+ "columns": {
79
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
80
+ "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
81
+ "text": {"prompt": "order text description", "dtype": "string"},
82
+ "amount": {"prompt": "order amount in USD", "dtype": "float"},
73
83
  },
74
- "primary_key": "id",
84
+ "primary_key": "order_id",
85
+ "foreign_keys": [
86
+ {
87
+ "column": "customer_id",
88
+ "referenced_table": "customers",
89
+ "description": "each customer has anywhere between 1 and 3 orders",
90
+ }
91
+ ],
75
92
  },
76
- "purchases": {
77
- "description": "Purchases of a Guest during their stay",
93
+ "items": {
94
+ "description": "Items in an Order",
78
95
  "columns": {
79
- "guest_id": {"prompt": "the guest id for that purchase", "dtype": "integer"},
80
- "purchase_id": {"prompt": "the unique id of the purchase", "dtype": "string"},
81
- "text": {"prompt": "purchase text description", "dtype": "string"},
82
- "amount": {"prompt": "purchase amount in EUR", "dtype": "float"},
96
+ "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
97
+ "order_id": {"prompt": "the order id for that item", "dtype": "string"},
98
+ "name": {"prompt": "the name of the item", "dtype": "string"},
99
+ "price": {"prompt": "the price of the item in USD", "dtype": "float"},
83
100
  },
84
101
  "foreign_keys": [
85
102
  {
86
- "column": "guest_id",
87
- "referenced_table": "guests",
88
- "description": "each guest has anywhere between 1 and 10 purchases",
103
+ "column": "order_id",
104
+ "referenced_table": "orders",
105
+ "description": "each order has between 2 and 5 items",
89
106
  }
90
107
  ],
91
108
  },
92
109
  }
93
- data = mock.sample(tables=tables, sample_size=5, model="openai/gpt-4.1-nano")
94
- df_guests = data["guests"]
95
- df_purchases = data["purchases"]
96
- print(df_guests)
97
- print(df_purchases)
110
+ data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
111
+ df_customers = data["customers"]
112
+ df_orders = data["orders"]
113
+ df_items = data["items"]
114
+ print(df_customers)
115
+ print(df_orders)
116
+ print(df_items)
98
117
  ```
@@ -47,34 +47,53 @@ print(df)
47
47
  from mostlyai import mock
48
48
 
49
49
  tables = {
50
- "guests": {
51
- "description": "Guests of an Alpine ski hotel in Austria",
50
+ "customers": {
51
+ "description": "Customers of a hardware store",
52
52
  "columns": {
53
- "id": {"prompt": "the unique id of the guest", "dtype": "integer"},
54
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
53
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
54
+ "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
55
+ },
56
+ "primary_key": "customer_id",
57
+ },
58
+ "orders": {
59
+ "description": "Orders of a Customer",
60
+ "columns": {
61
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
62
+ "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
63
+ "text": {"prompt": "order text description", "dtype": "string"},
64
+ "amount": {"prompt": "order amount in USD", "dtype": "float"},
55
65
  },
56
- "primary_key": "id",
66
+ "primary_key": "order_id",
67
+ "foreign_keys": [
68
+ {
69
+ "column": "customer_id",
70
+ "referenced_table": "customers",
71
+ "description": "each customer has anywhere between 1 and 3 orders",
72
+ }
73
+ ],
57
74
  },
58
- "purchases": {
59
- "description": "Purchases of a Guest during their stay",
75
+ "items": {
76
+ "description": "Items in an Order",
60
77
  "columns": {
61
- "guest_id": {"prompt": "the guest id for that purchase", "dtype": "integer"},
62
- "purchase_id": {"prompt": "the unique id of the purchase", "dtype": "string"},
63
- "text": {"prompt": "purchase text description", "dtype": "string"},
64
- "amount": {"prompt": "purchase amount in EUR", "dtype": "float"},
78
+ "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
79
+ "order_id": {"prompt": "the order id for that item", "dtype": "string"},
80
+ "name": {"prompt": "the name of the item", "dtype": "string"},
81
+ "price": {"prompt": "the price of the item in USD", "dtype": "float"},
65
82
  },
66
83
  "foreign_keys": [
67
84
  {
68
- "column": "guest_id",
69
- "referenced_table": "guests",
70
- "description": "each guest has anywhere between 1 and 10 purchases",
85
+ "column": "order_id",
86
+ "referenced_table": "orders",
87
+ "description": "each order has between 2 and 5 items",
71
88
  }
72
89
  ],
73
90
  },
74
91
  }
75
- data = mock.sample(tables=tables, sample_size=5, model="openai/gpt-4.1-nano")
76
- df_guests = data["guests"]
77
- df_purchases = data["purchases"]
78
- print(df_guests)
79
- print(df_purchases)
92
+ data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
93
+ df_customers = data["customers"]
94
+ df_orders = data["orders"]
95
+ df_items = data["items"]
96
+ print(df_customers)
97
+ print(df_orders)
98
+ print(df_items)
80
99
  ```
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.4" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.5" # Do not set this manually. Use poetry version [params].
@@ -89,6 +89,31 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
89
89
 
90
90
  return tables
91
91
 
92
+ @model_validator(mode="after")
93
+ def validate_no_circular_dependencies(self) -> MockConfig:
94
+ child_to_parents = {}
95
+ for table_name, table_config in self.root.items():
96
+ child_to_parents[table_name] = [fk.referenced_table for fk in table_config.foreign_keys]
97
+ visited = set()
98
+
99
+ def detect_cycle(table_name: str, path: list[str]) -> None:
100
+ if table_name in path:
101
+ cycle_start = path.index(table_name)
102
+ cycle = path[cycle_start:] + [table_name]
103
+ raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}")
104
+ if table_name in visited:
105
+ return
106
+ visited.add(table_name)
107
+ path.append(table_name)
108
+ for parent in child_to_parents[table_name]:
109
+ detect_cycle(parent, path)
110
+ path.pop()
111
+
112
+ for table_name in child_to_parents:
113
+ detect_cycle(table_name, [])
114
+
115
+ return self
116
+
92
117
 
93
118
  class TableConfig(BaseModel):
94
119
  description: str = ""
@@ -234,7 +259,7 @@ def _create_table_prompt(
234
259
  # add previous rows as context to help the LLM generate consistent data
235
260
  if previous_rows:
236
261
  prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
237
- prompt += json.dumps(previous_rows, indent=2)
262
+ prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
238
263
 
239
264
  # add context table name, primary key and data
240
265
  if context_data is not None:
@@ -252,12 +277,14 @@ def _create_table_prompt(
252
277
  prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
253
278
  else:
254
279
  prompt += (
255
- f"Generate rows for the `{table_name}` table. "
256
- f"The Foreign Key column may only contain values from Context Table Data.\n\n"
280
+ f"Generate data for the `{table_name}` table. "
281
+ f"The Foreign Key column may only contain values from Context Table Data. "
282
+ f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
257
283
  )
258
284
  if previous_rows:
259
285
  prompt += (
260
286
  "Generate new rows that maintain consistency with the previous rows where appropriate. "
287
+ "Don't copy previous rows in the output. "
261
288
  "Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
262
289
  )
263
290
  prompt += f"Do not use code to generate the data.\n\n"
@@ -426,6 +453,44 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
426
453
  return sample_size
427
454
 
428
455
 
456
+ def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
457
+ child_to_parents = {}
458
+ parent_to_children = {}
459
+
460
+ for table_name in config.root:
461
+ child_to_parents[table_name] = []
462
+ parent_to_children[table_name] = []
463
+
464
+ for table_name, table_config in config.root.items():
465
+ if table_config.foreign_keys:
466
+ for fk in table_config.foreign_keys:
467
+ referenced_table = fk.referenced_table
468
+ child_to_parents[table_name].append(referenced_table)
469
+ parent_to_children[referenced_table].append(table_name)
470
+
471
+ subject_tables = [table_name for table_name, deps in child_to_parents.items() if not deps]
472
+ return child_to_parents, parent_to_children, subject_tables
473
+
474
+
475
+ def _build_execution_plan(parent_to_children: dict[str, list[str]], subject_tables: list[str]) -> list[str]:
476
+ execution_plan = []
477
+ bfs_queue = list(subject_tables)
478
+ processed = set()
479
+
480
+ while bfs_queue:
481
+ table_name = bfs_queue.pop(0)
482
+ if table_name in processed:
483
+ continue
484
+
485
+ execution_plan.append(table_name)
486
+ processed.add(table_name)
487
+
488
+ for child in parent_to_children[table_name]:
489
+ if child not in bfs_queue and child not in processed:
490
+ bfs_queue.append(child)
491
+ return execution_plan
492
+
493
+
429
494
  def sample(
430
495
  *,
431
496
  tables: dict[str, dict],
@@ -491,34 +556,52 @@ def sample(
491
556
  from mostlyai import mock
492
557
 
493
558
  tables = {
494
- "guests": {
495
- "description": "Guests of an Alpine ski hotel in Austria",
559
+ "customers": {
560
+ "description": "Customers of a hardware store",
496
561
  "columns": {
497
- "id": {"prompt": "the unique id of the guest", "dtype": "integer"},
498
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
562
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
563
+ "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
564
+ },
565
+ "primary_key": "customer_id",
566
+ },
567
+ "orders": {
568
+ "description": "Orders of a Customer",
569
+ "columns": {
570
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
571
+ "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
572
+ "text": {"prompt": "order text description", "dtype": "string"},
573
+ "amount": {"prompt": "order amount in USD", "dtype": "float"},
499
574
  },
500
- "primary_key": "id",
575
+ "primary_key": "order_id",
576
+ "foreign_keys": [
577
+ {
578
+ "column": "customer_id",
579
+ "referenced_table": "customers",
580
+ "description": "each customer has anywhere between 1 and 3 orders",
581
+ }
582
+ ],
501
583
  },
502
- "purchases": {
503
- "description": "Purchases of a Guest during their stay",
584
+ "items": {
585
+ "description": "Items in an Order",
504
586
  "columns": {
505
- "guest_id": {"prompt": "the guest id for that purchase", "dtype": "integer"},
506
- "purchase_id": {"prompt": "the unique id of the purchase", "dtype": "string"},
507
- "text": {"prompt": "purchase text description", "dtype": "string"},
508
- "amount": {"prompt": "purchase amount in EUR", "dtype": "float"},
587
+ "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
588
+ "order_id": {"prompt": "the order id for that item", "dtype": "string"},
589
+ "name": {"prompt": "the name of the item", "dtype": "string"},
590
+ "price": {"prompt": "the price of the item in USD", "dtype": "float"},
509
591
  },
510
592
  "foreign_keys": [
511
593
  {
512
- "column": "guest_id",
513
- "referenced_table": "guests",
514
- "description": "each guest has anywhere between 1 and 10 purchases",
594
+ "column": "order_id",
595
+ "referenced_table": "orders",
596
+ "description": "each order has between 2 and 5 items",
515
597
  }
516
598
  ],
517
599
  },
518
600
  }
519
- data = mock.sample(tables=tables, sample_size=5, model="openai/gpt-4.1-nano")
520
- df_guests = data["guests"]
521
- df_purchases = data["purchases"]
601
+ data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
602
+ df_customers = data["customers"]
603
+ df_orders = data["orders"]
604
+ df_items = data["items"]
522
605
  ```
523
606
  """
524
607
 
@@ -526,9 +609,15 @@ def sample(
526
609
 
527
610
  sample_size = _harmonize_sample_size(sample_size, config)
528
611
  primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
529
- dfs = {}
530
- for table_name, table_config in config.root.items():
531
- if len(dfs) == 0:
612
+
613
+ child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
614
+ execution_plan: list[str] = _build_execution_plan(parent_to_children, subject_tables)
615
+
616
+ results: dict[str, pd.DataFrame] = {}
617
+
618
+ for table_name in execution_plan:
619
+ table_config = config.root[table_name]
620
+ if not child_to_parents[table_name]:
532
621
  # subject table
533
622
  df = _sample_table(
534
623
  table_name=table_name,
@@ -542,22 +631,21 @@ def sample(
542
631
  previous_rows_size=5,
543
632
  llm_config=LLMConfig(model=model, api_key=api_key),
544
633
  )
545
- elif len(dfs) == 1:
546
- # sequence table
634
+ else:
635
+ # sequencial table
636
+ referenced_table = table_config.foreign_keys[0].referenced_table
547
637
  df = _sample_table(
548
638
  table_name=table_name,
549
639
  table_config=table_config,
550
640
  primary_keys=primary_keys,
551
641
  sample_size=None,
552
- context_data=next(iter(dfs.values())),
642
+ context_data=results[referenced_table],
553
643
  temperature=temperature,
554
644
  top_p=top_p,
555
645
  batch_size=1, # generate one sequence at a time
556
646
  previous_rows_size=5,
557
647
  llm_config=LLMConfig(model=model, api_key=api_key),
558
648
  )
559
- else:
560
- raise RuntimeError("Only 1 or 2 table setups are supported for now")
561
- dfs[table_name] = df
649
+ results[table_name] = df
562
650
 
563
- return dfs if len(dfs) > 1 else next(iter(dfs.values()))
651
+ return results if len(results) > 1 else next(iter(results.values()))
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.0.4"
3
+ version = "0.0.5"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes
File without changes