mostlyai-mock 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.7" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.8" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -44,8 +44,10 @@ across tables.
44
44
 
45
45
 
46
46
  class LLMConfig(BaseModel):
47
- model: str
47
+ model: str = "openai/gpt-4.1-nano"
48
48
  api_key: str | None = None
49
+ temperature: float = 1.0
50
+ top_p: float = 0.95
49
51
 
50
52
 
51
53
  class MockConfig(RootModel[dict[str, "TableConfig"]]):
@@ -100,10 +102,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
100
102
  if table_name in path:
101
103
  cycle_start = path.index(table_name)
102
104
  cycle = path[cycle_start:] + [table_name]
103
- msg = f"Circular dependency detected: {' -> '.join(cycle)}."
104
- if len(cycle) == 2:
105
- msg += " Self-referencing tables are not yet supported."
106
- raise ValueError(msg)
105
+ if len(cycle) > 2: # len(cycle) == 2 means self-referencing table, which is allowed
106
+ raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}.")
107
107
  if table_name in visited:
108
108
  return
109
109
  visited.add(table_name)
@@ -119,7 +119,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
119
119
 
120
120
 
121
121
  class TableConfig(BaseModel):
122
- description: str = ""
122
+ prompt: str = ""
123
123
  columns: dict[str, ColumnConfig] = Field(..., min_items=1)
124
124
  primary_key: str | None = None
125
125
  foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
@@ -187,83 +187,78 @@ class DType(str, Enum):
187
187
  class ForeignKeyConfig(BaseModel):
188
188
  column: str
189
189
  referenced_table: str
190
- description: str | None = None
190
+ prompt: str | None = None
191
191
 
192
192
 
193
193
  def _sample_table(
194
194
  *,
195
- table_name: str,
196
- table_config: TableConfig,
195
+ name: str,
196
+ prompt: str,
197
+ columns: dict[str, ColumnConfig],
198
+ foreign_keys: list[ForeignKeyConfig] | None,
197
199
  primary_keys: dict[str, str] | None,
198
- sample_size: int | None,
199
200
  generated_data: dict[str, pd.DataFrame] | None,
200
- temperature: float,
201
- top_p: float,
201
+ sample_size: int,
202
202
  batch_size: int,
203
203
  previous_rows_size: int,
204
204
  non_context_size: int | None,
205
205
  llm_config: LLMConfig,
206
206
  ) -> pd.DataFrame:
207
207
  table_rows_generator = _create_table_rows_generator(
208
- table_name=table_name,
209
- table_config=table_config,
208
+ name=name,
209
+ prompt=prompt,
210
+ columns=columns,
210
211
  primary_keys=primary_keys,
211
- sample_size=sample_size,
212
+ foreign_keys=foreign_keys,
212
213
  generated_data=generated_data,
213
- temperature=temperature,
214
- top_p=top_p,
214
+ sample_size=sample_size,
215
215
  batch_size=batch_size,
216
216
  previous_rows_size=previous_rows_size,
217
217
  non_context_size=non_context_size,
218
218
  llm_config=llm_config,
219
219
  )
220
- table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
221
- table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, table_config=table_config)
220
+ table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
221
+ table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
222
222
  return table_df
223
223
 
224
224
 
225
225
  def _create_table_prompt(
226
226
  *,
227
- table_name: str,
228
- table_description: str,
227
+ name: str,
228
+ prompt: str,
229
229
  columns: dict[str, ColumnConfig],
230
230
  primary_keys: dict[str, str] | None,
231
231
  batch_size: int | None,
232
232
  foreign_keys: list[ForeignKeyConfig] | None,
233
233
  context_data: pd.DataFrame | None,
234
- non_context_data: dict[str, pd.DataFrame],
235
- previous_rows: list[dict],
234
+ non_context_data: dict[str, pd.DataFrame] | None,
235
+ previous_rows: list[dict] | None,
236
236
  ) -> str:
237
- if batch_size is not None:
238
- assert foreign_keys is None
239
- assert context_data is None
240
- else:
241
- assert foreign_keys is not None
242
- assert context_data is not None
243
- assert primary_keys is not None
244
-
245
- # add description
246
- prompt = f"# {table_description}\n\n"
237
+ # add table prompt
238
+ prompt = f"# {prompt}\n\n"
247
239
 
248
240
  # define table
249
- prompt += f"## Table: {table_name}\n\n"
241
+ prompt += f"## Table: {name}\n\n"
242
+
243
+ prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
250
244
 
251
245
  # add columns specifications
252
246
  prompt += "## Columns Specifications:\n\n"
253
247
  prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
254
248
 
255
- # define foreign keys
256
- if foreign_keys is not None:
257
- prompt += "## Foreign Keys:\n\n"
258
- prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
259
-
260
249
  # add previous rows as context to help the LLM generate consistent data
261
250
  if previous_rows:
262
251
  prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
263
252
  prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
264
253
 
254
+ # define foreign keys
255
+ if foreign_keys:
256
+ prompt += "## Foreign Keys:\n\n"
257
+ prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
258
+
265
259
  # add context table name, primary key and data
266
- if context_data is not None:
260
+ if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
261
+ assert context_data is not None
267
262
  fk = foreign_keys[0]
268
263
  prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
269
264
 
@@ -273,8 +268,12 @@ def _create_table_prompt(
273
268
  prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
274
269
 
275
270
  # add non-context table names, primary keys and data
276
- if non_context_data:
271
+ if foreign_keys and len(foreign_keys) > 1:
277
272
  for fk in foreign_keys[1:]:
273
+ if fk.referenced_table == name: # self-dependency is not considered as non-context
274
+ continue
275
+ assert non_context_data is not None
276
+ assert fk.referenced_table in non_context_data
278
277
  prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
279
278
 
280
279
  prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
@@ -284,15 +283,17 @@ def _create_table_prompt(
284
283
 
285
284
  # add instructions
286
285
  prompt += "\n## Instructions:\n\n"
287
- if batch_size is not None:
288
- prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
289
-
290
- if context_data is not None:
286
+ if not foreign_keys:
287
+ assert batch_size is not None
288
+ prompt += f"Generate {batch_size} rows for the `{name}` table.\n\n"
289
+ else:
291
290
  prompt += (
292
- f"Generate data for the `{table_name}` table. "
291
+ f"Generate data for the `{name}` table. "
293
292
  f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
294
- f"The second Foreign Key column from Foreign Keys section (if exists) may only contain values from Non-Context Table Data. "
295
- f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
293
+ f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
294
+ f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
295
+ f"In this case, ensure that the generated foreign keys are consistent with generated primary keys of the table. "
296
+ f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
296
297
  )
297
298
 
298
299
  if previous_rows:
@@ -309,13 +310,13 @@ def _create_table_prompt(
309
310
 
310
311
  def _create_table_rows_generator(
311
312
  *,
312
- table_name: str,
313
- table_config: TableConfig,
313
+ name: str,
314
+ prompt: str,
315
+ columns: dict[str, ColumnConfig],
316
+ foreign_keys: list[ForeignKeyConfig] | None,
314
317
  primary_keys: dict[str, str] | None,
315
- sample_size: int | None,
316
318
  generated_data: dict[str, pd.DataFrame] | None,
317
- temperature: float,
318
- top_p: float,
319
+ sample_size: int,
319
320
  batch_size: int,
320
321
  previous_rows_size: int,
321
322
  non_context_size: int | None,
@@ -383,37 +384,38 @@ def _create_table_rows_generator(
383
384
  for i in range(0, len(data), batch_size):
384
385
  yield data.iloc[i : i + batch_size]
385
386
 
387
+ # ensure model supports response_format and json schema
388
+ supported_params = litellm.get_supported_openai_params(model=llm_config.model)
389
+ assert "response_format" in supported_params
390
+ assert litellm.supports_response_schema(llm_config.model), (
391
+ "The model does not support structured output / JSON mode."
392
+ )
393
+
386
394
  # derive context data (if first foreign key is present) and harmonize sample size accordingly
387
395
  context_data: pd.DataFrame | None = None
388
- if table_config.foreign_keys:
389
- context_table_name = table_config.foreign_keys[0].referenced_table
396
+ if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
397
+ context_table_name = foreign_keys[0].referenced_table
390
398
  assert generated_data is not None
391
399
  assert context_table_name in generated_data
392
400
  context_data = generated_data[context_table_name]
393
401
  sample_size = len(context_data)
394
- assert sample_size is not None
395
402
 
396
403
  # derive non-context data (if more than one foreign key is present)
397
404
  non_context_data: dict[str, pd.DataFrame] = {}
398
- if table_config.foreign_keys and len(table_config.foreign_keys) > 1:
405
+ if foreign_keys and len(foreign_keys) > 1:
399
406
  assert generated_data is not None
400
407
  assert non_context_size is not None
401
- for fk in table_config.foreign_keys[1:]:
408
+ for fk in foreign_keys[1:]:
409
+ if fk.referenced_table == name: # self-dependency is not considered as non-context
410
+ continue
402
411
  non_context_table_name = fk.referenced_table
403
412
  assert non_context_table_name in generated_data
404
413
  non_context_data[non_context_table_name] = generated_data[non_context_table_name]
405
414
 
406
- # ensure model supports response_format and json schema
407
- supported_params = litellm.get_supported_openai_params(model=llm_config.model)
408
- assert "response_format" in supported_params
409
- assert litellm.supports_response_schema(llm_config.model), (
410
- "The model does not support structured output / JSON mode."
411
- )
412
-
413
415
  litellm_kwargs = {
414
- "response_format": create_table_response_format(columns=table_config.columns),
415
- "temperature": temperature,
416
- "top_p": top_p,
416
+ "response_format": create_table_response_format(columns=columns),
417
+ "temperature": llm_config.temperature,
418
+ "top_p": llm_config.top_p,
417
419
  "model": llm_config.model,
418
420
  "api_key": llm_config.api_key,
419
421
  "stream": True,
@@ -427,18 +429,17 @@ def _create_table_rows_generator(
427
429
  if non_context_data
428
430
  else None
429
431
  )
430
- prompt_kwargs = {
431
- "table_name": table_name,
432
- "table_description": table_config.description,
433
- "columns": table_config.columns,
434
- "primary_keys": primary_keys,
435
- "batch_size": batch_size if context_batch is None else None,
436
- "foreign_keys": table_config.foreign_keys if context_batch is not None else None,
437
- "context_data": context_batch if context_batch is not None else None,
438
- "non_context_data": non_context_batch if non_context_batch else None,
439
- "previous_rows": list(previous_rows),
440
- }
441
- prompt = _create_table_prompt(**prompt_kwargs)
432
+ prompt = _create_table_prompt(
433
+ name=name,
434
+ prompt=prompt,
435
+ columns=columns,
436
+ primary_keys=primary_keys,
437
+ batch_size=batch_size,
438
+ foreign_keys=foreign_keys,
439
+ context_data=context_batch,
440
+ non_context_data=non_context_batch,
441
+ previous_rows=list(previous_rows),
442
+ )
442
443
  messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}]
443
444
 
444
445
  response = litellm.completion(messages=messages, **litellm_kwargs)
@@ -464,7 +465,8 @@ def _create_table_rows_generator(
464
465
 
465
466
 
466
467
  def _convert_table_rows_generator_to_df(
467
- table_rows_generator: Generator[dict], table_config: TableConfig
468
+ table_rows_generator: Generator[dict],
469
+ columns: dict[str, ColumnConfig],
468
470
  ) -> pd.DataFrame:
469
471
  def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
470
472
  for column_name, column_config in columns.items():
@@ -485,7 +487,7 @@ def _convert_table_rows_generator_to_df(
485
487
  return df
486
488
 
487
489
  df = pd.DataFrame(list(table_rows_generator))
488
- df = align_df_dtypes_with_mock_dtypes(df, table_config.columns)
490
+ df = align_df_dtypes_with_mock_dtypes(df, columns)
489
491
  return df
490
492
 
491
493
 
@@ -498,30 +500,32 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
498
500
  return sample_size
499
501
 
500
502
 
501
- def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
502
- child_to_parents = {}
503
- parent_to_children = {}
503
+ def _build_execution_plan(config: MockConfig) -> list[str]:
504
+ def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
505
+ child_to_parents = {}
506
+ parent_to_children = {}
504
507
 
505
- for table_name in config.root:
506
- child_to_parents[table_name] = []
507
- parent_to_children[table_name] = []
508
+ for table_name in config.root:
509
+ child_to_parents[table_name] = set()
510
+ parent_to_children[table_name] = set()
508
511
 
509
- for table_name, table_config in config.root.items():
510
- if table_config.foreign_keys:
511
- for fk in table_config.foreign_keys:
512
- referenced_table = fk.referenced_table
513
- child_to_parents[table_name].append(referenced_table)
514
- parent_to_children[referenced_table].append(table_name)
512
+ for table_name, table_config in config.root.items():
513
+ if table_config.foreign_keys:
514
+ for fk in table_config.foreign_keys:
515
+ referenced_table = fk.referenced_table
516
+ child_to_parents[table_name].add(referenced_table)
517
+ parent_to_children[referenced_table].add(table_name)
515
518
 
516
- subject_tables = [table_name for table_name, deps in child_to_parents.items() if not deps]
517
- return child_to_parents, parent_to_children, subject_tables
519
+ root_tables = []
520
+ for table_name, parents in child_to_parents.items():
521
+ if not parents or parents == {table_name}: # no dependencies or only self-dependency
522
+ root_tables.append(table_name)
523
+ return child_to_parents, parent_to_children, root_tables
518
524
 
525
+ child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
519
526
 
520
- def _build_execution_plan(
521
- parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
522
- ) -> list[str]:
523
527
  execution_plan = []
524
- bfs_queue = list(subject_tables)
528
+ bfs_queue = list(root_tables)
525
529
  processed = set()
526
530
 
527
531
  while bfs_queue:
@@ -530,7 +534,10 @@ def _build_execution_plan(
530
534
  continue
531
535
 
532
536
  # ensure all parents are processed before processing this table
533
- unprocessed_parents = [p for p in child_to_parents[table_name] if p not in processed]
537
+ unprocessed_parents = []
538
+ for parent in child_to_parents[table_name]:
539
+ if parent not in processed and parent != table_name: # exclude self-dependency
540
+ unprocessed_parents.append(parent)
534
541
  if unprocessed_parents:
535
542
  bfs_queue.extend(unprocessed_parents)
536
543
  bfs_queue.append(table_name)
@@ -553,6 +560,7 @@ def sample(
553
560
  api_key: str | None = None,
554
561
  temperature: float = 1.0,
555
562
  top_p: float = 0.95,
563
+ return_type: Literal["auto", "dict"] = "auto",
556
564
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
557
565
  """
558
566
  Generate mock data by prompting an LLM.
@@ -577,6 +585,7 @@ def sample(
577
585
  api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
578
586
  temperature (float): The temperature to use for the LLM. Default is 1.0.
579
587
  top_p (float): The top-p value to use for the LLM. Default is 0.95.
588
+ return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
580
589
 
581
590
  Returns:
582
591
  - pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
@@ -588,7 +597,7 @@ def sample(
588
597
 
589
598
  tables = {
590
599
  "guests": {
591
- "description": "Guests of an Alpine ski hotel in Austria",
600
+ "prompt": "Guests of an Alpine ski hotel in Austria",
592
601
  "columns": {
593
602
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
594
603
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -611,7 +620,7 @@ def sample(
611
620
 
612
621
  tables = {
613
622
  "customers": {
614
- "description": "Customers of a hardware store",
623
+ "prompt": "Customers of a hardware store",
615
624
  "columns": {
616
625
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
617
626
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
@@ -619,7 +628,7 @@ def sample(
619
628
  "primary_key": "customer_id",
620
629
  },
621
630
  "warehouses": {
622
- "description": "Warehouses of a hardware store",
631
+ "prompt": "Warehouses of a hardware store",
623
632
  "columns": {
624
633
  "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
625
634
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
@@ -627,7 +636,7 @@ def sample(
627
636
  "primary_key": "warehouse_id",
628
637
  },
629
638
  "orders": {
630
- "description": "Orders of a Customer",
639
+ "prompt": "Orders of a Customer",
631
640
  "columns": {
632
641
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
633
642
  "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
@@ -640,7 +649,7 @@ def sample(
640
649
  {
641
650
  "column": "customer_id",
642
651
  "referenced_table": "customers",
643
- "description": "each customer has anywhere between 2 and 3 orders",
652
+ "prompt": "each customer has anywhere between 2 and 3 orders",
644
653
  },
645
654
  {
646
655
  "column": "warehouse_id",
@@ -649,7 +658,7 @@ def sample(
649
658
  ],
650
659
  },
651
660
  "items": {
652
- "description": "Items in an Order",
661
+ "prompt": "Items in an Order",
653
662
  "columns": {
654
663
  "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
655
664
  "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -660,7 +669,7 @@ def sample(
660
669
  {
661
670
  "column": "order_id",
662
671
  "referenced_table": "orders",
663
- "description": "each order has between 1 and 2 items",
672
+ "prompt": "each order has between 1 and 2 items",
664
673
  }
665
674
  ],
666
675
  },
@@ -674,47 +683,30 @@ def sample(
674
683
  """
675
684
 
676
685
  config = MockConfig(tables)
686
+ llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
677
687
 
678
688
  sample_size = _harmonize_sample_size(sample_size, config)
679
689
  primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
680
690
 
681
- child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
682
- execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
691
+ execution_plan: list[str] = _build_execution_plan(config)
683
692
 
684
- results: dict[str, pd.DataFrame] = {}
693
+ data: dict[str, pd.DataFrame] = {}
685
694
 
686
695
  for table_name in execution_plan:
687
696
  table_config = config.root[table_name]
688
- if not child_to_parents[table_name]:
689
- # subject table
690
- df = _sample_table(
691
- table_name=table_name,
692
- table_config=table_config,
693
- primary_keys=None,
694
- sample_size=sample_size[table_name],
695
- generated_data=None,
696
- temperature=temperature,
697
- top_p=top_p,
698
- batch_size=30, # generate 30 subjects at a time
699
- previous_rows_size=10, # present 10 previously generated rows to the LLM
700
- non_context_size=None,
701
- llm_config=LLMConfig(model=model, api_key=api_key),
702
- )
703
- else:
704
- # sequencial table
705
- df = _sample_table(
706
- table_name=table_name,
707
- table_config=table_config,
708
- primary_keys=primary_keys,
709
- sample_size=None,
710
- generated_data=results,
711
- temperature=temperature,
712
- top_p=top_p,
713
- batch_size=1, # generate one sequence at a time
714
- previous_rows_size=10, # present 10 previously generated rows to the LLM
715
- non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
716
- llm_config=LLMConfig(model=model, api_key=api_key),
717
- )
718
- results[table_name] = df
719
-
720
- return results if len(results) > 1 else next(iter(results.values()))
697
+ df = _sample_table(
698
+ name=table_name,
699
+ prompt=table_config.prompt,
700
+ columns=table_config.columns,
701
+ foreign_keys=table_config.foreign_keys,
702
+ primary_keys=primary_keys,
703
+ generated_data=data,
704
+ sample_size=sample_size[table_name],
705
+ batch_size=30, # generate 30 root table rows at a time
706
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
707
+ non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
708
+ llm_config=llm_config,
709
+ )
710
+ data[table_name] = df
711
+
712
+ return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
mostlyai/mock/mcp.py ADDED
@@ -0,0 +1,46 @@
1
+ import json
2
+
3
+ import pandas as pd
4
+ from fastmcp import Context, FastMCP
5
+
6
+ from mostlyai import mock
7
+
8
+ mcp = FastMCP(name="MostlyAI Mock MCP Server")
9
+
10
+
11
+ @mcp.tool(description=mock.sample.__doc__)
12
+ def sample_mock_data(
13
+ *,
14
+ tables: dict[str, dict],
15
+ sample_size: int,
16
+ model: str = "openai/gpt-4.1-nano",
17
+ api_key: str | None = None,
18
+ temperature: float = 1.0,
19
+ top_p: float = 0.95,
20
+ ctx: Context,
21
+ ) -> str:
22
+ # Notes:
23
+ # 1. Returning DataFrames directly results in converting them into truncated string.
24
+ # 2. The logs / progress bars are not propagated to the MCP Client. There is a dedicated API to do that (e.g. `ctx.info(...)`)
25
+ # 3. MCP Server inherits only selected environment variables (PATH, USER...); one way to pass LLM keys is through client configuration (`mcpServers->env`)
26
+ # 4. Some MCP Clients, e.g. Cursor, do not like Unions or Optionals in type hints
27
+ ctx.info(f"Generating mock data for `{len(tables)}` tables")
28
+ data = mock.sample(
29
+ tables=tables,
30
+ sample_size=sample_size,
31
+ model=model,
32
+ api_key=api_key,
33
+ temperature=temperature,
34
+ top_p=top_p,
35
+ return_type="dict",
36
+ )
37
+ ctx.info(f"Generated mock data for `{len(tables)}` tables")
38
+ return {k: v.to_dict(orient="records") for k, v in data.items()}
39
+
40
+
41
+ def main():
42
+ mcp.run(transport="stdio")
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.0.7
3
+ Version: 0.0.8
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -24,11 +24,13 @@ Classifier: Programming Language :: Python :: 3.13
24
24
  Classifier: Topic :: Software Development :: Libraries
25
25
  Classifier: Typing :: Typed
26
26
  Requires-Python: >=3.10
27
+ Requires-Dist: fastmcp<3.0.0,>=2.0.0
27
28
  Requires-Dist: litellm>=1.67.0
28
29
  Requires-Dist: numpy>=1.26.3
29
30
  Requires-Dist: pandas>=2.0.0
30
31
  Requires-Dist: pyarrow>=14.0.0
31
32
  Requires-Dist: pydantic<3.0.0,>=2.0.0
33
+ Requires-Dist: typer<1.0.0,>=0.9.0
32
34
  Description-Content-Type: text/markdown
33
35
 
34
36
  # Synthetic Mock Data 🔮
@@ -72,7 +74,7 @@ from mostlyai import mock
72
74
 
73
75
  tables = {
74
76
  "guests": {
75
- "description": "Guests of an Alpine ski hotel in Austria",
77
+ "prompt": "Guests of an Alpine ski hotel in Austria",
76
78
  "columns": {
77
79
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
78
80
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -112,7 +114,7 @@ from mostlyai import mock
112
114
 
113
115
  tables = {
114
116
  "customers": {
115
- "description": "Customers of a hardware store",
117
+ "prompt": "Customers of a hardware store",
116
118
  "columns": {
117
119
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
118
120
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
@@ -120,7 +122,7 @@ tables = {
120
122
  "primary_key": "customer_id",
121
123
  },
122
124
  "warehouses": {
123
- "description": "Warehouses of a hardware store",
125
+ "prompt": "Warehouses of a hardware store",
124
126
  "columns": {
125
127
  "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
126
128
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
@@ -128,7 +130,7 @@ tables = {
128
130
  "primary_key": "warehouse_id",
129
131
  },
130
132
  "orders": {
131
- "description": "Orders of a Customer",
133
+ "prompt": "Orders of a Customer",
132
134
  "columns": {
133
135
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
134
136
  "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
@@ -141,7 +143,7 @@ tables = {
141
143
  {
142
144
  "column": "customer_id",
143
145
  "referenced_table": "customers",
144
- "description": "each customer has anywhere between 2 and 3 orders",
146
+ "prompt": "each customer has anywhere between 2 and 3 orders",
145
147
  },
146
148
  {
147
149
  "column": "warehouse_id",
@@ -150,7 +152,7 @@ tables = {
150
152
  ],
151
153
  },
152
154
  "items": {
153
- "description": "Items in an Order",
155
+ "prompt": "Items in an Order",
154
156
  "columns": {
155
157
  "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
156
158
  "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -161,7 +163,7 @@ tables = {
161
163
  {
162
164
  "column": "order_id",
163
165
  "referenced_table": "orders",
164
- "description": "each order has between 1 and 2 items",
166
+ "prompt": "each order has between 1 and 2 items",
165
167
  }
166
168
  ],
167
169
  },
@@ -199,3 +201,42 @@ print(data["items"])
199
201
  # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
200
202
  # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
201
203
  ```
204
+
205
+ 6. Create your first self-referencing synthetic table
206
+
207
+ ```python
208
+ from mostlyai import mock
209
+
210
+ tables = {
211
+ "employees": {
212
+ "prompt": "Employees of a company",
213
+ "columns": {
214
+ "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
215
+ "name": {"prompt": "first name and last name of the president", "dtype": "string"},
216
+ "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
217
+ "role": {"prompt": "the role of the employee", "dtype": "string"},
218
+ },
219
+ "primary_key": "employee_id",
220
+ "foreign_keys": [
221
+ {
222
+ "column": "boss_id",
223
+ "referenced_table": "employees",
224
+ "prompt": "each boss has at most 3 employees",
225
+ },
226
+ ],
227
+ }
228
+ }
229
+ df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
230
+ print(df)
231
+ # employee_id name boss_id role
232
+ # 0 1 Sandra Phillips <NA> President
233
+ # 1 2 Marcus Tran 1 Chief Financial Officer
234
+ # 2 3 Ava Whittaker 1 Chief Technology Officer
235
+ # 3 4 Sophie Martin 1 Chief Operations Officer
236
+ # 4 5 Chad Nelson 2 Finance Manager
237
+ # 5 6 Ethan Glover 2 Senior Accountant
238
+ # 6 7 Kimberly Ortiz 2 Junior Accountant
239
+ # 7 8 Lucas Romero 3 IT Manager
240
+ # 8 9 Priya Desai 3 Lead Software Engineer
241
+ # 9 10 Felix Bennett 3 Senior Systems Analyst
242
+ ```
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=APjm2I3FljM9lFH6sdbIYyh5eC-8KTOuDKG_KXGXaHQ,714
2
+ mostlyai/mock/core.py,sha256=p5VAsRppzAc4P8FqKEunfQ3cPjImUU2cEc6yqHJVhMg,29884
3
+ mostlyai/mock/mcp.py,sha256=Ryp9aXBzGzHRFVwNt1gftrYlhYcQifCNhjRgn-iX7zc,1400
4
+ mostlyai_mock-0.0.8.dist-info/METADATA,sha256=DCO1oHuu6tOHGog--wEloM31HP0tGbztZP4F1YyzKzk,11415
5
+ mostlyai_mock-0.0.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.0.8.dist-info/entry_points.txt,sha256=77Swgr1rLuEmeCNOiwoUyKd128_c1Q5l88pj_7JVKaY,54
7
+ mostlyai_mock-0.0.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.0.8.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ mcp-server = mostlyai.mock.mcp:main
@@ -1,6 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=b9OE7NhBo32iINB3V7gU6jK_auwGB99AUlAZ_eul-eo,714
2
- mostlyai/mock/core.py,sha256=6xy0qzocyLh8kw3WvckOuFnCZx2LKpWrsJaHaF3ISCE,29901
3
- mostlyai_mock-0.0.7.dist-info/METADATA,sha256=8d-XpBFxaGkggPZLnH56raX2ysy2rWjL6bnI70JCSiU,9719
4
- mostlyai_mock-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- mostlyai_mock-0.0.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
- mostlyai_mock-0.0.7.dist-info/RECORD,,