mostlyai-mock 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.7" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.9" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -44,8 +44,10 @@ across tables.
44
44
 
45
45
 
46
46
  class LLMConfig(BaseModel):
47
- model: str
47
+ model: str = "openai/gpt-4.1-nano"
48
48
  api_key: str | None = None
49
+ temperature: float = 1.0
50
+ top_p: float = 0.95
49
51
 
50
52
 
51
53
  class MockConfig(RootModel[dict[str, "TableConfig"]]):
@@ -100,10 +102,8 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
100
102
  if table_name in path:
101
103
  cycle_start = path.index(table_name)
102
104
  cycle = path[cycle_start:] + [table_name]
103
- msg = f"Circular dependency detected: {' -> '.join(cycle)}."
104
- if len(cycle) == 2:
105
- msg += " Self-referencing tables are not yet supported."
106
- raise ValueError(msg)
105
+ if len(cycle) > 2: # len(cycle) == 2 means self-referencing table, which is allowed
106
+ raise ValueError(f"Circular dependency detected: {' -> '.join(cycle)}.")
107
107
  if table_name in visited:
108
108
  return
109
109
  visited.add(table_name)
@@ -119,7 +119,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
119
119
 
120
120
 
121
121
  class TableConfig(BaseModel):
122
- description: str = ""
122
+ prompt: str = ""
123
123
  columns: dict[str, ColumnConfig] = Field(..., min_items=1)
124
124
  primary_key: str | None = None
125
125
  foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
@@ -187,83 +187,78 @@ class DType(str, Enum):
187
187
  class ForeignKeyConfig(BaseModel):
188
188
  column: str
189
189
  referenced_table: str
190
- description: str | None = None
190
+ prompt: str | None = None
191
191
 
192
192
 
193
193
  def _sample_table(
194
194
  *,
195
- table_name: str,
196
- table_config: TableConfig,
195
+ name: str,
196
+ prompt: str,
197
+ columns: dict[str, ColumnConfig],
198
+ foreign_keys: list[ForeignKeyConfig] | None,
197
199
  primary_keys: dict[str, str] | None,
198
- sample_size: int | None,
199
200
  generated_data: dict[str, pd.DataFrame] | None,
200
- temperature: float,
201
- top_p: float,
201
+ sample_size: int,
202
202
  batch_size: int,
203
203
  previous_rows_size: int,
204
204
  non_context_size: int | None,
205
205
  llm_config: LLMConfig,
206
206
  ) -> pd.DataFrame:
207
207
  table_rows_generator = _create_table_rows_generator(
208
- table_name=table_name,
209
- table_config=table_config,
208
+ name=name,
209
+ prompt=prompt,
210
+ columns=columns,
210
211
  primary_keys=primary_keys,
211
- sample_size=sample_size,
212
+ foreign_keys=foreign_keys,
212
213
  generated_data=generated_data,
213
- temperature=temperature,
214
- top_p=top_p,
214
+ sample_size=sample_size,
215
215
  batch_size=batch_size,
216
216
  previous_rows_size=previous_rows_size,
217
217
  non_context_size=non_context_size,
218
218
  llm_config=llm_config,
219
219
  )
220
- table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{table_name}`".ljust(45))
221
- table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, table_config=table_config)
220
+ table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
221
+ table_df = _convert_table_rows_generator_to_df(table_rows_generator=table_rows_generator, columns=columns)
222
222
  return table_df
223
223
 
224
224
 
225
225
  def _create_table_prompt(
226
226
  *,
227
- table_name: str,
228
- table_description: str,
227
+ name: str,
228
+ prompt: str,
229
229
  columns: dict[str, ColumnConfig],
230
230
  primary_keys: dict[str, str] | None,
231
231
  batch_size: int | None,
232
232
  foreign_keys: list[ForeignKeyConfig] | None,
233
233
  context_data: pd.DataFrame | None,
234
- non_context_data: dict[str, pd.DataFrame],
235
- previous_rows: list[dict],
234
+ non_context_data: dict[str, pd.DataFrame] | None,
235
+ previous_rows: list[dict] | None,
236
236
  ) -> str:
237
- if batch_size is not None:
238
- assert foreign_keys is None
239
- assert context_data is None
240
- else:
241
- assert foreign_keys is not None
242
- assert context_data is not None
243
- assert primary_keys is not None
244
-
245
- # add description
246
- prompt = f"# {table_description}\n\n"
237
+ # add table prompt
238
+ prompt = f"# {prompt}\n\n"
247
239
 
248
240
  # define table
249
- prompt += f"## Table: {table_name}\n\n"
241
+ prompt += f"## Table: {name}\n\n"
242
+
243
+ prompt += f"## Table Primary Key: `{primary_keys[name]}`\n\n"
250
244
 
251
245
  # add columns specifications
252
246
  prompt += "## Columns Specifications:\n\n"
253
247
  prompt += f"{json.dumps({name: config.model_dump() for name, config in columns.items()}, indent=2)}\n\n"
254
248
 
255
- # define foreign keys
256
- if foreign_keys is not None:
257
- prompt += "## Foreign Keys:\n\n"
258
- prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
259
-
260
249
  # add previous rows as context to help the LLM generate consistent data
261
250
  if previous_rows:
262
251
  prompt += f"\n## Previous {len(previous_rows)} Rows:\n\n"
263
252
  prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
264
253
 
254
+ # define foreign keys
255
+ if foreign_keys:
256
+ prompt += "## Foreign Keys:\n\n"
257
+ prompt += f"{json.dumps([fk.model_dump() for fk in foreign_keys], indent=2)}\n\n"
258
+
265
259
  # add context table name, primary key and data
266
- if context_data is not None:
260
+ if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
261
+ assert context_data is not None
267
262
  fk = foreign_keys[0]
268
263
  prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
269
264
 
@@ -273,8 +268,12 @@ def _create_table_prompt(
273
268
  prompt += f"{context_data.to_json(orient='records', indent=2)}\n\n"
274
269
 
275
270
  # add non-context table names, primary keys and data
276
- if non_context_data:
271
+ if foreign_keys and len(foreign_keys) > 1:
277
272
  for fk in foreign_keys[1:]:
273
+ if fk.referenced_table == name: # self-dependency is not considered as non-context
274
+ continue
275
+ assert non_context_data is not None
276
+ assert fk.referenced_table in non_context_data
278
277
  prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
279
278
 
280
279
  prompt += f"## Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
@@ -284,15 +283,17 @@ def _create_table_prompt(
284
283
 
285
284
  # add instructions
286
285
  prompt += "\n## Instructions:\n\n"
287
- if batch_size is not None:
288
- prompt += f"Generate {batch_size} rows for the `{table_name}` table.\n\n"
289
-
290
- if context_data is not None:
286
+ if not foreign_keys:
287
+ assert batch_size is not None
288
+ prompt += f"Generate {batch_size} rows for the `{name}` table.\n\n"
289
+ else:
291
290
  prompt += (
292
- f"Generate data for the `{table_name}` table. "
291
+ f"Generate data for the `{name}` table. "
293
292
  f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
294
- f"The second Foreign Key column from Foreign Keys section (if exists) may only contain values from Non-Context Table Data. "
295
- f"Pay attention to description of the Foreign Key column to understand the relationship.\n\n"
293
+ f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
294
+ f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
295
+ f"In this case, ensure that the generated foreign keys are consistent with generated primary keys of the table. "
296
+ f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
296
297
  )
297
298
 
298
299
  if previous_rows:
@@ -309,13 +310,13 @@ def _create_table_prompt(
309
310
 
310
311
  def _create_table_rows_generator(
311
312
  *,
312
- table_name: str,
313
- table_config: TableConfig,
313
+ name: str,
314
+ prompt: str,
315
+ columns: dict[str, ColumnConfig],
316
+ foreign_keys: list[ForeignKeyConfig] | None,
314
317
  primary_keys: dict[str, str] | None,
315
- sample_size: int | None,
316
318
  generated_data: dict[str, pd.DataFrame] | None,
317
- temperature: float,
318
- top_p: float,
319
+ sample_size: int,
319
320
  batch_size: int,
320
321
  previous_rows_size: int,
321
322
  non_context_size: int | None,
@@ -383,37 +384,38 @@ def _create_table_rows_generator(
383
384
  for i in range(0, len(data), batch_size):
384
385
  yield data.iloc[i : i + batch_size]
385
386
 
387
+ # ensure model supports response_format and json schema
388
+ supported_params = litellm.get_supported_openai_params(model=llm_config.model)
389
+ assert "response_format" in supported_params
390
+ assert litellm.supports_response_schema(llm_config.model), (
391
+ "The model does not support structured output / JSON mode."
392
+ )
393
+
386
394
  # derive context data (if first foreign key is present) and harmonize sample size accordingly
387
395
  context_data: pd.DataFrame | None = None
388
- if table_config.foreign_keys:
389
- context_table_name = table_config.foreign_keys[0].referenced_table
396
+ if foreign_keys and foreign_keys[0].referenced_table != name: # self-dependency is not considered as context
397
+ context_table_name = foreign_keys[0].referenced_table
390
398
  assert generated_data is not None
391
399
  assert context_table_name in generated_data
392
400
  context_data = generated_data[context_table_name]
393
401
  sample_size = len(context_data)
394
- assert sample_size is not None
395
402
 
396
403
  # derive non-context data (if more than one foreign key is present)
397
404
  non_context_data: dict[str, pd.DataFrame] = {}
398
- if table_config.foreign_keys and len(table_config.foreign_keys) > 1:
405
+ if foreign_keys and len(foreign_keys) > 1:
399
406
  assert generated_data is not None
400
407
  assert non_context_size is not None
401
- for fk in table_config.foreign_keys[1:]:
408
+ for fk in foreign_keys[1:]:
409
+ if fk.referenced_table == name: # self-dependency is not considered as non-context
410
+ continue
402
411
  non_context_table_name = fk.referenced_table
403
412
  assert non_context_table_name in generated_data
404
413
  non_context_data[non_context_table_name] = generated_data[non_context_table_name]
405
414
 
406
- # ensure model supports response_format and json schema
407
- supported_params = litellm.get_supported_openai_params(model=llm_config.model)
408
- assert "response_format" in supported_params
409
- assert litellm.supports_response_schema(llm_config.model), (
410
- "The model does not support structured output / JSON mode."
411
- )
412
-
413
415
  litellm_kwargs = {
414
- "response_format": create_table_response_format(columns=table_config.columns),
415
- "temperature": temperature,
416
- "top_p": top_p,
416
+ "response_format": create_table_response_format(columns=columns),
417
+ "temperature": llm_config.temperature,
418
+ "top_p": llm_config.top_p,
417
419
  "model": llm_config.model,
418
420
  "api_key": llm_config.api_key,
419
421
  "stream": True,
@@ -427,18 +429,17 @@ def _create_table_rows_generator(
427
429
  if non_context_data
428
430
  else None
429
431
  )
430
- prompt_kwargs = {
431
- "table_name": table_name,
432
- "table_description": table_config.description,
433
- "columns": table_config.columns,
434
- "primary_keys": primary_keys,
435
- "batch_size": batch_size if context_batch is None else None,
436
- "foreign_keys": table_config.foreign_keys if context_batch is not None else None,
437
- "context_data": context_batch if context_batch is not None else None,
438
- "non_context_data": non_context_batch if non_context_batch else None,
439
- "previous_rows": list(previous_rows),
440
- }
441
- prompt = _create_table_prompt(**prompt_kwargs)
432
+ prompt = _create_table_prompt(
433
+ name=name,
434
+ prompt=prompt,
435
+ columns=columns,
436
+ primary_keys=primary_keys,
437
+ batch_size=batch_size,
438
+ foreign_keys=foreign_keys,
439
+ context_data=context_batch,
440
+ non_context_data=non_context_batch,
441
+ previous_rows=list(previous_rows),
442
+ )
442
443
  messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}]
443
444
 
444
445
  response = litellm.completion(messages=messages, **litellm_kwargs)
@@ -464,7 +465,8 @@ def _create_table_rows_generator(
464
465
 
465
466
 
466
467
  def _convert_table_rows_generator_to_df(
467
- table_rows_generator: Generator[dict], table_config: TableConfig
468
+ table_rows_generator: Generator[dict],
469
+ columns: dict[str, ColumnConfig],
468
470
  ) -> pd.DataFrame:
469
471
  def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
470
472
  for column_name, column_config in columns.items():
@@ -485,7 +487,7 @@ def _convert_table_rows_generator_to_df(
485
487
  return df
486
488
 
487
489
  df = pd.DataFrame(list(table_rows_generator))
488
- df = align_df_dtypes_with_mock_dtypes(df, table_config.columns)
490
+ df = align_df_dtypes_with_mock_dtypes(df, columns)
489
491
  return df
490
492
 
491
493
 
@@ -498,30 +500,32 @@ def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig
498
500
  return sample_size
499
501
 
500
502
 
501
- def _build_dependency_graph(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
502
- child_to_parents = {}
503
- parent_to_children = {}
503
+ def _build_execution_plan(config: MockConfig) -> list[str]:
504
+ def build_dependency_mappings(config: MockConfig) -> tuple[dict[str, list[str]], dict[str, list[str]], list[str]]:
505
+ child_to_parents = {}
506
+ parent_to_children = {}
504
507
 
505
- for table_name in config.root:
506
- child_to_parents[table_name] = []
507
- parent_to_children[table_name] = []
508
+ for table_name in config.root:
509
+ child_to_parents[table_name] = set()
510
+ parent_to_children[table_name] = set()
508
511
 
509
- for table_name, table_config in config.root.items():
510
- if table_config.foreign_keys:
511
- for fk in table_config.foreign_keys:
512
- referenced_table = fk.referenced_table
513
- child_to_parents[table_name].append(referenced_table)
514
- parent_to_children[referenced_table].append(table_name)
512
+ for table_name, table_config in config.root.items():
513
+ if table_config.foreign_keys:
514
+ for fk in table_config.foreign_keys:
515
+ referenced_table = fk.referenced_table
516
+ child_to_parents[table_name].add(referenced_table)
517
+ parent_to_children[referenced_table].add(table_name)
515
518
 
516
- subject_tables = [table_name for table_name, deps in child_to_parents.items() if not deps]
517
- return child_to_parents, parent_to_children, subject_tables
519
+ root_tables = []
520
+ for table_name, parents in child_to_parents.items():
521
+ if not parents or parents == {table_name}: # no dependencies or only self-dependency
522
+ root_tables.append(table_name)
523
+ return child_to_parents, parent_to_children, root_tables
518
524
 
525
+ child_to_parents, parent_to_children, root_tables = build_dependency_mappings(config)
519
526
 
520
- def _build_execution_plan(
521
- parent_to_children: dict[str, list[str]], child_to_parents: dict[str, list[str]], subject_tables: list[str]
522
- ) -> list[str]:
523
527
  execution_plan = []
524
- bfs_queue = list(subject_tables)
528
+ bfs_queue = list(root_tables)
525
529
  processed = set()
526
530
 
527
531
  while bfs_queue:
@@ -530,7 +534,10 @@ def _build_execution_plan(
530
534
  continue
531
535
 
532
536
  # ensure all parents are processed before processing this table
533
- unprocessed_parents = [p for p in child_to_parents[table_name] if p not in processed]
537
+ unprocessed_parents = []
538
+ for parent in child_to_parents[table_name]:
539
+ if parent not in processed and parent != table_name: # exclude self-dependency
540
+ unprocessed_parents.append(parent)
534
541
  if unprocessed_parents:
535
542
  bfs_queue.extend(unprocessed_parents)
536
543
  bfs_queue.append(table_name)
@@ -553,6 +560,7 @@ def sample(
553
560
  api_key: str | None = None,
554
561
  temperature: float = 1.0,
555
562
  top_p: float = 0.95,
563
+ return_type: Literal["auto", "dict"] = "auto",
556
564
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
557
565
  """
558
566
  Generate mock data by prompting an LLM.
@@ -577,6 +585,7 @@ def sample(
577
585
  api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
578
586
  temperature (float): The temperature to use for the LLM. Default is 1.0.
579
587
  top_p (float): The top-p value to use for the LLM. Default is 0.95.
588
+ return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
580
589
 
581
590
  Returns:
582
591
  - pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
@@ -588,7 +597,7 @@ def sample(
588
597
 
589
598
  tables = {
590
599
  "guests": {
591
- "description": "Guests of an Alpine ski hotel in Austria",
600
+ "prompt": "Guests of an Alpine ski hotel in Austria",
592
601
  "columns": {
593
602
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
594
603
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -611,7 +620,7 @@ def sample(
611
620
 
612
621
  tables = {
613
622
  "customers": {
614
- "description": "Customers of a hardware store",
623
+ "prompt": "Customers of a hardware store",
615
624
  "columns": {
616
625
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
617
626
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
@@ -619,7 +628,7 @@ def sample(
619
628
  "primary_key": "customer_id",
620
629
  },
621
630
  "warehouses": {
622
- "description": "Warehouses of a hardware store",
631
+ "prompt": "Warehouses of a hardware store",
623
632
  "columns": {
624
633
  "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
625
634
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
@@ -627,7 +636,7 @@ def sample(
627
636
  "primary_key": "warehouse_id",
628
637
  },
629
638
  "orders": {
630
- "description": "Orders of a Customer",
639
+ "prompt": "Orders of a Customer",
631
640
  "columns": {
632
641
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
633
642
  "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
@@ -640,7 +649,7 @@ def sample(
640
649
  {
641
650
  "column": "customer_id",
642
651
  "referenced_table": "customers",
643
- "description": "each customer has anywhere between 2 and 3 orders",
652
+ "prompt": "each customer has anywhere between 2 and 3 orders",
644
653
  },
645
654
  {
646
655
  "column": "warehouse_id",
@@ -649,7 +658,7 @@ def sample(
649
658
  ],
650
659
  },
651
660
  "items": {
652
- "description": "Items in an Order",
661
+ "prompt": "Items in an Order",
653
662
  "columns": {
654
663
  "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
655
664
  "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -660,7 +669,7 @@ def sample(
660
669
  {
661
670
  "column": "order_id",
662
671
  "referenced_table": "orders",
663
- "description": "each order has between 1 and 2 items",
672
+ "prompt": "each order has between 1 and 2 items",
664
673
  }
665
674
  ],
666
675
  },
@@ -674,47 +683,30 @@ def sample(
674
683
  """
675
684
 
676
685
  config = MockConfig(tables)
686
+ llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
677
687
 
678
688
  sample_size = _harmonize_sample_size(sample_size, config)
679
689
  primary_keys = {table_name: table_config.primary_key for table_name, table_config in config.root.items()}
680
690
 
681
- child_to_parents, parent_to_children, subject_tables = _build_dependency_graph(config)
682
- execution_plan: list[str] = _build_execution_plan(parent_to_children, child_to_parents, subject_tables)
691
+ execution_plan: list[str] = _build_execution_plan(config)
683
692
 
684
- results: dict[str, pd.DataFrame] = {}
693
+ data: dict[str, pd.DataFrame] = {}
685
694
 
686
695
  for table_name in execution_plan:
687
696
  table_config = config.root[table_name]
688
- if not child_to_parents[table_name]:
689
- # subject table
690
- df = _sample_table(
691
- table_name=table_name,
692
- table_config=table_config,
693
- primary_keys=None,
694
- sample_size=sample_size[table_name],
695
- generated_data=None,
696
- temperature=temperature,
697
- top_p=top_p,
698
- batch_size=30, # generate 30 subjects at a time
699
- previous_rows_size=10, # present 10 previously generated rows to the LLM
700
- non_context_size=None,
701
- llm_config=LLMConfig(model=model, api_key=api_key),
702
- )
703
- else:
704
- # sequencial table
705
- df = _sample_table(
706
- table_name=table_name,
707
- table_config=table_config,
708
- primary_keys=primary_keys,
709
- sample_size=None,
710
- generated_data=results,
711
- temperature=temperature,
712
- top_p=top_p,
713
- batch_size=1, # generate one sequence at a time
714
- previous_rows_size=10, # present 10 previously generated rows to the LLM
715
- non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
716
- llm_config=LLMConfig(model=model, api_key=api_key),
717
- )
718
- results[table_name] = df
719
-
720
- return results if len(results) > 1 else next(iter(results.values()))
697
+ df = _sample_table(
698
+ name=table_name,
699
+ prompt=table_config.prompt,
700
+ columns=table_config.columns,
701
+ foreign_keys=table_config.foreign_keys,
702
+ primary_keys=primary_keys,
703
+ generated_data=data,
704
+ sample_size=sample_size[table_name],
705
+ batch_size=30, # generate 30 root table rows at a time
706
+ previous_rows_size=10, # present 10 previously generated rows to the LLM
707
+ non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
708
+ llm_config=llm_config,
709
+ )
710
+ data[table_name] = df
711
+
712
+ return next(iter(data.values())) if len(data) == 1 and return_type == "auto" else data
@@ -0,0 +1,85 @@
1
+ import os
2
+ import tempfile
3
+ import zipfile
4
+
5
+ import requests
6
+ from fastmcp import Context, FastMCP
7
+
8
+ from mostlyai import mock
9
+
10
+ SAMPLE_MOCK_TOOL_DESCRIPTION = f"""
11
+ It is proxy to the `mostlyai.mock.sample` function.
12
+
13
+ This function returns an URL to the generated CSV bundle (as ZIP file).
14
+ Print this URL in Markdown format, so user can easily download the data.
15
+
16
+ What comes after the `=============================` is the documentation of the `mostlyai.mock.sample` function.
17
+
18
+ =============================
19
+ {mock.sample.__doc__}
20
+ """
21
+
22
+ mcp = FastMCP(name="MostlyAI Mock MCP Server")
23
+
24
+
25
+ def _upload_to_0x0st(data: dict) -> str:
26
+ with tempfile.TemporaryDirectory() as temp_dir:
27
+ zip_path = os.path.join(temp_dir, "mock_data.zip")
28
+ with zipfile.ZipFile(zip_path, "w") as zip_file:
29
+ for table_name, df in data.items():
30
+ csv_path = os.path.join(temp_dir, f"{table_name}.csv")
31
+ df.to_csv(csv_path, index=False)
32
+ zip_file.write(csv_path, arcname=f"{table_name}.csv")
33
+
34
+ with open(zip_path, "rb") as f:
35
+ response = requests.post(
36
+ "https://0x0.st",
37
+ files={"file": f},
38
+ data={"expires": "24", "secret": ""},
39
+ headers={"User-Agent": "MockData/1.0"},
40
+ )
41
+
42
+ if response.status_code == 200:
43
+ url = response.text.strip()
44
+ return url
45
+ else:
46
+ raise Exception(f"Failed to upload ZIP: HTTP {response.status_code}")
47
+
48
+
49
+ @mcp.tool(description=SAMPLE_MOCK_TOOL_DESCRIPTION)
50
+ def sample_mock_data(
51
+ *,
52
+ tables: dict[str, dict],
53
+ sample_size: int,
54
+ model: str = "openai/gpt-4.1-nano",
55
+ api_key: str | None = None,
56
+ temperature: float = 1.0,
57
+ top_p: float = 0.95,
58
+ ctx: Context,
59
+ ) -> str:
60
+ # Notes:
61
+ # 1. Returning DataFrames directly results in converting them into truncated string.
62
+ # 2. The logs / progress bars are not propagated to the MCP Client. There is a dedicated API to do that (e.g. `ctx.info(...)`)
63
+ # 3. MCP Server inherits only selected environment variables (PATH, USER...); one way to pass LLM keys is through client configuration (`mcpServers->env`)
64
+ # 4. Some MCP Clients, e.g. Cursor, do not like Unions or Optionals in type hints
65
+ ctx.info(f"Generating mock data for `{len(tables)}` tables")
66
+ data = mock.sample(
67
+ tables=tables,
68
+ sample_size=sample_size,
69
+ model=model,
70
+ api_key=api_key,
71
+ temperature=temperature,
72
+ top_p=top_p,
73
+ return_type="dict",
74
+ )
75
+ ctx.info(f"Generated mock data for `{len(tables)}` tables")
76
+ url = _upload_to_0x0st(data)
77
+ return url
78
+
79
+
80
+ def main():
81
+ mcp.run(transport="stdio")
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -24,6 +24,7 @@ Classifier: Programming Language :: Python :: 3.13
24
24
  Classifier: Topic :: Software Development :: Libraries
25
25
  Classifier: Typing :: Typed
26
26
  Requires-Python: >=3.10
27
+ Requires-Dist: fastmcp<3.0.0,>=2.0.0
27
28
  Requires-Dist: litellm>=1.67.0
28
29
  Requires-Dist: numpy>=1.26.3
29
30
  Requires-Dist: pandas>=2.0.0
@@ -72,7 +73,7 @@ from mostlyai import mock
72
73
 
73
74
  tables = {
74
75
  "guests": {
75
- "description": "Guests of an Alpine ski hotel in Austria",
76
+ "prompt": "Guests of an Alpine ski hotel in Austria",
76
77
  "columns": {
77
78
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
78
79
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
@@ -112,7 +113,7 @@ from mostlyai import mock
112
113
 
113
114
  tables = {
114
115
  "customers": {
115
- "description": "Customers of a hardware store",
116
+ "prompt": "Customers of a hardware store",
116
117
  "columns": {
117
118
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
118
119
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
@@ -120,7 +121,7 @@ tables = {
120
121
  "primary_key": "customer_id",
121
122
  },
122
123
  "warehouses": {
123
- "description": "Warehouses of a hardware store",
124
+ "prompt": "Warehouses of a hardware store",
124
125
  "columns": {
125
126
  "warehouse_id": {"prompt": "the unique id of the warehouse", "dtype": "integer"},
126
127
  "name": {"prompt": "the name of the warehouse", "dtype": "string"},
@@ -128,7 +129,7 @@ tables = {
128
129
  "primary_key": "warehouse_id",
129
130
  },
130
131
  "orders": {
131
- "description": "Orders of a Customer",
132
+ "prompt": "Orders of a Customer",
132
133
  "columns": {
133
134
  "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
134
135
  "warehouse_id": {"prompt": "the warehouse id for that order", "dtype": "integer"},
@@ -141,7 +142,7 @@ tables = {
141
142
  {
142
143
  "column": "customer_id",
143
144
  "referenced_table": "customers",
144
- "description": "each customer has anywhere between 2 and 3 orders",
145
+ "prompt": "each customer has anywhere between 2 and 3 orders",
145
146
  },
146
147
  {
147
148
  "column": "warehouse_id",
@@ -150,7 +151,7 @@ tables = {
150
151
  ],
151
152
  },
152
153
  "items": {
153
- "description": "Items in an Order",
154
+ "prompt": "Items in an Order",
154
155
  "columns": {
155
156
  "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
156
157
  "order_id": {"prompt": "the order id for that item", "dtype": "string"},
@@ -161,7 +162,7 @@ tables = {
161
162
  {
162
163
  "column": "order_id",
163
164
  "referenced_table": "orders",
164
- "description": "each order has between 1 and 2 items",
165
+ "prompt": "each order has between 1 and 2 items",
165
166
  }
166
167
  ],
167
168
  },
@@ -199,3 +200,42 @@ print(data["items"])
199
200
  # 8 ITM-84311 ORD-11385 Ergonomic Task Chair, Black Mesh 359.25
200
201
  # 9 ITM-84312 ORD-11385 Standard Delivery Service 48.5
201
202
  ```
203
+
204
+ 6. Create your first self-referencing synthetic table
205
+
206
+ ```python
207
+ from mostlyai import mock
208
+
209
+ tables = {
210
+ "employees": {
211
+ "prompt": "Employees of a company",
212
+ "columns": {
213
+ "employee_id": {"prompt": "the unique id of the employee", "dtype": "integer"},
214
+ "name": {"prompt": "first name and last name of the president", "dtype": "string"},
215
+ "boss_id": {"prompt": "the id of the boss of the employee", "dtype": "integer"},
216
+ "role": {"prompt": "the role of the employee", "dtype": "string"},
217
+ },
218
+ "primary_key": "employee_id",
219
+ "foreign_keys": [
220
+ {
221
+ "column": "boss_id",
222
+ "referenced_table": "employees",
223
+ "prompt": "each boss has at most 3 employees",
224
+ },
225
+ ],
226
+ }
227
+ }
228
+ df = sample(tables=tables, sample_size=10, model="openai/gpt-4.1")
229
+ print(df)
230
+ # employee_id name boss_id role
231
+ # 0 1 Sandra Phillips <NA> President
232
+ # 1 2 Marcus Tran 1 Chief Financial Officer
233
+ # 2 3 Ava Whittaker 1 Chief Technology Officer
234
+ # 3 4 Sophie Martin 1 Chief Operations Officer
235
+ # 4 5 Chad Nelson 2 Finance Manager
236
+ # 5 6 Ethan Glover 2 Senior Accountant
237
+ # 6 7 Kimberly Ortiz 2 Junior Accountant
238
+ # 7 8 Lucas Romero 3 IT Manager
239
+ # 8 9 Priya Desai 3 Lead Software Engineer
240
+ # 9 10 Felix Bennett 3 Senior Systems Analyst
241
+ ```
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=jObvPbThXtHSUyMozHQZdSsgvR_fiii7gcPNjnBx0WM,714
2
+ mostlyai/mock/core.py,sha256=p5VAsRppzAc4P8FqKEunfQ3cPjImUU2cEc6yqHJVhMg,29884
3
+ mostlyai/mock/mcp_server.py,sha256=0oyv-7M2Cm9a7JdrMKwHSb3nucPW1J2N6YiYASboAbM,2741
4
+ mostlyai_mock-0.0.9.dist-info/METADATA,sha256=plPb0H5ilUw9_wzB8i5s0Rmw4W0VXWjmx7NiaS7hEGk,11380
5
+ mostlyai_mock-0.0.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.0.9.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
+ mostlyai_mock-0.0.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.0.9.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ mcp-server = mostlyai.mock.mcp_server:main
@@ -1,6 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=b9OE7NhBo32iINB3V7gU6jK_auwGB99AUlAZ_eul-eo,714
2
- mostlyai/mock/core.py,sha256=6xy0qzocyLh8kw3WvckOuFnCZx2LKpWrsJaHaF3ISCE,29901
3
- mostlyai_mock-0.0.7.dist-info/METADATA,sha256=8d-XpBFxaGkggPZLnH56raX2ysy2rWjL6bnI70JCSiU,9719
4
- mostlyai_mock-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- mostlyai_mock-0.0.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
- mostlyai_mock-0.0.7.dist-info/RECORD,,