mostlyai-mock 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +85 -19
- mostlyai/mock/mcp_server.py +14 -0
- {mostlyai_mock-0.1.3.dist-info → mostlyai_mock-0.1.5.dist-info}/METADATA +2 -5
- mostlyai_mock-0.1.5.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.3.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.3.dist-info → mostlyai_mock-0.1.5.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.3.dist-info → mostlyai_mock-0.1.5.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.3.dist-info → mostlyai_mock-0.1.5.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -18,7 +18,7 @@ import json
|
|
18
18
|
from collections import deque
|
19
19
|
from collections.abc import Generator
|
20
20
|
from enum import Enum
|
21
|
-
from typing import Any, Literal
|
21
|
+
from typing import Any, Literal
|
22
22
|
|
23
23
|
import litellm
|
24
24
|
import pandas as pd
|
@@ -28,7 +28,7 @@ from tqdm import tqdm
|
|
28
28
|
litellm.suppress_debug_info = True
|
29
29
|
|
30
30
|
SYSTEM_PROMPT = """
|
31
|
-
You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
31
|
+
You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
32
32
|
|
33
33
|
Your task is to:
|
34
34
|
|
@@ -58,7 +58,7 @@ class LLMConfig(BaseModel):
|
|
58
58
|
|
59
59
|
|
60
60
|
class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
61
|
-
root: dict[str, TableConfig] = Field(...,
|
61
|
+
root: dict[str, TableConfig] = Field(..., min_length=1)
|
62
62
|
|
63
63
|
@field_validator("root")
|
64
64
|
@classmethod
|
@@ -127,7 +127,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
127
127
|
|
128
128
|
class TableConfig(BaseModel):
|
129
129
|
prompt: str = ""
|
130
|
-
columns: dict[str, ColumnConfig] = Field(...,
|
130
|
+
columns: dict[str, ColumnConfig] = Field(..., min_length=1)
|
131
131
|
primary_key: str | None = None
|
132
132
|
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
|
133
133
|
|
@@ -261,7 +261,7 @@ def _create_table_prompt(
|
|
261
261
|
|
262
262
|
# add existing data to augment
|
263
263
|
if existing_data is not None:
|
264
|
-
prompt +=
|
264
|
+
prompt += "\n## Existing Data to Augment:\n\n"
|
265
265
|
prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
266
266
|
|
267
267
|
# define foreign keys
|
@@ -314,11 +314,11 @@ def _create_table_prompt(
|
|
314
314
|
|
315
315
|
if foreign_keys:
|
316
316
|
prompt += (
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
317
|
+
"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
|
318
|
+
"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
|
319
|
+
"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
|
320
|
+
"In this case, ensure that the foreign keys are consistent with primary keys of the table. "
|
321
|
+
"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
|
322
322
|
)
|
323
323
|
|
324
324
|
if existing_data is not None:
|
@@ -358,7 +358,7 @@ def _create_table_rows_generator(
|
|
358
358
|
llm_config: LLMConfig,
|
359
359
|
) -> Generator[dict]:
|
360
360
|
def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
|
361
|
-
def create_annotation(column_config: ColumnConfig) ->
|
361
|
+
def create_annotation(column_config: ColumnConfig) -> type:
|
362
362
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
363
363
|
return Literal[tuple(column_config.values)]
|
364
364
|
return {
|
@@ -488,6 +488,12 @@ def _create_table_rows_generator(
|
|
488
488
|
table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()
|
489
489
|
}
|
490
490
|
|
491
|
+
if context_batch is None:
|
492
|
+
# for root tables, scale down batch size in order to prevent excessive generations
|
493
|
+
remaining_rows = sample_size - yielded_sequences
|
494
|
+
if batch_size >= remaining_rows:
|
495
|
+
batch_size = remaining_rows + 2 # +2 because LLM may not always count the rows correctly
|
496
|
+
|
491
497
|
llm_prompt = _create_table_prompt(
|
492
498
|
name=name,
|
493
499
|
prompt=prompt,
|
@@ -553,6 +559,36 @@ def _convert_table_rows_generator_to_df(
|
|
553
559
|
return df
|
554
560
|
|
555
561
|
|
562
|
+
def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataFrame] | None) -> dict[str, dict]:
|
563
|
+
def _infer_dtype(series: pd.Series) -> DType:
|
564
|
+
if pd.api.types.is_integer_dtype(series):
|
565
|
+
return DType.INTEGER
|
566
|
+
elif pd.api.types.is_float_dtype(series):
|
567
|
+
return DType.FLOAT
|
568
|
+
elif pd.api.types.is_datetime64_dtype(series):
|
569
|
+
return DType.DATETIME
|
570
|
+
elif pd.api.types.is_bool_dtype(series):
|
571
|
+
return DType.BOOLEAN
|
572
|
+
else:
|
573
|
+
return DType.STRING
|
574
|
+
|
575
|
+
if existing_data is None:
|
576
|
+
return tables
|
577
|
+
|
578
|
+
tables = tables.copy()
|
579
|
+
for table_name, existing_table in existing_data.items():
|
580
|
+
table_config = tables.setdefault(table_name, {})
|
581
|
+
column_configs = table_config.setdefault("columns", {})
|
582
|
+
existing_column_configs = {
|
583
|
+
existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
|
584
|
+
for existing_column in existing_table.columns
|
585
|
+
if existing_column not in column_configs
|
586
|
+
}
|
587
|
+
column_configs = {**existing_column_configs, **column_configs}
|
588
|
+
table_config["columns"] = column_configs
|
589
|
+
return tables
|
590
|
+
|
591
|
+
|
556
592
|
def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
|
557
593
|
if isinstance(sample_size, int):
|
558
594
|
return {table_name: sample_size for table_name in config.root}
|
@@ -626,24 +662,25 @@ def sample(
|
|
626
662
|
return_type: Literal["auto", "dict"] = "auto",
|
627
663
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
628
664
|
"""
|
629
|
-
Generate mock data by prompting an LLM.
|
665
|
+
Generate mock data from scratch or enrich existing data by prompting an LLM.
|
630
666
|
|
631
667
|
Args:
|
632
668
|
tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
|
633
669
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
634
670
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
635
|
-
If a dictionary is provided, the number of rows to generate for each subject table can be specified
|
636
|
-
individually.
|
671
|
+
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
637
672
|
Default is 10. Ignored if existing_data is provided.
|
673
|
+
If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
|
638
674
|
existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
|
639
675
|
Default is None.
|
640
|
-
model (str): The LiteLLM chat completion model to be used.
|
676
|
+
model (str): The LiteLLM chat completion model to be used. Model needs to support structured output / JSON mode.
|
641
677
|
Examples include:
|
642
|
-
- `openai/gpt-4.1-nano` (default;
|
678
|
+
- `openai/gpt-4.1-nano` (default; fast, and smart)
|
643
679
|
- `openai/gpt-4.1-mini` (slower, but smarter)
|
644
680
|
- `openai/gpt-4.1` (slowest, but smartest)
|
645
681
|
- `gemini/gemini-2.0-flash`
|
646
682
|
- `gemini/gemini-2.5-flash-preview-04-17`
|
683
|
+
- 'groq/gemma2-9b-it`
|
647
684
|
- `groq/llama-3.3-70b-versatile`
|
648
685
|
- `anthropic/claude-3-7-sonnet-latest`
|
649
686
|
See https://docs.litellm.ai/docs/providers/ for more options.
|
@@ -656,7 +693,7 @@ def sample(
|
|
656
693
|
- pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
|
657
694
|
- dict[str, pd.DataFrame]: A dictionary containing the generated mock data for each table, if multiple tables are provided.
|
658
695
|
|
659
|
-
Example of single table (without PK):
|
696
|
+
Example of generating mock data for a single table (without PK):
|
660
697
|
```python
|
661
698
|
from mostlyai import mock
|
662
699
|
|
@@ -679,7 +716,7 @@ def sample(
|
|
679
716
|
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
|
680
717
|
```
|
681
718
|
|
682
|
-
Example of multiple tables (with PK/FK relationships):
|
719
|
+
Example of generating mock data for multiple tables (with PK/FK relationships):
|
683
720
|
```python
|
684
721
|
from mostlyai import mock
|
685
722
|
|
@@ -746,7 +783,34 @@ def sample(
|
|
746
783
|
df_items = data["items"]
|
747
784
|
```
|
748
785
|
|
749
|
-
Example of
|
786
|
+
Example of enriching a single dataframe:
|
787
|
+
```python
|
788
|
+
from mostlyai import mock
|
789
|
+
import pandas as pd
|
790
|
+
|
791
|
+
tables = {
|
792
|
+
"patients": {
|
793
|
+
"prompt": "Patients of a hospital in Finland",
|
794
|
+
"columns": {
|
795
|
+
"full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
|
796
|
+
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
797
|
+
"place_of_birth": {"prompt": "place of birth", "dtype": "string"},
|
798
|
+
},
|
799
|
+
},
|
800
|
+
}
|
801
|
+
existing_df = pd.DataFrame({
|
802
|
+
"age": [25, 30, 35, 40],
|
803
|
+
"gender": ["male", "male", "female", "female"],
|
804
|
+
})
|
805
|
+
enriched_df = mock.sample(
|
806
|
+
tables=tables,
|
807
|
+
existing_data={"patients": existing_df},
|
808
|
+
model="openai/gpt-4.1-nano"
|
809
|
+
)
|
810
|
+
enriched_df
|
811
|
+
```
|
812
|
+
|
813
|
+
Example of enriching / augmenting an existing dataset:
|
750
814
|
```python
|
751
815
|
from mostlyai import mock
|
752
816
|
import pandas as pd
|
@@ -803,7 +867,9 @@ def sample(
|
|
803
867
|
```
|
804
868
|
"""
|
805
869
|
|
870
|
+
tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
|
806
871
|
config = MockConfig(tables)
|
872
|
+
|
807
873
|
llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
|
808
874
|
|
809
875
|
sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
|
mostlyai/mock/mcp_server.py
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
# Copyright 2025 MOSTLY AI
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
1
15
|
import os
|
2
16
|
import tempfile
|
3
17
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: LLM-generated Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -169,7 +169,7 @@ tables = {
|
|
169
169
|
}
|
170
170
|
data = mock.sample(
|
171
171
|
tables=tables,
|
172
|
-
sample_size=2,
|
172
|
+
sample_size=2,
|
173
173
|
model="openai/gpt-4.1"
|
174
174
|
)
|
175
175
|
print(data["customers"])
|
@@ -250,9 +250,6 @@ tables = {
|
|
250
250
|
"guests": {
|
251
251
|
"prompt": "Guests of an Alpine ski hotel in Austria",
|
252
252
|
"columns": {
|
253
|
-
"guest_id": {"prompt": "the unique id of the guest", "dtype": "integer"},
|
254
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
255
|
-
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
256
253
|
"gender": {"dtype": "category", "values": ["male", "female"]},
|
257
254
|
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
258
255
|
"room_number": {"prompt": "room number", "dtype": "integer"},
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=-bfsVZJQ0OkN5b3IRP3F9aUCiA8Eq1-RmAqBmTg0O0g,714
|
2
|
+
mostlyai/mock/core.py,sha256=V7KG7nOQPU95v6lRoSIfJuYivS0pNZ3rbiNC6SqDZSc,38075
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
|
4
|
+
mostlyai_mock-0.1.5.dist-info/METADATA,sha256=LfugCsu7ANDZk2ozNFHDxgCqY42etJIdkXcfc-S-cUE,13887
|
5
|
+
mostlyai_mock-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.5.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.5.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=38sp2aKJVtPa3koRxanlBS6fe_ccVQvIieILlKb-xuw,714
|
2
|
-
mostlyai/mock/core.py,sha256=lO5OzuOz7bvjaLHpfiN-wyjFBPD0oSHSqEA4v8q436Y,35318
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
|
4
|
-
mostlyai_mock-0.1.3.dist-info/METADATA,sha256=rkHeGDlNUM2cqSxWY_R47FWXsOLktpdl_COja8zYz28,14161
|
5
|
-
mostlyai_mock-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.3.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|