mostlyai-mock 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +43 -11
- {mostlyai_mock-0.1.2.dist-info → mostlyai_mock-0.1.4.dist-info}/METADATA +1 -1
- mostlyai_mock-0.1.4.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.2.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.2.dist-info → mostlyai_mock-0.1.4.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.2.dist-info → mostlyai_mock-0.1.4.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.2.dist-info → mostlyai_mock-0.1.4.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -25,6 +25,8 @@ import pandas as pd
|
|
25
25
|
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
26
26
|
from tqdm import tqdm
|
27
27
|
|
28
|
+
litellm.suppress_debug_info = True
|
29
|
+
|
28
30
|
SYSTEM_PROMPT = """
|
29
31
|
You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
30
32
|
|
@@ -624,24 +626,25 @@ def sample(
|
|
624
626
|
return_type: Literal["auto", "dict"] = "auto",
|
625
627
|
) -> pd.DataFrame | dict[str, pd.DataFrame]:
|
626
628
|
"""
|
627
|
-
Generate mock data by prompting an LLM.
|
629
|
+
Generate mock data from scratch or enrich existing data by prompting an LLM.
|
628
630
|
|
629
631
|
Args:
|
630
632
|
tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
|
631
633
|
sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
|
632
634
|
If a single integer is provided, the same number of rows will be generated for each subject table.
|
633
|
-
If a dictionary is provided, the number of rows to generate for each subject table can be specified
|
634
|
-
individually.
|
635
|
+
If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
|
635
636
|
Default is 10. Ignored if existing_data is provided.
|
637
|
+
If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
|
636
638
|
existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
|
637
639
|
Default is None.
|
638
|
-
model (str): The LiteLLM chat completion model to be used.
|
640
|
+
model (str): The LiteLLM chat completion model to be used. Model needs to support structured output / JSON mode.
|
639
641
|
Examples include:
|
640
|
-
- `openai/gpt-4.1-nano` (default)
|
641
|
-
- `openai/gpt-4.1-mini`
|
642
|
-
- `openai/gpt-4.1`
|
642
|
+
- `openai/gpt-4.1-nano` (default; fast, and smart)
|
643
|
+
- `openai/gpt-4.1-mini` (slower, but smarter)
|
644
|
+
- `openai/gpt-4.1` (slowest, but smartest)
|
643
645
|
- `gemini/gemini-2.0-flash`
|
644
646
|
- `gemini/gemini-2.5-flash-preview-04-17`
|
647
|
+
- 'groq/gemma2-9b-it`
|
645
648
|
- `groq/llama-3.3-70b-versatile`
|
646
649
|
- `anthropic/claude-3-7-sonnet-latest`
|
647
650
|
See https://docs.litellm.ai/docs/providers/ for more options.
|
@@ -654,7 +657,7 @@ def sample(
|
|
654
657
|
- pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
|
655
658
|
- dict[str, pd.DataFrame]: A dictionary containing the generated mock data for each table, if multiple tables are provided.
|
656
659
|
|
657
|
-
Example of single table (without PK):
|
660
|
+
Example of generating mock data for a single table (without PK):
|
658
661
|
```python
|
659
662
|
from mostlyai import mock
|
660
663
|
|
@@ -677,7 +680,7 @@ def sample(
|
|
677
680
|
df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
|
678
681
|
```
|
679
682
|
|
680
|
-
Example of multiple tables (with PK/FK relationships):
|
683
|
+
Example of generating mock data for multiple tables (with PK/FK relationships):
|
681
684
|
```python
|
682
685
|
from mostlyai import mock
|
683
686
|
|
@@ -688,7 +691,7 @@ def sample(
|
|
688
691
|
"customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
|
689
692
|
"name": {"prompt": "first name and last name of the customer", "dtype": "string"},
|
690
693
|
},
|
691
|
-
"primary_key": "customer_id",
|
694
|
+
"primary_key": "customer_id", # single string; no composite keys allowed
|
692
695
|
},
|
693
696
|
"warehouses": {
|
694
697
|
"prompt": "Warehouses of a hardware store",
|
@@ -744,7 +747,36 @@ def sample(
|
|
744
747
|
df_items = data["items"]
|
745
748
|
```
|
746
749
|
|
747
|
-
Example of
|
750
|
+
Example of enriching a single dataframe:
|
751
|
+
```python
|
752
|
+
from mostlyai import mock
|
753
|
+
import pandas as pd
|
754
|
+
|
755
|
+
tables = {
|
756
|
+
"patients": {
|
757
|
+
"prompt": "Patients of a hospital in Finland",
|
758
|
+
"columns": {
|
759
|
+
"age": {},
|
760
|
+
"gender": {},
|
761
|
+
"full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
|
762
|
+
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
763
|
+
"place_of_birth": {"prompt": "place of birth", "dtype": "string"},
|
764
|
+
},
|
765
|
+
},
|
766
|
+
}
|
767
|
+
existing_df = pd.DataFrame({
|
768
|
+
"age": [25, 30, 35, 40],
|
769
|
+
"gender": ["male", "male", "female", "female"],
|
770
|
+
})
|
771
|
+
enriched_df = mock.sample(
|
772
|
+
tables=tables,
|
773
|
+
existing_data={"patients": existing_df},
|
774
|
+
model="openai/gpt-4.1-nano"
|
775
|
+
)
|
776
|
+
enriched_df
|
777
|
+
```
|
778
|
+
|
779
|
+
Example of enriching / augmenting an existing dataset:
|
748
780
|
```python
|
749
781
|
from mostlyai import mock
|
750
782
|
import pandas as pd
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=EvV_Tp6ExzQPq4apGq_8F25qw_paNTcQEC94nIVOEog,714
|
2
|
+
mostlyai/mock/core.py,sha256=ubarMA3VUlXdjUsCXQK_mD_kWPkTMOYvLz9G4OughGk,36532
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
|
4
|
+
mostlyai_mock-0.1.4.dist-info/METADATA,sha256=jibPe0pKcwqyPBoyc7H98LPd72vkGZBStdw_yMNVvJI,14161
|
5
|
+
mostlyai_mock-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.4.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.4.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=eeJjZ-XPDr-e4iE44SBNNt_xYQKnT6OVm75Xr52CYWc,714
|
2
|
-
mostlyai/mock/core.py,sha256=buDLbuCIGbNP91TtCnN-dg3wlHLtRcvzzlGQWm-7j8k,35183
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
|
4
|
-
mostlyai_mock-0.1.2.dist-info/METADATA,sha256=pSjJ6D5ckyBdvpk_-5SjfF9-c6PXrEOjEc4oW5IE-g4,14161
|
5
|
-
mostlyai_mock-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.2.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|