mostlyai-mock 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.2" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.4" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -25,6 +25,8 @@ import pandas as pd
25
25
  from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
26
26
  from tqdm import tqdm
27
27
 
28
+ litellm.suppress_debug_info = True
29
+
28
30
  SYSTEM_PROMPT = """
29
31
  You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
30
32
 
@@ -624,24 +626,25 @@ def sample(
624
626
  return_type: Literal["auto", "dict"] = "auto",
625
627
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
626
628
  """
627
- Generate mock data by prompting an LLM.
629
+ Generate mock data from scratch or enrich existing data by prompting an LLM.
628
630
 
629
631
  Args:
630
632
  tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
631
633
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
632
634
  If a single integer is provided, the same number of rows will be generated for each subject table.
633
- If a dictionary is provided, the number of rows to generate for each subject table can be specified
634
- individually.
635
+ If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
635
636
  Default is 10. Ignored if existing_data is provided.
637
+ If a table has a foreign key, the sample size is determined by the corresponding foreign key prompt. If nothing specified, a few rows per parent record are generated.
636
638
  existing_data (dict[str, pd.DataFrame] | None): Existing data to augment. If provided, the sample_size argument is ignored.
637
639
  Default is None.
638
- model (str): The LiteLLM chat completion model to be used. Requires support for structured output / JSON mode.
640
+ model (str): The LiteLLM chat completion model to be used. Model needs to support structured output / JSON mode.
639
641
  Examples include:
640
- - `openai/gpt-4.1-nano` (default)
641
- - `openai/gpt-4.1-mini`
642
- - `openai/gpt-4.1`
642
+ - `openai/gpt-4.1-nano` (default; fast, and smart)
643
+ - `openai/gpt-4.1-mini` (slower, but smarter)
644
+ - `openai/gpt-4.1` (slowest, but smartest)
643
645
  - `gemini/gemini-2.0-flash`
644
646
  - `gemini/gemini-2.5-flash-preview-04-17`
647
+ - 'groq/gemma2-9b-it`
645
648
  - `groq/llama-3.3-70b-versatile`
646
649
  - `anthropic/claude-3-7-sonnet-latest`
647
650
  See https://docs.litellm.ai/docs/providers/ for more options.
@@ -654,7 +657,7 @@ def sample(
654
657
  - pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
655
658
  - dict[str, pd.DataFrame]: A dictionary containing the generated mock data for each table, if multiple tables are provided.
656
659
 
657
- Example of single table (without PK):
660
+ Example of generating mock data for a single table (without PK):
658
661
  ```python
659
662
  from mostlyai import mock
660
663
 
@@ -677,7 +680,7 @@ def sample(
677
680
  df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
678
681
  ```
679
682
 
680
- Example of multiple tables (with PK/FK relationships):
683
+ Example of generating mock data for multiple tables (with PK/FK relationships):
681
684
  ```python
682
685
  from mostlyai import mock
683
686
 
@@ -688,7 +691,7 @@ def sample(
688
691
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
689
692
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
690
693
  },
691
- "primary_key": "customer_id",
694
+ "primary_key": "customer_id", # single string; no composite keys allowed
692
695
  },
693
696
  "warehouses": {
694
697
  "prompt": "Warehouses of a hardware store",
@@ -744,7 +747,36 @@ def sample(
744
747
  df_items = data["items"]
745
748
  ```
746
749
 
747
- Example of data augmentation:
750
+ Example of enriching a single dataframe:
751
+ ```python
752
+ from mostlyai import mock
753
+ import pandas as pd
754
+
755
+ tables = {
756
+ "patients": {
757
+ "prompt": "Patients of a hospital in Finland",
758
+ "columns": {
759
+ "age": {},
760
+ "gender": {},
761
+ "full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
762
+ "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
763
+ "place_of_birth": {"prompt": "place of birth", "dtype": "string"},
764
+ },
765
+ },
766
+ }
767
+ existing_df = pd.DataFrame({
768
+ "age": [25, 30, 35, 40],
769
+ "gender": ["male", "male", "female", "female"],
770
+ })
771
+ enriched_df = mock.sample(
772
+ tables=tables,
773
+ existing_data={"patients": existing_df},
774
+ model="openai/gpt-4.1-nano"
775
+ )
776
+ enriched_df
777
+ ```
778
+
779
+ Example of enriching / augmenting an existing dataset:
748
780
  ```python
749
781
  from mostlyai import mock
750
782
  import pandas as pd
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: LLM-generated Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=EvV_Tp6ExzQPq4apGq_8F25qw_paNTcQEC94nIVOEog,714
2
+ mostlyai/mock/core.py,sha256=ubarMA3VUlXdjUsCXQK_mD_kWPkTMOYvLz9G4OughGk,36532
3
+ mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
4
+ mostlyai_mock-0.1.4.dist-info/METADATA,sha256=jibPe0pKcwqyPBoyc7H98LPd72vkGZBStdw_yMNVvJI,14161
5
+ mostlyai_mock-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.1.4.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
+ mostlyai_mock-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.1.4.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=eeJjZ-XPDr-e4iE44SBNNt_xYQKnT6OVm75Xr52CYWc,714
2
- mostlyai/mock/core.py,sha256=buDLbuCIGbNP91TtCnN-dg3wlHLtRcvzzlGQWm-7j8k,35183
3
- mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
4
- mostlyai_mock-0.1.2.dist-info/METADATA,sha256=pSjJ6D5ckyBdvpk_-5SjfF9-c6PXrEOjEc4oW5IE-g4,14161
5
- mostlyai_mock-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- mostlyai_mock-0.1.2.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
- mostlyai_mock-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
- mostlyai_mock-0.1.2.dist-info/RECORD,,