mostlyai-mock 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.4" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.1.5" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -18,7 +18,7 @@ import json
18
18
  from collections import deque
19
19
  from collections.abc import Generator
20
20
  from enum import Enum
21
- from typing import Any, Literal, Type
21
+ from typing import Any, Literal
22
22
 
23
23
  import litellm
24
24
  import pandas as pd
@@ -28,7 +28,7 @@ from tqdm import tqdm
28
28
  litellm.suppress_debug_info = True
29
29
 
30
30
  SYSTEM_PROMPT = """
31
- You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
31
+ You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
32
32
 
33
33
  Your task is to:
34
34
 
@@ -58,7 +58,7 @@ class LLMConfig(BaseModel):
58
58
 
59
59
 
60
60
  class MockConfig(RootModel[dict[str, "TableConfig"]]):
61
- root: dict[str, TableConfig] = Field(..., min_items=1)
61
+ root: dict[str, TableConfig] = Field(..., min_length=1)
62
62
 
63
63
  @field_validator("root")
64
64
  @classmethod
@@ -127,7 +127,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
127
127
 
128
128
  class TableConfig(BaseModel):
129
129
  prompt: str = ""
130
- columns: dict[str, ColumnConfig] = Field(..., min_items=1)
130
+ columns: dict[str, ColumnConfig] = Field(..., min_length=1)
131
131
  primary_key: str | None = None
132
132
  foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
133
133
 
@@ -261,7 +261,7 @@ def _create_table_prompt(
261
261
 
262
262
  # add existing data to augment
263
263
  if existing_data is not None:
264
- prompt += f"\n## Existing Data to Augment:\n\n"
264
+ prompt += "\n## Existing Data to Augment:\n\n"
265
265
  prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
266
266
 
267
267
  # define foreign keys
@@ -314,11 +314,11 @@ def _create_table_prompt(
314
314
 
315
315
  if foreign_keys:
316
316
  prompt += (
317
- f"The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
318
- f"The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
319
- f"If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
320
- f"In this case, ensure that the foreign keys are consistent with primary keys of the table. "
321
- f"Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
317
+ "The first Foreign Key column from Foreign Keys section may only contain values from Context Table Data. "
318
+ "The following Foreign Key columns from Foreign Keys section (if exists) may only contain values from Non-Context Table Data sections. "
319
+ "If either relevant Context Table Data or Non-Context Table Data is not present, this means that table has self-dependency. "
320
+ "In this case, ensure that the foreign keys are consistent with primary keys of the table. "
321
+ "Pay attention to prompt of the Foreign Key column to understand the relationship.\n\n"
322
322
  )
323
323
 
324
324
  if existing_data is not None:
@@ -358,7 +358,7 @@ def _create_table_rows_generator(
358
358
  llm_config: LLMConfig,
359
359
  ) -> Generator[dict]:
360
360
  def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
361
- def create_annotation(column_config: ColumnConfig) -> Type:
361
+ def create_annotation(column_config: ColumnConfig) -> type:
362
362
  if column_config.values or column_config.dtype is DType.CATEGORY:
363
363
  return Literal[tuple(column_config.values)]
364
364
  return {
@@ -488,6 +488,12 @@ def _create_table_rows_generator(
488
488
  table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()
489
489
  }
490
490
 
491
+ if context_batch is None:
492
+ # for root tables, scale down batch size in order to prevent excessive generations
493
+ remaining_rows = sample_size - yielded_sequences
494
+ if batch_size >= remaining_rows:
495
+ batch_size = remaining_rows + 2 # +2 because LLM may not always count the rows correctly
496
+
491
497
  llm_prompt = _create_table_prompt(
492
498
  name=name,
493
499
  prompt=prompt,
@@ -553,6 +559,36 @@ def _convert_table_rows_generator_to_df(
553
559
  return df
554
560
 
555
561
 
562
+ def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataFrame] | None) -> dict[str, dict]:
563
+ def _infer_dtype(series: pd.Series) -> DType:
564
+ if pd.api.types.is_integer_dtype(series):
565
+ return DType.INTEGER
566
+ elif pd.api.types.is_float_dtype(series):
567
+ return DType.FLOAT
568
+ elif pd.api.types.is_datetime64_dtype(series):
569
+ return DType.DATETIME
570
+ elif pd.api.types.is_bool_dtype(series):
571
+ return DType.BOOLEAN
572
+ else:
573
+ return DType.STRING
574
+
575
+ if existing_data is None:
576
+ return tables
577
+
578
+ tables = tables.copy()
579
+ for table_name, existing_table in existing_data.items():
580
+ table_config = tables.setdefault(table_name, {})
581
+ column_configs = table_config.setdefault("columns", {})
582
+ existing_column_configs = {
583
+ existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
584
+ for existing_column in existing_table.columns
585
+ if existing_column not in column_configs
586
+ }
587
+ column_configs = {**existing_column_configs, **column_configs}
588
+ table_config["columns"] = column_configs
589
+ return tables
590
+
591
+
556
592
  def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
557
593
  if isinstance(sample_size, int):
558
594
  return {table_name: sample_size for table_name in config.root}
@@ -756,8 +792,6 @@ def sample(
756
792
  "patients": {
757
793
  "prompt": "Patients of a hospital in Finland",
758
794
  "columns": {
759
- "age": {},
760
- "gender": {},
761
795
  "full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
762
796
  "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
763
797
  "place_of_birth": {"prompt": "place of birth", "dtype": "string"},
@@ -769,7 +803,7 @@ def sample(
769
803
  "gender": ["male", "male", "female", "female"],
770
804
  })
771
805
  enriched_df = mock.sample(
772
- tables=tables,
806
+ tables=tables,
773
807
  existing_data={"patients": existing_df},
774
808
  model="openai/gpt-4.1-nano"
775
809
  )
@@ -833,7 +867,9 @@ def sample(
833
867
  ```
834
868
  """
835
869
 
870
+ tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
836
871
  config = MockConfig(tables)
872
+
837
873
  llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
838
874
 
839
875
  sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
@@ -1,3 +1,17 @@
1
+ # Copyright 2025 MOSTLY AI
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
1
15
  import os
2
16
  import tempfile
3
17
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: LLM-generated Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -169,7 +169,7 @@ tables = {
169
169
  }
170
170
  data = mock.sample(
171
171
  tables=tables,
172
- sample_size=2,
172
+ sample_size=2,
173
173
  model="openai/gpt-4.1"
174
174
  )
175
175
  print(data["customers"])
@@ -250,9 +250,6 @@ tables = {
250
250
  "guests": {
251
251
  "prompt": "Guests of an Alpine ski hotel in Austria",
252
252
  "columns": {
253
- "guest_id": {"prompt": "the unique id of the guest", "dtype": "integer"},
254
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
255
- "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
256
253
  "gender": {"dtype": "category", "values": ["male", "female"]},
257
254
  "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
258
255
  "room_number": {"prompt": "room number", "dtype": "integer"},
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=-bfsVZJQ0OkN5b3IRP3F9aUCiA8Eq1-RmAqBmTg0O0g,714
2
+ mostlyai/mock/core.py,sha256=V7KG7nOQPU95v6lRoSIfJuYivS0pNZ3rbiNC6SqDZSc,38075
3
+ mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
4
+ mostlyai_mock-0.1.5.dist-info/METADATA,sha256=LfugCsu7ANDZk2ozNFHDxgCqY42etJIdkXcfc-S-cUE,13887
5
+ mostlyai_mock-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.1.5.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
+ mostlyai_mock-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.1.5.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=EvV_Tp6ExzQPq4apGq_8F25qw_paNTcQEC94nIVOEog,714
2
- mostlyai/mock/core.py,sha256=ubarMA3VUlXdjUsCXQK_mD_kWPkTMOYvLz9G4OughGk,36532
3
- mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
4
- mostlyai_mock-0.1.4.dist-info/METADATA,sha256=jibPe0pKcwqyPBoyc7H98LPd72vkGZBStdw_yMNVvJI,14161
5
- mostlyai_mock-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- mostlyai_mock-0.1.4.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
- mostlyai_mock-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
- mostlyai_mock-0.1.4.dist-info/RECORD,,