mostlyai-mock 0.0.1__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.0.1
3
+ Version: 0.0.4
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -8,7 +8,6 @@ Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
8
8
  Author-email: MOSTLY AI <dev@mostly.ai>
9
9
  License-Expression: Apache-2.0
10
10
  License-File: LICENSE
11
- License-File: LICENSE_HEADER
12
11
  Requires-Python: >=3.10
13
12
  Requires-Dist: litellm>=1.67.0
14
13
  Requires-Dist: numpy>=1.26.3
@@ -46,12 +45,13 @@ tables = {
46
45
  "columns": {
47
46
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
48
47
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
49
- "gender": {"prompt": "gender of the guest; male or female", "dtype": "string"},
48
+ "gender": {"dtype": "category", "values": ["male", "female"]},
50
49
  "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
51
50
  "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
52
51
  "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
53
52
  "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
54
53
  "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
54
+ "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
55
55
  },
56
56
  }
57
57
  }
@@ -27,12 +27,13 @@ tables = {
27
27
  "columns": {
28
28
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
29
29
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
30
- "gender": {"prompt": "gender of the guest; male or female", "dtype": "string"},
30
+ "gender": {"dtype": "category", "values": ["male", "female"]},
31
31
  "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
32
32
  "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
33
33
  "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
34
34
  "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
35
35
  "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
36
+ "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
36
37
  },
37
38
  }
38
39
  }
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.1" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.4" # Do not set this manually. Use poetry version [params].
@@ -18,10 +18,11 @@ import json
18
18
  from collections import deque
19
19
  from collections.abc import Generator
20
20
  from enum import Enum
21
+ from typing import Any, Literal, Type
21
22
 
22
23
  import litellm
23
24
  import pandas as pd
24
- from pydantic import BaseModel, Field, RootModel, create_model, field_validator
25
+ from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
25
26
  from tqdm import tqdm
26
27
 
27
28
  SYSTEM_PROMPT = f"""
@@ -97,14 +98,59 @@ class TableConfig(BaseModel):
97
98
 
98
99
 
99
100
  class ColumnConfig(BaseModel):
100
- prompt: str
101
+ prompt: str = ""
101
102
  dtype: DType
103
+ values: list[Any] = Field(default_factory=list)
104
+
105
+ @model_validator(mode="before")
106
+ def set_default_dtype(cls, data):
107
+ if isinstance(data, dict):
108
+ if "dtype" not in data:
109
+ if data.get("values"):
110
+ data["dtype"] = DType.CATEGORY
111
+ else:
112
+ data["dtype"] = DType.STRING
113
+ return data
114
+
115
+ @model_validator(mode="after")
116
+ def ensure_values_are_unique(self) -> ColumnConfig:
117
+ if self.values:
118
+ if len(self.values) != len(set(self.values)):
119
+ raise ValueError("Values must be unique")
120
+ return self
121
+
122
+ @model_validator(mode="after")
123
+ def ensure_values_are_provided_for_category_dtype(self) -> ColumnConfig:
124
+ if self.dtype == DType.CATEGORY and not self.values:
125
+ raise ValueError("At least one value must be provided when dtype is 'category'")
126
+ return self
127
+
128
+ @model_validator(mode="after")
129
+ def harmonize_values_with_dtypes(self) -> ColumnConfig:
130
+ if self.values:
131
+ cast_fn, convertible_to = {
132
+ DType.INTEGER: (int, "integers"),
133
+ DType.FLOAT: (float, "floats"),
134
+ DType.STRING: (str, "strings"),
135
+ DType.CATEGORY: (lambda c: c, "categories"),
136
+ DType.BOOLEAN: (bool, "booleans"),
137
+ DType.DATE: (str, "strings"),
138
+ DType.DATETIME: (str, "strings"),
139
+ }[self.dtype]
140
+ try:
141
+ self.values = [cast_fn(c) for c in self.values]
142
+ except ValueError:
143
+ raise ValueError(
144
+ f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
145
+ )
146
+ return self
102
147
 
103
148
 
104
149
  class DType(str, Enum):
105
150
  INTEGER = "integer"
106
151
  FLOAT = "float"
107
152
  STRING = "string"
153
+ CATEGORY = "category"
108
154
  BOOLEAN = "boolean"
109
155
  DATE = "date"
110
156
  DATETIME = "datetime"
@@ -234,19 +280,23 @@ def _create_table_rows_generator(
234
280
  llm_config: LLMConfig,
235
281
  ) -> Generator[dict]:
236
282
  def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
237
- dtype_to_pydantic_type = {
238
- DType.INTEGER: int,
239
- DType.FLOAT: float,
240
- DType.STRING: str,
241
- DType.BOOLEAN: bool,
242
- # response_format has limited support for JSON Schema features
243
- # thus we represent dates and datetimes as strings
244
- DType.DATE: str,
245
- DType.DATETIME: str,
246
- }
283
+ def create_annotation(column_config: ColumnConfig) -> Type:
284
+ if column_config.values or column_config.dtype is DType.CATEGORY:
285
+ return Literal[tuple(column_config.values)]
286
+ return {
287
+ DType.INTEGER: int,
288
+ DType.FLOAT: float,
289
+ DType.STRING: str,
290
+ DType.BOOLEAN: bool,
291
+ # response_format has limited support for JSON Schema features
292
+ # thus we represent dates and datetimes as strings
293
+ DType.DATE: str,
294
+ DType.DATETIME: str,
295
+ }[column_config.dtype]
296
+
247
297
  fields = {}
248
298
  for column_name, column_config in columns.items():
249
- annotation = dtype_to_pydantic_type[column_config.dtype]
299
+ annotation = create_annotation(column_config)
250
300
  fields[column_name] = (annotation, Field(...))
251
301
  TableRow = create_model("TableRow", **fields)
252
302
  TableRows = create_model("TableRows", rows=(list[TableRow], ...))
@@ -351,16 +401,14 @@ def _convert_table_rows_generator_to_df(
351
401
  def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
352
402
  for column_name, column_config in columns.items():
353
403
  if column_config.dtype in [DType.DATE, DType.DATETIME]:
354
- # datetime.date, datetime.datetime -> datetime64[ns] / datetime64[ns, tz]
355
404
  df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
356
405
  elif column_config.dtype in [DType.INTEGER, DType.FLOAT]:
357
- # int -> int64[pyarrow], float -> double[pyarrow]
358
406
  df[column_name] = pd.to_numeric(df[column_name], errors="coerce", dtype_backend="pyarrow")
359
407
  elif column_config.dtype is DType.BOOLEAN:
360
- # bool -> bool
361
408
  df[column_name] = df[column_name].astype(bool)
409
+ elif column_config.dtype is DType.CATEGORY:
410
+ df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
362
411
  else:
363
- # other -> string[pyarrow]
364
412
  df[column_name] = df[column_name].astype("string[pyarrow]")
365
413
  return df
366
414
 
@@ -404,6 +452,8 @@ def sample(
404
452
  - `openai/gpt-4.1`
405
453
  - `gemini/gemini-2.0-flash`
406
454
  - `gemini/gemini-2.5-flash-preview-04-17`
455
+ - `groq/llama-3.3-70b-versatile`
456
+ - `anthropic/claude-3-7-sonnet-latest`
407
457
  See https://docs.litellm.ai/docs/providers/ for more options.
408
458
  api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
409
459
  temperature (float): The temperature to use for the LLM. Default is 1.0.
@@ -423,12 +473,13 @@ def sample(
423
473
  "columns": {
424
474
  "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
425
475
  "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
426
- "gender": {"prompt": "gender of the guest; male or female", "dtype": "string"},
476
+ "gender": {"dtype": "category", "values": ["male", "female"]},
427
477
  "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
428
478
  "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
429
479
  "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
430
480
  "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
431
481
  "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
482
+ "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
432
483
  },
433
484
  }
434
485
  }
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.0.1"
3
+ version = "0.0.4"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
@@ -1,13 +0,0 @@
1
- Copyright 2025 MOSTLY AI
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
File without changes
File without changes