mostlyai-mock 0.0.1__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/PKG-INFO +3 -3
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/README.md +2 -1
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/mostlyai/mock/__init__.py +1 -1
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/mostlyai/mock/core.py +69 -18
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/pyproject.toml +1 -1
- mostlyai_mock-0.0.1/LICENSE_HEADER +0 -13
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/.gitignore +0 -0
- {mostlyai_mock-0.0.1 → mostlyai_mock-0.0.4}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: Synthetic Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -8,7 +8,6 @@ Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
|
|
8
8
|
Author-email: MOSTLY AI <dev@mostly.ai>
|
9
9
|
License-Expression: Apache-2.0
|
10
10
|
License-File: LICENSE
|
11
|
-
License-File: LICENSE_HEADER
|
12
11
|
Requires-Python: >=3.10
|
13
12
|
Requires-Dist: litellm>=1.67.0
|
14
13
|
Requires-Dist: numpy>=1.26.3
|
@@ -46,12 +45,13 @@ tables = {
|
|
46
45
|
"columns": {
|
47
46
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
48
47
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
49
|
-
"gender": {"
|
48
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
50
49
|
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
51
50
|
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
52
51
|
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
53
52
|
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
54
53
|
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
54
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
55
55
|
},
|
56
56
|
}
|
57
57
|
}
|
@@ -27,12 +27,13 @@ tables = {
|
|
27
27
|
"columns": {
|
28
28
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
29
29
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
30
|
-
"gender": {"
|
30
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
31
31
|
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
32
32
|
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
33
33
|
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
34
34
|
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
35
35
|
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
36
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
36
37
|
},
|
37
38
|
}
|
38
39
|
}
|
@@ -18,10 +18,11 @@ import json
|
|
18
18
|
from collections import deque
|
19
19
|
from collections.abc import Generator
|
20
20
|
from enum import Enum
|
21
|
+
from typing import Any, Literal, Type
|
21
22
|
|
22
23
|
import litellm
|
23
24
|
import pandas as pd
|
24
|
-
from pydantic import BaseModel, Field, RootModel, create_model, field_validator
|
25
|
+
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
25
26
|
from tqdm import tqdm
|
26
27
|
|
27
28
|
SYSTEM_PROMPT = f"""
|
@@ -97,14 +98,59 @@ class TableConfig(BaseModel):
|
|
97
98
|
|
98
99
|
|
99
100
|
class ColumnConfig(BaseModel):
|
100
|
-
prompt: str
|
101
|
+
prompt: str = ""
|
101
102
|
dtype: DType
|
103
|
+
values: list[Any] = Field(default_factory=list)
|
104
|
+
|
105
|
+
@model_validator(mode="before")
|
106
|
+
def set_default_dtype(cls, data):
|
107
|
+
if isinstance(data, dict):
|
108
|
+
if "dtype" not in data:
|
109
|
+
if data.get("values"):
|
110
|
+
data["dtype"] = DType.CATEGORY
|
111
|
+
else:
|
112
|
+
data["dtype"] = DType.STRING
|
113
|
+
return data
|
114
|
+
|
115
|
+
@model_validator(mode="after")
|
116
|
+
def ensure_values_are_unique(self) -> ColumnConfig:
|
117
|
+
if self.values:
|
118
|
+
if len(self.values) != len(set(self.values)):
|
119
|
+
raise ValueError("Values must be unique")
|
120
|
+
return self
|
121
|
+
|
122
|
+
@model_validator(mode="after")
|
123
|
+
def ensure_values_are_provided_for_category_dtype(self) -> ColumnConfig:
|
124
|
+
if self.dtype == DType.CATEGORY and not self.values:
|
125
|
+
raise ValueError("At least one value must be provided when dtype is 'category'")
|
126
|
+
return self
|
127
|
+
|
128
|
+
@model_validator(mode="after")
|
129
|
+
def harmonize_values_with_dtypes(self) -> ColumnConfig:
|
130
|
+
if self.values:
|
131
|
+
cast_fn, convertible_to = {
|
132
|
+
DType.INTEGER: (int, "integers"),
|
133
|
+
DType.FLOAT: (float, "floats"),
|
134
|
+
DType.STRING: (str, "strings"),
|
135
|
+
DType.CATEGORY: (lambda c: c, "categories"),
|
136
|
+
DType.BOOLEAN: (bool, "booleans"),
|
137
|
+
DType.DATE: (str, "strings"),
|
138
|
+
DType.DATETIME: (str, "strings"),
|
139
|
+
}[self.dtype]
|
140
|
+
try:
|
141
|
+
self.values = [cast_fn(c) for c in self.values]
|
142
|
+
except ValueError:
|
143
|
+
raise ValueError(
|
144
|
+
f"All values must be convertible to {convertible_to} when dtype is '{self.dtype.value}'"
|
145
|
+
)
|
146
|
+
return self
|
102
147
|
|
103
148
|
|
104
149
|
class DType(str, Enum):
|
105
150
|
INTEGER = "integer"
|
106
151
|
FLOAT = "float"
|
107
152
|
STRING = "string"
|
153
|
+
CATEGORY = "category"
|
108
154
|
BOOLEAN = "boolean"
|
109
155
|
DATE = "date"
|
110
156
|
DATETIME = "datetime"
|
@@ -234,19 +280,23 @@ def _create_table_rows_generator(
|
|
234
280
|
llm_config: LLMConfig,
|
235
281
|
) -> Generator[dict]:
|
236
282
|
def create_table_response_format(columns: dict[str, ColumnConfig]) -> BaseModel:
|
237
|
-
|
238
|
-
DType.
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
283
|
+
def create_annotation(column_config: ColumnConfig) -> Type:
|
284
|
+
if column_config.values or column_config.dtype is DType.CATEGORY:
|
285
|
+
return Literal[tuple(column_config.values)]
|
286
|
+
return {
|
287
|
+
DType.INTEGER: int,
|
288
|
+
DType.FLOAT: float,
|
289
|
+
DType.STRING: str,
|
290
|
+
DType.BOOLEAN: bool,
|
291
|
+
# response_format has limited support for JSON Schema features
|
292
|
+
# thus we represent dates and datetimes as strings
|
293
|
+
DType.DATE: str,
|
294
|
+
DType.DATETIME: str,
|
295
|
+
}[column_config.dtype]
|
296
|
+
|
247
297
|
fields = {}
|
248
298
|
for column_name, column_config in columns.items():
|
249
|
-
annotation =
|
299
|
+
annotation = create_annotation(column_config)
|
250
300
|
fields[column_name] = (annotation, Field(...))
|
251
301
|
TableRow = create_model("TableRow", **fields)
|
252
302
|
TableRows = create_model("TableRows", rows=(list[TableRow], ...))
|
@@ -351,16 +401,14 @@ def _convert_table_rows_generator_to_df(
|
|
351
401
|
def align_df_dtypes_with_mock_dtypes(df: pd.DataFrame, columns: dict[str, ColumnConfig]) -> pd.DataFrame:
|
352
402
|
for column_name, column_config in columns.items():
|
353
403
|
if column_config.dtype in [DType.DATE, DType.DATETIME]:
|
354
|
-
# datetime.date, datetime.datetime -> datetime64[ns] / datetime64[ns, tz]
|
355
404
|
df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
|
356
405
|
elif column_config.dtype in [DType.INTEGER, DType.FLOAT]:
|
357
|
-
# int -> int64[pyarrow], float -> double[pyarrow]
|
358
406
|
df[column_name] = pd.to_numeric(df[column_name], errors="coerce", dtype_backend="pyarrow")
|
359
407
|
elif column_config.dtype is DType.BOOLEAN:
|
360
|
-
# bool -> bool
|
361
408
|
df[column_name] = df[column_name].astype(bool)
|
409
|
+
elif column_config.dtype is DType.CATEGORY:
|
410
|
+
df[column_name] = pd.Categorical(df[column_name], categories=column_config.values)
|
362
411
|
else:
|
363
|
-
# other -> string[pyarrow]
|
364
412
|
df[column_name] = df[column_name].astype("string[pyarrow]")
|
365
413
|
return df
|
366
414
|
|
@@ -404,6 +452,8 @@ def sample(
|
|
404
452
|
- `openai/gpt-4.1`
|
405
453
|
- `gemini/gemini-2.0-flash`
|
406
454
|
- `gemini/gemini-2.5-flash-preview-04-17`
|
455
|
+
- `groq/llama-3.3-70b-versatile`
|
456
|
+
- `anthropic/claude-3-7-sonnet-latest`
|
407
457
|
See https://docs.litellm.ai/docs/providers/ for more options.
|
408
458
|
api_key (str | None): The API key to use for the LLM. If not provided, LiteLLM will take it from the environment variables.
|
409
459
|
temperature (float): The temperature to use for the LLM. Default is 1.0.
|
@@ -423,12 +473,13 @@ def sample(
|
|
423
473
|
"columns": {
|
424
474
|
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
425
475
|
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
426
|
-
"gender": {"
|
476
|
+
"gender": {"dtype": "category", "values": ["male", "female"]},
|
427
477
|
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
428
478
|
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
429
479
|
"checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
|
430
480
|
"is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
|
431
481
|
"price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
|
482
|
+
"room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
|
432
483
|
},
|
433
484
|
}
|
434
485
|
}
|
@@ -1,13 +0,0 @@
|
|
1
|
-
Copyright 2025 MOSTLY AI
|
2
|
-
|
3
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
you may not use this file except in compliance with the License.
|
5
|
-
You may obtain a copy of the License at
|
6
|
-
|
7
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
|
9
|
-
Unless required by applicable law or agreed to in writing, software
|
10
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
See the License for the specific language governing permissions and
|
13
|
-
limitations under the License.
|
File without changes
|
File without changes
|