mostlyai-mock 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.0.5" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.0.6" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -577,7 +577,7 @@ def sample(
577
577
  {
578
578
  "column": "customer_id",
579
579
  "referenced_table": "customers",
580
- "description": "each customer has anywhere between 1 and 3 orders",
580
+ "description": "each customer has anywhere between 2 and 3 orders",
581
581
  }
582
582
  ],
583
583
  },
@@ -593,7 +593,7 @@ def sample(
593
593
  {
594
594
  "column": "order_id",
595
595
  "referenced_table": "orders",
596
- "description": "each order has between 2 and 5 items",
596
+ "description": "each order has between 1 and 2 items",
597
597
  }
598
598
  ],
599
599
  },
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: mostlyai-mock
3
+ Version: 0.0.6
4
+ Summary: Synthetic Mock Data
5
+ Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
+ Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
7
+ Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
8
+ Author-email: MOSTLY AI <dev@mostly.ai>
9
+ License-Expression: Apache-2.0
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Financial and Insurance Industry
14
+ Classifier: Intended Audience :: Healthcare Industry
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Intended Audience :: Telecommunications Industry
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Software Development :: Libraries
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: litellm>=1.67.0
28
+ Requires-Dist: numpy>=1.26.3
29
+ Requires-Dist: pandas>=2.0.0
30
+ Requires-Dist: pyarrow>=14.0.0
31
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Synthetic Mock Data 🔮
35
+
36
+ [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
37
+
38
+ Create data out of nothing. Prompt LLMs for Tabular Data.
39
+
40
+ ## Key Features
41
+
42
+ * A light-weight python client for prompting LLMs for mixed-type tabular data
43
+ * Select from a range of LLM endpoints, that provide structured output
44
+ * Supports single-table as well as multi-table scenarios.
45
+ * Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
46
+ * Specify context, distributions and rules via dataset-, table- or column-level prompts.
47
+ * Tailor the diversity and realism of your generated data via temperature and top_p.
48
+
49
+ ## Getting Started
50
+
51
+ 1. Install the latest version of the `mostlyai-mock` python package.
52
+
53
+ ```bash
54
+ pip install -U mostlyai-mock
55
+ ```
56
+
57
+ 2. Set the API key of your LLM endpoint (if not done yet)
58
+
59
+ ```python
60
+ import os
61
+ os.environ["OPENAI_API_KEY"] = "your-api-key"
62
+ # os.environ["GEMINI_API_KEY"] = "your-api-key"
63
+ # os.environ["GROQ_API_KEY"] = "your-api-key"
64
+ ```
65
+
66
+ Note: You will need to obtain your API key directly from the LLM service provider (e.g. for Open AI from [here](https://platform.openai.com/api-keys)). The LLM endpoint will be determined by the chosen `model` when making calls to `mock.sample`.
67
+
68
+ 3. Create your first basic synthetic table from scratch
69
+
70
+ ```python
71
+ from mostlyai import mock
72
+
73
+ tables = {
74
+ "guests": {
75
+ "description": "Guests of an Alpine ski hotel in Austria",
76
+ "columns": {
77
+ "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
78
+ "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
79
+ "gender": {"dtype": "category", "values": ["male", "female"]},
80
+ "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
81
+ "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
82
+ "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
83
+ "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
84
+ "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
85
+ "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
86
+ },
87
+ }
88
+ }
89
+ df = mock.sample(
90
+ tables=tables, # provide table and column definitions
91
+ sample_size=10, # generate 10 records
92
+ model="openai/gpt-4.1-nano", # select the LLM model (optional)
93
+ )
94
+ print(df)
95
+ # nationality name gender age date_of_birth checkin_time is_vip price_per_night room_number
96
+ # 0 AT Anna Müller female 29 1994-09-15 2025-01-05 14:30:00 True 350.0 101
97
+ # 1 DE Johann Schmidt male 45 1978-11-20 2025-01-06 16:45:00 False 250.0 102
98
+ # 2 CH Lara Meier female 32 1991-04-12 2025-01-05 12:00:00 True 400.0 103
99
+ # 3 IT Marco Rossi male 38 1985-02-25 2025-01-07 09:15:00 False 280.0 201
100
+ # 4 FR Claire Dupont female 24 2000-07-08 2025-01-07 11:20:00 False 220.0 202
101
+ # 5 AT Felix Gruber male 52 1972-01-10 2025-01-06 17:50:00 True 375.0 203
102
+ # 6 DE Sophie Becker female 27 1996-03-30 2025-01-08 08:30:00 False 230.0 204
103
+ # 7 CH Max Keller male 31 1992-05-16 2025-01-09 14:10:00 False 290.0 101
104
+ # 8 IT Giulia Bianchi female 36 1988-08-19 2025-01-05 15:55:00 True 410.0 102
105
+ # 9 FR Louis Martin male 44 1980-12-05 2025-01-07 10:40:00 False 270.0 103
106
+ ```
107
+
108
+ 4. Create your first multi-table synthetic dataset
109
+
110
+ ```python
111
+ from mostlyai import mock
112
+
113
+ tables = {
114
+ "customers": {
115
+ "description": "Customers of a hardware store",
116
+ "columns": {
117
+ "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
118
+ "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
119
+ },
120
+ "primary_key": "customer_id",
121
+ },
122
+ "orders": {
123
+ "description": "Orders of a Customer",
124
+ "columns": {
125
+ "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
126
+ "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
127
+ "text": {"prompt": "order text description", "dtype": "string"},
128
+ "amount": {"prompt": "order amount in USD", "dtype": "float"},
129
+ },
130
+ "primary_key": "order_id",
131
+ "foreign_keys": [
132
+ {
133
+ "column": "customer_id",
134
+ "referenced_table": "customers",
135
+ "description": "each customer has anywhere between 2 and 3 orders",
136
+ }
137
+ ],
138
+ },
139
+ "items": {
140
+ "description": "Items in an Order",
141
+ "columns": {
142
+ "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
143
+ "order_id": {"prompt": "the order id for that item", "dtype": "string"},
144
+ "name": {"prompt": "the name of the item", "dtype": "string"},
145
+ "price": {"prompt": "the price of the item in USD", "dtype": "float"},
146
+ },
147
+ "foreign_keys": [
148
+ {
149
+ "column": "order_id",
150
+ "referenced_table": "orders",
151
+ "description": "each order has between 1 and 2 items",
152
+ }
153
+ ],
154
+ },
155
+ }
156
+ data = mock.sample(
157
+ tables=tables,
158
+ sample_size=2,
159
+ model="openai/gpt-4.1"
160
+ )
161
+ print(data["customers"])
162
+ # customer_id name
163
+ # 0 1 Michael Torres
164
+ # 1 2 Elaine Kim
165
+ print(data["orders"])
166
+ # customer_id order_id text amount
167
+ # 0 1 ORD20240612001 Home office desk and ergonomic chair bundle 412.95
168
+ # 1 1 ORD20240517322 Wireless noise-cancelling headphones 226.49
169
+ # 2 1 ORD20240430307 Smart LED desk lamp with USB charging port 69.99
170
+ # 3 2 ORD20240614015 Eco-friendly bamboo kitchen utensil set 39.95
171
+ # 4 2 ORD20240528356 Air fryer with digital touch screen, 5-quart c... 129.99
172
+ # 5 2 ORD20240510078 Double-walled glass coffee mugs, set of 4 48.5
173
+ print(data["items"])
174
+ # item_id order_id name price
175
+ # 0 ITEM100001A ORD20240612001 Ergonomic Mesh Office Chair 179.99
176
+ # 1 ITEM100001B ORD20240612001 Adjustable Home Office Desk 232.96
177
+ # 2 ITEM100002A ORD20240517322 Wireless Noise-Cancelling Headphones 226.49
178
+ # 3 ITEM100003A ORD20240430307 Smart LED Desk Lamp 59.99
179
+ # 4 ITEM100003B ORD20240430307 USB Charging Cable (Desk Lamp Compatible) 10.0
180
+ # 5 ITEM100004A ORD20240614015 Bamboo Cooking Spoon 13.49
181
+ # 6 ITEM100004B ORD20240614015 Bamboo Slotted Turner 12.99
182
+ # 7 ITEM100005A ORD20240528356 Digital Air Fryer (5-Quart, Black) 115.99
183
+ # 8 ITEM100005B ORD20240528356 Silicone Liner for Air Fryer (5-Quart) 13.99
184
+ # 9 ITEM100006A ORD20240510078 Double-Walled Glass Coffee Mug (12oz) 13.75
185
+ # 10 ITEM100006B ORD20240510078 Double-Walled Glass Coffee Mug (8oz) 11.25
186
+ ```
@@ -0,0 +1,6 @@
1
+ mostlyai/mock/__init__.py,sha256=wTqasBznmMbaFvtG6KDOFT7luVFNv7SXx7c3CoRQ8fQ,714
2
+ mostlyai/mock/core.py,sha256=yQfRe56eKeMv-XxHIXEZv4VF4NmZs0WRG-mFN7s7tuU,26351
3
+ mostlyai_mock-0.0.6.dist-info/METADATA,sha256=DE6sI855G4jQ0yHROWw3TYfzfA-nYOIA0ZJ7tpnzwZo,9285
4
+ mostlyai_mock-0.0.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
+ mostlyai_mock-0.0.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
+ mostlyai_mock-0.0.6.dist-info/RECORD,,
@@ -1,117 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: mostlyai-mock
3
- Version: 0.0.5
4
- Summary: Synthetic Mock Data
5
- Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
- Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
7
- Project-URL: documentation, https://mostly-ai.github.io/mostlyai-mock/
8
- Author-email: MOSTLY AI <dev@mostly.ai>
9
- License-Expression: Apache-2.0
10
- License-File: LICENSE
11
- Requires-Python: >=3.10
12
- Requires-Dist: litellm>=1.67.0
13
- Requires-Dist: numpy>=1.26.3
14
- Requires-Dist: pandas>=2.0.0
15
- Requires-Dist: pyarrow>=14.0.0
16
- Requires-Dist: pydantic<3.0.0,>=2.0.0
17
- Description-Content-Type: text/markdown
18
-
19
- # Synthetic Mock Data 🔮
20
-
21
- [![Documentation](https://img.shields.io/badge/docs-latest-green)](https://mostly-ai.github.io/mostlyai-mock/) [![stats](https://pepy.tech/badge/mostlyai-mock)](https://pypi.org/project/mostlyai-mock/) ![license](https://img.shields.io/github/license/mostly-ai/mostlyai-mock) ![GitHub Release](https://img.shields.io/github/v/release/mostly-ai/mostlyai-mock) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mostlyai-mock)
22
-
23
- Create data out of nothing. Prompt LLMs for Tabular Data.
24
-
25
- ## Installation
26
-
27
- The latest release of `mostlyai-mock` can be installed via pip:
28
-
29
- ```bash
30
- pip install -U mostlyai-mock
31
- ```
32
-
33
- Note: An API key to a LLM endpoint, with structured response, is required. It is recommended to set such a key as an environment variable (e.g. `OPENAI_API_KEY`, `GEMINI_API_KEY`, etc.). Alternatively, the key needs to be passed to every call to the library iteself via the parameter `api_key`.
34
-
35
- ## Quick Start
36
-
37
- ### Single Table
38
-
39
- ```python
40
- from mostlyai import mock
41
-
42
- tables = {
43
- "guests": {
44
- "description": "Guests of an Alpine ski hotel in Austria",
45
- "columns": {
46
- "nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
47
- "name": {"prompt": "first name and last name of the guest", "dtype": "string"},
48
- "gender": {"dtype": "category", "values": ["male", "female"]},
49
- "age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
50
- "date_of_birth": {"prompt": "date of birth", "dtype": "date"},
51
- "checkin_time": {"prompt": "the check in timestamp of the guest; may 2025", "dtype": "datetime"},
52
- "is_vip": {"prompt": "is the guest a VIP", "dtype": "boolean"},
53
- "price_per_night": {"prompt": "price paid per night, in EUR", "dtype": "float"},
54
- "room_number": {"prompt": "room number", "dtype": "integer", "values": [101, 102, 103, 201, 202, 203, 204]}
55
- },
56
- }
57
- }
58
- df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-4.1-nano")
59
- print(df)
60
- ```
61
-
62
- ### Multiple Tables
63
-
64
- ```python
65
- from mostlyai import mock
66
-
67
- tables = {
68
- "customers": {
69
- "description": "Customers of a hardware store",
70
- "columns": {
71
- "customer_id": {"prompt": "the unique id of the customer", "dtype": "integer"},
72
- "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
73
- },
74
- "primary_key": "customer_id",
75
- },
76
- "orders": {
77
- "description": "Orders of a Customer",
78
- "columns": {
79
- "customer_id": {"prompt": "the customer id for that order", "dtype": "integer"},
80
- "order_id": {"prompt": "the unique id of the order", "dtype": "string"},
81
- "text": {"prompt": "order text description", "dtype": "string"},
82
- "amount": {"prompt": "order amount in USD", "dtype": "float"},
83
- },
84
- "primary_key": "order_id",
85
- "foreign_keys": [
86
- {
87
- "column": "customer_id",
88
- "referenced_table": "customers",
89
- "description": "each customer has anywhere between 1 and 3 orders",
90
- }
91
- ],
92
- },
93
- "items": {
94
- "description": "Items in an Order",
95
- "columns": {
96
- "item_id": {"prompt": "the unique id of the item", "dtype": "string"},
97
- "order_id": {"prompt": "the order id for that item", "dtype": "string"},
98
- "name": {"prompt": "the name of the item", "dtype": "string"},
99
- "price": {"prompt": "the price of the item in USD", "dtype": "float"},
100
- },
101
- "foreign_keys": [
102
- {
103
- "column": "order_id",
104
- "referenced_table": "orders",
105
- "description": "each order has between 2 and 5 items",
106
- }
107
- ],
108
- },
109
- }
110
- data = mock.sample(tables=tables, sample_size=2, model="openai/gpt-4.1")
111
- df_customers = data["customers"]
112
- df_orders = data["orders"]
113
- df_items = data["items"]
114
- print(df_customers)
115
- print(df_orders)
116
- print(df_items)
117
- ```
@@ -1,6 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=u9aoJD9XCxM0h1mIWBwkm_O_7VUf9d6-y8YiC4diUAM,714
2
- mostlyai/mock/core.py,sha256=zgUmv6yclP51NJFE5W0CtE4raXZUnKF6Fa8r8_idbpI,26351
3
- mostlyai_mock-0.0.5.dist-info/METADATA,sha256=Ek_7faZR-YNx7ntoiqqBZxahcwC5f2oiVVM1FseXMf8,4655
4
- mostlyai_mock-0.0.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- mostlyai_mock-0.0.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
- mostlyai_mock-0.0.5.dist-info/RECORD,,