mostlyai-mock 0.1.17__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.17
3
+ Version: 0.2.0
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -47,7 +47,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
47
47
  * A light-weight python client for prompting LLMs for mixed-type tabular data.
48
48
  * Select from a wide range of LLM endpoints and LLM models.
49
49
  * Supports single-table as well as multi-table scenarios.
50
- * Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
50
+ * Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
51
51
  * Specify context, distributions and rules via dataset-, table- or column-level prompts.
52
52
  * Create from scratch or enrich existing datasets with new columns and/or rows.
53
53
  * Tailor the diversity and realism of your generated data via temperature and top_p.
@@ -9,7 +9,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
9
9
  * A light-weight python client for prompting LLMs for mixed-type tabular data.
10
10
  * Select from a wide range of LLM endpoints and LLM models.
11
11
  * Supports single-table as well as multi-table scenarios.
12
- * Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
12
+ * Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
13
13
  * Specify context, distributions and rules via dataset-, table- or column-level prompts.
14
14
  * Create from scratch or enrich existing datasets with new columns and/or rows.
15
15
  * Tailor the diversity and realism of your generated data via temperature and top_p.
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.17" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.2.0" # Do not set this manually. Use poetry version [params].
@@ -18,8 +18,9 @@ import asyncio
18
18
  import concurrent.futures
19
19
  import json
20
20
  import math
21
+ import time
21
22
  from collections import deque
22
- from collections.abc import AsyncGenerator, Callable
23
+ from collections.abc import AsyncGenerator, Awaitable, Callable
23
24
  from enum import Enum
24
25
  from io import StringIO
25
26
  from typing import Any, Literal
@@ -29,7 +30,6 @@ import litellm
29
30
  import pandas as pd
30
31
  import tenacity
31
32
  from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
32
- from tqdm.asyncio import tqdm
33
33
 
34
34
  litellm.suppress_debug_info = True
35
35
 
@@ -249,8 +249,26 @@ async def _sample_table(
249
249
  n_workers: int,
250
250
  llm_config: LLMConfig,
251
251
  config: MockConfig,
252
- progress_callback: Callable | None = None,
252
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
253
253
  ) -> pd.DataFrame:
254
+ # provide a default progress callback if none is provided
255
+ if progress_callback is None:
256
+
257
+ async def default_progress_callback(**kwargs):
258
+ percentage = (kwargs["progress"] / kwargs["total"]) * 100 if kwargs["total"] > 0 else 0
259
+ rows_per_second = kwargs["rows"] / kwargs["elapsed_time"] if kwargs["elapsed_time"] > 0 else 0
260
+ message = (
261
+ f"Generating table `{kwargs['table']}`".ljust(40)
262
+ + f": {percentage:3.0f}%, {kwargs['rows']} rows, {kwargs['elapsed_time']:.0f}s, {rows_per_second:.1f} rows/s"
263
+ )
264
+ is_final = kwargs["progress"] >= kwargs["total"]
265
+ if is_final:
266
+ print(f"\r{message}") # final update with newline
267
+ else:
268
+ print(f"\r{message}", end="", flush=True) # in-progress update
269
+
270
+ progress_callback = default_progress_callback
271
+
254
272
  table_rows_generator = _create_table_rows_generator(
255
273
  name=name,
256
274
  prompt=prompt,
@@ -265,7 +283,6 @@ async def _sample_table(
265
283
  llm_config=llm_config,
266
284
  progress_callback=progress_callback,
267
285
  )
268
- table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
269
286
  table_df = await _convert_table_rows_generator_to_df(
270
287
  table_rows_generator=table_rows_generator,
271
288
  columns=columns,
@@ -805,7 +822,7 @@ async def _create_table_rows_generator(
805
822
  non_context_size: int | None,
806
823
  n_workers: int,
807
824
  llm_config: LLMConfig,
808
- progress_callback: Callable | None = None,
825
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
809
826
  ) -> AsyncGenerator[dict]:
810
827
  batch_size = 20 # generate 20 root table rows at a time
811
828
 
@@ -927,6 +944,7 @@ async def _create_table_rows_generator(
927
944
 
928
945
  n_completed_batches = 0
929
946
  n_yielded_sequences = 0
947
+ table_start_time = time.time()
930
948
  while n_yielded_sequences < sample_size:
931
949
  if n_completed_batches >= n_total_batches:
932
950
  assert context_data is None, "n_total_batches is fixed for linked tables"
@@ -959,10 +977,13 @@ async def _create_table_rows_generator(
959
977
  break
960
978
  n_completed_batches += 1
961
979
  if progress_callback:
980
+ elapsed_time = time.time() - table_start_time
962
981
  await progress_callback(
982
+ table=name,
963
983
  progress=n_completed_batches,
964
984
  total=n_total_batches,
965
- message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
985
+ rows=n_yielded_sequences,
986
+ elapsed_time=round(elapsed_time, 2),
966
987
  )
967
988
  result_queue.task_done()
968
989
 
@@ -1232,7 +1253,7 @@ async def _sample_common(
1232
1253
  top_p: float = 0.95,
1233
1254
  n_workers: int = 10,
1234
1255
  return_type: Literal["auto", "dict"] = "auto",
1235
- progress_callback: Callable | None = None,
1256
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
1236
1257
  ):
1237
1258
  tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
1238
1259
  config = MockConfig(tables)
@@ -1290,6 +1311,7 @@ def sample(
1290
1311
  top_p: float = 0.95,
1291
1312
  n_workers: int = 10,
1292
1313
  return_type: Literal["auto", "dict"] = "auto",
1314
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
1293
1315
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
1294
1316
  """
1295
1317
  Generate synthetic data from scratch or enrich existing data with new columns.
@@ -1305,6 +1327,7 @@ def sample(
1305
1327
  Args:
1306
1328
  tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
1307
1329
  Note: Avoid using double quotes (`"`) and other special characters in column names.
1330
+ Available dtypes: `string`, `integer`, `float`, `category`, `boolean`, `date`, `datetime`.
1308
1331
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
1309
1332
  If a single integer is provided, the same number of rows will be generated for each subject table.
1310
1333
  If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
@@ -1329,6 +1352,11 @@ def sample(
1329
1352
  n_workers (int): The number of concurrent workers making the LLM calls. Default is 10. The value is clamped to the range [1, 10].
1330
1353
  If n_workers is 1, the generation of batches becomes sequential and certain features for better data consistency are enabled.
1331
1354
  return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
1355
+ progress_callback (Callable | None): Optional callback function to track progress during data generation.
1356
+ If not provided, a default progress callback will display progress messages in the format:
1357
+ "Generating table `table_name`: X%, Y rows, Zs, W.X rows/s"
1358
+ The callback receives keyword arguments including: table, progress, total,
1359
+ rows, and elapsed_time. Default is None.
1332
1360
 
1333
1361
  Returns:
1334
1362
  - pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
@@ -1368,7 +1396,7 @@ def sample(
1368
1396
  "customer_id": {"prompt": "the unique id of the customer", "dtype": "string"},
1369
1397
  "name": {"prompt": "first name and last name of the customer", "dtype": "string"},
1370
1398
  },
1371
- "primary_key": "customer_id", # single string; no composite keys allowed; primary keys must have string dtype
1399
+ "primary_key": "customer_id", # no composite keys allowed;
1372
1400
  },
1373
1401
  "warehouses": {
1374
1402
  "prompt": "Warehouses of a hardware store",
@@ -1506,6 +1534,26 @@ def sample(
1506
1534
  df_customers = data["customers"]
1507
1535
  df_orders = data["orders"]
1508
1536
  ```
1537
+
1538
+ Example of using a custom progress callback to provide progress in JSON format:
1539
+ ```python
1540
+ from mostlyai import mock
1541
+ import asyncio
1542
+ import json
1543
+
1544
+ async def custom_progress_callback(**kwargs):
1545
+ msg = f"\r{json.dumps(kwargs)}"
1546
+ if kwargs["progress"] < kwargs["total"]:
1547
+ print(msg, end="", flush=True)
1548
+ else:
1549
+ print(msg)
1550
+
1551
+ df = mock.sample(
1552
+ tables=tables,
1553
+ sample_size=10,
1554
+ progress_callback=custom_progress_callback
1555
+ )
1556
+ ```
1509
1557
  """
1510
1558
 
1511
1559
  def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
@@ -1523,7 +1571,7 @@ def sample(
1523
1571
  top_p=top_p,
1524
1572
  n_workers=n_workers,
1525
1573
  return_type=return_type,
1526
- progress_callback=None,
1574
+ progress_callback=progress_callback,
1527
1575
  )
1528
1576
  return future.result()
1529
1577
 
@@ -1539,7 +1587,7 @@ async def _asample(
1539
1587
  top_p: float = 0.95,
1540
1588
  n_workers: int = 10,
1541
1589
  return_type: Literal["auto", "dict"] = "auto",
1542
- progress_callback: Callable | None = None,
1590
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
1543
1591
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
1544
1592
  return await _sample_common(
1545
1593
  tables=tables,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.1.17"
3
+ version = "0.2.0"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes
File without changes