mostlyai-mock 0.1.18__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mostlyai/mock/__init__.py CHANGED
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.1.18" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.2.1" # Do not set this manually. Use poetry version [params].
mostlyai/mock/core.py CHANGED
@@ -18,8 +18,9 @@ import asyncio
18
18
  import concurrent.futures
19
19
  import json
20
20
  import math
21
+ import time
21
22
  from collections import deque
22
- from collections.abc import AsyncGenerator, Callable
23
+ from collections.abc import AsyncGenerator, Awaitable, Callable
23
24
  from enum import Enum
24
25
  from io import StringIO
25
26
  from typing import Any, Literal
@@ -29,7 +30,6 @@ import litellm
29
30
  import pandas as pd
30
31
  import tenacity
31
32
  from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
32
- from tqdm.asyncio import tqdm
33
33
 
34
34
  litellm.suppress_debug_info = True
35
35
 
@@ -249,8 +249,26 @@ async def _sample_table(
249
249
  n_workers: int,
250
250
  llm_config: LLMConfig,
251
251
  config: MockConfig,
252
- progress_callback: Callable | None = None,
252
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
253
253
  ) -> pd.DataFrame:
254
+ # provide a default progress callback if none is provided
255
+ if progress_callback is None:
256
+
257
+ async def default_progress_callback(**kwargs):
258
+ percentage = (kwargs["progress"] / kwargs["total"]) * 100 if kwargs["total"] > 0 else 0
259
+ rows_per_second = kwargs["rows"] / kwargs["elapsed_time"] if kwargs["elapsed_time"] > 0 else 0
260
+ message = (
261
+ f"Generating table `{kwargs['table']}`".ljust(40)
262
+ + f": {percentage:3.0f}%, {kwargs['rows']} rows, {kwargs['elapsed_time']:.0f}s, {rows_per_second:.1f} rows/s"
263
+ )
264
+ is_final = kwargs["progress"] >= kwargs["total"]
265
+ if is_final:
266
+ print(f"\r{message}") # final update with newline
267
+ else:
268
+ print(f"\r{message}", end="", flush=True) # in-progress update
269
+
270
+ progress_callback = default_progress_callback
271
+
254
272
  table_rows_generator = _create_table_rows_generator(
255
273
  name=name,
256
274
  prompt=prompt,
@@ -265,7 +283,6 @@ async def _sample_table(
265
283
  llm_config=llm_config,
266
284
  progress_callback=progress_callback,
267
285
  )
268
- table_rows_generator = tqdm(table_rows_generator, desc=f"Generating rows for table `{name}`".ljust(45))
269
286
  table_df = await _convert_table_rows_generator_to_df(
270
287
  table_rows_generator=table_rows_generator,
271
288
  columns=columns,
@@ -805,7 +822,7 @@ async def _create_table_rows_generator(
805
822
  non_context_size: int | None,
806
823
  n_workers: int,
807
824
  llm_config: LLMConfig,
808
- progress_callback: Callable | None = None,
825
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
809
826
  ) -> AsyncGenerator[dict]:
810
827
  batch_size = 20 # generate 20 root table rows at a time
811
828
 
@@ -863,6 +880,10 @@ async def _create_table_rows_generator(
863
880
  # +2 because LLM may not always count the rows correctly
864
881
  batch_sizes[-1] = sample_size - sum(batch_sizes[:-1]) + 2
865
882
 
883
+ # emit initial progress message right away
884
+ if progress_callback:
885
+ await progress_callback(table=name, progress=0, total=n_total_batches, rows=0, elapsed_time=0)
886
+
866
887
  # initialize queues for async communication
867
888
  batch_queue = asyncio.PriorityQueue()
868
889
  result_queue = asyncio.Queue()
@@ -927,6 +948,8 @@ async def _create_table_rows_generator(
927
948
 
928
949
  n_completed_batches = 0
929
950
  n_yielded_sequences = 0
951
+ n_generated_rows = 0
952
+ table_start_time = time.time()
930
953
  while n_yielded_sequences < sample_size:
931
954
  if n_completed_batches >= n_total_batches:
932
955
  assert context_data is None, "n_total_batches is fixed for linked tables"
@@ -950,6 +973,7 @@ async def _create_table_rows_generator(
950
973
  rows = result
951
974
  for row_idx, row in enumerate(rows):
952
975
  yield (batch_idx, row)
976
+ n_generated_rows += 1
953
977
  if context_batches is None or row_idx == len(rows) - 1:
954
978
  # in case of flat table, each row is considered a single sequence
955
979
  # in case of linked table, all rows are considered a single sequence
@@ -959,10 +983,13 @@ async def _create_table_rows_generator(
959
983
  break
960
984
  n_completed_batches += 1
961
985
  if progress_callback:
986
+ elapsed_time = time.time() - table_start_time
962
987
  await progress_callback(
988
+ table=name,
963
989
  progress=n_completed_batches,
964
990
  total=n_total_batches,
965
- message=f"Generating rows for table `{name}`: {n_completed_batches}/{n_total_batches}",
991
+ rows=n_generated_rows,
992
+ elapsed_time=round(elapsed_time, 2),
966
993
  )
967
994
  result_queue.task_done()
968
995
 
@@ -1232,7 +1259,7 @@ async def _sample_common(
1232
1259
  top_p: float = 0.95,
1233
1260
  n_workers: int = 10,
1234
1261
  return_type: Literal["auto", "dict"] = "auto",
1235
- progress_callback: Callable | None = None,
1262
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
1236
1263
  ):
1237
1264
  tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
1238
1265
  config = MockConfig(tables)
@@ -1290,6 +1317,7 @@ def sample(
1290
1317
  top_p: float = 0.95,
1291
1318
  n_workers: int = 10,
1292
1319
  return_type: Literal["auto", "dict"] = "auto",
1320
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
1293
1321
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
1294
1322
  """
1295
1323
  Generate synthetic data from scratch or enrich existing data with new columns.
@@ -1305,6 +1333,7 @@ def sample(
1305
1333
  Args:
1306
1334
  tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
1307
1335
  Note: Avoid using double quotes (`"`) and other special characters in column names.
1336
+ Available dtypes: `string`, `integer`, `float`, `category`, `boolean`, `date`, `datetime`.
1308
1337
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
1309
1338
  If a single integer is provided, the same number of rows will be generated for each subject table.
1310
1339
  If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
@@ -1329,6 +1358,11 @@ def sample(
1329
1358
  n_workers (int): The number of concurrent workers making the LLM calls. Default is 10. The value is clamped to the range [1, 10].
1330
1359
  If n_workers is 1, the generation of batches becomes sequential and certain features for better data consistency are enabled.
1331
1360
  return_type (Literal["auto", "dict"]): The format of the returned data. Default is "auto".
1361
+ progress_callback (Callable | None): Optional callback function to track progress during data generation.
1362
+ If not provided, a default progress callback will display progress messages in the format:
1363
+ "Generating table `table_name`: X%, Y rows, Zs, W.X rows/s"
1364
+ The callback receives keyword arguments including: table, progress, total,
1365
+ rows, and elapsed_time. Default is None.
1332
1366
 
1333
1367
  Returns:
1334
1368
  - pd.DataFrame: A single DataFrame containing the generated mock data, if only one table is provided.
@@ -1506,6 +1540,26 @@ def sample(
1506
1540
  df_customers = data["customers"]
1507
1541
  df_orders = data["orders"]
1508
1542
  ```
1543
+
1544
+ Example of using a custom progress callback to provide progress in JSON format:
1545
+ ```python
1546
+ from mostlyai import mock
1547
+ import asyncio
1548
+ import json
1549
+
1550
+ async def custom_progress_callback(**kwargs):
1551
+ msg = f"\r{json.dumps(kwargs)}"
1552
+ if kwargs["progress"] < kwargs["total"]:
1553
+ print(msg, end="", flush=True)
1554
+ else:
1555
+ print(msg)
1556
+
1557
+ df = mock.sample(
1558
+ tables=tables,
1559
+ sample_size=10,
1560
+ progress_callback=custom_progress_callback
1561
+ )
1562
+ ```
1509
1563
  """
1510
1564
 
1511
1565
  def sample_common_sync(*args, **kwargs) -> pd.DataFrame | dict[str, pd.DataFrame]:
@@ -1523,7 +1577,7 @@ def sample(
1523
1577
  top_p=top_p,
1524
1578
  n_workers=n_workers,
1525
1579
  return_type=return_type,
1526
- progress_callback=None,
1580
+ progress_callback=progress_callback,
1527
1581
  )
1528
1582
  return future.result()
1529
1583
 
@@ -1539,7 +1593,7 @@ async def _asample(
1539
1593
  top_p: float = 0.95,
1540
1594
  n_workers: int = 10,
1541
1595
  return_type: Literal["auto", "dict"] = "auto",
1542
- progress_callback: Callable | None = None,
1596
+ progress_callback: Callable[..., Awaitable[None]] | None = None,
1543
1597
  ) -> pd.DataFrame | dict[str, pd.DataFrame]:
1544
1598
  return await _sample_common(
1545
1599
  tables=tables,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.1.18
3
+ Version: 0.2.1
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -47,7 +47,7 @@ Use LLMs to generate any Tabular Data towards your needs. Create from scratch, e
47
47
  * A light-weight python client for prompting LLMs for mixed-type tabular data.
48
48
  * Select from a wide range of LLM endpoints and LLM models.
49
49
  * Supports single-table as well as multi-table scenarios.
50
- * Supports variety of data types: `string`, `categorical`, `integer`, `float`, `boolean`, `date`, and `datetime`.
50
+ * Supports variety of data types: `string`, `integer`, `float`, `category`, `boolean`, `date`, and `datetime`.
51
51
  * Specify context, distributions and rules via dataset-, table- or column-level prompts.
52
52
  * Create from scratch or enrich existing datasets with new columns and/or rows.
53
53
  * Tailor the diversity and realism of your generated data via temperature and top_p.
@@ -0,0 +1,8 @@
1
+ mostlyai/mock/__init__.py,sha256=JUs456bSY9qLCZk_m-QjtcFDf4e3c9DkzdQpBafJZKA,714
2
+ mostlyai/mock/core.py,sha256=TsBPnV068Dmgy52sWKDKe055_LWQtWy9YO4Jx5bRVLk,69227
3
+ mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
4
+ mostlyai_mock-0.2.1.dist-info/METADATA,sha256=eSwEcdV3RwSUw6ciEEjeBAzV99E_mf1wj9XGXjapTak,14253
5
+ mostlyai_mock-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ mostlyai_mock-0.2.1.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
+ mostlyai_mock-0.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ mostlyai_mock-0.2.1.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mostlyai/mock/__init__.py,sha256=UKmnKlQ7fZVvB0ckh9_nXjojAE0JGa2Kd2mT0Ci8cDU,715
2
- mostlyai/mock/core.py,sha256=oGSpIXINL7R1X7ZN5dtdwItaPXDD0mGvkakA0CEzmwI,66880
3
- mostlyai/mock/mcp_server.py,sha256=uDLg0SeMPV2VZhXviM-F769W0xlmhGwlmQiQhY0Q-Ik,2365
4
- mostlyai_mock-0.1.18.dist-info/METADATA,sha256=EmLjpo-D-wJefswHIMk3TCK9TvzLML-3Sjo0OEi9qAI,14257
5
- mostlyai_mock-0.1.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- mostlyai_mock-0.1.18.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
7
- mostlyai_mock-0.1.18.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
- mostlyai_mock-0.1.18.dist-info/RECORD,,