mostlyai-mock 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mostlyai-mock
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Synthetic Mock Data
5
5
  Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
6
6
  Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
@@ -15,4 +15,4 @@
15
15
  from mostlyai.mock.core import sample
16
16
 
17
17
  __all__ = ["sample"]
18
- __version__ = "0.2.0" # Do not set this manually. Use poetry version [params].
18
+ __version__ = "0.2.2" # Do not set this manually. Use poetry version [params].
@@ -880,6 +880,10 @@ async def _create_table_rows_generator(
880
880
  # +2 because LLM may not always count the rows correctly
881
881
  batch_sizes[-1] = sample_size - sum(batch_sizes[:-1]) + 2
882
882
 
883
+ # emit initial progress message right away
884
+ if progress_callback:
885
+ await progress_callback(table=name, progress=0, total=n_total_batches, rows=0, elapsed_time=0)
886
+
883
887
  # initialize queues for async communication
884
888
  batch_queue = asyncio.PriorityQueue()
885
889
  result_queue = asyncio.Queue()
@@ -944,6 +948,7 @@ async def _create_table_rows_generator(
944
948
 
945
949
  n_completed_batches = 0
946
950
  n_yielded_sequences = 0
951
+ n_generated_rows = 0
947
952
  table_start_time = time.time()
948
953
  while n_yielded_sequences < sample_size:
949
954
  if n_completed_batches >= n_total_batches:
@@ -968,6 +973,7 @@ async def _create_table_rows_generator(
968
973
  rows = result
969
974
  for row_idx, row in enumerate(rows):
970
975
  yield (batch_idx, row)
976
+ n_generated_rows += 1
971
977
  if context_batches is None or row_idx == len(rows) - 1:
972
978
  # in case of flat table, each row is considered a single sequence
973
979
  # in case of linked table, all rows are considered a single sequence
@@ -982,7 +988,7 @@ async def _create_table_rows_generator(
982
988
  table=name,
983
989
  progress=n_completed_batches,
984
990
  total=n_total_batches,
985
- rows=n_yielded_sequences,
991
+ rows=n_generated_rows,
986
992
  elapsed_time=round(elapsed_time, 2),
987
993
  )
988
994
  result_queue.task_done()
@@ -1328,6 +1334,7 @@ def sample(
1328
1334
  tables (dict[str, dict]): The table specifications to generate mock data for. See examples for usage.
1329
1335
  Note: Avoid using double quotes (`"`) and other special characters in column names.
1330
1336
  Available dtypes: `string`, `integer`, `float`, `category`, `boolean`, `date`, `datetime`.
1337
+ Primary key dtypes: `integer` → auto-increment (1, 2, 3, ...); `string` → LLM-generated unique IDs.
1331
1338
  sample_size (int | dict[str, int]): The number of rows to generate for each subject table.
1332
1339
  If a single integer is provided, the same number of rows will be generated for each subject table.
1333
1340
  If a dictionary is provided, the number of rows to generate for each subject table can be specified individually.
@@ -1452,6 +1459,32 @@ def sample(
1452
1459
  df_items = data["items"]
1453
1460
  ```
1454
1461
 
1462
+ Example of auto-increment integer primary keys (self-referencing table):
1463
+ ```python
1464
+ from mostlyai import mock
1465
+
1466
+ tables = {
1467
+ "employees": {
1468
+ "prompt": "Employees of a company",
1469
+ "columns": {
1470
+ "employee_id": {"dtype": "integer"}, # integer PK → auto-increment (1, 2, 3, ...)
1471
+ "name": {"prompt": "first name and last name of the employee", "dtype": "string"},
1472
+ "boss_id": {"dtype": "integer"}, # integer FK → references auto-incremented values
1473
+ "role": {"prompt": "the role of the employee", "dtype": "string"},
1474
+ },
1475
+ "primary_key": "employee_id",
1476
+ "foreign_keys": [
1477
+ {
1478
+ "column": "boss_id",
1479
+ "referenced_table": "employees",
1480
+ "prompt": "each boss has at most 3 employees",
1481
+ },
1482
+ ],
1483
+ }
1484
+ }
1485
+ df = mock.sample(tables=tables, sample_size=10, model="openai/gpt-5", n_workers=1)
1486
+ ```
1487
+
1455
1488
  Example of enriching a single dataframe:
1456
1489
  ```python
1457
1490
  from mostlyai import mock
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mostlyai-mock"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  description = "Synthetic Mock Data"
5
5
  authors = [{ name = "MOSTLY AI", email = "dev@mostly.ai" }]
6
6
  requires-python = ">=3.10"
File without changes
File without changes
File without changes