mostlyai-mock 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mostlyai/mock/__init__.py +1 -1
- mostlyai/mock/core.py +168 -48
- mostlyai/mock/mcp_server.py +14 -0
- {mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/METADATA +3 -5
- mostlyai_mock-0.1.6.dist-info/RECORD +8 -0
- mostlyai_mock-0.1.4.dist-info/RECORD +0 -8
- {mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/WHEEL +0 -0
- {mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/entry_points.txt +0 -0
- {mostlyai_mock-0.1.4.dist-info → mostlyai_mock-0.1.6.dist-info}/licenses/LICENSE +0 -0
mostlyai/mock/__init__.py
CHANGED
mostlyai/mock/core.py
CHANGED
@@ -14,21 +14,23 @@
|
|
14
14
|
|
15
15
|
from __future__ import annotations
|
16
16
|
|
17
|
+
import itertools
|
17
18
|
import json
|
18
19
|
from collections import deque
|
19
20
|
from collections.abc import Generator
|
20
21
|
from enum import Enum
|
21
|
-
from typing import Any, Literal
|
22
|
+
from typing import Any, Literal
|
22
23
|
|
23
24
|
import litellm
|
24
25
|
import pandas as pd
|
26
|
+
import tenacity
|
25
27
|
from pydantic import BaseModel, Field, RootModel, create_model, field_validator, model_validator
|
26
28
|
from tqdm import tqdm
|
27
29
|
|
28
30
|
litellm.suppress_debug_info = True
|
29
31
|
|
30
32
|
SYSTEM_PROMPT = """
|
31
|
-
You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
33
|
+
You are a specialized mock data generator designed to create highly realistic, contextually appropriate data based on schema definitions.
|
32
34
|
|
33
35
|
Your task is to:
|
34
36
|
|
@@ -58,7 +60,7 @@ class LLMConfig(BaseModel):
|
|
58
60
|
|
59
61
|
|
60
62
|
class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
61
|
-
root: dict[str, TableConfig] = Field(...,
|
63
|
+
root: dict[str, TableConfig] = Field(..., min_length=1)
|
62
64
|
|
63
65
|
@field_validator("root")
|
64
66
|
@classmethod
|
@@ -127,7 +129,7 @@ class MockConfig(RootModel[dict[str, "TableConfig"]]):
|
|
127
129
|
|
128
130
|
class TableConfig(BaseModel):
|
129
131
|
prompt: str = ""
|
130
|
-
columns: dict[str, ColumnConfig] = Field(...,
|
132
|
+
columns: dict[str, ColumnConfig] = Field(..., min_length=1)
|
131
133
|
primary_key: str | None = None
|
132
134
|
foreign_keys: list[ForeignKeyConfig] = Field(default_factory=list)
|
133
135
|
|
@@ -246,52 +248,85 @@ def _create_table_prompt(
|
|
246
248
|
prompt = f"# {prompt}\n\n"
|
247
249
|
|
248
250
|
# define table
|
249
|
-
prompt += f"## Table: {name}
|
251
|
+
prompt += f"## Target Table: `{name}`\n\n"
|
250
252
|
|
251
|
-
prompt += f"
|
253
|
+
prompt += f"### Target Table Primary Key: `{primary_keys[name]}`\n\n"
|
252
254
|
|
253
255
|
# add columns specifications
|
254
|
-
prompt += "
|
255
|
-
|
256
|
+
prompt += "### Target Table Column Specifications:\n\n"
|
257
|
+
column_specifications = {
|
258
|
+
name: config.model_dump(exclude_defaults=True, exclude_unset=True, exclude_none=True)
|
259
|
+
for name, config in columns.items()
|
260
|
+
}
|
261
|
+
if existing_data is not None:
|
262
|
+
# do not generate values for columns that already exist in existing data
|
263
|
+
column_specifications = {
|
264
|
+
column: spec for column, spec in column_specifications.items() if column not in existing_data.columns
|
265
|
+
}
|
266
|
+
prompt += f"{json.dumps(column_specifications, indent=2)}\n\n"
|
256
267
|
|
257
268
|
# add previous rows as context to help the LLM generate consistent data
|
269
|
+
has_previous_rows_section = False
|
258
270
|
if previous_rows:
|
259
|
-
|
271
|
+
has_previous_rows_section = True
|
272
|
+
prompt += f"\n## Previous `{len(previous_rows)}` Rows of Target Table `{name}`:\n\n"
|
260
273
|
prompt += f"{json.dumps(previous_rows, indent=2)}\n\n"
|
261
274
|
|
262
275
|
# add existing data to augment
|
276
|
+
has_existing_data_section = False
|
263
277
|
if existing_data is not None:
|
264
|
-
|
278
|
+
has_existing_data_section = True
|
279
|
+
prompt += f"\n## Existing Data of Target Table `{name}` to Augment:\n\n"
|
265
280
|
prompt += f"{existing_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
266
281
|
|
267
|
-
# define foreign keys
|
268
|
-
|
269
|
-
|
270
|
-
|
282
|
+
# define self referencing foreign keys
|
283
|
+
has_self_referencing_foreign_keys_section = False
|
284
|
+
self_referencing_foreign_keys = [fk for fk in foreign_keys if fk.referenced_table == name]
|
285
|
+
if self_referencing_foreign_keys:
|
286
|
+
has_self_referencing_foreign_keys_section = True
|
287
|
+
prompt += f"## Self Referencing Foreign Keys in Target Table `{name}`\n\n"
|
288
|
+
for fk in self_referencing_foreign_keys:
|
289
|
+
prompt += f"### Primary Key Column: `{primary_keys[name]}`\n\n"
|
290
|
+
|
291
|
+
prompt += f"### Foreign Key Column: `{fk.column}`\n\n"
|
292
|
+
|
293
|
+
prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
|
294
|
+
|
295
|
+
foreign_keys = [fk for fk in foreign_keys if fk.referenced_table != name] # exclude self-dependency going forward
|
271
296
|
|
272
297
|
# add context table name, primary key and data
|
273
|
-
|
298
|
+
has_context_table_section = False
|
299
|
+
if foreign_keys:
|
300
|
+
has_context_table_section = True
|
274
301
|
assert context_data is not None
|
275
302
|
fk = foreign_keys[0]
|
276
303
|
prompt += f"## Context Table: `{fk.referenced_table}`\n\n"
|
277
304
|
|
278
|
-
prompt += f"
|
305
|
+
prompt += f"### Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
306
|
+
|
307
|
+
prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
|
279
308
|
|
280
|
-
prompt += "
|
309
|
+
prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
|
310
|
+
|
311
|
+
prompt += "### Context Table Data:\n\n"
|
281
312
|
prompt += f"{context_data.to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
282
313
|
|
283
314
|
# add non-context table names, primary keys and data
|
315
|
+
has_non_context_tables_section = False
|
284
316
|
if foreign_keys and len(foreign_keys) > 1:
|
317
|
+
has_non_context_tables_section = True
|
285
318
|
for fk in foreign_keys[1:]:
|
286
|
-
if fk.referenced_table == name: # self-dependency is not considered as non-context
|
287
|
-
continue
|
288
319
|
assert non_context_data is not None
|
289
320
|
assert fk.referenced_table in non_context_data
|
290
321
|
prompt += f"## Non-Context Table: `{fk.referenced_table}`\n\n"
|
291
322
|
|
292
|
-
prompt += f"
|
323
|
+
prompt += f"### Non-Context Table Primary Key: `{primary_keys[fk.referenced_table]}`\n\n"
|
324
|
+
|
325
|
+
prompt += f"### Foreign Key Column in Target Table `{name}`: `{fk.column}`\n\n"
|
326
|
+
|
327
|
+
prompt += f"### Description of the Relationship: `{fk.prompt}`\n\n"
|
293
328
|
|
294
|
-
prompt += "
|
329
|
+
prompt += "### Non-Context Table Data:\n\n"
|
295
330
|
prompt += (
|
296
331
|
f"{non_context_data[fk.referenced_table].to_json(orient='records', date_format='iso', indent=2)}\n\n"
|
297
332
|
)
|
@@ -304,42 +339,62 @@ def _create_table_prompt(
|
|
304
339
|
n_rows = None
|
305
340
|
if existing_data is not None:
|
306
341
|
n_rows = len(existing_data)
|
307
|
-
elif not foreign_keys:
|
342
|
+
elif not foreign_keys and not self_referencing_foreign_keys:
|
308
343
|
assert batch_size is not None
|
309
344
|
n_rows = batch_size
|
310
345
|
|
311
|
-
prompt += f"{verb.capitalize()} data for the `{name}
|
346
|
+
prompt += f"{verb.capitalize()} data for the Target Table `{name}`.\n\n"
|
312
347
|
if n_rows is not None:
|
313
348
|
prompt += f"Number of rows to {verb}: `{n_rows}`.\n\n"
|
314
349
|
|
315
|
-
if
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
350
|
+
if has_context_table_section:
|
351
|
+
assert foreign_keys
|
352
|
+
prompt += f"Target Table Foreign Key column `{foreign_keys[0].column}` may only contain values from `Context Table Data`."
|
353
|
+
if has_previous_rows_section:
|
354
|
+
prompt += " Never use values from `Previous Rows of Target Table` section."
|
355
|
+
prompt += " Respect the `Description of the Relationship` of `Context Table` section to understand the relationship, in particular the number of rows to generate."
|
356
|
+
prompt += "\n\n"
|
357
|
+
|
358
|
+
if has_self_referencing_foreign_keys_section:
|
359
|
+
prompt += "Target Table Self Referencing Foreign Key columns defined in `Self Referencing Foreign Keys` must be consistent with the `Target Table Primary Key`."
|
360
|
+
prompt += " Respect the `Description of the Relationship` of `Self Referencing Foreign Keys` section to understand the relationship."
|
361
|
+
prompt += "\n\n"
|
362
|
+
|
363
|
+
if has_non_context_tables_section:
|
364
|
+
assert len(foreign_keys) > 1
|
365
|
+
prompt += "All other Target Table Foreign Key columns may only contain values from `Non-Context Table Data` of relevant `Non-Context Table` sections."
|
366
|
+
prompt += " Respect the `Description of the Relationship` of relevant `Non-Context Table` section to understand the relationship."
|
367
|
+
prompt += "\n\n"
|
368
|
+
|
369
|
+
if has_existing_data_section:
|
370
|
+
assert existing_data is not None
|
325
371
|
prompt += (
|
326
372
|
f"You are given existing data for the `{name}` table and asked to generate "
|
327
|
-
f"values for the missing columns. The existing data contains column(s): {
|
328
|
-
f"You need to generate values for column(s): {
|
373
|
+
f"values for the missing columns. The existing data contains column(s): {list(existing_data.columns)}. "
|
374
|
+
f"You need to generate values for column(s): {list(columns.keys() - existing_data.columns)}. "
|
329
375
|
f"Ensure that the generated values are contextually appropriate and consistent with the existing data. "
|
330
376
|
f"Use the existing columns' values to inform the generation of new values. "
|
331
377
|
f"Don't generate new rows, only augment the existing data.\n\n"
|
332
378
|
)
|
333
379
|
|
334
|
-
if
|
380
|
+
if has_previous_rows_section:
|
381
|
+
assert previous_rows is not None
|
335
382
|
prompt += (
|
336
383
|
f"{verb.capitalize()} new rows that maintain consistency with the previous rows where appropriate. "
|
337
384
|
"Don't copy previous rows in the output. "
|
338
385
|
"Don't pay attention to the number of previous rows; there might have been more generated than provided.\n\n"
|
339
386
|
)
|
387
|
+
|
340
388
|
prompt += f"Do not use code to {verb} the data.\n\n"
|
341
|
-
prompt += "Return the full data as a JSON string.\n"
|
342
389
|
|
390
|
+
prompt += "Return data as a JSON string."
|
391
|
+
prompt += " The JSON string should have 'rows' key at the top level. The value of 'rows' key should be a list of JSON objects."
|
392
|
+
prompt += " Each JSON object should have column names as keys and values as column values."
|
393
|
+
if existing_data is not None:
|
394
|
+
prompt += (
|
395
|
+
f" Only include the following columns in the JSON string: {list(columns.keys() - existing_data.columns)}."
|
396
|
+
)
|
397
|
+
prompt += "\n"
|
343
398
|
return prompt
|
344
399
|
|
345
400
|
|
@@ -357,8 +412,10 @@ def _create_table_rows_generator(
|
|
357
412
|
non_context_size: int | None,
|
358
413
|
llm_config: LLMConfig,
|
359
414
|
) -> Generator[dict]:
|
360
|
-
def create_table_response_format(
|
361
|
-
|
415
|
+
def create_table_response_format(
|
416
|
+
columns: dict[str, ColumnConfig], existing_data: pd.DataFrame | None
|
417
|
+
) -> tuple[type[BaseModel], int]:
|
418
|
+
def create_annotation(column_config: ColumnConfig) -> type:
|
362
419
|
if column_config.values or column_config.dtype is DType.CATEGORY:
|
363
420
|
return Literal[tuple(column_config.values)]
|
364
421
|
return {
|
@@ -374,11 +431,14 @@ def _create_table_rows_generator(
|
|
374
431
|
|
375
432
|
fields = {}
|
376
433
|
for column_name, column_config in columns.items():
|
434
|
+
if existing_data is not None and column_name in existing_data.columns:
|
435
|
+
continue # skip columns that already exist in existing data
|
377
436
|
annotation = create_annotation(column_config)
|
378
437
|
fields[column_name] = (annotation, Field(...))
|
379
438
|
TableRow = create_model("TableRow", **fields)
|
380
439
|
TableRows = create_model("TableRows", rows=(list[TableRow], ...))
|
381
|
-
|
440
|
+
n_enforced_columns = len(fields)
|
441
|
+
return TableRows, n_enforced_columns
|
382
442
|
|
383
443
|
def yield_rows_from_json_chunks_stream(response: litellm.CustomStreamWrapper) -> Generator[dict]:
|
384
444
|
# starting with dirty buffer is to handle the `{"rows": []}` case
|
@@ -419,6 +479,18 @@ def _create_table_rows_generator(
|
|
419
479
|
for i in range(0, len(data), batch_size):
|
420
480
|
yield data.iloc[i : i + batch_size]
|
421
481
|
|
482
|
+
def completion_with_retries(*args, **kwargs):
|
483
|
+
n_attempts = 3
|
484
|
+
|
485
|
+
def print_on_retry(_):
|
486
|
+
print(" * Trying again... * ", end="", flush=True)
|
487
|
+
|
488
|
+
# try up to 3 times, print a message to the user on each retry
|
489
|
+
retryer = tenacity.Retrying(
|
490
|
+
stop=tenacity.stop_after_attempt(n_attempts), reraise=True, before_sleep=print_on_retry
|
491
|
+
)
|
492
|
+
return retryer(litellm.completion, *args, **kwargs)
|
493
|
+
|
422
494
|
if not llm_config.model.startswith("litellm_proxy/"):
|
423
495
|
# ensure model supports response_format and json schema (this check does not work with litellm_proxy)
|
424
496
|
supported_params = litellm.get_supported_openai_params(model=llm_config.model) or []
|
@@ -453,7 +525,6 @@ def _create_table_rows_generator(
|
|
453
525
|
non_context_data[non_context_table_name] = data[non_context_table_name]
|
454
526
|
|
455
527
|
litellm_kwargs = {
|
456
|
-
"response_format": create_table_response_format(columns=columns),
|
457
528
|
"temperature": llm_config.temperature,
|
458
529
|
"top_p": llm_config.top_p,
|
459
530
|
"model": llm_config.model,
|
@@ -488,6 +559,16 @@ def _create_table_rows_generator(
|
|
488
559
|
table_name: df.sample(frac=1.0).head(non_context_size) for table_name, df in non_context_data.items()
|
489
560
|
}
|
490
561
|
|
562
|
+
if context_batch is None:
|
563
|
+
# for root tables, scale down batch size in order to prevent excessive generations
|
564
|
+
remaining_rows = sample_size - yielded_sequences
|
565
|
+
if batch_size >= remaining_rows:
|
566
|
+
batch_size = remaining_rows + 2 # +2 because LLM may not always count the rows correctly
|
567
|
+
|
568
|
+
response_format, n_enforced_columns = create_table_response_format(
|
569
|
+
columns=columns, existing_data=existing_batch
|
570
|
+
)
|
571
|
+
|
491
572
|
llm_prompt = _create_table_prompt(
|
492
573
|
name=name,
|
493
574
|
prompt=prompt,
|
@@ -502,12 +583,20 @@ def _create_table_rows_generator(
|
|
502
583
|
)
|
503
584
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": llm_prompt}]
|
504
585
|
|
505
|
-
|
506
|
-
|
586
|
+
if n_enforced_columns != 0:
|
587
|
+
response = completion_with_retries(messages=messages, response_format=response_format, **litellm_kwargs)
|
588
|
+
rows_stream = yield_rows_from_json_chunks_stream(response)
|
589
|
+
else:
|
590
|
+
# skip roundtrip to LLM in case all columns are provided in existing data
|
591
|
+
rows_stream = itertools.repeat({})
|
507
592
|
|
593
|
+
batch_row_idx = 0
|
508
594
|
while True:
|
509
595
|
try:
|
510
|
-
|
596
|
+
row_generated_part = next(rows_stream)
|
597
|
+
row_existing_part = existing_batch.iloc[batch_row_idx].to_dict() if existing_batch is not None else {}
|
598
|
+
row = {**row_existing_part, **row_generated_part}
|
599
|
+
row = {column: row[column] for column in columns.keys()} # keep columns order according to user's spec
|
511
600
|
except StopIteration:
|
512
601
|
break # move to next batch
|
513
602
|
previous_rows.append(row)
|
@@ -517,6 +606,7 @@ def _create_table_rows_generator(
|
|
517
606
|
yielded_sequences += 1
|
518
607
|
if yielded_sequences >= sample_size:
|
519
608
|
return # move to next table
|
609
|
+
batch_row_idx += 1
|
520
610
|
if context_batch is not None:
|
521
611
|
# for each context_batch, full sequences are generated
|
522
612
|
yielded_sequences += len(context_batch)
|
@@ -553,6 +643,36 @@ def _convert_table_rows_generator_to_df(
|
|
553
643
|
return df
|
554
644
|
|
555
645
|
|
646
|
+
def _harmonize_tables(tables: dict[str, dict], existing_data: dict[str, pd.DataFrame] | None) -> dict[str, dict]:
|
647
|
+
def _infer_dtype(series: pd.Series) -> DType:
|
648
|
+
if pd.api.types.is_integer_dtype(series):
|
649
|
+
return DType.INTEGER
|
650
|
+
elif pd.api.types.is_float_dtype(series):
|
651
|
+
return DType.FLOAT
|
652
|
+
elif pd.api.types.is_datetime64_dtype(series):
|
653
|
+
return DType.DATETIME
|
654
|
+
elif pd.api.types.is_bool_dtype(series):
|
655
|
+
return DType.BOOLEAN
|
656
|
+
else:
|
657
|
+
return DType.STRING
|
658
|
+
|
659
|
+
if existing_data is None:
|
660
|
+
return tables
|
661
|
+
|
662
|
+
tables = tables.copy()
|
663
|
+
for table_name, existing_table in existing_data.items():
|
664
|
+
table_config = tables.setdefault(table_name, {})
|
665
|
+
column_configs = table_config.setdefault("columns", {})
|
666
|
+
existing_column_configs = {
|
667
|
+
existing_column: {"dtype": _infer_dtype(existing_table[existing_column])}
|
668
|
+
for existing_column in existing_table.columns
|
669
|
+
if existing_column not in column_configs
|
670
|
+
}
|
671
|
+
column_configs = {**existing_column_configs, **column_configs}
|
672
|
+
table_config["columns"] = column_configs
|
673
|
+
return tables
|
674
|
+
|
675
|
+
|
556
676
|
def _harmonize_sample_size(sample_size: int | dict[str, int], config: MockConfig) -> dict[str, int]:
|
557
677
|
if isinstance(sample_size, int):
|
558
678
|
return {table_name: sample_size for table_name in config.root}
|
@@ -756,8 +876,6 @@ def sample(
|
|
756
876
|
"patients": {
|
757
877
|
"prompt": "Patients of a hospital in Finland",
|
758
878
|
"columns": {
|
759
|
-
"age": {},
|
760
|
-
"gender": {},
|
761
879
|
"full_name": {"prompt": "first name and last name of the patient", "dtype": "string"},
|
762
880
|
"date_of_birth": {"prompt": "date of birth", "dtype": "date"},
|
763
881
|
"place_of_birth": {"prompt": "place of birth", "dtype": "string"},
|
@@ -769,7 +887,7 @@ def sample(
|
|
769
887
|
"gender": ["male", "male", "female", "female"],
|
770
888
|
})
|
771
889
|
enriched_df = mock.sample(
|
772
|
-
tables=tables,
|
890
|
+
tables=tables,
|
773
891
|
existing_data={"patients": existing_df},
|
774
892
|
model="openai/gpt-4.1-nano"
|
775
893
|
)
|
@@ -833,7 +951,9 @@ def sample(
|
|
833
951
|
```
|
834
952
|
"""
|
835
953
|
|
954
|
+
tables: dict[str, TableConfig] = _harmonize_tables(tables, existing_data)
|
836
955
|
config = MockConfig(tables)
|
956
|
+
|
837
957
|
llm_config = LLMConfig(model=model, api_key=api_key, temperature=temperature, top_p=top_p)
|
838
958
|
|
839
959
|
sample_size: dict[str, int] = _harmonize_sample_size(sample_size, config)
|
@@ -853,7 +973,7 @@ def sample(
|
|
853
973
|
primary_keys=primary_keys,
|
854
974
|
data=data,
|
855
975
|
sample_size=sample_size[table_name],
|
856
|
-
batch_size=
|
976
|
+
batch_size=20, # generate 20 root table rows at a time
|
857
977
|
previous_rows_size=10, # present 10 previously generated rows to the LLM
|
858
978
|
non_context_size=10, # pick 10 rows to choose from for each non-context foreign key
|
859
979
|
llm_config=llm_config,
|
mostlyai/mock/mcp_server.py
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
# Copyright 2025 MOSTLY AI
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
1
15
|
import os
|
2
16
|
import tempfile
|
3
17
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mostlyai-mock
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6
|
4
4
|
Summary: LLM-generated Mock Data
|
5
5
|
Project-URL: homepage, https://github.com/mostly-ai/mostlyai-mock
|
6
6
|
Project-URL: repository, https://github.com/mostly-ai/mostlyai-mock
|
@@ -30,6 +30,7 @@ Requires-Dist: numpy>=1.26.3
|
|
30
30
|
Requires-Dist: pandas>=2.0.0
|
31
31
|
Requires-Dist: pyarrow>=14.0.0
|
32
32
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
33
|
+
Requires-Dist: tenacity>=9.1.2
|
33
34
|
Description-Content-Type: text/markdown
|
34
35
|
|
35
36
|
# LLM-generated Mock Data 🔮
|
@@ -169,7 +170,7 @@ tables = {
|
|
169
170
|
}
|
170
171
|
data = mock.sample(
|
171
172
|
tables=tables,
|
172
|
-
sample_size=2,
|
173
|
+
sample_size=2,
|
173
174
|
model="openai/gpt-4.1"
|
174
175
|
)
|
175
176
|
print(data["customers"])
|
@@ -250,9 +251,6 @@ tables = {
|
|
250
251
|
"guests": {
|
251
252
|
"prompt": "Guests of an Alpine ski hotel in Austria",
|
252
253
|
"columns": {
|
253
|
-
"guest_id": {"prompt": "the unique id of the guest", "dtype": "integer"},
|
254
|
-
"name": {"prompt": "first name and last name of the guest", "dtype": "string"},
|
255
|
-
"nationality": {"prompt": "2-letter code for the nationality", "dtype": "string"},
|
256
254
|
"gender": {"dtype": "category", "values": ["male", "female"]},
|
257
255
|
"age": {"prompt": "age in years; min: 18, max: 80; avg: 25", "dtype": "integer"},
|
258
256
|
"room_number": {"prompt": "room number", "dtype": "integer"},
|
@@ -0,0 +1,8 @@
|
|
1
|
+
mostlyai/mock/__init__.py,sha256=8UddMHmwpfwSb7ChuVNvIaWNLTlWkN0Cxh63CskmtBw,714
|
2
|
+
mostlyai/mock/core.py,sha256=NFfyucqjT3iC9lqfu4dPmRnYizxtfFH1Tf3KHRRxHvg,42242
|
3
|
+
mostlyai/mock/mcp_server.py,sha256=kWMIjKCwnvYfjY8B2IdP4JNs8ik_8jA6ISCDqrG9utc,2137
|
4
|
+
mostlyai_mock-0.1.6.dist-info/METADATA,sha256=RMYEgGG4P3WfhavNC_4ph6dTCtumqQ3uA-swot9WKyc,13918
|
5
|
+
mostlyai_mock-0.1.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
+
mostlyai_mock-0.1.6.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
+
mostlyai_mock-0.1.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
+
mostlyai_mock-0.1.6.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
mostlyai/mock/__init__.py,sha256=EvV_Tp6ExzQPq4apGq_8F25qw_paNTcQEC94nIVOEog,714
|
2
|
-
mostlyai/mock/core.py,sha256=ubarMA3VUlXdjUsCXQK_mD_kWPkTMOYvLz9G4OughGk,36532
|
3
|
-
mostlyai/mock/mcp_server.py,sha256=Vp0bWzE8wUyA6k4PHLa0TbkuI9s07E48xPrAUgf_5qU,1563
|
4
|
-
mostlyai_mock-0.1.4.dist-info/METADATA,sha256=jibPe0pKcwqyPBoyc7H98LPd72vkGZBStdw_yMNVvJI,14161
|
5
|
-
mostlyai_mock-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
6
|
-
mostlyai_mock-0.1.4.dist-info/entry_points.txt,sha256=XDbppUIAaCWW0nresVep8zb71pkzZuFA16jCBHq8CU8,61
|
7
|
-
mostlyai_mock-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
8
|
-
mostlyai_mock-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|