datasynth-py 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasynth_py-0.2.1/MANIFEST.in +3 -0
- datasynth_py-0.2.1/PKG-INFO +132 -0
- datasynth_py-0.2.1/README.md +84 -0
- datasynth_py-0.2.1/datasynth_py/__init__.py +48 -0
- datasynth_py-0.2.1/datasynth_py/client.py +344 -0
- datasynth_py-0.2.1/datasynth_py/config/__init__.py +39 -0
- datasynth_py-0.2.1/datasynth_py/config/blueprints.py +296 -0
- datasynth_py-0.2.1/datasynth_py/config/models.py +392 -0
- datasynth_py-0.2.1/datasynth_py/config/validation.py +80 -0
- datasynth_py-0.2.1/datasynth_py/fingerprint.py +454 -0
- datasynth_py-0.2.1/datasynth_py/py.typed +0 -0
- datasynth_py-0.2.1/datasynth_py/runtime/__init__.py +1 -0
- datasynth_py-0.2.1/datasynth_py.egg-info/PKG-INFO +132 -0
- datasynth_py-0.2.1/datasynth_py.egg-info/SOURCES.txt +19 -0
- datasynth_py-0.2.1/datasynth_py.egg-info/dependency_links.txt +1 -0
- datasynth_py-0.2.1/datasynth_py.egg-info/requires.txt +23 -0
- datasynth_py-0.2.1/datasynth_py.egg-info/top_level.txt +1 -0
- datasynth_py-0.2.1/pyproject.toml +91 -0
- datasynth_py-0.2.1/setup.cfg +4 -0
- datasynth_py-0.2.1/tests/test_blueprints.py +48 -0
- datasynth_py-0.2.1/tests/test_config.py +94 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datasynth-py
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Python wrapper for DataSynth synthetic data generation
|
|
5
|
+
Author-email: EY ASU RnD <michael.ivertowski@ch.ey.com>
|
|
6
|
+
Maintainer-email: EY ASU RnD <michael.ivertowski@ch.ey.com>
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Project-URL: Homepage, https://github.com/ey-asu-rnd/SyntheticData
|
|
9
|
+
Project-URL: Documentation, https://ey-asu-rnd.github.io/SyntheticData/
|
|
10
|
+
Project-URL: Repository, https://github.com/ey-asu-rnd/SyntheticData
|
|
11
|
+
Project-URL: Changelog, https://github.com/ey-asu-rnd/SyntheticData/blob/main/CHANGELOG.md
|
|
12
|
+
Project-URL: Issues, https://github.com/ey-asu-rnd/SyntheticData/issues
|
|
13
|
+
Keywords: synthetic-data,data-generation,testing,machine-learning,financial-data,accounting,journal-entries,fraud-detection
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Classifier: Topic :: Software Development :: Testing :: Mocking
|
|
27
|
+
Classifier: Typing :: Typed
|
|
28
|
+
Requires-Python: >=3.9
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
Provides-Extra: cli
|
|
31
|
+
Requires-Dist: PyYAML>=6.0; extra == "cli"
|
|
32
|
+
Provides-Extra: memory
|
|
33
|
+
Requires-Dist: pandas>=2.0; extra == "memory"
|
|
34
|
+
Provides-Extra: streaming
|
|
35
|
+
Requires-Dist: websockets>=12.0; extra == "streaming"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: PyYAML>=6.0; extra == "all"
|
|
38
|
+
Requires-Dist: pandas>=2.0; extra == "all"
|
|
39
|
+
Requires-Dist: websockets>=12.0; extra == "all"
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: PyYAML>=6.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pandas>=2.0; extra == "dev"
|
|
43
|
+
Requires-Dist: websockets>=12.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
46
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
47
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
48
|
+
|
|
49
|
+
# datasynth-py
|
|
50
|
+
|
|
51
|
+
Python wrapper for the DataSynth synthetic data generator.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
### From PyPI
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install datasynth-py[all]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Or install specific extras:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install datasynth-py # Core only (no dependencies)
|
|
65
|
+
pip install datasynth-py[cli] # CLI generation (PyYAML)
|
|
66
|
+
pip install datasynth-py[memory] # In-memory tables (pandas)
|
|
67
|
+
pip install datasynth-py[streaming] # Streaming (websockets)
|
|
68
|
+
pip install datasynth-py[all] # All optional dependencies
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### From Source
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
cd python
|
|
75
|
+
pip install -e ".[all]"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from datasynth_py import DataSynth, CompanyConfig, Config, GlobalSettings, ChartOfAccountsSettings
|
|
82
|
+
|
|
83
|
+
config = Config(
|
|
84
|
+
global_settings=GlobalSettings(
|
|
85
|
+
industry="retail",
|
|
86
|
+
start_date="2024-01-01",
|
|
87
|
+
period_months=12,
|
|
88
|
+
),
|
|
89
|
+
companies=[
|
|
90
|
+
CompanyConfig(code="C001", name="Retail Corp", currency="USD", country="US"),
|
|
91
|
+
],
|
|
92
|
+
chart_of_accounts=ChartOfAccountsSettings(complexity="small"),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
synth = DataSynth()
|
|
96
|
+
result = synth.generate(config=config, output={"format": "csv", "sink": "temp_dir"})
|
|
97
|
+
print(result.output_dir)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Using Blueprints
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from datasynth_py import DataSynth
|
|
104
|
+
from datasynth_py.config import blueprints
|
|
105
|
+
|
|
106
|
+
config = blueprints.retail_small(companies=4, transactions=10000)
|
|
107
|
+
synth = DataSynth()
|
|
108
|
+
result = synth.generate(config=config, output={"format": "parquet", "sink": "path", "path": "./output"})
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Requirements
|
|
112
|
+
|
|
113
|
+
The wrapper shells out to the `datasynth-data` CLI binary. Build it with:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
cargo build --release
|
|
117
|
+
export DATASYNTH_BINARY=target/release/datasynth-data
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Or pass `binary_path` when creating the client:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
synth = DataSynth(binary_path="/path/to/datasynth-data")
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Documentation
|
|
127
|
+
|
|
128
|
+
See the [Python Wrapper Guide](../docs/src/user-guide/python-wrapper.md) for complete documentation.
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
Apache 2.0 License - see the main project LICENSE file.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# datasynth-py
|
|
2
|
+
|
|
3
|
+
Python wrapper for the DataSynth synthetic data generator.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
### From PyPI
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install datasynth-py[all]
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or install specific extras:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install datasynth-py # Core only (no dependencies)
|
|
17
|
+
pip install datasynth-py[cli] # CLI generation (PyYAML)
|
|
18
|
+
pip install datasynth-py[memory] # In-memory tables (pandas)
|
|
19
|
+
pip install datasynth-py[streaming] # Streaming (websockets)
|
|
20
|
+
pip install datasynth-py[all] # All optional dependencies
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### From Source
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
cd python
|
|
27
|
+
pip install -e ".[all]"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from datasynth_py import DataSynth, CompanyConfig, Config, GlobalSettings, ChartOfAccountsSettings
|
|
34
|
+
|
|
35
|
+
config = Config(
|
|
36
|
+
global_settings=GlobalSettings(
|
|
37
|
+
industry="retail",
|
|
38
|
+
start_date="2024-01-01",
|
|
39
|
+
period_months=12,
|
|
40
|
+
),
|
|
41
|
+
companies=[
|
|
42
|
+
CompanyConfig(code="C001", name="Retail Corp", currency="USD", country="US"),
|
|
43
|
+
],
|
|
44
|
+
chart_of_accounts=ChartOfAccountsSettings(complexity="small"),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
synth = DataSynth()
|
|
48
|
+
result = synth.generate(config=config, output={"format": "csv", "sink": "temp_dir"})
|
|
49
|
+
print(result.output_dir)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Using Blueprints
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from datasynth_py import DataSynth
|
|
56
|
+
from datasynth_py.config import blueprints
|
|
57
|
+
|
|
58
|
+
config = blueprints.retail_small(companies=4, transactions=10000)
|
|
59
|
+
synth = DataSynth()
|
|
60
|
+
result = synth.generate(config=config, output={"format": "parquet", "sink": "path", "path": "./output"})
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Requirements
|
|
64
|
+
|
|
65
|
+
The wrapper shells out to the `datasynth-data` CLI binary. Build it with:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
cargo build --release
|
|
69
|
+
export DATASYNTH_BINARY=target/release/datasynth-data
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Or pass `binary_path` when creating the client:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
synth = DataSynth(binary_path="/path/to/datasynth-data")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Documentation
|
|
79
|
+
|
|
80
|
+
See the [Python Wrapper Guide](../docs/src/user-guide/python-wrapper.md) for complete documentation.
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
Apache 2.0 License - see the main project LICENSE file.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Python wrapper for DataSynth."""
|
|
2
|
+
|
|
3
|
+
from datasynth_py.client import DataSynth, GenerationResult, OutputSpec, StreamingSession
|
|
4
|
+
from datasynth_py.config import blueprints
|
|
5
|
+
from datasynth_py.config.models import (
|
|
6
|
+
AuditSettings,
|
|
7
|
+
BankingSettings,
|
|
8
|
+
ChartOfAccountsSettings,
|
|
9
|
+
CompanyConfig,
|
|
10
|
+
CompanySettings,
|
|
11
|
+
Config,
|
|
12
|
+
DataQualitySettings,
|
|
13
|
+
FraudSettings,
|
|
14
|
+
GlobalSettings,
|
|
15
|
+
GraphExportSettings,
|
|
16
|
+
OutputSettings,
|
|
17
|
+
ScenarioSettings,
|
|
18
|
+
TemporalDriftSettings,
|
|
19
|
+
TransactionSettings,
|
|
20
|
+
)
|
|
21
|
+
from datasynth_py.config.validation import ConfigValidationError
|
|
22
|
+
from datasynth_py.fingerprint import FidelityReport, FingerprintClient, FingerprintInfo
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"AuditSettings",
|
|
26
|
+
"BankingSettings",
|
|
27
|
+
"ChartOfAccountsSettings",
|
|
28
|
+
"CompanyConfig",
|
|
29
|
+
"CompanySettings",
|
|
30
|
+
"Config",
|
|
31
|
+
"ConfigValidationError",
|
|
32
|
+
"DataQualitySettings",
|
|
33
|
+
"DataSynth",
|
|
34
|
+
"FidelityReport",
|
|
35
|
+
"FingerprintClient",
|
|
36
|
+
"FingerprintInfo",
|
|
37
|
+
"FraudSettings",
|
|
38
|
+
"GenerationResult",
|
|
39
|
+
"GlobalSettings",
|
|
40
|
+
"GraphExportSettings",
|
|
41
|
+
"OutputSettings",
|
|
42
|
+
"OutputSpec",
|
|
43
|
+
"ScenarioSettings",
|
|
44
|
+
"StreamingSession",
|
|
45
|
+
"TemporalDriftSettings",
|
|
46
|
+
"TransactionSettings",
|
|
47
|
+
"blueprints",
|
|
48
|
+
]
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""Client entrypoint for the DataSynth Python wrapper."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import pathlib
|
|
8
|
+
import subprocess
|
|
9
|
+
import tempfile
|
|
10
|
+
import urllib.error
|
|
11
|
+
import urllib.request
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any, AsyncIterator, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
import importlib.util
|
|
16
|
+
|
|
17
|
+
from datasynth_py.config.models import Config, MissingDependencyError
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class OutputSpec:
|
|
22
|
+
format: str = "csv"
|
|
23
|
+
sink: str = "temp_dir"
|
|
24
|
+
path: Optional[str] = None
|
|
25
|
+
compression: Optional[str] = None
|
|
26
|
+
table_format: str = "pandas"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class GenerationResult:
|
|
31
|
+
output_dir: Optional[str] = None
|
|
32
|
+
tables: Optional[Dict[str, Any]] = None
|
|
33
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DataSynth:
|
|
37
|
+
"""Python wrapper for running DataSynth generation."""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
binary_path: Optional[str] = None,
|
|
42
|
+
server_url: str = "http://localhost:3000",
|
|
43
|
+
api_key: Optional[str] = None,
|
|
44
|
+
request_timeout: float = 30.0,
|
|
45
|
+
) -> None:
|
|
46
|
+
self._binary_path = binary_path or os.environ.get("DATASYNTH_BINARY", "datasynth-data")
|
|
47
|
+
self._server_url = server_url.rstrip("/")
|
|
48
|
+
self._api_key = api_key
|
|
49
|
+
self._request_timeout = request_timeout
|
|
50
|
+
self._fingerprint_client: Optional["FingerprintClient"] = None
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def fingerprint(self) -> "FingerprintClient":
|
|
54
|
+
"""Access fingerprint operations.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
FingerprintClient for extract, validate, info, evaluate operations.
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
>>> synth = DataSynth()
|
|
61
|
+
>>> synth.fingerprint.extract("./data/", "./fp.dsf")
|
|
62
|
+
>>> info = synth.fingerprint.info("./fp.dsf")
|
|
63
|
+
"""
|
|
64
|
+
if self._fingerprint_client is None:
|
|
65
|
+
from datasynth_py.fingerprint import FingerprintClient
|
|
66
|
+
self._fingerprint_client = FingerprintClient(self._binary_path)
|
|
67
|
+
return self._fingerprint_client
|
|
68
|
+
|
|
69
|
+
def generate(
|
|
70
|
+
self,
|
|
71
|
+
config: Config,
|
|
72
|
+
output: Optional[OutputSpec | Dict[str, Any]] = None,
|
|
73
|
+
seed: Optional[int] = None,
|
|
74
|
+
) -> GenerationResult:
|
|
75
|
+
config.validate()
|
|
76
|
+
output_spec = _coerce_output_spec(output)
|
|
77
|
+
if seed is not None:
|
|
78
|
+
config = config.override(**{"global": {"seed": seed}})
|
|
79
|
+
if output_spec.sink == "path" and not output_spec.path:
|
|
80
|
+
raise ValueError("OutputSpec.path must be set when sink='path'.")
|
|
81
|
+
|
|
82
|
+
output_dir = self._resolve_output_dir(output_spec)
|
|
83
|
+
config_path = self._write_config(config, output_dir, output_spec)
|
|
84
|
+
self._run_cli(config_path=config_path, output_dir=output_dir)
|
|
85
|
+
|
|
86
|
+
if output_spec.sink == "memory":
|
|
87
|
+
tables = _load_tables(output_dir, output_spec)
|
|
88
|
+
return GenerationResult(output_dir=None, tables=tables)
|
|
89
|
+
return GenerationResult(output_dir=output_dir, tables=None)
|
|
90
|
+
|
|
91
|
+
def stream(
|
|
92
|
+
self,
|
|
93
|
+
config: Optional[Config] = None,
|
|
94
|
+
events_per_second: Optional[int] = None,
|
|
95
|
+
max_events: Optional[int] = None,
|
|
96
|
+
inject_anomalies: Optional[bool] = None,
|
|
97
|
+
seed: Optional[int] = None,
|
|
98
|
+
) -> "StreamingSession":
|
|
99
|
+
if config is not None:
|
|
100
|
+
config.validate()
|
|
101
|
+
payload = _config_to_server_payload(config, seed)
|
|
102
|
+
self._post_json("/api/config", payload)
|
|
103
|
+
|
|
104
|
+
stream_payload: Dict[str, Any] = {}
|
|
105
|
+
if events_per_second is not None:
|
|
106
|
+
stream_payload["events_per_second"] = events_per_second
|
|
107
|
+
if max_events is not None:
|
|
108
|
+
stream_payload["max_events"] = max_events
|
|
109
|
+
if inject_anomalies is not None:
|
|
110
|
+
stream_payload["inject_anomalies"] = inject_anomalies
|
|
111
|
+
self._post_json("/api/stream/start", stream_payload)
|
|
112
|
+
return StreamingSession(
|
|
113
|
+
server_url=self._server_url,
|
|
114
|
+
api_key=self._api_key,
|
|
115
|
+
request_timeout=self._request_timeout,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def _write_config(self, config: Config, output_dir: str, output_spec: OutputSpec) -> str:
|
|
119
|
+
yaml_spec = importlib.util.find_spec("yaml")
|
|
120
|
+
if yaml_spec is None:
|
|
121
|
+
raise MissingDependencyError(
|
|
122
|
+
"PyYAML is required to generate config files. Install with `pip install PyYAML`."
|
|
123
|
+
)
|
|
124
|
+
import yaml # type: ignore
|
|
125
|
+
|
|
126
|
+
payload = config.to_dict()
|
|
127
|
+
|
|
128
|
+
# Ensure output section exists with required fields
|
|
129
|
+
if "output" not in payload:
|
|
130
|
+
payload["output"] = {}
|
|
131
|
+
payload["output"]["output_directory"] = output_dir
|
|
132
|
+
|
|
133
|
+
# Map output format from OutputSpec
|
|
134
|
+
format_map = {"csv": "csv", "jsonl": "json", "parquet": "parquet"}
|
|
135
|
+
cli_format = format_map.get(output_spec.format, "csv")
|
|
136
|
+
payload["output"]["formats"] = [cli_format]
|
|
137
|
+
|
|
138
|
+
data = yaml.safe_dump(payload, sort_keys=False)
|
|
139
|
+
fd, path = tempfile.mkstemp(prefix="datasynth_", suffix=".yaml")
|
|
140
|
+
os.close(fd)
|
|
141
|
+
pathlib.Path(path).write_text(data, encoding="utf-8")
|
|
142
|
+
return path
|
|
143
|
+
|
|
144
|
+
def _resolve_output_dir(self, output: OutputSpec) -> str:
|
|
145
|
+
if output.sink == "path" and output.path:
|
|
146
|
+
return output.path
|
|
147
|
+
if output.sink == "temp_dir":
|
|
148
|
+
return tempfile.mkdtemp(prefix="datasynth_output_")
|
|
149
|
+
if output.sink == "memory":
|
|
150
|
+
return tempfile.mkdtemp(prefix="datasynth_output_")
|
|
151
|
+
raise ValueError(f"Unknown output sink: {output.sink}")
|
|
152
|
+
|
|
153
|
+
def _run_cli(self, config_path: str, output_dir: str) -> None:
|
|
154
|
+
command = [
|
|
155
|
+
self._binary_path,
|
|
156
|
+
"generate",
|
|
157
|
+
"--config",
|
|
158
|
+
config_path,
|
|
159
|
+
"--output",
|
|
160
|
+
output_dir,
|
|
161
|
+
]
|
|
162
|
+
try:
|
|
163
|
+
subprocess.run(command, check=True, capture_output=True, text=True)
|
|
164
|
+
except FileNotFoundError as exc:
|
|
165
|
+
raise RuntimeError(
|
|
166
|
+
"datasynth-data binary not found. Build it with `cargo build --release` "
|
|
167
|
+
"and set DATASYNTH_BINARY or pass binary_path."
|
|
168
|
+
) from exc
|
|
169
|
+
except subprocess.CalledProcessError as exc:
|
|
170
|
+
raise RuntimeError(
|
|
171
|
+
f"datasynth-data failed: {exc.stderr or exc.stdout}"
|
|
172
|
+
) from exc
|
|
173
|
+
|
|
174
|
+
def _post_json(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
175
|
+
url = f"{self._server_url}{path}"
|
|
176
|
+
data = json.dumps(payload).encode("utf-8")
|
|
177
|
+
headers = {"Content-Type": "application/json"}
|
|
178
|
+
if self._api_key:
|
|
179
|
+
headers["X-API-Key"] = self._api_key
|
|
180
|
+
request = urllib.request.Request(url, data=data, headers=headers, method="POST")
|
|
181
|
+
try:
|
|
182
|
+
with urllib.request.urlopen(request, timeout=self._request_timeout) as response:
|
|
183
|
+
body = response.read().decode("utf-8")
|
|
184
|
+
except urllib.error.HTTPError as exc:
|
|
185
|
+
body = exc.read().decode("utf-8")
|
|
186
|
+
raise RuntimeError(f"Server error ({exc.code}): {body}") from exc
|
|
187
|
+
return json.loads(body) if body else {}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass(frozen=True)
|
|
191
|
+
class StreamingSession:
|
|
192
|
+
server_url: str
|
|
193
|
+
api_key: Optional[str]
|
|
194
|
+
request_timeout: float
|
|
195
|
+
|
|
196
|
+
def pause(self) -> Dict[str, Any]:
|
|
197
|
+
return self._control("/api/stream/pause")
|
|
198
|
+
|
|
199
|
+
def resume(self) -> Dict[str, Any]:
|
|
200
|
+
return self._control("/api/stream/resume")
|
|
201
|
+
|
|
202
|
+
def stop(self) -> Dict[str, Any]:
|
|
203
|
+
return self._control("/api/stream/stop")
|
|
204
|
+
|
|
205
|
+
def trigger_pattern(self, pattern: str) -> Dict[str, Any]:
|
|
206
|
+
"""Trigger a pattern in the streaming session.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
pattern: Pattern name (year_end_spike, period_end_spike, fraud_cluster, etc.)
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Response from the server.
|
|
213
|
+
"""
|
|
214
|
+
return self._control(f"/api/stream/trigger/{pattern}")
|
|
215
|
+
|
|
216
|
+
def trigger_year_end(self) -> Dict[str, Any]:
|
|
217
|
+
"""Trigger year-end closing patterns (high volume, accruals, adjustments)."""
|
|
218
|
+
return self.trigger_pattern("year_end_spike")
|
|
219
|
+
|
|
220
|
+
def trigger_month_end(self) -> Dict[str, Any]:
|
|
221
|
+
"""Trigger month-end/period-end patterns."""
|
|
222
|
+
return self.trigger_pattern("period_end_spike")
|
|
223
|
+
|
|
224
|
+
def trigger_fraud_cluster(self) -> Dict[str, Any]:
|
|
225
|
+
"""Trigger a cluster of fraud-related transactions."""
|
|
226
|
+
return self.trigger_pattern("fraud_cluster")
|
|
227
|
+
|
|
228
|
+
def trigger_quarter_end(self) -> Dict[str, Any]:
|
|
229
|
+
"""Trigger quarter-end closing patterns."""
|
|
230
|
+
return self.trigger_pattern("quarter_end_spike")
|
|
231
|
+
|
|
232
|
+
async def events(self) -> AsyncIterator[Dict[str, Any]]:
|
|
233
|
+
websockets_spec = importlib.util.find_spec("websockets")
|
|
234
|
+
if websockets_spec is None:
|
|
235
|
+
raise MissingDependencyError(
|
|
236
|
+
"The websockets package is required for streaming. Install with `pip install websockets`."
|
|
237
|
+
)
|
|
238
|
+
import websockets # type: ignore
|
|
239
|
+
|
|
240
|
+
ws_url = self.server_url.replace("http", "ws") + "/ws/events"
|
|
241
|
+
headers = []
|
|
242
|
+
if self.api_key:
|
|
243
|
+
headers.append(("X-API-Key", self.api_key))
|
|
244
|
+
async with websockets.connect(ws_url, extra_headers=headers) as websocket:
|
|
245
|
+
async for message in websocket:
|
|
246
|
+
yield json.loads(message)
|
|
247
|
+
|
|
248
|
+
def _control(self, path: str) -> Dict[str, Any]:
|
|
249
|
+
url = f"{self.server_url}{path}"
|
|
250
|
+
headers = {"Content-Type": "application/json"}
|
|
251
|
+
if self.api_key:
|
|
252
|
+
headers["X-API-Key"] = self.api_key
|
|
253
|
+
request = urllib.request.Request(url, data=b"{}", headers=headers, method="POST")
|
|
254
|
+
with urllib.request.urlopen(request, timeout=self.request_timeout) as response:
|
|
255
|
+
body = response.read().decode("utf-8")
|
|
256
|
+
return json.loads(body) if body else {}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _coerce_output_spec(value: Optional[OutputSpec | Dict[str, Any]]) -> OutputSpec:
|
|
260
|
+
if value is None:
|
|
261
|
+
return OutputSpec()
|
|
262
|
+
if isinstance(value, OutputSpec):
|
|
263
|
+
return value
|
|
264
|
+
return OutputSpec(**value)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _load_tables(output_dir: str, output_spec: OutputSpec) -> Dict[str, Any]:
|
|
268
|
+
if output_spec.table_format != "pandas":
|
|
269
|
+
raise ValueError("Only pandas table_format is supported in this wrapper.")
|
|
270
|
+
pandas_spec = importlib.util.find_spec("pandas")
|
|
271
|
+
if pandas_spec is None:
|
|
272
|
+
raise MissingDependencyError(
|
|
273
|
+
"pandas is required for in-memory tables. Install with `pip install pandas`."
|
|
274
|
+
)
|
|
275
|
+
import pandas as pd # type: ignore
|
|
276
|
+
|
|
277
|
+
tables: Dict[str, Any] = {}
|
|
278
|
+
directory = pathlib.Path(output_dir)
|
|
279
|
+
if output_spec.format == "csv":
|
|
280
|
+
for csv_path in directory.rglob("*.csv"):
|
|
281
|
+
tables[csv_path.stem] = pd.read_csv(csv_path)
|
|
282
|
+
elif output_spec.format == "jsonl":
|
|
283
|
+
for json_path in directory.rglob("*.jsonl"):
|
|
284
|
+
tables[json_path.stem] = pd.read_json(json_path, lines=True)
|
|
285
|
+
elif output_spec.format == "parquet":
|
|
286
|
+
for parquet_path in directory.rglob("*.parquet"):
|
|
287
|
+
tables[parquet_path.stem] = pd.read_parquet(parquet_path)
|
|
288
|
+
else:
|
|
289
|
+
raise ValueError(f"Unsupported format for memory loading: {output_spec.format}")
|
|
290
|
+
return tables
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _config_to_server_payload(config: Config, seed: Optional[int]) -> Dict[str, Any]:
|
|
294
|
+
"""Convert Config to server API payload format."""
|
|
295
|
+
payload = config.to_dict()
|
|
296
|
+
global_settings = payload.get("global", {})
|
|
297
|
+
companies = payload.get("companies", [])
|
|
298
|
+
chart_of_accounts = payload.get("chart_of_accounts", {})
|
|
299
|
+
fraud = payload.get("fraud", {})
|
|
300
|
+
|
|
301
|
+
# Extract values from the new schema structure
|
|
302
|
+
industry = global_settings.get("industry", "retail")
|
|
303
|
+
complexity = chart_of_accounts.get("complexity", "small")
|
|
304
|
+
start_date = global_settings.get("start_date", "2024-01-01")
|
|
305
|
+
period_months = global_settings.get("period_months", 12)
|
|
306
|
+
seed_value = seed if seed is not None else global_settings.get("seed")
|
|
307
|
+
|
|
308
|
+
# Companies is now a list of company configs
|
|
309
|
+
company_payloads: List[Dict[str, Any]] = []
|
|
310
|
+
if isinstance(companies, list):
|
|
311
|
+
for company in companies:
|
|
312
|
+
company_payloads.append({
|
|
313
|
+
"code": company.get("code", "C001"),
|
|
314
|
+
"name": company.get("name", "Company"),
|
|
315
|
+
"currency": company.get("currency", "USD"),
|
|
316
|
+
"country": company.get("country", "US"),
|
|
317
|
+
"annual_transaction_volume": 10000,
|
|
318
|
+
"volume_weight": company.get("volume_weight", 1.0),
|
|
319
|
+
})
|
|
320
|
+
else:
|
|
321
|
+
# Fallback for legacy format
|
|
322
|
+
company_payloads.append({
|
|
323
|
+
"code": "C001",
|
|
324
|
+
"name": "Company 1",
|
|
325
|
+
"currency": "USD",
|
|
326
|
+
"country": "US",
|
|
327
|
+
"annual_transaction_volume": 10000,
|
|
328
|
+
"volume_weight": 1.0,
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
# Extract fraud settings
|
|
332
|
+
fraud_enabled = fraud.get("enabled", False)
|
|
333
|
+
fraud_rate = fraud.get("rate", 0.0)
|
|
334
|
+
|
|
335
|
+
return {
|
|
336
|
+
"industry": industry,
|
|
337
|
+
"start_date": start_date,
|
|
338
|
+
"period_months": period_months,
|
|
339
|
+
"seed": seed_value,
|
|
340
|
+
"coa_complexity": complexity,
|
|
341
|
+
"companies": company_payloads,
|
|
342
|
+
"fraud_enabled": fraud_enabled,
|
|
343
|
+
"fraud_rate": fraud_rate,
|
|
344
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Configuration helpers for datasynth_py."""
|
|
2
|
+
|
|
3
|
+
from datasynth_py.config import blueprints
|
|
4
|
+
from datasynth_py.config.models import (
|
|
5
|
+
AuditSettings,
|
|
6
|
+
BankingSettings,
|
|
7
|
+
ChartOfAccountsSettings,
|
|
8
|
+
CompanyConfig,
|
|
9
|
+
CompanySettings, # Legacy alias
|
|
10
|
+
Config,
|
|
11
|
+
DataQualitySettings,
|
|
12
|
+
FraudSettings,
|
|
13
|
+
GlobalSettings,
|
|
14
|
+
GraphExportSettings,
|
|
15
|
+
OutputSettings,
|
|
16
|
+
ScenarioSettings,
|
|
17
|
+
TemporalDriftSettings,
|
|
18
|
+
TransactionSettings,
|
|
19
|
+
)
|
|
20
|
+
from datasynth_py.config.validation import ConfigValidationError
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AuditSettings",
|
|
24
|
+
"BankingSettings",
|
|
25
|
+
"ChartOfAccountsSettings",
|
|
26
|
+
"CompanyConfig",
|
|
27
|
+
"CompanySettings",
|
|
28
|
+
"Config",
|
|
29
|
+
"ConfigValidationError",
|
|
30
|
+
"DataQualitySettings",
|
|
31
|
+
"FraudSettings",
|
|
32
|
+
"GlobalSettings",
|
|
33
|
+
"GraphExportSettings",
|
|
34
|
+
"OutputSettings",
|
|
35
|
+
"ScenarioSettings",
|
|
36
|
+
"TemporalDriftSettings",
|
|
37
|
+
"TransactionSettings",
|
|
38
|
+
"blueprints",
|
|
39
|
+
]
|