loggen-lg 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. loggen_lg-0.1.0/MANIFEST.in +4 -0
  2. loggen_lg-0.1.0/PKG-INFO +141 -0
  3. loggen_lg-0.1.0/README.md +128 -0
  4. loggen_lg-0.1.0/loggen-lg/__init__.py +32 -0
  5. loggen_lg-0.1.0/loggen-lg/__main__.py +3 -0
  6. loggen_lg-0.1.0/loggen-lg/_version.py +1 -0
  7. loggen_lg-0.1.0/loggen-lg/cli.py +221 -0
  8. loggen_lg-0.1.0/loggen-lg/core/__init__.py +5 -0
  9. loggen_lg-0.1.0/loggen-lg/core/generator.py +82 -0
  10. loggen_lg-0.1.0/loggen-lg/core/loaders.py +48 -0
  11. loggen_lg-0.1.0/loggen-lg/core/session.py +159 -0
  12. loggen_lg-0.1.0/loggen-lg/core/writer.py +54 -0
  13. loggen_lg-0.1.0/loggen-lg/formats/__init__.py +31 -0
  14. loggen_lg-0.1.0/loggen-lg/formats/cef.py +62 -0
  15. loggen_lg-0.1.0/loggen-lg/formats/json_fmt.py +10 -0
  16. loggen_lg-0.1.0/loggen-lg/formats/syslog_fmt.py +51 -0
  17. loggen_lg-0.1.0/loggen-lg/profiles/__init__.py +22 -0
  18. loggen_lg-0.1.0/loggen-lg/profiles/base.py +51 -0
  19. loggen_lg-0.1.0/loggen-lg/profiles/edr.py +332 -0
  20. loggen_lg-0.1.0/loggen-lg/profiles/linux.py +319 -0
  21. loggen_lg-0.1.0/loggen-lg/profiles/network.py +242 -0
  22. loggen_lg-0.1.0/loggen-lg/profiles/windows.py +510 -0
  23. loggen_lg-0.1.0/loggen-lg/scenarios/__init__.py +26 -0
  24. loggen_lg-0.1.0/loggen-lg/scenarios/base.py +31 -0
  25. loggen_lg-0.1.0/loggen-lg/scenarios/builtin.py +373 -0
  26. loggen_lg-0.1.0/loggen_lg.egg-info/PKG-INFO +141 -0
  27. loggen_lg-0.1.0/loggen_lg.egg-info/SOURCES.txt +30 -0
  28. loggen_lg-0.1.0/loggen_lg.egg-info/dependency_links.txt +1 -0
  29. loggen_lg-0.1.0/loggen_lg.egg-info/entry_points.txt +2 -0
  30. loggen_lg-0.1.0/loggen_lg.egg-info/top_level.txt +1 -0
  31. loggen_lg-0.1.0/pyproject.toml +30 -0
  32. loggen_lg-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,4 @@
1
+ global-exclude *.pyc
2
+ global-exclude *.pyo
3
+ global-exclude __pycache__
4
+ prune .vscode
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: loggen-lg
3
+ Version: 0.1.0
4
+ Summary: Synthetic log generator for SIEM and IR exercises
5
+ License-Expression: MIT
6
+ Keywords: siem,logs,security,testing,ir,blue-team
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Information Technology
9
+ Classifier: Topic :: Security
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+
14
+ # loggen
15
+
16
+ Synthetic log generator for SIEM and IR exercises. Reproducible, deterministic, supports arbitrary scales from MB to TB. Profiles for Windows, Linux, network, and EDR-like telemetry.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ pip install -e .
22
+ ```
23
+
24
+ ## Quick start
25
+
26
+ ```python
27
+ from loggen import LogGen, Session
28
+
29
+ gen = LogGen(seed=42)
30
+
31
+ # Stream 1 000 mixed entries
32
+ for entry in gen.stream(["windows", "linux", "edr"], count=1000):
33
+ print(entry.asdict())
34
+
35
+ # Write 1 GB of mixed logs (gzip)
36
+ gen.write("corpus.jsonl.gz", ["windows", "linux", "network", "edr"], size="1GB")
37
+
38
+ # Run a built-in attack scenario
39
+ gen.write("attack.jsonl", [], scenario="lateral_movement", count=50)
40
+
41
+ # In-memory list
42
+ entries = gen.to_list(["network"], count=200)
43
+ ```
44
+
45
+ ## Custom data pools
46
+
47
+ By default loggen ships with a built-in set of hostnames, users, and IPs.
48
+ You can replace any of them with your own data.
49
+
50
+ **Inline lists**
51
+ ```python
52
+ from loggen import Session, LogGen
53
+
54
+ session = Session(
55
+ seed=42,
56
+ hosts=["prod-web-01", "prod-db-01", "prod-cache-01"],
57
+ users=["alice", "bob", "charlie"],
58
+ )
59
+ gen = LogGen(session=session)
60
+ ```
61
+
62
+ **From a file**
63
+ ```python
64
+ from loggen import Session, LogGen, WordList
65
+
66
+ session = Session(
67
+ seed=42,
68
+ hosts=WordList.from_file("wordlists/hosts.txt"),
69
+ users=WordList.from_file("wordlists/users.json"),
70
+ internal_ips=WordList.from_file("wordlists/ips.csv", column=0),
71
+ )
72
+ gen = LogGen(session=session)
73
+ ```
74
+
75
+ **From a JSON config file**
76
+ ```python
77
+ from loggen import Session, LogGen
78
+
79
+ session = Session.from_config("config/loggen.json")
80
+ gen = LogGen(session=session)
81
+ ```
82
+
83
+ See [WIKI.md](WIKI.md) for the config file format and all supported options.
84
+
85
+ **From an environment variable**
86
+ ```python
87
+ from loggen import WordList
88
+
89
+ hosts = WordList.from_env("LOGGEN_HOSTS") # comma-separated
90
+ users = WordList.from_env("LOGGEN_USERS", fallback=["admin"])
91
+ ```
92
+
93
+ ## CLI
94
+
95
+ ```bash
96
+ # 10 000 mixed entries to stdout (JSON Lines)
97
+ python -m loggen generate -p windows -p linux -n 10000
98
+
99
+ # 500 MB of network logs, gzip, reproducible
100
+ python -m loggen generate -p network --size 500MB --seed 42 -o network.jsonl.gz
101
+
102
+ # Brute-force scenario in CEF format
103
+ python -m loggen generate --scenario brute_force -f cef -o brute.cef
104
+
105
+ # Custom profile weights
106
+ python -m loggen generate -p windows -p edr --weight windows:4 --weight edr:1 -n 50000 -o out.jsonl
107
+
108
+ # List available options
109
+ python -m loggen list-profiles
110
+ python -m loggen list-scenarios
111
+ python -m loggen list-formats
112
+ ```
113
+
114
+ ## Profiles
115
+
116
+ | Profile | Covers |
117
+ |-----------|--------|
118
+ | `windows` | Security events 4624/4625/4688/4672/4698/7045/5140/4104 … |
119
+ | `linux` | sshd auth, sudo, cron, systemd, auditd, PAM, kernel |
120
+ | `network` | Firewall allow/deny, DNS, HTTP proxy, DHCP |
121
+ | `edr` | Process/file/network/registry/module events with hashes |
122
+
123
+ ## Scenarios
124
+
125
+ | Scenario | Description |
126
+ |--------------------|-------------|
127
+ | `brute_force` | Repeated logon failures → success |
128
+ | `lateral_movement` | Recon → SMB share access → remote execution |
129
+ | `priv_esc` | Service install → SYSTEM shell → 4672 |
130
+ | `data_exfil` | Archive creation → large outbound transfers → DNS tunnelling |
131
+ | `persistence` | Registry run key + scheduled task + binary drop |
132
+
133
+ ## Output formats
134
+
135
+ | Format | Description |
136
+ |----------|-------------|
137
+ | `jsonl` | JSON Lines / NDJSON (default) |
138
+ | `cef` | ArcSight CEF:0 |
139
+ | `syslog` | RFC 5424 syslog |
140
+
141
+ See [WIKI.md](WIKI.md) for full API reference.
@@ -0,0 +1,128 @@
1
+ # loggen
2
+
3
+ Synthetic log generator for SIEM and IR exercises. Reproducible, deterministic, supports arbitrary scales from MB to TB. Profiles for Windows, Linux, network, and EDR-like telemetry.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install -e .
9
+ ```
10
+
11
+ ## Quick start
12
+
13
+ ```python
14
+ from loggen import LogGen, Session
15
+
16
+ gen = LogGen(seed=42)
17
+
18
+ # Stream 1 000 mixed entries
19
+ for entry in gen.stream(["windows", "linux", "edr"], count=1000):
20
+ print(entry.asdict())
21
+
22
+ # Write 1 GB of mixed logs (gzip)
23
+ gen.write("corpus.jsonl.gz", ["windows", "linux", "network", "edr"], size="1GB")
24
+
25
+ # Run a built-in attack scenario
26
+ gen.write("attack.jsonl", [], scenario="lateral_movement", count=50)
27
+
28
+ # In-memory list
29
+ entries = gen.to_list(["network"], count=200)
30
+ ```
31
+
32
+ ## Custom data pools
33
+
34
+ By default loggen ships with a built-in set of hostnames, users, and IPs.
35
+ You can replace any of them with your own data.
36
+
37
+ **Inline lists**
38
+ ```python
39
+ from loggen import Session, LogGen
40
+
41
+ session = Session(
42
+ seed=42,
43
+ hosts=["prod-web-01", "prod-db-01", "prod-cache-01"],
44
+ users=["alice", "bob", "charlie"],
45
+ )
46
+ gen = LogGen(session=session)
47
+ ```
48
+
49
+ **From a file**
50
+ ```python
51
+ from loggen import Session, LogGen, WordList
52
+
53
+ session = Session(
54
+ seed=42,
55
+ hosts=WordList.from_file("wordlists/hosts.txt"),
56
+ users=WordList.from_file("wordlists/users.json"),
57
+ internal_ips=WordList.from_file("wordlists/ips.csv", column=0),
58
+ )
59
+ gen = LogGen(session=session)
60
+ ```
61
+
62
+ **From a JSON config file**
63
+ ```python
64
+ from loggen import Session, LogGen
65
+
66
+ session = Session.from_config("config/loggen.json")
67
+ gen = LogGen(session=session)
68
+ ```
69
+
70
+ See [WIKI.md](WIKI.md) for the config file format and all supported options.
71
+
72
+ **From an environment variable**
73
+ ```python
74
+ from loggen import WordList
75
+
76
+ hosts = WordList.from_env("LOGGEN_HOSTS") # comma-separated
77
+ users = WordList.from_env("LOGGEN_USERS", fallback=["admin"])
78
+ ```
79
+
80
+ ## CLI
81
+
82
+ ```bash
83
+ # 10 000 mixed entries to stdout (JSON Lines)
84
+ python -m loggen generate -p windows -p linux -n 10000
85
+
86
+ # 500 MB of network logs, gzip, reproducible
87
+ python -m loggen generate -p network --size 500MB --seed 42 -o network.jsonl.gz
88
+
89
+ # Brute-force scenario in CEF format
90
+ python -m loggen generate --scenario brute_force -f cef -o brute.cef
91
+
92
+ # Custom profile weights
93
+ python -m loggen generate -p windows -p edr --weight windows:4 --weight edr:1 -n 50000 -o out.jsonl
94
+
95
+ # List available options
96
+ python -m loggen list-profiles
97
+ python -m loggen list-scenarios
98
+ python -m loggen list-formats
99
+ ```
100
+
101
+ ## Profiles
102
+
103
+ | Profile | Covers |
104
+ |-----------|--------|
105
+ | `windows` | Security events 4624/4625/4688/4672/4698/7045/5140/4104 … |
106
+ | `linux` | sshd auth, sudo, cron, systemd, auditd, PAM, kernel |
107
+ | `network` | Firewall allow/deny, DNS, HTTP proxy, DHCP |
108
+ | `edr` | Process/file/network/registry/module events with hashes |
109
+
110
+ ## Scenarios
111
+
112
+ | Scenario | Description |
113
+ |--------------------|-------------|
114
+ | `brute_force` | Repeated logon failures → success |
115
+ | `lateral_movement` | Recon → SMB share access → remote execution |
116
+ | `priv_esc` | Service install → SYSTEM shell → 4672 |
117
+ | `data_exfil` | Archive creation → large outbound transfers → DNS tunnelling |
118
+ | `persistence` | Registry run key + scheduled task + binary drop |
119
+
120
+ ## Output formats
121
+
122
+ | Format | Description |
123
+ |----------|-------------|
124
+ | `jsonl` | JSON Lines / NDJSON (default) |
125
+ | `cef` | ArcSight CEF:0 |
126
+ | `syslog` | RFC 5424 syslog |
127
+
128
+ See [WIKI.md](WIKI.md) for full API reference.
@@ -0,0 +1,32 @@
1
+ from loggen._version import VERSION
2
+ from loggen.core.session import Session
3
+ from loggen.core.generator import LogGen
4
+ from loggen.core.loaders import WordList
5
+ from loggen.profiles.base import LogEntry
6
+ from loggen.profiles import (
7
+ WindowsProfile,
8
+ LinuxProfile,
9
+ NetworkProfile,
10
+ EDRProfile,
11
+ PROFILE_REGISTRY,
12
+ )
13
+ from loggen.formats import Format, get_formatter
14
+ from loggen.scenarios import SCENARIO_REGISTRY
15
+
16
+ __version__ = VERSION
17
+
18
+ __all__ = [
19
+ "LogGen",
20
+ "Session",
21
+ "LogEntry",
22
+ "WordList",
23
+ "Format",
24
+ "get_formatter",
25
+ "WindowsProfile",
26
+ "LinuxProfile",
27
+ "NetworkProfile",
28
+ "EDRProfile",
29
+ "PROFILE_REGISTRY",
30
+ "SCENARIO_REGISTRY",
31
+ "__version__",
32
+ ]
@@ -0,0 +1,3 @@
1
+ from loggen.cli import main
2
+
3
+ main()
@@ -0,0 +1 @@
1
+ VERSION = "0.1.0"
@@ -0,0 +1,221 @@
1
+ # loggen CLI
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from loggen._version import VERSION
9
+ from loggen.core.session import Session
10
+ from loggen.core.generator import LogGen
11
+ from loggen.profiles import PROFILE_REGISTRY
12
+ from loggen.formats import Format
13
+ from loggen.scenarios import SCENARIO_REGISTRY
14
+
15
+
16
+ def _build_parser() -> argparse.ArgumentParser:
17
+ p = argparse.ArgumentParser(
18
+ prog="loggen",
19
+ description="Synthetic log generator for SIEM and IR exercises.",
20
+ formatter_class=argparse.RawDescriptionHelpFormatter,
21
+ epilog=_EXAMPLES,
22
+ )
23
+ p.add_argument("--version", action="version", version=f"loggen {VERSION}")
24
+
25
+ sub = p.add_subparsers(dest="command", metavar="COMMAND")
26
+
27
+ # ---- generate -------------------------------------------------------
28
+ gen = sub.add_parser("generate", aliases=["gen"], help="Generate log entries.")
29
+ gen.add_argument(
30
+ "-p", "--profile",
31
+ dest="profiles", action="append", default=[],
32
+ metavar="NAME",
33
+ help=f"Profile to include (repeatable). Choices: {', '.join(sorted(PROFILE_REGISTRY))}",
34
+ )
35
+ gen.add_argument(
36
+ "-s", "--scenario",
37
+ metavar="NAME",
38
+ help=f"Run a built-in scenario instead of random profiles. "
39
+ f"Choices: {', '.join(sorted(SCENARIO_REGISTRY))}",
40
+ )
41
+ gen.add_argument(
42
+ "-n", "--count",
43
+ type=int, default=None,
44
+ metavar="N",
45
+ help="Number of log entries to generate.",
46
+ )
47
+ gen.add_argument(
48
+ "--size",
49
+ metavar="SIZE",
50
+ help="Target output size, e.g. 100MB, 1GB, 2TB. Overrides --count.",
51
+ )
52
+ gen.add_argument(
53
+ "-f", "--format",
54
+ default="jsonl", choices=[f.value for f in Format],
55
+ help="Output format (default: jsonl).",
56
+ )
57
+ gen.add_argument(
58
+ "-o", "--output",
59
+ default=None, metavar="PATH",
60
+ help="Output file path. Use .gz suffix for gzip. Defaults to stdout.",
61
+ )
62
+ gen.add_argument(
63
+ "--seed",
64
+ type=int, default=0,
65
+ help="Random seed for reproducibility (default: 0).",
66
+ )
67
+ gen.add_argument(
68
+ "--start",
69
+ metavar="DATETIME",
70
+ help="Log start timestamp ISO-8601 (default: 2024-01-01T00:00:00Z).",
71
+ )
72
+ gen.add_argument(
73
+ "--duration",
74
+ type=float, default=24.0, metavar="HOURS",
75
+ help="Time window in hours (default: 24).",
76
+ )
77
+ gen.add_argument(
78
+ "--weight",
79
+ dest="weights", action="append", default=[], metavar="PROFILE:WEIGHT",
80
+ help="Relative sampling weight, e.g. windows:3 (repeatable).",
81
+ )
82
+ gen.add_argument(
83
+ "--progress",
84
+ action="store_true",
85
+ help="Print progress to stderr.",
86
+ )
87
+
88
+ # ---- list-profiles --------------------------------------------------
89
+ sub.add_parser("list-profiles", aliases=["profiles"],
90
+ help="List available profiles.")
91
+
92
+ # ---- list-scenarios -------------------------------------------------
93
+ sub.add_parser("list-scenarios", aliases=["scenarios"],
94
+ help="List available scenarios.")
95
+
96
+ # ---- list-formats ---------------------------------------------------
97
+ sub.add_parser("list-formats", aliases=["formats"],
98
+ help="List available output formats.")
99
+
100
+ return p
101
+
102
+
103
+ _EXAMPLES = """
104
+ examples:
105
+ # 1 000 Windows + Linux events to stdout
106
+ loggen generate -p windows -p linux -n 1000
107
+
108
+ # 500 MB of network logs (gzipped), deterministic seed
109
+ loggen generate -p network --size 500MB --seed 42 -o network.jsonl.gz
110
+
111
+ # Run a brute-force scenario, CEF format
112
+ loggen generate --scenario brute_force -f cef -o brute.cef
113
+
114
+ # Mix all profiles with custom weights
115
+ loggen generate -p windows -p linux -p network -p edr \\
116
+ --weight windows:4 --weight edr:2 -n 50000 -o full.jsonl
117
+
118
+ # List what's available
119
+ loggen list-profiles
120
+ loggen list-scenarios
121
+ """
122
+
123
+
124
+ def _parse_weights(raw: list[str]) -> dict[str, float]:
125
+ result: dict[str, float] = {}
126
+ for item in raw:
127
+ if ":" not in item:
128
+ raise SystemExit(f"Invalid weight format {item!r} — expected PROFILE:WEIGHT")
129
+ k, v = item.rsplit(":", 1)
130
+ result[k] = float(v)
131
+ return result
132
+
133
+
134
+ def _parse_start(s: str | None):
135
+ if not s:
136
+ return None
137
+ from datetime import datetime, timezone
138
+ for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
139
+ try:
140
+ dt = datetime.strptime(s, fmt)
141
+ return dt.replace(tzinfo=timezone.utc)
142
+ except ValueError:
143
+ continue
144
+ raise SystemExit(f"Cannot parse datetime {s!r}. Use ISO-8601, e.g. 2024-06-01T00:00:00Z")
145
+
146
+
147
+ def main(argv: list[str] | None = None) -> None:
148
+ parser = _build_parser()
149
+ args = parser.parse_args(argv)
150
+
151
+ if args.command in (None,):
152
+ parser.print_help()
153
+ return
154
+
155
+ if args.command in ("list-profiles", "profiles"):
156
+ print("Available profiles:")
157
+ for name in sorted(PROFILE_REGISTRY):
158
+ print(f" {name}")
159
+ return
160
+
161
+ if args.command in ("list-scenarios", "scenarios"):
162
+ print("Available scenarios:")
163
+ from loggen.scenarios import SCENARIO_REGISTRY as SR
164
+ for name, cls in sorted(SR.items()):
165
+ print(f" {name:<20} {cls.description}")
166
+ return
167
+
168
+ if args.command in ("list-formats", "formats"):
169
+ print("Available formats:")
170
+ for f in Format:
171
+ print(f" {f.value}")
172
+ return
173
+
174
+ if args.command in ("generate", "gen"):
175
+ _run_generate(args)
176
+ return
177
+
178
+ parser.print_help()
179
+
180
+
181
+ def _run_generate(args) -> None:
182
+ if not args.profiles and not args.scenario:
183
+ raise SystemExit(
184
+ "Specify at least one --profile or a --scenario.\n"
185
+ f"Profiles: {', '.join(sorted(PROFILE_REGISTRY))}\n"
186
+ f"Scenarios: {', '.join(sorted(SCENARIO_REGISTRY))}"
187
+ )
188
+
189
+ if not args.count and not args.size and not args.scenario:
190
+ raise SystemExit("Specify --count N or --size SIZE (e.g. --size 100MB).")
191
+
192
+ start = _parse_start(args.start)
193
+ session_kwargs: dict = {"seed": args.seed, "duration_hours": args.duration}
194
+ if start:
195
+ session_kwargs["start"] = start
196
+ session = Session(**session_kwargs)
197
+ gen = LogGen(session=session)
198
+
199
+ weights = _parse_weights(args.weights)
200
+ dest = args.output # None → stdout
201
+
202
+ try:
203
+ written = gen.write(
204
+ dest=dest,
205
+ profiles=args.profiles,
206
+ count=args.count,
207
+ size=args.size,
208
+ format=args.format,
209
+ progress=args.progress,
210
+ weights=weights if weights else None,
211
+ scenario=args.scenario,
212
+ )
213
+ except (BrokenPipeError, KeyboardInterrupt):
214
+ pass
215
+ else:
216
+ if args.output and args.progress:
217
+ sys.stderr.write(f"Wrote {written / 1_048_576:.2f} MB → {args.output}\n")
218
+
219
+
220
+ if __name__ == "__main__":
221
+ main()
@@ -0,0 +1,5 @@
1
+ from loggen.core.session import Session
2
+ from loggen.core.generator import LogGen
3
+ from loggen.core.loaders import WordList
4
+
5
+ __all__ = ["Session", "LogGen", "WordList"]
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterator
4
+
5
+ from loggen.core.session import Session
6
+ from loggen.profiles.base import LogEntry
7
+ from loggen.profiles import PROFILE_REGISTRY
8
+
9
+
10
+ def _parse_size(s: str | int | None) -> int | None:
11
+ if s is None:
12
+ return None
13
+ if isinstance(s, int):
14
+ return s
15
+ s = s.strip().upper()
16
+ for unit, mult in [("TB", 1 << 40), ("GB", 1 << 30), ("MB", 1 << 20), ("KB", 1 << 10), ("B", 1)]:
17
+ if s.endswith(unit):
18
+ return int(float(s[: -len(unit)]) * mult)
19
+ return int(s)
20
+
21
+
22
+ class LogGen:
23
+ def __init__(self, seed: int = 0, session: Session | None = None) -> None:
24
+ self.session = session or Session(seed=seed)
25
+
26
+ def stream(
27
+ self,
28
+ profiles: list[str],
29
+ count: int | None = None,
30
+ weights: dict[str, float] | None = None,
31
+ scenario: str | None = None,
32
+ ) -> Iterator[LogEntry]:
33
+ if scenario:
34
+ yield from self._stream_scenario(scenario, count)
35
+ return
36
+
37
+ if not profiles:
38
+ raise ValueError("At least one profile required.")
39
+
40
+ for name in profiles:
41
+ if name not in PROFILE_REGISTRY:
42
+ raise ValueError(f"Unknown profile {name!r}. Available: {sorted(PROFILE_REGISTRY)}")
43
+
44
+ profile_instances = [(name, PROFILE_REGISTRY[name]()) for name in profiles]
45
+ w = weights or {}
46
+ profile_weights = [w.get(name, 1.0) for name, _ in profile_instances]
47
+
48
+ rng = self.session.rng
49
+ n = 0
50
+ while count is None or n < count:
51
+ (name, profile), = rng.choices(profile_instances, weights=profile_weights, k=1)
52
+ yield profile.generate_one(self.session.fork(n))
53
+ n += 1
54
+
55
+ def to_list(self, profiles: list[str], count: int, **kwargs) -> list[LogEntry]:
56
+ return list(self.stream(profiles, count=count, **kwargs))
57
+
58
+ def write(
59
+ self,
60
+ dest,
61
+ profiles: list[str],
62
+ count: int | None = None,
63
+ size: str | int | None = None,
64
+ format: str = "jsonl",
65
+ progress: bool = False,
66
+ **kwargs,
67
+ ) -> int:
68
+ from loggen.core.writer import write_stream
69
+ from loggen.formats import get_formatter
70
+
71
+ formatter = get_formatter(format)
72
+ size_bytes = _parse_size(size)
73
+ entries = self.stream(profiles, count=count, **kwargs)
74
+ return write_stream(entries, dest, formatter, size_limit=size_bytes, progress=progress)
75
+
76
+ def _stream_scenario(self, scenario: str, count: int | None) -> Iterator[LogEntry]:
77
+ from loggen.scenarios import SCENARIO_REGISTRY
78
+
79
+ cls = SCENARIO_REGISTRY.get(scenario)
80
+ if cls is None:
81
+ raise ValueError(f"Unknown scenario {scenario!r}. Available: {sorted(SCENARIO_REGISTRY)}")
82
+ yield from cls(self.session).stream(count=count)
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+
8
+
9
+ class WordList(list):
10
+ """A list that can be populated from files or env vars."""
11
+
12
+ @classmethod
13
+ def from_file(cls, path: str | Path, *, column: int = 0, comment: str = "#") -> "WordList":
14
+ """Load from .txt, .json, or .csv."""
15
+ path = str(path)
16
+ ext = os.path.splitext(path)[1].lower()
17
+
18
+ if ext == ".json":
19
+ with open(path) as f:
20
+ data = json.load(f)
21
+ if not isinstance(data, list):
22
+ raise ValueError(f"{path}: expected a JSON array")
23
+ return cls(str(x) for x in data)
24
+
25
+ if ext == ".csv":
26
+ with open(path, newline="") as f:
27
+ rows = csv.reader(f)
28
+ return cls(
29
+ row[column]
30
+ for row in rows
31
+ if row and not row[0].startswith(comment)
32
+ )
33
+
34
+ # plain text — one item per line
35
+ with open(path) as f:
36
+ return cls(
37
+ line.strip()
38
+ for line in f
39
+ if line.strip() and not line.startswith(comment)
40
+ )
41
+
42
+ @classmethod
43
+ def from_env(cls, var: str, sep: str = ",", fallback: list[str] | None = None) -> "WordList":
44
+ """Load from a comma-separated environment variable."""
45
+ raw = os.environ.get(var, "")
46
+ if raw:
47
+ return cls(item.strip() for item in raw.split(sep) if item.strip())
48
+ return cls(fallback or [])