loggen-lg 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loggen_lg-0.1.0/MANIFEST.in +4 -0
- loggen_lg-0.1.0/PKG-INFO +141 -0
- loggen_lg-0.1.0/README.md +128 -0
- loggen_lg-0.1.0/loggen-lg/__init__.py +32 -0
- loggen_lg-0.1.0/loggen-lg/__main__.py +3 -0
- loggen_lg-0.1.0/loggen-lg/_version.py +1 -0
- loggen_lg-0.1.0/loggen-lg/cli.py +221 -0
- loggen_lg-0.1.0/loggen-lg/core/__init__.py +5 -0
- loggen_lg-0.1.0/loggen-lg/core/generator.py +82 -0
- loggen_lg-0.1.0/loggen-lg/core/loaders.py +48 -0
- loggen_lg-0.1.0/loggen-lg/core/session.py +159 -0
- loggen_lg-0.1.0/loggen-lg/core/writer.py +54 -0
- loggen_lg-0.1.0/loggen-lg/formats/__init__.py +31 -0
- loggen_lg-0.1.0/loggen-lg/formats/cef.py +62 -0
- loggen_lg-0.1.0/loggen-lg/formats/json_fmt.py +10 -0
- loggen_lg-0.1.0/loggen-lg/formats/syslog_fmt.py +51 -0
- loggen_lg-0.1.0/loggen-lg/profiles/__init__.py +22 -0
- loggen_lg-0.1.0/loggen-lg/profiles/base.py +51 -0
- loggen_lg-0.1.0/loggen-lg/profiles/edr.py +332 -0
- loggen_lg-0.1.0/loggen-lg/profiles/linux.py +319 -0
- loggen_lg-0.1.0/loggen-lg/profiles/network.py +242 -0
- loggen_lg-0.1.0/loggen-lg/profiles/windows.py +510 -0
- loggen_lg-0.1.0/loggen-lg/scenarios/__init__.py +26 -0
- loggen_lg-0.1.0/loggen-lg/scenarios/base.py +31 -0
- loggen_lg-0.1.0/loggen-lg/scenarios/builtin.py +373 -0
- loggen_lg-0.1.0/loggen_lg.egg-info/PKG-INFO +141 -0
- loggen_lg-0.1.0/loggen_lg.egg-info/SOURCES.txt +30 -0
- loggen_lg-0.1.0/loggen_lg.egg-info/dependency_links.txt +1 -0
- loggen_lg-0.1.0/loggen_lg.egg-info/entry_points.txt +2 -0
- loggen_lg-0.1.0/loggen_lg.egg-info/top_level.txt +1 -0
- loggen_lg-0.1.0/pyproject.toml +30 -0
- loggen_lg-0.1.0/setup.cfg +4 -0
loggen_lg-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: loggen-lg
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Synthetic log generator for SIEM and IR exercises
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: siem,logs,security,testing,ir,blue-team
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Information Technology
|
|
9
|
+
Classifier: Topic :: Security
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# loggen
|
|
15
|
+
|
|
16
|
+
Synthetic log generator for SIEM and IR exercises. Reproducible, deterministic, supports arbitrary scales from MB to TB. Profiles for Windows, Linux, network, and EDR-like telemetry.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install -e .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from loggen import LogGen, Session
|
|
28
|
+
|
|
29
|
+
gen = LogGen(seed=42)
|
|
30
|
+
|
|
31
|
+
# Stream 1 000 mixed entries
|
|
32
|
+
for entry in gen.stream(["windows", "linux", "edr"], count=1000):
|
|
33
|
+
print(entry.asdict())
|
|
34
|
+
|
|
35
|
+
# Write 1 GB of mixed logs (gzip)
|
|
36
|
+
gen.write("corpus.jsonl.gz", ["windows", "linux", "network", "edr"], size="1GB")
|
|
37
|
+
|
|
38
|
+
# Run a built-in attack scenario
|
|
39
|
+
gen.write("attack.jsonl", [], scenario="lateral_movement", count=50)
|
|
40
|
+
|
|
41
|
+
# In-memory list
|
|
42
|
+
entries = gen.to_list(["network"], count=200)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Custom data pools
|
|
46
|
+
|
|
47
|
+
By default loggen ships with a built-in set of hostnames, users, and IPs.
|
|
48
|
+
You can replace any of them with your own data.
|
|
49
|
+
|
|
50
|
+
**Inline lists**
|
|
51
|
+
```python
|
|
52
|
+
from loggen import Session, LogGen
|
|
53
|
+
|
|
54
|
+
session = Session(
|
|
55
|
+
seed=42,
|
|
56
|
+
hosts=["prod-web-01", "prod-db-01", "prod-cache-01"],
|
|
57
|
+
users=["alice", "bob", "charlie"],
|
|
58
|
+
)
|
|
59
|
+
gen = LogGen(session=session)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**From a file**
|
|
63
|
+
```python
|
|
64
|
+
from loggen import Session, LogGen, WordList
|
|
65
|
+
|
|
66
|
+
session = Session(
|
|
67
|
+
seed=42,
|
|
68
|
+
hosts=WordList.from_file("wordlists/hosts.txt"),
|
|
69
|
+
users=WordList.from_file("wordlists/users.json"),
|
|
70
|
+
internal_ips=WordList.from_file("wordlists/ips.csv", column=0),
|
|
71
|
+
)
|
|
72
|
+
gen = LogGen(session=session)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**From a JSON config file**
|
|
76
|
+
```python
|
|
77
|
+
from loggen import Session, LogGen
|
|
78
|
+
|
|
79
|
+
session = Session.from_config("config/loggen.json")
|
|
80
|
+
gen = LogGen(session=session)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
See [WIKI.md](WIKI.md) for the config file format and all supported options.
|
|
84
|
+
|
|
85
|
+
**From an environment variable**
|
|
86
|
+
```python
|
|
87
|
+
from loggen import WordList
|
|
88
|
+
|
|
89
|
+
hosts = WordList.from_env("LOGGEN_HOSTS") # comma-separated
|
|
90
|
+
users = WordList.from_env("LOGGEN_USERS", fallback=["admin"])
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## CLI
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# 10 000 mixed entries to stdout (JSON Lines)
|
|
97
|
+
python -m loggen generate -p windows -p linux -n 10000
|
|
98
|
+
|
|
99
|
+
# 500 MB of network logs, gzip, reproducible
|
|
100
|
+
python -m loggen generate -p network --size 500MB --seed 42 -o network.jsonl.gz
|
|
101
|
+
|
|
102
|
+
# Brute-force scenario in CEF format
|
|
103
|
+
python -m loggen generate --scenario brute_force -f cef -o brute.cef
|
|
104
|
+
|
|
105
|
+
# Custom profile weights
|
|
106
|
+
python -m loggen generate -p windows -p edr --weight windows:4 --weight edr:1 -n 50000 -o out.jsonl
|
|
107
|
+
|
|
108
|
+
# List available options
|
|
109
|
+
python -m loggen list-profiles
|
|
110
|
+
python -m loggen list-scenarios
|
|
111
|
+
python -m loggen list-formats
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Profiles
|
|
115
|
+
|
|
116
|
+
| Profile | Covers |
|
|
117
|
+
|-----------|--------|
|
|
118
|
+
| `windows` | Security events 4624/4625/4688/4672/4698/7045/5140/4104 … |
|
|
119
|
+
| `linux` | sshd auth, sudo, cron, systemd, auditd, PAM, kernel |
|
|
120
|
+
| `network` | Firewall allow/deny, DNS, HTTP proxy, DHCP |
|
|
121
|
+
| `edr` | Process/file/network/registry/module events with hashes |
|
|
122
|
+
|
|
123
|
+
## Scenarios
|
|
124
|
+
|
|
125
|
+
| Scenario | Description |
|
|
126
|
+
|--------------------|-------------|
|
|
127
|
+
| `brute_force` | Repeated logon failures → success |
|
|
128
|
+
| `lateral_movement` | Recon → SMB share access → remote execution |
|
|
129
|
+
| `priv_esc` | Service install → SYSTEM shell → 4672 |
|
|
130
|
+
| `data_exfil` | Archive creation → large outbound transfers → DNS tunnelling |
|
|
131
|
+
| `persistence` | Registry run key + scheduled task + binary drop |
|
|
132
|
+
|
|
133
|
+
## Output formats
|
|
134
|
+
|
|
135
|
+
| Format | Description |
|
|
136
|
+
|----------|-------------|
|
|
137
|
+
| `jsonl` | JSON Lines / NDJSON (default) |
|
|
138
|
+
| `cef` | ArcSight CEF:0 |
|
|
139
|
+
| `syslog` | RFC 5424 syslog |
|
|
140
|
+
|
|
141
|
+
See [WIKI.md](WIKI.md) for full API reference.
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# loggen
|
|
2
|
+
|
|
3
|
+
Synthetic log generator for SIEM and IR exercises. Reproducible, deterministic, supports arbitrary scales from MB to TB. Profiles for Windows, Linux, network, and EDR-like telemetry.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e .
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from loggen import LogGen, Session
|
|
15
|
+
|
|
16
|
+
gen = LogGen(seed=42)
|
|
17
|
+
|
|
18
|
+
# Stream 1 000 mixed entries
|
|
19
|
+
for entry in gen.stream(["windows", "linux", "edr"], count=1000):
|
|
20
|
+
print(entry.asdict())
|
|
21
|
+
|
|
22
|
+
# Write 1 GB of mixed logs (gzip)
|
|
23
|
+
gen.write("corpus.jsonl.gz", ["windows", "linux", "network", "edr"], size="1GB")
|
|
24
|
+
|
|
25
|
+
# Run a built-in attack scenario
|
|
26
|
+
gen.write("attack.jsonl", [], scenario="lateral_movement", count=50)
|
|
27
|
+
|
|
28
|
+
# In-memory list
|
|
29
|
+
entries = gen.to_list(["network"], count=200)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Custom data pools
|
|
33
|
+
|
|
34
|
+
By default loggen ships with a built-in set of hostnames, users, and IPs.
|
|
35
|
+
You can replace any of them with your own data.
|
|
36
|
+
|
|
37
|
+
**Inline lists**
|
|
38
|
+
```python
|
|
39
|
+
from loggen import Session, LogGen
|
|
40
|
+
|
|
41
|
+
session = Session(
|
|
42
|
+
seed=42,
|
|
43
|
+
hosts=["prod-web-01", "prod-db-01", "prod-cache-01"],
|
|
44
|
+
users=["alice", "bob", "charlie"],
|
|
45
|
+
)
|
|
46
|
+
gen = LogGen(session=session)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**From a file**
|
|
50
|
+
```python
|
|
51
|
+
from loggen import Session, LogGen, WordList
|
|
52
|
+
|
|
53
|
+
session = Session(
|
|
54
|
+
seed=42,
|
|
55
|
+
hosts=WordList.from_file("wordlists/hosts.txt"),
|
|
56
|
+
users=WordList.from_file("wordlists/users.json"),
|
|
57
|
+
internal_ips=WordList.from_file("wordlists/ips.csv", column=0),
|
|
58
|
+
)
|
|
59
|
+
gen = LogGen(session=session)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**From a JSON config file**
|
|
63
|
+
```python
|
|
64
|
+
from loggen import Session, LogGen
|
|
65
|
+
|
|
66
|
+
session = Session.from_config("config/loggen.json")
|
|
67
|
+
gen = LogGen(session=session)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
See [WIKI.md](WIKI.md) for the config file format and all supported options.
|
|
71
|
+
|
|
72
|
+
**From an environment variable**
|
|
73
|
+
```python
|
|
74
|
+
from loggen import WordList
|
|
75
|
+
|
|
76
|
+
hosts = WordList.from_env("LOGGEN_HOSTS") # comma-separated
|
|
77
|
+
users = WordList.from_env("LOGGEN_USERS", fallback=["admin"])
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## CLI
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# 10 000 mixed entries to stdout (JSON Lines)
|
|
84
|
+
python -m loggen generate -p windows -p linux -n 10000
|
|
85
|
+
|
|
86
|
+
# 500 MB of network logs, gzip, reproducible
|
|
87
|
+
python -m loggen generate -p network --size 500MB --seed 42 -o network.jsonl.gz
|
|
88
|
+
|
|
89
|
+
# Brute-force scenario in CEF format
|
|
90
|
+
python -m loggen generate --scenario brute_force -f cef -o brute.cef
|
|
91
|
+
|
|
92
|
+
# Custom profile weights
|
|
93
|
+
python -m loggen generate -p windows -p edr --weight windows:4 --weight edr:1 -n 50000 -o out.jsonl
|
|
94
|
+
|
|
95
|
+
# List available options
|
|
96
|
+
python -m loggen list-profiles
|
|
97
|
+
python -m loggen list-scenarios
|
|
98
|
+
python -m loggen list-formats
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Profiles
|
|
102
|
+
|
|
103
|
+
| Profile | Covers |
|
|
104
|
+
|-----------|--------|
|
|
105
|
+
| `windows` | Security events 4624/4625/4688/4672/4698/7045/5140/4104 … |
|
|
106
|
+
| `linux` | sshd auth, sudo, cron, systemd, auditd, PAM, kernel |
|
|
107
|
+
| `network` | Firewall allow/deny, DNS, HTTP proxy, DHCP |
|
|
108
|
+
| `edr` | Process/file/network/registry/module events with hashes |
|
|
109
|
+
|
|
110
|
+
## Scenarios
|
|
111
|
+
|
|
112
|
+
| Scenario | Description |
|
|
113
|
+
|--------------------|-------------|
|
|
114
|
+
| `brute_force` | Repeated logon failures → success |
|
|
115
|
+
| `lateral_movement` | Recon → SMB share access → remote execution |
|
|
116
|
+
| `priv_esc` | Service install → SYSTEM shell → 4672 |
|
|
117
|
+
| `data_exfil` | Archive creation → large outbound transfers → DNS tunnelling |
|
|
118
|
+
| `persistence` | Registry run key + scheduled task + binary drop |
|
|
119
|
+
|
|
120
|
+
## Output formats
|
|
121
|
+
|
|
122
|
+
| Format | Description |
|
|
123
|
+
|----------|-------------|
|
|
124
|
+
| `jsonl` | JSON Lines / NDJSON (default) |
|
|
125
|
+
| `cef` | ArcSight CEF:0 |
|
|
126
|
+
| `syslog` | RFC 5424 syslog |
|
|
127
|
+
|
|
128
|
+
See [WIKI.md](WIKI.md) for full API reference.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from loggen._version import VERSION
|
|
2
|
+
from loggen.core.session import Session
|
|
3
|
+
from loggen.core.generator import LogGen
|
|
4
|
+
from loggen.core.loaders import WordList
|
|
5
|
+
from loggen.profiles.base import LogEntry
|
|
6
|
+
from loggen.profiles import (
|
|
7
|
+
WindowsProfile,
|
|
8
|
+
LinuxProfile,
|
|
9
|
+
NetworkProfile,
|
|
10
|
+
EDRProfile,
|
|
11
|
+
PROFILE_REGISTRY,
|
|
12
|
+
)
|
|
13
|
+
from loggen.formats import Format, get_formatter
|
|
14
|
+
from loggen.scenarios import SCENARIO_REGISTRY
|
|
15
|
+
|
|
16
|
+
__version__ = VERSION
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"LogGen",
|
|
20
|
+
"Session",
|
|
21
|
+
"LogEntry",
|
|
22
|
+
"WordList",
|
|
23
|
+
"Format",
|
|
24
|
+
"get_formatter",
|
|
25
|
+
"WindowsProfile",
|
|
26
|
+
"LinuxProfile",
|
|
27
|
+
"NetworkProfile",
|
|
28
|
+
"EDRProfile",
|
|
29
|
+
"PROFILE_REGISTRY",
|
|
30
|
+
"SCENARIO_REGISTRY",
|
|
31
|
+
"__version__",
|
|
32
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
VERSION = "0.1.0"
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# loggen CLI
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from loggen._version import VERSION
|
|
9
|
+
from loggen.core.session import Session
|
|
10
|
+
from loggen.core.generator import LogGen
|
|
11
|
+
from loggen.profiles import PROFILE_REGISTRY
|
|
12
|
+
from loggen.formats import Format
|
|
13
|
+
from loggen.scenarios import SCENARIO_REGISTRY
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
17
|
+
p = argparse.ArgumentParser(
|
|
18
|
+
prog="loggen",
|
|
19
|
+
description="Synthetic log generator for SIEM and IR exercises.",
|
|
20
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
21
|
+
epilog=_EXAMPLES,
|
|
22
|
+
)
|
|
23
|
+
p.add_argument("--version", action="version", version=f"loggen {VERSION}")
|
|
24
|
+
|
|
25
|
+
sub = p.add_subparsers(dest="command", metavar="COMMAND")
|
|
26
|
+
|
|
27
|
+
# ---- generate -------------------------------------------------------
|
|
28
|
+
gen = sub.add_parser("generate", aliases=["gen"], help="Generate log entries.")
|
|
29
|
+
gen.add_argument(
|
|
30
|
+
"-p", "--profile",
|
|
31
|
+
dest="profiles", action="append", default=[],
|
|
32
|
+
metavar="NAME",
|
|
33
|
+
help=f"Profile to include (repeatable). Choices: {', '.join(sorted(PROFILE_REGISTRY))}",
|
|
34
|
+
)
|
|
35
|
+
gen.add_argument(
|
|
36
|
+
"-s", "--scenario",
|
|
37
|
+
metavar="NAME",
|
|
38
|
+
help=f"Run a built-in scenario instead of random profiles. "
|
|
39
|
+
f"Choices: {', '.join(sorted(SCENARIO_REGISTRY))}",
|
|
40
|
+
)
|
|
41
|
+
gen.add_argument(
|
|
42
|
+
"-n", "--count",
|
|
43
|
+
type=int, default=None,
|
|
44
|
+
metavar="N",
|
|
45
|
+
help="Number of log entries to generate.",
|
|
46
|
+
)
|
|
47
|
+
gen.add_argument(
|
|
48
|
+
"--size",
|
|
49
|
+
metavar="SIZE",
|
|
50
|
+
help="Target output size, e.g. 100MB, 1GB, 2TB. Overrides --count.",
|
|
51
|
+
)
|
|
52
|
+
gen.add_argument(
|
|
53
|
+
"-f", "--format",
|
|
54
|
+
default="jsonl", choices=[f.value for f in Format],
|
|
55
|
+
help="Output format (default: jsonl).",
|
|
56
|
+
)
|
|
57
|
+
gen.add_argument(
|
|
58
|
+
"-o", "--output",
|
|
59
|
+
default=None, metavar="PATH",
|
|
60
|
+
help="Output file path. Use .gz suffix for gzip. Defaults to stdout.",
|
|
61
|
+
)
|
|
62
|
+
gen.add_argument(
|
|
63
|
+
"--seed",
|
|
64
|
+
type=int, default=0,
|
|
65
|
+
help="Random seed for reproducibility (default: 0).",
|
|
66
|
+
)
|
|
67
|
+
gen.add_argument(
|
|
68
|
+
"--start",
|
|
69
|
+
metavar="DATETIME",
|
|
70
|
+
help="Log start timestamp ISO-8601 (default: 2024-01-01T00:00:00Z).",
|
|
71
|
+
)
|
|
72
|
+
gen.add_argument(
|
|
73
|
+
"--duration",
|
|
74
|
+
type=float, default=24.0, metavar="HOURS",
|
|
75
|
+
help="Time window in hours (default: 24).",
|
|
76
|
+
)
|
|
77
|
+
gen.add_argument(
|
|
78
|
+
"--weight",
|
|
79
|
+
dest="weights", action="append", default=[], metavar="PROFILE:WEIGHT",
|
|
80
|
+
help="Relative sampling weight, e.g. windows:3 (repeatable).",
|
|
81
|
+
)
|
|
82
|
+
gen.add_argument(
|
|
83
|
+
"--progress",
|
|
84
|
+
action="store_true",
|
|
85
|
+
help="Print progress to stderr.",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# ---- list-profiles --------------------------------------------------
|
|
89
|
+
sub.add_parser("list-profiles", aliases=["profiles"],
|
|
90
|
+
help="List available profiles.")
|
|
91
|
+
|
|
92
|
+
# ---- list-scenarios -------------------------------------------------
|
|
93
|
+
sub.add_parser("list-scenarios", aliases=["scenarios"],
|
|
94
|
+
help="List available scenarios.")
|
|
95
|
+
|
|
96
|
+
# ---- list-formats ---------------------------------------------------
|
|
97
|
+
sub.add_parser("list-formats", aliases=["formats"],
|
|
98
|
+
help="List available output formats.")
|
|
99
|
+
|
|
100
|
+
return p
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
_EXAMPLES = """
|
|
104
|
+
examples:
|
|
105
|
+
# 1 000 Windows + Linux events to stdout
|
|
106
|
+
loggen generate -p windows -p linux -n 1000
|
|
107
|
+
|
|
108
|
+
# 500 MB of network logs (gzipped), deterministic seed
|
|
109
|
+
loggen generate -p network --size 500MB --seed 42 -o network.jsonl.gz
|
|
110
|
+
|
|
111
|
+
# Run a brute-force scenario, CEF format
|
|
112
|
+
loggen generate --scenario brute_force -f cef -o brute.cef
|
|
113
|
+
|
|
114
|
+
# Mix all profiles with custom weights
|
|
115
|
+
loggen generate -p windows -p linux -p network -p edr \\
|
|
116
|
+
--weight windows:4 --weight edr:2 -n 50000 -o full.jsonl
|
|
117
|
+
|
|
118
|
+
# List what's available
|
|
119
|
+
loggen list-profiles
|
|
120
|
+
loggen list-scenarios
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _parse_weights(raw: list[str]) -> dict[str, float]:
|
|
125
|
+
result: dict[str, float] = {}
|
|
126
|
+
for item in raw:
|
|
127
|
+
if ":" not in item:
|
|
128
|
+
raise SystemExit(f"Invalid weight format {item!r} — expected PROFILE:WEIGHT")
|
|
129
|
+
k, v = item.rsplit(":", 1)
|
|
130
|
+
result[k] = float(v)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _parse_start(s: str | None):
|
|
135
|
+
if not s:
|
|
136
|
+
return None
|
|
137
|
+
from datetime import datetime, timezone
|
|
138
|
+
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
|
|
139
|
+
try:
|
|
140
|
+
dt = datetime.strptime(s, fmt)
|
|
141
|
+
return dt.replace(tzinfo=timezone.utc)
|
|
142
|
+
except ValueError:
|
|
143
|
+
continue
|
|
144
|
+
raise SystemExit(f"Cannot parse datetime {s!r}. Use ISO-8601, e.g. 2024-06-01T00:00:00Z")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def main(argv: list[str] | None = None) -> None:
|
|
148
|
+
parser = _build_parser()
|
|
149
|
+
args = parser.parse_args(argv)
|
|
150
|
+
|
|
151
|
+
if args.command in (None,):
|
|
152
|
+
parser.print_help()
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
if args.command in ("list-profiles", "profiles"):
|
|
156
|
+
print("Available profiles:")
|
|
157
|
+
for name in sorted(PROFILE_REGISTRY):
|
|
158
|
+
print(f" {name}")
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
if args.command in ("list-scenarios", "scenarios"):
|
|
162
|
+
print("Available scenarios:")
|
|
163
|
+
from loggen.scenarios import SCENARIO_REGISTRY as SR
|
|
164
|
+
for name, cls in sorted(SR.items()):
|
|
165
|
+
print(f" {name:<20} {cls.description}")
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
if args.command in ("list-formats", "formats"):
|
|
169
|
+
print("Available formats:")
|
|
170
|
+
for f in Format:
|
|
171
|
+
print(f" {f.value}")
|
|
172
|
+
return
|
|
173
|
+
|
|
174
|
+
if args.command in ("generate", "gen"):
|
|
175
|
+
_run_generate(args)
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
parser.print_help()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _run_generate(args) -> None:
|
|
182
|
+
if not args.profiles and not args.scenario:
|
|
183
|
+
raise SystemExit(
|
|
184
|
+
"Specify at least one --profile or a --scenario.\n"
|
|
185
|
+
f"Profiles: {', '.join(sorted(PROFILE_REGISTRY))}\n"
|
|
186
|
+
f"Scenarios: {', '.join(sorted(SCENARIO_REGISTRY))}"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if not args.count and not args.size and not args.scenario:
|
|
190
|
+
raise SystemExit("Specify --count N or --size SIZE (e.g. --size 100MB).")
|
|
191
|
+
|
|
192
|
+
start = _parse_start(args.start)
|
|
193
|
+
session_kwargs: dict = {"seed": args.seed, "duration_hours": args.duration}
|
|
194
|
+
if start:
|
|
195
|
+
session_kwargs["start"] = start
|
|
196
|
+
session = Session(**session_kwargs)
|
|
197
|
+
gen = LogGen(session=session)
|
|
198
|
+
|
|
199
|
+
weights = _parse_weights(args.weights)
|
|
200
|
+
dest = args.output # None → stdout
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
written = gen.write(
|
|
204
|
+
dest=dest,
|
|
205
|
+
profiles=args.profiles,
|
|
206
|
+
count=args.count,
|
|
207
|
+
size=args.size,
|
|
208
|
+
format=args.format,
|
|
209
|
+
progress=args.progress,
|
|
210
|
+
weights=weights if weights else None,
|
|
211
|
+
scenario=args.scenario,
|
|
212
|
+
)
|
|
213
|
+
except (BrokenPipeError, KeyboardInterrupt):
|
|
214
|
+
pass
|
|
215
|
+
else:
|
|
216
|
+
if args.output and args.progress:
|
|
217
|
+
sys.stderr.write(f"Wrote {written / 1_048_576:.2f} MB → {args.output}\n")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if __name__ == "__main__":
|
|
221
|
+
main()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
from loggen.core.session import Session
|
|
6
|
+
from loggen.profiles.base import LogEntry
|
|
7
|
+
from loggen.profiles import PROFILE_REGISTRY
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_size(s: str | int | None) -> int | None:
|
|
11
|
+
if s is None:
|
|
12
|
+
return None
|
|
13
|
+
if isinstance(s, int):
|
|
14
|
+
return s
|
|
15
|
+
s = s.strip().upper()
|
|
16
|
+
for unit, mult in [("TB", 1 << 40), ("GB", 1 << 30), ("MB", 1 << 20), ("KB", 1 << 10), ("B", 1)]:
|
|
17
|
+
if s.endswith(unit):
|
|
18
|
+
return int(float(s[: -len(unit)]) * mult)
|
|
19
|
+
return int(s)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LogGen:
|
|
23
|
+
def __init__(self, seed: int = 0, session: Session | None = None) -> None:
|
|
24
|
+
self.session = session or Session(seed=seed)
|
|
25
|
+
|
|
26
|
+
def stream(
|
|
27
|
+
self,
|
|
28
|
+
profiles: list[str],
|
|
29
|
+
count: int | None = None,
|
|
30
|
+
weights: dict[str, float] | None = None,
|
|
31
|
+
scenario: str | None = None,
|
|
32
|
+
) -> Iterator[LogEntry]:
|
|
33
|
+
if scenario:
|
|
34
|
+
yield from self._stream_scenario(scenario, count)
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
if not profiles:
|
|
38
|
+
raise ValueError("At least one profile required.")
|
|
39
|
+
|
|
40
|
+
for name in profiles:
|
|
41
|
+
if name not in PROFILE_REGISTRY:
|
|
42
|
+
raise ValueError(f"Unknown profile {name!r}. Available: {sorted(PROFILE_REGISTRY)}")
|
|
43
|
+
|
|
44
|
+
profile_instances = [(name, PROFILE_REGISTRY[name]()) for name in profiles]
|
|
45
|
+
w = weights or {}
|
|
46
|
+
profile_weights = [w.get(name, 1.0) for name, _ in profile_instances]
|
|
47
|
+
|
|
48
|
+
rng = self.session.rng
|
|
49
|
+
n = 0
|
|
50
|
+
while count is None or n < count:
|
|
51
|
+
(name, profile), = rng.choices(profile_instances, weights=profile_weights, k=1)
|
|
52
|
+
yield profile.generate_one(self.session.fork(n))
|
|
53
|
+
n += 1
|
|
54
|
+
|
|
55
|
+
def to_list(self, profiles: list[str], count: int, **kwargs) -> list[LogEntry]:
|
|
56
|
+
return list(self.stream(profiles, count=count, **kwargs))
|
|
57
|
+
|
|
58
|
+
def write(
|
|
59
|
+
self,
|
|
60
|
+
dest,
|
|
61
|
+
profiles: list[str],
|
|
62
|
+
count: int | None = None,
|
|
63
|
+
size: str | int | None = None,
|
|
64
|
+
format: str = "jsonl",
|
|
65
|
+
progress: bool = False,
|
|
66
|
+
**kwargs,
|
|
67
|
+
) -> int:
|
|
68
|
+
from loggen.core.writer import write_stream
|
|
69
|
+
from loggen.formats import get_formatter
|
|
70
|
+
|
|
71
|
+
formatter = get_formatter(format)
|
|
72
|
+
size_bytes = _parse_size(size)
|
|
73
|
+
entries = self.stream(profiles, count=count, **kwargs)
|
|
74
|
+
return write_stream(entries, dest, formatter, size_limit=size_bytes, progress=progress)
|
|
75
|
+
|
|
76
|
+
def _stream_scenario(self, scenario: str, count: int | None) -> Iterator[LogEntry]:
|
|
77
|
+
from loggen.scenarios import SCENARIO_REGISTRY
|
|
78
|
+
|
|
79
|
+
cls = SCENARIO_REGISTRY.get(scenario)
|
|
80
|
+
if cls is None:
|
|
81
|
+
raise ValueError(f"Unknown scenario {scenario!r}. Available: {sorted(SCENARIO_REGISTRY)}")
|
|
82
|
+
yield from cls(self.session).stream(count=count)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WordList(list):
|
|
10
|
+
"""A list that can be populated from files or env vars."""
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def from_file(cls, path: str | Path, *, column: int = 0, comment: str = "#") -> "WordList":
|
|
14
|
+
"""Load from .txt, .json, or .csv."""
|
|
15
|
+
path = str(path)
|
|
16
|
+
ext = os.path.splitext(path)[1].lower()
|
|
17
|
+
|
|
18
|
+
if ext == ".json":
|
|
19
|
+
with open(path) as f:
|
|
20
|
+
data = json.load(f)
|
|
21
|
+
if not isinstance(data, list):
|
|
22
|
+
raise ValueError(f"{path}: expected a JSON array")
|
|
23
|
+
return cls(str(x) for x in data)
|
|
24
|
+
|
|
25
|
+
if ext == ".csv":
|
|
26
|
+
with open(path, newline="") as f:
|
|
27
|
+
rows = csv.reader(f)
|
|
28
|
+
return cls(
|
|
29
|
+
row[column]
|
|
30
|
+
for row in rows
|
|
31
|
+
if row and not row[0].startswith(comment)
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# plain text — one item per line
|
|
35
|
+
with open(path) as f:
|
|
36
|
+
return cls(
|
|
37
|
+
line.strip()
|
|
38
|
+
for line in f
|
|
39
|
+
if line.strip() and not line.startswith(comment)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_env(cls, var: str, sep: str = ",", fallback: list[str] | None = None) -> "WordList":
|
|
44
|
+
"""Load from a comma-separated environment variable."""
|
|
45
|
+
raw = os.environ.get(var, "")
|
|
46
|
+
if raw:
|
|
47
|
+
return cls(item.strip() for item in raw.split(sep) if item.strip())
|
|
48
|
+
return cls(fallback or [])
|