cesnet-datazoo 0.0.16__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/PKG-INFO +2 -1
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/config.py +174 -167
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/constants.py +4 -6
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/cesnet_dataset.py +200 -172
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/datasets.py +22 -2
- cesnet-datazoo-0.1.0/cesnet_datazoo/datasets/datasets_constants.py +670 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/loaders.py +3 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
- cesnet-datazoo-0.1.0/cesnet_datazoo/datasets/metadata/metadata.csv +4 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/statistics.py +36 -16
- cesnet-datazoo-0.1.0/cesnet_datazoo/pytables_data/data_scalers.py +110 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/indices_setup.py +29 -33
- cesnet-datazoo-0.1.0/cesnet_datazoo/pytables_data/pytables_dataset.py +294 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/class_info.py +7 -5
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/download.py +6 -1
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/PKG-INFO +2 -1
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/SOURCES.txt +2 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/requires.txt +1 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/pyproject.toml +2 -1
- cesnet-datazoo-0.0.16/cesnet_datazoo/datasets/metadata/metadata.csv +0 -4
- cesnet-datazoo-0.0.16/cesnet_datazoo/pytables_data/pytables_dataset.py +0 -420
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/LICENCE +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/README.md +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/__init__.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/__init__.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/__init__.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/classification_report.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/__init__.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/__init__.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/fileutils.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/random.py +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/top_level.txt +0 -0
- {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
|
|
16
16
|
Requires-Python: >=3.10
|
17
17
|
Description-Content-Type: text/markdown
|
18
18
|
License-File: LICENCE
|
19
|
+
Requires-Dist: cesnet_models
|
19
20
|
Requires-Dist: matplotlib
|
20
21
|
Requires-Dist: numpy
|
21
22
|
Requires-Dist: pandas
|
@@ -8,12 +8,11 @@ import warnings
|
|
8
8
|
from dataclasses import InitVar, field
|
9
9
|
from datetime import datetime
|
10
10
|
from enum import Enum
|
11
|
-
from typing import TYPE_CHECKING, Literal, Optional
|
11
|
+
from typing import TYPE_CHECKING, Callable, Literal, Optional
|
12
12
|
|
13
13
|
import yaml
|
14
14
|
from pydantic import model_validator
|
15
15
|
from pydantic.dataclasses import dataclass
|
16
|
-
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
|
17
16
|
|
18
17
|
from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP_FLAGS,
|
19
18
|
TCP_PPI_CHANNELS, UDP_PPI_CHANNELS)
|
@@ -21,19 +20,6 @@ from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP
|
|
21
20
|
if TYPE_CHECKING:
|
22
21
|
from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
|
23
22
|
|
24
|
-
Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
|
25
|
-
|
26
|
-
class ScalerEnum(Enum):
|
27
|
-
"""Available scalers for flow statistics, packet sizes, and inter-packet times."""
|
28
|
-
STANDARD = "standard"
|
29
|
-
"""Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
|
30
|
-
ROBUST = "robust"
|
31
|
-
"""Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
|
32
|
-
MINMAX = "minmax"
|
33
|
-
"""Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
|
34
|
-
NO_SCALER = "no-scaler"
|
35
|
-
"""No scaling."""
|
36
|
-
def __str__(self): return self.value
|
37
23
|
|
38
24
|
class Protocol(Enum):
|
39
25
|
TLS = "TLS"
|
@@ -48,25 +34,23 @@ class ValidationApproach(Enum):
|
|
48
34
|
is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
|
49
35
|
VALIDATION_DATES = "validation-dates"
|
50
36
|
"""Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
|
51
|
-
NO_VALIDATION = "no-validation"
|
52
|
-
"""Do not use validation. The validation dataloader and dataframe will not be available."""
|
53
37
|
def __str__(self): return self.value
|
54
38
|
|
55
39
|
class AppSelection(Enum):
|
56
40
|
"""
|
57
41
|
Applications can be divided into *known* and *unknown* classes. To use a dataset in the standard closed-world setting, use `ALL_KNOWN ` to select all the applications as *known*.
|
58
|
-
Use `TOPX_KNOWN` or `
|
59
|
-
The `
|
42
|
+
Use `TOPX_KNOWN` or `BACKGROUND_UNKNOWN` for the open-world setting and evaluation of out-of-distribution or open-set recognition methods.
|
43
|
+
The `FIXED` is for manual selection of *known* and *unknown* applications.
|
60
44
|
"""
|
61
45
|
ALL_KNOWN = "all-known"
|
62
46
|
"""Use all applications as *known*."""
|
63
47
|
TOPX_KNOWN = "topx-known"
|
64
48
|
"""Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
|
65
49
|
Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
|
66
|
-
|
67
|
-
"""Use the
|
68
|
-
|
69
|
-
"""
|
50
|
+
BACKGROUND_UNKNOWN = "background-unknown"
|
51
|
+
"""Use the list of background traffic classes (`apps_selection_background_unknown`) as *unknown*, and the rest as *known*."""
|
52
|
+
FIXED = "fixed"
|
53
|
+
"""Manual application selection. Provide lists of *known* applications (`apps_selection_fixed_known`) and *unknown* applications (`apps_selection_fixed_unknown`)."""
|
70
54
|
def __str__(self): return self.value
|
71
55
|
|
72
56
|
class MinTrainSamplesCheck(Enum):
|
@@ -103,8 +87,9 @@ class TrainDataParams():
|
|
103
87
|
train_tables_paths: list[str]
|
104
88
|
apps_selection: AppSelection
|
105
89
|
apps_selection_topx: int
|
106
|
-
|
107
|
-
|
90
|
+
apps_selection_background_unknown: list[str]
|
91
|
+
apps_selection_fixed_known: list[str]
|
92
|
+
apps_selection_fixed_unknown: list[str]
|
108
93
|
disabled_apps: list[str]
|
109
94
|
min_train_samples_check: MinTrainSamplesCheck
|
110
95
|
min_train_samples_per_app: int
|
@@ -114,8 +99,8 @@ class TestDataParams():
|
|
114
99
|
database_filename: str
|
115
100
|
test_period_name: str
|
116
101
|
test_tables_paths: list[str]
|
117
|
-
|
118
|
-
|
102
|
+
known_apps: list[str]
|
103
|
+
unknown_apps: list[str]
|
119
104
|
|
120
105
|
class C:
|
121
106
|
arbitrary_types_allowed = True
|
@@ -128,38 +113,43 @@ class DatasetConfig():
|
|
128
113
|
|
129
114
|
- Train, validation, test sets (dates, sizes, validation approach).
|
130
115
|
- Application selection — either the standard closed-world setting (only *known* classes) or the open-world setting (*known* and *unknown* classes).
|
131
|
-
- Feature scaling. See the [data features][features] page for more information.
|
116
|
+
- Feature scaling. See the [data features][features] page for more information. DOCS_TODO
|
132
117
|
- Dataloader options like batch sizes, order of loading, or number of workers.
|
133
118
|
|
134
119
|
When initializing this class, pass a [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance to be configured and the desired configuration. Available options are [here][config.DatasetConfig--configuration-options].
|
135
120
|
|
136
121
|
Attributes:
|
137
|
-
dataset: The dataset instance to be configured
|
138
|
-
data_root: Taken from the dataset instance
|
139
|
-
database_filename: Taken from the dataset instance
|
140
|
-
database_path: Taken from the dataset instance
|
141
|
-
servicemap_path: Taken from the dataset instance
|
142
|
-
flowstats_features: Taken from `dataset.metadata.flowstats_features
|
143
|
-
|
122
|
+
dataset: The dataset instance to be configured.
|
123
|
+
data_root: Taken from the dataset instance.
|
124
|
+
database_filename: Taken from the dataset instance.
|
125
|
+
database_path: Taken from the dataset instance.
|
126
|
+
servicemap_path: Taken from the dataset instance.
|
127
|
+
flowstats_features: Taken from `dataset.metadata.flowstats_features`.
|
128
|
+
flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
|
129
|
+
flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
|
130
|
+
other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
|
144
131
|
|
145
132
|
# Configuration options
|
146
133
|
|
147
134
|
Attributes:
|
135
|
+
need_train_set: Use to disable the train set. `Default: True`
|
136
|
+
need_val_set: Use to disable the validation set. When `need_train_set` is false, the validation set will also be disabled. `Default: True`
|
137
|
+
need_test_set: Use to disable the test set. `Default: True`
|
148
138
|
train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
149
139
|
train_dates: Dates used for creating a train set.
|
150
140
|
train_dates_weigths: To use a non-uniform distribution of samples across train dates.
|
151
|
-
val_approach: How a validation set should be created. Either split train data into train and validation
|
141
|
+
val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
|
152
142
|
train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
|
153
143
|
val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
154
144
|
val_dates: Dates used for creating a validation set.
|
155
|
-
no_test_set: Disable the test set. `Default: False`
|
156
145
|
test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
157
146
|
test_dates: Dates used for creating a test set.
|
158
147
|
|
159
148
|
apps_selection: How to select application classes. `Default: ALL_KNOWN`
|
160
149
|
apps_selection_topx: Take top X as known.
|
161
|
-
|
162
|
-
|
150
|
+
apps_selection_background_unknown: Provide a list of background traffic classes to be used as unknown.
|
151
|
+
apps_selection_fixed_known: Provide a list of manually selected known applications.
|
152
|
+
apps_selection_fixed_unknown: Provide a list of manually selected unknown applications.
|
163
153
|
disabled_apps: List of applications to be disabled and not used at all.
|
164
154
|
min_train_samples_check: How to handle applications with *not enough* training samples. `Default: DISABLE_APPS`
|
165
155
|
min_train_samples_per_app: Defines the threshold for *not enough*. `Default: 100`
|
@@ -182,21 +172,14 @@ class DatasetConfig():
|
|
182
172
|
train_dataloader_seed: Seed for loading train data in random order. `Default: None`
|
183
173
|
|
184
174
|
return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
|
185
|
-
|
186
|
-
raw_output: Return raw output without data scaling, clipping, and normalization. `Default: False`
|
175
|
+
return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
|
187
176
|
use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
|
188
|
-
normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
|
189
177
|
use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
|
190
178
|
use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
psizes_scaler: Which scaler to use for packet sizes. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
|
196
|
-
psizes_max: Max clip packet sizes before scaling. `Default: 1500`
|
197
|
-
ipt_scaler: Which scaler to use for inter-packet times. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
|
198
|
-
ipt_min: Min clip inter-packet times before scaling. `Default: 0`
|
199
|
-
ipt_max: Max clip inter-packet times before scaling. `Default: 65000`
|
179
|
+
fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25` DOCS_TODO
|
180
|
+
ppi_transform: Transform function for PPI sequences. `Default: None` DOCS_TODO
|
181
|
+
flowstats_transform: Transform function for flow statistics. `Default: None`
|
182
|
+
flowstats_phist_transform: Transform function for packet histograms. `Default: None`
|
200
183
|
|
201
184
|
# How to configure train, validation, and test sets
|
202
185
|
There are three options for how to define train/validation/test dates.
|
@@ -222,8 +205,13 @@ class DatasetConfig():
|
|
222
205
|
database_path: str = field(init=False)
|
223
206
|
servicemap_path: str = field(init=False)
|
224
207
|
flowstats_features: list[str] = field(init=False)
|
208
|
+
flowstats_features_boolean: list[str] = field(init=False)
|
209
|
+
flowstats_features_phist: list[str] = field(init=False)
|
225
210
|
other_fields: list[str] = field(init=False)
|
226
211
|
|
212
|
+
need_train_set: bool = True
|
213
|
+
need_val_set: bool = True
|
214
|
+
need_test_set: bool = True
|
227
215
|
train_period_name: str = ""
|
228
216
|
train_dates: list[str] = field(default_factory=list)
|
229
217
|
train_dates_weigths: Optional[list[int]] = None
|
@@ -231,14 +219,14 @@ class DatasetConfig():
|
|
231
219
|
train_val_split_fraction: float = 0.2
|
232
220
|
val_period_name: str = ""
|
233
221
|
val_dates: list[str] = field(default_factory=list)
|
234
|
-
no_test_set: bool = False
|
235
222
|
test_period_name: str = ""
|
236
223
|
test_dates: list[str] = field(default_factory=list)
|
237
224
|
|
238
225
|
apps_selection: AppSelection = AppSelection.ALL_KNOWN
|
239
226
|
apps_selection_topx: int = 0
|
240
|
-
|
241
|
-
|
227
|
+
apps_selection_background_unknown: list[str] = field(default_factory=list)
|
228
|
+
apps_selection_fixed_known: list[str] = field(default_factory=list)
|
229
|
+
apps_selection_fixed_unknown: list[str] = field(default_factory=list)
|
242
230
|
disabled_apps: list[str] = field(default_factory=list)
|
243
231
|
min_train_samples_check: MinTrainSamplesCheck = MinTrainSamplesCheck.DISABLE_APPS
|
244
232
|
min_train_samples_per_app: int = 100
|
@@ -261,21 +249,14 @@ class DatasetConfig():
|
|
261
249
|
train_dataloader_seed: Optional[int] = None
|
262
250
|
|
263
251
|
return_other_fields: bool = False
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
normalize_packet_histograms: bool = True
|
268
|
-
use_tcp_features: bool = True
|
252
|
+
return_tensors: bool = False
|
253
|
+
use_packet_histograms: bool = False
|
254
|
+
use_tcp_features: bool = False
|
269
255
|
use_push_flags: bool = False
|
270
|
-
zero_ppi_start: int = 0
|
271
256
|
fit_scalers_samples: int | float = 0.25
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
psizes_max: int = 1500
|
276
|
-
ipt_scaler: ScalerEnum = ScalerEnum.STANDARD
|
277
|
-
ipt_min: int = 0
|
278
|
-
ipt_max: int = 65000
|
257
|
+
ppi_transform: Optional[Callable] = None
|
258
|
+
flowstats_transform: Optional[Callable] = None
|
259
|
+
flowstats_phist_transform: Optional[Callable] = None
|
279
260
|
|
280
261
|
def __post_init__(self, dataset: CesnetDataset):
|
281
262
|
"""
|
@@ -285,23 +266,28 @@ class DatasetConfig():
|
|
285
266
|
self.servicemap_path = dataset.servicemap_path
|
286
267
|
self.database_filename = dataset.database_filename
|
287
268
|
self.database_path = dataset.database_path
|
288
|
-
self.flowstats_features = dataset.metadata.flowstats_features
|
289
|
-
self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
|
290
269
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
if self.
|
296
|
-
raise ValueError(
|
297
|
-
|
298
|
-
|
299
|
-
self.train_period_name
|
300
|
-
|
270
|
+
if not self.need_train_set:
|
271
|
+
self.need_val_set = False
|
272
|
+
if self.apps_selection != AppSelection.FIXED:
|
273
|
+
raise ValueError("Application selection has to be fixed when need_train_set is false")
|
274
|
+
if (len(self.train_dates) > 0 or self.train_period_name != ""):
|
275
|
+
raise ValueError("train_dates and train_period_name cannot be specified when need_train_set is false")
|
276
|
+
else:
|
277
|
+
# Configure train dates
|
278
|
+
if len(self.train_dates) > 0 and self.train_period_name == "":
|
279
|
+
raise ValueError("train_period_name has to be specified when train_dates are set")
|
280
|
+
if len(self.train_dates) == 0 and self.train_period_name != "":
|
281
|
+
if self.train_period_name not in dataset.time_periods:
|
282
|
+
raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
|
283
|
+
self.train_dates = dataset.time_periods[self.train_period_name]
|
284
|
+
if len(self.train_dates) == 0 and self.test_period_name == "":
|
285
|
+
self.train_period_name = dataset.default_train_period_name
|
286
|
+
self.train_dates = dataset.time_periods[dataset.default_train_period_name]
|
301
287
|
# Configure test dates
|
302
|
-
if self.
|
288
|
+
if not self.need_test_set:
|
303
289
|
if (len(self.test_dates) > 0 or self.test_period_name != ""):
|
304
|
-
raise ValueError("test_dates and test_period_name cannot be specified when
|
290
|
+
raise ValueError("test_dates and test_period_name cannot be specified when need_test_set is false")
|
305
291
|
else:
|
306
292
|
if len(self.test_dates) > 0 and self.test_period_name == "":
|
307
293
|
raise ValueError("test_period_name has to be specified when test_dates are set")
|
@@ -313,8 +299,8 @@ class DatasetConfig():
|
|
313
299
|
self.test_period_name = dataset.default_test_period_name
|
314
300
|
self.test_dates = dataset.time_periods[dataset.default_test_period_name]
|
315
301
|
# Configure val dates
|
316
|
-
if (self.
|
317
|
-
raise ValueError("val_dates and val_period_name cannot be specified when
|
302
|
+
if (not self.need_val_set or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
|
303
|
+
raise ValueError("val_dates and val_period_name cannot be specified when need_val_set is false or the validation approach is split-from-train")
|
318
304
|
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
319
305
|
if len(self.val_dates) > 0 and self.val_period_name == "":
|
320
306
|
raise ValueError("val_period_name has to be specified when val_dates are set")
|
@@ -323,57 +309,58 @@ class DatasetConfig():
|
|
323
309
|
raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
|
324
310
|
self.val_dates = dataset.time_periods[self.val_period_name]
|
325
311
|
if len(self.val_dates) == 0 and self.val_period_name == "":
|
326
|
-
raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when
|
312
|
+
raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
|
327
313
|
# Check if train, val, and test dates are available in the dataset
|
328
|
-
if dataset.available_dates
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
314
|
+
bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
|
315
|
+
bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
|
316
|
+
bad_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
|
317
|
+
if len(bad_train_dates) > 0:
|
318
|
+
raise ValueError(f"Bad train dates {bad_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
319
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
320
|
+
if len(bad_val_dates) > 0:
|
321
|
+
raise ValueError(f"Bad validation dates {bad_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
322
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
323
|
+
if len(bad_test_dates) > 0:
|
324
|
+
raise ValueError(f"Bad test dates {bad_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
325
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
341
326
|
# Check time order of train, val, and test periods
|
342
327
|
train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
|
343
328
|
test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
|
344
|
-
if
|
329
|
+
if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
|
345
330
|
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
346
331
|
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
332
|
+
# Train dates are guaranteed to be set
|
347
333
|
val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
|
348
334
|
if min(val_dates) <= max(train_dates):
|
349
335
|
warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
350
|
-
if
|
336
|
+
if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
|
351
337
|
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
352
338
|
# Configure features
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
self.flowstats_clip = 1.0
|
357
|
-
self.psizes_scaler = ScalerEnum.NO_SCALER
|
358
|
-
self.psizes_max = 1500
|
359
|
-
self.ipt_scaler = ScalerEnum.NO_SCALER
|
360
|
-
self.ipt_min = 0
|
361
|
-
self.ipt_max = 65000
|
362
|
-
if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
|
363
|
-
self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
|
364
|
-
if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.features_in_packet_sequences:
|
365
|
-
raise ValueError("This TLS dataset does not support use_push_flags")
|
339
|
+
self.flowstats_features = dataset.metadata.flowstats_features
|
340
|
+
self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
|
341
|
+
self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
|
366
342
|
if self.use_packet_histograms:
|
367
|
-
if len(dataset.metadata.
|
368
|
-
|
369
|
-
|
370
|
-
|
343
|
+
if len(dataset.metadata.packet_histograms) == 0:
|
344
|
+
raise ValueError("This dataset does not support use_packet_histograms")
|
345
|
+
self.flowstats_features_phist = dataset.metadata.packet_histograms
|
346
|
+
else:
|
347
|
+
self.flowstats_features_phist = []
|
348
|
+
if self.flowstats_phist_transform is not None:
|
349
|
+
raise ValueError("flowstats_phist_transform cannot be specified when use_packet_histograms is false")
|
350
|
+
if dataset.metadata.protocol == Protocol.TLS:
|
351
|
+
if self.use_tcp_features:
|
352
|
+
self.flowstats_features_boolean = self.flowstats_features_boolean + SELECTED_TCP_FLAGS
|
353
|
+
if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.ppi_features:
|
354
|
+
raise ValueError("This TLS dataset does not support use_push_flags")
|
371
355
|
if dataset.metadata.protocol == Protocol.QUIC:
|
372
|
-
self.use_tcp_features
|
356
|
+
if self.use_tcp_features:
|
357
|
+
raise ValueError("QUIC datasets do not support use_tcp_features")
|
373
358
|
if self.use_push_flags:
|
374
359
|
raise ValueError("QUIC datasets do not support use_push_flags")
|
375
360
|
# When train_dates_weigths are used, train_size and val_known_size have to be specified
|
376
361
|
if self.train_dates_weigths is not None:
|
362
|
+
if not self.need_train_set:
|
363
|
+
raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
|
377
364
|
if len(self.train_dates_weigths) != len(self.train_dates):
|
378
365
|
raise ValueError("train_dates_weigths has to have the same length as train_dates")
|
379
366
|
if self.train_size == "all":
|
@@ -384,59 +371,75 @@ class DatasetConfig():
|
|
384
371
|
if self.apps_selection == AppSelection.ALL_KNOWN:
|
385
372
|
self.val_unknown_size = 0
|
386
373
|
self.test_unknown_size = 0
|
387
|
-
if self.apps_selection_topx != 0 or len(self.
|
388
|
-
raise ValueError("apps_selection_topx,
|
389
|
-
if self.apps_selection == AppSelection.TOPX_KNOWN
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
374
|
+
if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
|
375
|
+
raise ValueError("apps_selection_topx, apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is all-known")
|
376
|
+
if self.apps_selection == AppSelection.TOPX_KNOWN:
|
377
|
+
if self.apps_selection_topx == 0:
|
378
|
+
raise ValueError("apps_selection_topx has to be greater than 0 when application selection is top-x-known")
|
379
|
+
if len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
|
380
|
+
raise ValueError("apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is top-x-known")
|
381
|
+
if self.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
|
382
|
+
if len(self.apps_selection_background_unknown) == 0:
|
383
|
+
raise ValueError("apps_selection_background_unknown has to be specified when application selection is background-unknown")
|
384
|
+
bad_apps = [a for a in self.apps_selection_background_unknown if a not in dataset.available_classes]
|
385
|
+
if len(bad_apps) > 0:
|
386
|
+
raise ValueError(f"Bad applications in apps_selection_background_unknown {bad_apps}. Use applications available in dataset.available_classes")
|
387
|
+
if self.apps_selection_topx != 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
|
388
|
+
raise ValueError("apps_selection_topx, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is background-unknown")
|
389
|
+
if self.apps_selection == AppSelection.FIXED:
|
390
|
+
if len(self.apps_selection_fixed_known) == 0:
|
391
|
+
raise ValueError("apps_selection_fixed_known has to be specified when application selection is fixed")
|
392
|
+
bad_apps = [a for a in self.apps_selection_fixed_known + self.apps_selection_fixed_unknown if a not in dataset.available_classes]
|
393
|
+
if len(bad_apps) > 0:
|
394
|
+
raise ValueError(f"Bad applications in apps_selection_fixed_known or apps_selection_fixed_unknown {bad_apps}. Use applications available in dataset.available_classes")
|
396
395
|
if len(self.disabled_apps) > 0:
|
397
|
-
raise ValueError("disabled_apps cannot be specified when
|
398
|
-
if self.min_train_samples_per_app != 0:
|
399
|
-
|
400
|
-
|
401
|
-
|
396
|
+
raise ValueError("disabled_apps cannot be specified when application selection is fixed")
|
397
|
+
if self.min_train_samples_per_app != 0 and self.min_train_samples_per_app != 100:
|
398
|
+
warnings.warn("min_train_samples_per_app is not used when application selection is fixed")
|
399
|
+
if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0:
|
400
|
+
raise ValueError("apps_selection_topx and apps_selection_background_unknown cannot be specified when application selection is fixed")
|
402
401
|
# More asserts
|
403
|
-
|
404
|
-
|
402
|
+
bad_disabled_apps = [a for a in self.disabled_apps if a not in dataset.available_classes]
|
403
|
+
if len(bad_disabled_apps) > 0:
|
404
|
+
raise ValueError(f"Bad applications in disabled_apps {bad_disabled_apps}. Use applications available in dataset.available_classes")
|
405
405
|
if isinstance(self.fit_scalers_samples, float) and (self.fit_scalers_samples <= 0 or self.fit_scalers_samples > 1):
|
406
406
|
raise ValueError("fit_scalers_samples has to be either float between 0 and 1 (giving the fraction of training samples used for fitting scalers) or an integer")
|
407
407
|
|
408
408
|
def get_flowstats_features_len(self) -> int:
|
409
409
|
"""Gets the number of flow statistics features."""
|
410
|
-
|
411
|
-
for f in self.flowstats_features:
|
412
|
-
if f.startswith("PHIST_"):
|
413
|
-
n += PHIST_BIN_COUNT
|
414
|
-
else:
|
415
|
-
n += 1
|
416
|
-
return n
|
410
|
+
return len(self.flowstats_features) + len(self.flowstats_features_boolean) + PHIST_BIN_COUNT * len(self.flowstats_features_phist)
|
417
411
|
|
418
412
|
def get_flowstats_feature_names_expanded(self, shorter_names: bool = False) -> list[str]:
|
419
413
|
"""Gets names of flow statistics features. Packet histograms are expanded into bin features."""
|
420
|
-
|
414
|
+
phist_mapping = {
|
421
415
|
"PHIST_SRC_SIZES": [f"PSIZE_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
|
422
416
|
"PHIST_DST_SIZES": [f"PSIZE_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
|
423
417
|
"PHIST_SRC_IPT": [f"IPT_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
|
424
418
|
"PHIST_DST_IPT": [f"IPT_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
|
425
|
-
"FLOW_ENDREASON_IDLE": "FEND_IDLE" if shorter_names else "FLOW_ENDREASON_IDLE",
|
426
|
-
"FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE" if shorter_names else "FLOW_ENDREASON_ACTIVE",
|
427
|
-
"FLOW_ENDREASON_END": "FEND_END" if shorter_names else "FLOW_ENDREASON_END",
|
428
|
-
"FLOW_ENDREASON_OTHER": "FEND_OTHER" if shorter_names else "FLOW_ENDREASON_OTHER",
|
429
419
|
}
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
420
|
+
short_names_mapping = {
|
421
|
+
"FLOW_ENDREASON_IDLE": "FEND_IDLE",
|
422
|
+
"FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE",
|
423
|
+
"FLOW_ENDREASON_END": "FEND_END",
|
424
|
+
"FLOW_ENDREASON_OTHER": "FEND_OTHER",
|
425
|
+
"FLAG_CWR": "F_CWR",
|
426
|
+
"FLAG_CWR_REV": "F_CWR_REV",
|
427
|
+
"FLAG_ECE": "F_ECE",
|
428
|
+
"FLAG_ECE_REV": "F_ECE_REV",
|
429
|
+
"FLAG_PSH_REV": "F_PSH_REV",
|
430
|
+
"FLAG_RST": "F_RST",
|
431
|
+
"FLAG_RST_REV": "F_RST_REV",
|
432
|
+
"FLAG_FIN": "F_FIN",
|
433
|
+
"FLAG_FIN_REV": "F_FIN_REV",
|
434
|
+
}
|
435
|
+
feature_names = self.flowstats_features[:]
|
436
|
+
for f in self.flowstats_features_boolean:
|
437
|
+
if shorter_names and f in short_names_mapping:
|
438
|
+
feature_names.append(short_names_mapping[f])
|
438
439
|
else:
|
439
|
-
feature_names.append(
|
440
|
+
feature_names.append(f)
|
441
|
+
for f in self.flowstats_features_phist:
|
442
|
+
feature_names.extend(phist_mapping[f])
|
440
443
|
assert len(feature_names) == self.get_flowstats_features_len()
|
441
444
|
return feature_names
|
442
445
|
|
@@ -449,8 +452,8 @@ class DatasetConfig():
|
|
449
452
|
ppi_feature_names += [f"PUSH_{i}" for i in range(1, PPI_MAX_LEN + 1)]
|
450
453
|
return ppi_feature_names
|
451
454
|
|
452
|
-
def get_ppi_channels(self) -> int:
|
453
|
-
"""Gets the
|
455
|
+
def get_ppi_channels(self) -> list[int]:
|
456
|
+
"""Gets the available features (channels) in PPI sequences."""
|
454
457
|
if self.use_push_flags:
|
455
458
|
return TCP_PPI_CHANNELS
|
456
459
|
else:
|
@@ -485,8 +488,11 @@ class DatasetConfig():
|
|
485
488
|
return params_hash
|
486
489
|
|
487
490
|
def _get_train_data_path(self) -> str:
|
488
|
-
|
489
|
-
|
491
|
+
if self.need_train_set:
|
492
|
+
params_hash = self._get_train_data_hash()
|
493
|
+
return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
|
494
|
+
else:
|
495
|
+
return os.path.join(self.data_root, "train-data", "default")
|
490
496
|
|
491
497
|
def _get_train_data_params(self) -> TrainDataParams:
|
492
498
|
return TrainDataParams(
|
@@ -495,38 +501,39 @@ class DatasetConfig():
|
|
495
501
|
train_tables_paths=self._get_train_tables_paths(),
|
496
502
|
apps_selection=self.apps_selection,
|
497
503
|
apps_selection_topx=self.apps_selection_topx,
|
498
|
-
|
499
|
-
|
504
|
+
apps_selection_background_unknown=self.apps_selection_background_unknown,
|
505
|
+
apps_selection_fixed_known=self.apps_selection_fixed_known,
|
506
|
+
apps_selection_fixed_unknown=self.apps_selection_fixed_unknown,
|
500
507
|
disabled_apps=self.disabled_apps,
|
501
508
|
min_train_samples_per_app=self.min_train_samples_per_app,
|
502
509
|
min_train_samples_check=self.min_train_samples_check,)
|
503
510
|
|
504
|
-
def _get_val_data_params_and_path(self,
|
511
|
+
def _get_val_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
|
505
512
|
assert self.val_approach == ValidationApproach.VALIDATION_DATES
|
506
513
|
val_data_params = TestDataParams(
|
507
514
|
database_filename=self.database_filename,
|
508
515
|
test_period_name=self.val_period_name,
|
509
516
|
test_tables_paths=self._get_val_tables_paths(),
|
510
|
-
|
511
|
-
|
517
|
+
known_apps=known_apps,
|
518
|
+
unknown_apps=unknown_apps,)
|
512
519
|
params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(val_data_params), sort_keys=True).encode()).hexdigest()
|
513
520
|
params_hash = params_hash[:10]
|
514
521
|
val_data_path = os.path.join(self.data_root, "val-data", f"{params_hash}_{self.random_state}")
|
515
522
|
return val_data_params, val_data_path
|
516
523
|
|
517
|
-
def _get_test_data_params_and_path(self,
|
524
|
+
def _get_test_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
|
518
525
|
test_data_params = TestDataParams(
|
519
526
|
database_filename=self.database_filename,
|
520
527
|
test_period_name=self.test_period_name,
|
521
528
|
test_tables_paths=self._get_test_tables_paths(),
|
522
|
-
|
523
|
-
|
529
|
+
known_apps=known_apps,
|
530
|
+
unknown_apps=unknown_apps,)
|
524
531
|
params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(test_data_params), sort_keys=True).encode()).hexdigest()
|
525
532
|
params_hash = params_hash[:10]
|
526
533
|
test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
|
527
534
|
return test_data_params, test_data_path
|
528
535
|
|
529
|
-
@model_validator(mode="before")
|
536
|
+
@model_validator(mode="before") # type: ignore
|
530
537
|
@classmethod
|
531
538
|
def check_deprecated_args(cls, values):
|
532
539
|
kwargs = values.kwargs
|
@@ -6,20 +6,18 @@ DATASET_SIZES = {
|
|
6
6
|
}
|
7
7
|
|
8
8
|
# Per-packet information (PPI) constants
|
9
|
+
PPI_MAX_LEN = 30
|
9
10
|
IPT_POS = 0
|
10
11
|
DIR_POS = 1
|
11
12
|
SIZE_POS = 2
|
12
13
|
PUSH_FLAGS_POS = 3
|
13
|
-
|
14
|
-
|
15
|
-
UDP_PPI_CHANNELS = 3
|
14
|
+
TCP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS, PUSH_FLAGS_POS]
|
15
|
+
UDP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS]
|
16
16
|
|
17
17
|
# Features
|
18
18
|
FLOWSTATS_TO_SCALE = ["BYTES", "BYTES_REV", "PACKETS", "PACKETS_REV", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION", "DURATION"]
|
19
|
-
FLOWSTATS_NO_CLIP = ["
|
19
|
+
FLOWSTATS_NO_CLIP = ["DURATION", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION"]
|
20
20
|
SELECTED_TCP_FLAGS = ["FLAG_CWR", "FLAG_CWR_REV", "FLAG_ECE", "FLAG_ECE_REV", "FLAG_PSH_REV", "FLAG_RST", "FLAG_RST_REV", "FLAG_FIN", "FLAG_FIN_REV"]
|
21
|
-
FLOWEND_REASON_FEATURES = ["FLOW_ENDREASON_IDLE", "FLOW_ENDREASON_ACTIVE", "FLOW_ENDREASON_END", "FLOW_ENDREASON_OTHER"]
|
22
|
-
PHISTS_FEATURES = ["PHIST_SRC_SIZES", "PHIST_DST_SIZES", "PHIST_SRC_IPT", "PHIST_DST_IPT"]
|
23
21
|
PHIST_BIN_COUNT = 8
|
24
22
|
|
25
23
|
# Column names
|