cesnet-datazoo 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +173 -168
- cesnet_datazoo/constants.py +4 -6
- cesnet_datazoo/datasets/cesnet_dataset.py +200 -177
- cesnet_datazoo/datasets/datasets.py +22 -2
- cesnet_datazoo/datasets/datasets_constants.py +670 -0
- cesnet_datazoo/datasets/loaders.py +3 -0
- cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
- cesnet_datazoo/datasets/metadata/metadata.csv +4 -4
- cesnet_datazoo/datasets/statistics.py +36 -16
- cesnet_datazoo/pytables_data/data_scalers.py +68 -154
- cesnet_datazoo/pytables_data/indices_setup.py +29 -33
- cesnet_datazoo/pytables_data/pytables_dataset.py +99 -122
- cesnet_datazoo/utils/class_info.py +7 -5
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/METADATA +2 -1
- cesnet_datazoo-0.1.0.dist-info/RECORD +30 -0
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/WHEEL +1 -1
- cesnet_datazoo-0.0.17.dist-info/RECORD +0 -29
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/top_level.txt +0 -0
cesnet_datazoo/config.py
CHANGED
@@ -8,12 +8,11 @@ import warnings
|
|
8
8
|
from dataclasses import InitVar, field
|
9
9
|
from datetime import datetime
|
10
10
|
from enum import Enum
|
11
|
-
from typing import TYPE_CHECKING, Literal, Optional
|
11
|
+
from typing import TYPE_CHECKING, Callable, Literal, Optional
|
12
12
|
|
13
13
|
import yaml
|
14
14
|
from pydantic import model_validator
|
15
15
|
from pydantic.dataclasses import dataclass
|
16
|
-
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
|
17
16
|
|
18
17
|
from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP_FLAGS,
|
19
18
|
TCP_PPI_CHANNELS, UDP_PPI_CHANNELS)
|
@@ -21,19 +20,6 @@ from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP
|
|
21
20
|
if TYPE_CHECKING:
|
22
21
|
from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
|
23
22
|
|
24
|
-
Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
|
25
|
-
|
26
|
-
class ScalerEnum(Enum):
|
27
|
-
"""Available scalers for flow statistics, packet sizes, and inter-packet times."""
|
28
|
-
STANDARD = "standard"
|
29
|
-
"""Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
|
30
|
-
ROBUST = "robust"
|
31
|
-
"""Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
|
32
|
-
MINMAX = "minmax"
|
33
|
-
"""Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
|
34
|
-
NO_SCALER = "no-scaler"
|
35
|
-
"""No scaling."""
|
36
|
-
def __str__(self): return self.value
|
37
23
|
|
38
24
|
class Protocol(Enum):
|
39
25
|
TLS = "TLS"
|
@@ -48,25 +34,23 @@ class ValidationApproach(Enum):
|
|
48
34
|
is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
|
49
35
|
VALIDATION_DATES = "validation-dates"
|
50
36
|
"""Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
|
51
|
-
NO_VALIDATION = "no-validation"
|
52
|
-
"""Do not use validation. The validation dataloader and dataframe will not be available."""
|
53
37
|
def __str__(self): return self.value
|
54
38
|
|
55
39
|
class AppSelection(Enum):
|
56
40
|
"""
|
57
41
|
Applications can be divided into *known* and *unknown* classes. To use a dataset in the standard closed-world setting, use `ALL_KNOWN ` to select all the applications as *known*.
|
58
|
-
Use `TOPX_KNOWN` or `
|
59
|
-
The `
|
42
|
+
Use `TOPX_KNOWN` or `BACKGROUND_UNKNOWN` for the open-world setting and evaluation of out-of-distribution or open-set recognition methods.
|
43
|
+
The `FIXED` is for manual selection of *known* and *unknown* applications.
|
60
44
|
"""
|
61
45
|
ALL_KNOWN = "all-known"
|
62
46
|
"""Use all applications as *known*."""
|
63
47
|
TOPX_KNOWN = "topx-known"
|
64
48
|
"""Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
|
65
49
|
Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
|
66
|
-
|
67
|
-
"""Use the
|
68
|
-
|
69
|
-
"""
|
50
|
+
BACKGROUND_UNKNOWN = "background-unknown"
|
51
|
+
"""Use the list of background traffic classes (`apps_selection_background_unknown`) as *unknown*, and the rest as *known*."""
|
52
|
+
FIXED = "fixed"
|
53
|
+
"""Manual application selection. Provide lists of *known* applications (`apps_selection_fixed_known`) and *unknown* applications (`apps_selection_fixed_unknown`)."""
|
70
54
|
def __str__(self): return self.value
|
71
55
|
|
72
56
|
class MinTrainSamplesCheck(Enum):
|
@@ -103,8 +87,9 @@ class TrainDataParams():
|
|
103
87
|
train_tables_paths: list[str]
|
104
88
|
apps_selection: AppSelection
|
105
89
|
apps_selection_topx: int
|
106
|
-
|
107
|
-
|
90
|
+
apps_selection_background_unknown: list[str]
|
91
|
+
apps_selection_fixed_known: list[str]
|
92
|
+
apps_selection_fixed_unknown: list[str]
|
108
93
|
disabled_apps: list[str]
|
109
94
|
min_train_samples_check: MinTrainSamplesCheck
|
110
95
|
min_train_samples_per_app: int
|
@@ -114,8 +99,8 @@ class TestDataParams():
|
|
114
99
|
database_filename: str
|
115
100
|
test_period_name: str
|
116
101
|
test_tables_paths: list[str]
|
117
|
-
|
118
|
-
|
102
|
+
known_apps: list[str]
|
103
|
+
unknown_apps: list[str]
|
119
104
|
|
120
105
|
class C:
|
121
106
|
arbitrary_types_allowed = True
|
@@ -128,38 +113,43 @@ class DatasetConfig():
|
|
128
113
|
|
129
114
|
- Train, validation, test sets (dates, sizes, validation approach).
|
130
115
|
- Application selection — either the standard closed-world setting (only *known* classes) or the open-world setting (*known* and *unknown* classes).
|
131
|
-
- Feature scaling. See the [data features][features] page for more information.
|
116
|
+
- Feature scaling. See the [data features][features] page for more information. DOCS_TODO
|
132
117
|
- Dataloader options like batch sizes, order of loading, or number of workers.
|
133
118
|
|
134
119
|
When initializing this class, pass a [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance to be configured and the desired configuration. Available options are [here][config.DatasetConfig--configuration-options].
|
135
120
|
|
136
121
|
Attributes:
|
137
|
-
dataset: The dataset instance to be configured
|
138
|
-
data_root: Taken from the dataset instance
|
139
|
-
database_filename: Taken from the dataset instance
|
140
|
-
database_path: Taken from the dataset instance
|
141
|
-
servicemap_path: Taken from the dataset instance
|
142
|
-
flowstats_features: Taken from `dataset.metadata.flowstats_features
|
143
|
-
|
122
|
+
dataset: The dataset instance to be configured.
|
123
|
+
data_root: Taken from the dataset instance.
|
124
|
+
database_filename: Taken from the dataset instance.
|
125
|
+
database_path: Taken from the dataset instance.
|
126
|
+
servicemap_path: Taken from the dataset instance.
|
127
|
+
flowstats_features: Taken from `dataset.metadata.flowstats_features`.
|
128
|
+
flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
|
129
|
+
flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
|
130
|
+
other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
|
144
131
|
|
145
132
|
# Configuration options
|
146
133
|
|
147
134
|
Attributes:
|
135
|
+
need_train_set: Use to disable the train set. `Default: True`
|
136
|
+
need_val_set: Use to disable the validation set. When `need_train_set` is false, the validation set will also be disabled. `Default: True`
|
137
|
+
need_test_set: Use to disable the test set. `Default: True`
|
148
138
|
train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
149
139
|
train_dates: Dates used for creating a train set.
|
150
140
|
train_dates_weigths: To use a non-uniform distribution of samples across train dates.
|
151
|
-
val_approach: How a validation set should be created. Either split train data into train and validation
|
141
|
+
val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
|
152
142
|
train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
|
153
143
|
val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
154
144
|
val_dates: Dates used for creating a validation set.
|
155
|
-
no_test_set: Disable the test set. `Default: False`
|
156
145
|
test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
157
146
|
test_dates: Dates used for creating a test set.
|
158
147
|
|
159
148
|
apps_selection: How to select application classes. `Default: ALL_KNOWN`
|
160
149
|
apps_selection_topx: Take top X as known.
|
161
|
-
|
162
|
-
|
150
|
+
apps_selection_background_unknown: Provide a list of background traffic classes to be used as unknown.
|
151
|
+
apps_selection_fixed_known: Provide a list of manually selected known applications.
|
152
|
+
apps_selection_fixed_unknown: Provide a list of manually selected unknown applications.
|
163
153
|
disabled_apps: List of applications to be disabled and not used at all.
|
164
154
|
min_train_samples_check: How to handle applications with *not enough* training samples. `Default: DISABLE_APPS`
|
165
155
|
min_train_samples_per_app: Defines the threshold for *not enough*. `Default: 100`
|
@@ -182,22 +172,14 @@ class DatasetConfig():
|
|
182
172
|
train_dataloader_seed: Seed for loading train data in random order. `Default: None`
|
183
173
|
|
184
174
|
return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
|
185
|
-
|
186
|
-
raw_output: Return raw output without data scaling, clipping, and normalization. `Default: False`
|
175
|
+
return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
|
187
176
|
use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
|
188
|
-
normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
|
189
177
|
use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
|
190
178
|
use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
ipt_scaler: Which scaler to use for inter-packet times. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
|
196
|
-
scalers_attrs: Load data scalers from numeric values in this dict rather than from pickled files. `Default: None`
|
197
|
-
flowstats_clip: Quantile clip before the scaling of flow statistics. Should limit the influence of outliers. Set to `1` to disable. `Default: 0.99`
|
198
|
-
psizes_max: Max clip packet sizes before scaling. `Default: 1500`
|
199
|
-
ipt_min: Min clip inter-packet times before scaling. `Default: 0`
|
200
|
-
ipt_max: Max clip inter-packet times before scaling. `Default: 65000`
|
179
|
+
fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25` DOCS_TODO
|
180
|
+
ppi_transform: Transform function for PPI sequences. `Default: None` DOCS_TODO
|
181
|
+
flowstats_transform: Transform function for flow statistics. `Default: None`
|
182
|
+
flowstats_phist_transform: Transform function for packet histograms. `Default: None`
|
201
183
|
|
202
184
|
# How to configure train, validation, and test sets
|
203
185
|
There are three options for how to define train/validation/test dates.
|
@@ -223,8 +205,13 @@ class DatasetConfig():
|
|
223
205
|
database_path: str = field(init=False)
|
224
206
|
servicemap_path: str = field(init=False)
|
225
207
|
flowstats_features: list[str] = field(init=False)
|
208
|
+
flowstats_features_boolean: list[str] = field(init=False)
|
209
|
+
flowstats_features_phist: list[str] = field(init=False)
|
226
210
|
other_fields: list[str] = field(init=False)
|
227
211
|
|
212
|
+
need_train_set: bool = True
|
213
|
+
need_val_set: bool = True
|
214
|
+
need_test_set: bool = True
|
228
215
|
train_period_name: str = ""
|
229
216
|
train_dates: list[str] = field(default_factory=list)
|
230
217
|
train_dates_weigths: Optional[list[int]] = None
|
@@ -232,14 +219,14 @@ class DatasetConfig():
|
|
232
219
|
train_val_split_fraction: float = 0.2
|
233
220
|
val_period_name: str = ""
|
234
221
|
val_dates: list[str] = field(default_factory=list)
|
235
|
-
no_test_set: bool = False
|
236
222
|
test_period_name: str = ""
|
237
223
|
test_dates: list[str] = field(default_factory=list)
|
238
224
|
|
239
225
|
apps_selection: AppSelection = AppSelection.ALL_KNOWN
|
240
226
|
apps_selection_topx: int = 0
|
241
|
-
|
242
|
-
|
227
|
+
apps_selection_background_unknown: list[str] = field(default_factory=list)
|
228
|
+
apps_selection_fixed_known: list[str] = field(default_factory=list)
|
229
|
+
apps_selection_fixed_unknown: list[str] = field(default_factory=list)
|
243
230
|
disabled_apps: list[str] = field(default_factory=list)
|
244
231
|
min_train_samples_check: MinTrainSamplesCheck = MinTrainSamplesCheck.DISABLE_APPS
|
245
232
|
min_train_samples_per_app: int = 100
|
@@ -262,22 +249,14 @@ class DatasetConfig():
|
|
262
249
|
train_dataloader_seed: Optional[int] = None
|
263
250
|
|
264
251
|
return_other_fields: bool = False
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
normalize_packet_histograms: bool = True
|
269
|
-
use_tcp_features: bool = True
|
252
|
+
return_tensors: bool = False
|
253
|
+
use_packet_histograms: bool = False
|
254
|
+
use_tcp_features: bool = False
|
270
255
|
use_push_flags: bool = False
|
271
|
-
zero_ppi_start: int = 0
|
272
256
|
fit_scalers_samples: int | float = 0.25
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
scalers_attrs: Optional[dict] = None
|
277
|
-
flowstats_clip: float = 0.99
|
278
|
-
psizes_max: int = 1500
|
279
|
-
ipt_min: int = 0
|
280
|
-
ipt_max: int = 65000
|
257
|
+
ppi_transform: Optional[Callable] = None
|
258
|
+
flowstats_transform: Optional[Callable] = None
|
259
|
+
flowstats_phist_transform: Optional[Callable] = None
|
281
260
|
|
282
261
|
def __post_init__(self, dataset: CesnetDataset):
|
283
262
|
"""
|
@@ -287,23 +266,28 @@ class DatasetConfig():
|
|
287
266
|
self.servicemap_path = dataset.servicemap_path
|
288
267
|
self.database_filename = dataset.database_filename
|
289
268
|
self.database_path = dataset.database_path
|
290
|
-
self.flowstats_features = dataset.metadata.flowstats_features
|
291
|
-
self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
|
292
269
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
if self.
|
298
|
-
raise ValueError(
|
299
|
-
|
300
|
-
|
301
|
-
self.train_period_name
|
302
|
-
|
270
|
+
if not self.need_train_set:
|
271
|
+
self.need_val_set = False
|
272
|
+
if self.apps_selection != AppSelection.FIXED:
|
273
|
+
raise ValueError("Application selection has to be fixed when need_train_set is false")
|
274
|
+
if (len(self.train_dates) > 0 or self.train_period_name != ""):
|
275
|
+
raise ValueError("train_dates and train_period_name cannot be specified when need_train_set is false")
|
276
|
+
else:
|
277
|
+
# Configure train dates
|
278
|
+
if len(self.train_dates) > 0 and self.train_period_name == "":
|
279
|
+
raise ValueError("train_period_name has to be specified when train_dates are set")
|
280
|
+
if len(self.train_dates) == 0 and self.train_period_name != "":
|
281
|
+
if self.train_period_name not in dataset.time_periods:
|
282
|
+
raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
|
283
|
+
self.train_dates = dataset.time_periods[self.train_period_name]
|
284
|
+
if len(self.train_dates) == 0 and self.test_period_name == "":
|
285
|
+
self.train_period_name = dataset.default_train_period_name
|
286
|
+
self.train_dates = dataset.time_periods[dataset.default_train_period_name]
|
303
287
|
# Configure test dates
|
304
|
-
if self.
|
288
|
+
if not self.need_test_set:
|
305
289
|
if (len(self.test_dates) > 0 or self.test_period_name != ""):
|
306
|
-
raise ValueError("test_dates and test_period_name cannot be specified when
|
290
|
+
raise ValueError("test_dates and test_period_name cannot be specified when need_test_set is false")
|
307
291
|
else:
|
308
292
|
if len(self.test_dates) > 0 and self.test_period_name == "":
|
309
293
|
raise ValueError("test_period_name has to be specified when test_dates are set")
|
@@ -315,8 +299,8 @@ class DatasetConfig():
|
|
315
299
|
self.test_period_name = dataset.default_test_period_name
|
316
300
|
self.test_dates = dataset.time_periods[dataset.default_test_period_name]
|
317
301
|
# Configure val dates
|
318
|
-
if (self.
|
319
|
-
raise ValueError("val_dates and val_period_name cannot be specified when
|
302
|
+
if (not self.need_val_set or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
|
303
|
+
raise ValueError("val_dates and val_period_name cannot be specified when need_val_set is false or the validation approach is split-from-train")
|
320
304
|
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
321
305
|
if len(self.val_dates) > 0 and self.val_period_name == "":
|
322
306
|
raise ValueError("val_period_name has to be specified when val_dates are set")
|
@@ -325,57 +309,58 @@ class DatasetConfig():
|
|
325
309
|
raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
|
326
310
|
self.val_dates = dataset.time_periods[self.val_period_name]
|
327
311
|
if len(self.val_dates) == 0 and self.val_period_name == "":
|
328
|
-
raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when
|
312
|
+
raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
|
329
313
|
# Check if train, val, and test dates are available in the dataset
|
330
|
-
if dataset.available_dates
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
314
|
+
bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
|
315
|
+
bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
|
316
|
+
bad_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
|
317
|
+
if len(bad_train_dates) > 0:
|
318
|
+
raise ValueError(f"Bad train dates {bad_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
319
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
320
|
+
if len(bad_val_dates) > 0:
|
321
|
+
raise ValueError(f"Bad validation dates {bad_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
322
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
323
|
+
if len(bad_test_dates) > 0:
|
324
|
+
raise ValueError(f"Bad test dates {bad_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
325
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
343
326
|
# Check time order of train, val, and test periods
|
344
327
|
train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
|
345
328
|
test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
|
346
|
-
if
|
329
|
+
if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
|
347
330
|
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
348
331
|
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
332
|
+
# Train dates are guaranteed to be set
|
349
333
|
val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
|
350
334
|
if min(val_dates) <= max(train_dates):
|
351
335
|
warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
352
|
-
if
|
336
|
+
if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
|
353
337
|
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
354
338
|
# Configure features
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
self.flowstats_clip = 1.0
|
359
|
-
self.psizes_scaler = ScalerEnum.NO_SCALER
|
360
|
-
self.psizes_max = 1500
|
361
|
-
self.ipt_scaler = ScalerEnum.NO_SCALER
|
362
|
-
self.ipt_min = 0
|
363
|
-
self.ipt_max = 65000
|
364
|
-
if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
|
365
|
-
self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
|
366
|
-
if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.features_in_packet_sequences:
|
367
|
-
raise ValueError("This TLS dataset does not support use_push_flags")
|
339
|
+
self.flowstats_features = dataset.metadata.flowstats_features
|
340
|
+
self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
|
341
|
+
self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
|
368
342
|
if self.use_packet_histograms:
|
369
|
-
if len(dataset.metadata.
|
370
|
-
|
371
|
-
|
372
|
-
|
343
|
+
if len(dataset.metadata.packet_histograms) == 0:
|
344
|
+
raise ValueError("This dataset does not support use_packet_histograms")
|
345
|
+
self.flowstats_features_phist = dataset.metadata.packet_histograms
|
346
|
+
else:
|
347
|
+
self.flowstats_features_phist = []
|
348
|
+
if self.flowstats_phist_transform is not None:
|
349
|
+
raise ValueError("flowstats_phist_transform cannot be specified when use_packet_histograms is false")
|
350
|
+
if dataset.metadata.protocol == Protocol.TLS:
|
351
|
+
if self.use_tcp_features:
|
352
|
+
self.flowstats_features_boolean = self.flowstats_features_boolean + SELECTED_TCP_FLAGS
|
353
|
+
if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.ppi_features:
|
354
|
+
raise ValueError("This TLS dataset does not support use_push_flags")
|
373
355
|
if dataset.metadata.protocol == Protocol.QUIC:
|
374
|
-
self.use_tcp_features
|
356
|
+
if self.use_tcp_features:
|
357
|
+
raise ValueError("QUIC datasets do not support use_tcp_features")
|
375
358
|
if self.use_push_flags:
|
376
359
|
raise ValueError("QUIC datasets do not support use_push_flags")
|
377
360
|
# When train_dates_weigths are used, train_size and val_known_size have to be specified
|
378
361
|
if self.train_dates_weigths is not None:
|
362
|
+
if not self.need_train_set:
|
363
|
+
raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
|
379
364
|
if len(self.train_dates_weigths) != len(self.train_dates):
|
380
365
|
raise ValueError("train_dates_weigths has to have the same length as train_dates")
|
381
366
|
if self.train_size == "all":
|
@@ -386,59 +371,75 @@ class DatasetConfig():
|
|
386
371
|
if self.apps_selection == AppSelection.ALL_KNOWN:
|
387
372
|
self.val_unknown_size = 0
|
388
373
|
self.test_unknown_size = 0
|
389
|
-
if self.apps_selection_topx != 0 or len(self.
|
390
|
-
raise ValueError("apps_selection_topx,
|
391
|
-
if self.apps_selection == AppSelection.TOPX_KNOWN
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
374
|
+
if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
|
375
|
+
raise ValueError("apps_selection_topx, apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is all-known")
|
376
|
+
if self.apps_selection == AppSelection.TOPX_KNOWN:
|
377
|
+
if self.apps_selection_topx == 0:
|
378
|
+
raise ValueError("apps_selection_topx has to be greater than 0 when application selection is top-x-known")
|
379
|
+
if len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
|
380
|
+
raise ValueError("apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is top-x-known")
|
381
|
+
if self.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
|
382
|
+
if len(self.apps_selection_background_unknown) == 0:
|
383
|
+
raise ValueError("apps_selection_background_unknown has to be specified when application selection is background-unknown")
|
384
|
+
bad_apps = [a for a in self.apps_selection_background_unknown if a not in dataset.available_classes]
|
385
|
+
if len(bad_apps) > 0:
|
386
|
+
raise ValueError(f"Bad applications in apps_selection_background_unknown {bad_apps}. Use applications available in dataset.available_classes")
|
387
|
+
if self.apps_selection_topx != 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
|
388
|
+
raise ValueError("apps_selection_topx, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is background-unknown")
|
389
|
+
if self.apps_selection == AppSelection.FIXED:
|
390
|
+
if len(self.apps_selection_fixed_known) == 0:
|
391
|
+
raise ValueError("apps_selection_fixed_known has to be specified when application selection is fixed")
|
392
|
+
bad_apps = [a for a in self.apps_selection_fixed_known + self.apps_selection_fixed_unknown if a not in dataset.available_classes]
|
393
|
+
if len(bad_apps) > 0:
|
394
|
+
raise ValueError(f"Bad applications in apps_selection_fixed_known or apps_selection_fixed_unknown {bad_apps}. Use applications available in dataset.available_classes")
|
398
395
|
if len(self.disabled_apps) > 0:
|
399
|
-
raise ValueError("disabled_apps cannot be specified when
|
400
|
-
if self.min_train_samples_per_app != 0:
|
401
|
-
|
402
|
-
|
403
|
-
|
396
|
+
raise ValueError("disabled_apps cannot be specified when application selection is fixed")
|
397
|
+
if self.min_train_samples_per_app != 0 and self.min_train_samples_per_app != 100:
|
398
|
+
warnings.warn("min_train_samples_per_app is not used when application selection is fixed")
|
399
|
+
if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0:
|
400
|
+
raise ValueError("apps_selection_topx and apps_selection_background_unknown cannot be specified when application selection is fixed")
|
404
401
|
# More asserts
|
405
|
-
|
406
|
-
|
402
|
+
bad_disabled_apps = [a for a in self.disabled_apps if a not in dataset.available_classes]
|
403
|
+
if len(bad_disabled_apps) > 0:
|
404
|
+
raise ValueError(f"Bad applications in disabled_apps {bad_disabled_apps}. Use applications available in dataset.available_classes")
|
407
405
|
if isinstance(self.fit_scalers_samples, float) and (self.fit_scalers_samples <= 0 or self.fit_scalers_samples > 1):
|
408
406
|
raise ValueError("fit_scalers_samples has to be either float between 0 and 1 (giving the fraction of training samples used for fitting scalers) or an integer")
|
409
407
|
|
410
408
|
def get_flowstats_features_len(self) -> int:
|
411
409
|
"""Gets the number of flow statistics features."""
|
412
|
-
|
413
|
-
for f in self.flowstats_features:
|
414
|
-
if f.startswith("PHIST_"):
|
415
|
-
n += PHIST_BIN_COUNT
|
416
|
-
else:
|
417
|
-
n += 1
|
418
|
-
return n
|
410
|
+
return len(self.flowstats_features) + len(self.flowstats_features_boolean) + PHIST_BIN_COUNT * len(self.flowstats_features_phist)
|
419
411
|
|
420
412
|
def get_flowstats_feature_names_expanded(self, shorter_names: bool = False) -> list[str]:
|
421
413
|
"""Gets names of flow statistics features. Packet histograms are expanded into bin features."""
|
422
|
-
|
414
|
+
phist_mapping = {
|
423
415
|
"PHIST_SRC_SIZES": [f"PSIZE_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
|
424
416
|
"PHIST_DST_SIZES": [f"PSIZE_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
|
425
417
|
"PHIST_SRC_IPT": [f"IPT_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
|
426
418
|
"PHIST_DST_IPT": [f"IPT_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
|
427
|
-
"FLOW_ENDREASON_IDLE": "FEND_IDLE" if shorter_names else "FLOW_ENDREASON_IDLE",
|
428
|
-
"FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE" if shorter_names else "FLOW_ENDREASON_ACTIVE",
|
429
|
-
"FLOW_ENDREASON_END": "FEND_END" if shorter_names else "FLOW_ENDREASON_END",
|
430
|
-
"FLOW_ENDREASON_OTHER": "FEND_OTHER" if shorter_names else "FLOW_ENDREASON_OTHER",
|
431
419
|
}
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
420
|
+
short_names_mapping = {
|
421
|
+
"FLOW_ENDREASON_IDLE": "FEND_IDLE",
|
422
|
+
"FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE",
|
423
|
+
"FLOW_ENDREASON_END": "FEND_END",
|
424
|
+
"FLOW_ENDREASON_OTHER": "FEND_OTHER",
|
425
|
+
"FLAG_CWR": "F_CWR",
|
426
|
+
"FLAG_CWR_REV": "F_CWR_REV",
|
427
|
+
"FLAG_ECE": "F_ECE",
|
428
|
+
"FLAG_ECE_REV": "F_ECE_REV",
|
429
|
+
"FLAG_PSH_REV": "F_PSH_REV",
|
430
|
+
"FLAG_RST": "F_RST",
|
431
|
+
"FLAG_RST_REV": "F_RST_REV",
|
432
|
+
"FLAG_FIN": "F_FIN",
|
433
|
+
"FLAG_FIN_REV": "F_FIN_REV",
|
434
|
+
}
|
435
|
+
feature_names = self.flowstats_features[:]
|
436
|
+
for f in self.flowstats_features_boolean:
|
437
|
+
if shorter_names and f in short_names_mapping:
|
438
|
+
feature_names.append(short_names_mapping[f])
|
440
439
|
else:
|
441
|
-
feature_names.append(
|
440
|
+
feature_names.append(f)
|
441
|
+
for f in self.flowstats_features_phist:
|
442
|
+
feature_names.extend(phist_mapping[f])
|
442
443
|
assert len(feature_names) == self.get_flowstats_features_len()
|
443
444
|
return feature_names
|
444
445
|
|
@@ -451,8 +452,8 @@ class DatasetConfig():
|
|
451
452
|
ppi_feature_names += [f"PUSH_{i}" for i in range(1, PPI_MAX_LEN + 1)]
|
452
453
|
return ppi_feature_names
|
453
454
|
|
454
|
-
def get_ppi_channels(self) -> int:
|
455
|
-
"""Gets the
|
455
|
+
def get_ppi_channels(self) -> list[int]:
|
456
|
+
"""Gets the available features (channels) in PPI sequences."""
|
456
457
|
if self.use_push_flags:
|
457
458
|
return TCP_PPI_CHANNELS
|
458
459
|
else:
|
@@ -487,8 +488,11 @@ class DatasetConfig():
|
|
487
488
|
return params_hash
|
488
489
|
|
489
490
|
def _get_train_data_path(self) -> str:
|
490
|
-
|
491
|
-
|
491
|
+
if self.need_train_set:
|
492
|
+
params_hash = self._get_train_data_hash()
|
493
|
+
return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
|
494
|
+
else:
|
495
|
+
return os.path.join(self.data_root, "train-data", "default")
|
492
496
|
|
493
497
|
def _get_train_data_params(self) -> TrainDataParams:
|
494
498
|
return TrainDataParams(
|
@@ -497,32 +501,33 @@ class DatasetConfig():
|
|
497
501
|
train_tables_paths=self._get_train_tables_paths(),
|
498
502
|
apps_selection=self.apps_selection,
|
499
503
|
apps_selection_topx=self.apps_selection_topx,
|
500
|
-
|
501
|
-
|
504
|
+
apps_selection_background_unknown=self.apps_selection_background_unknown,
|
505
|
+
apps_selection_fixed_known=self.apps_selection_fixed_known,
|
506
|
+
apps_selection_fixed_unknown=self.apps_selection_fixed_unknown,
|
502
507
|
disabled_apps=self.disabled_apps,
|
503
508
|
min_train_samples_per_app=self.min_train_samples_per_app,
|
504
509
|
min_train_samples_check=self.min_train_samples_check,)
|
505
510
|
|
506
|
-
def _get_val_data_params_and_path(self,
|
511
|
+
def _get_val_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
|
507
512
|
assert self.val_approach == ValidationApproach.VALIDATION_DATES
|
508
513
|
val_data_params = TestDataParams(
|
509
514
|
database_filename=self.database_filename,
|
510
515
|
test_period_name=self.val_period_name,
|
511
516
|
test_tables_paths=self._get_val_tables_paths(),
|
512
|
-
|
513
|
-
|
517
|
+
known_apps=known_apps,
|
518
|
+
unknown_apps=unknown_apps,)
|
514
519
|
params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(val_data_params), sort_keys=True).encode()).hexdigest()
|
515
520
|
params_hash = params_hash[:10]
|
516
521
|
val_data_path = os.path.join(self.data_root, "val-data", f"{params_hash}_{self.random_state}")
|
517
522
|
return val_data_params, val_data_path
|
518
523
|
|
519
|
-
def _get_test_data_params_and_path(self,
|
524
|
+
def _get_test_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
|
520
525
|
test_data_params = TestDataParams(
|
521
526
|
database_filename=self.database_filename,
|
522
527
|
test_period_name=self.test_period_name,
|
523
528
|
test_tables_paths=self._get_test_tables_paths(),
|
524
|
-
|
525
|
-
|
529
|
+
known_apps=known_apps,
|
530
|
+
unknown_apps=unknown_apps,)
|
526
531
|
params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(test_data_params), sort_keys=True).encode()).hexdigest()
|
527
532
|
params_hash = params_hash[:10]
|
528
533
|
test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
|
cesnet_datazoo/constants.py
CHANGED
@@ -6,20 +6,18 @@ DATASET_SIZES = {
|
|
6
6
|
}
|
7
7
|
|
8
8
|
# Per-packet information (PPI) constants
|
9
|
+
PPI_MAX_LEN = 30
|
9
10
|
IPT_POS = 0
|
10
11
|
DIR_POS = 1
|
11
12
|
SIZE_POS = 2
|
12
13
|
PUSH_FLAGS_POS = 3
|
13
|
-
|
14
|
-
|
15
|
-
UDP_PPI_CHANNELS = 3
|
14
|
+
TCP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS, PUSH_FLAGS_POS]
|
15
|
+
UDP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS]
|
16
16
|
|
17
17
|
# Features
|
18
18
|
FLOWSTATS_TO_SCALE = ["BYTES", "BYTES_REV", "PACKETS", "PACKETS_REV", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION", "DURATION"]
|
19
|
-
FLOWSTATS_NO_CLIP = ["
|
19
|
+
FLOWSTATS_NO_CLIP = ["DURATION", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION"]
|
20
20
|
SELECTED_TCP_FLAGS = ["FLAG_CWR", "FLAG_CWR_REV", "FLAG_ECE", "FLAG_ECE_REV", "FLAG_PSH_REV", "FLAG_RST", "FLAG_RST_REV", "FLAG_FIN", "FLAG_FIN_REV"]
|
21
|
-
FLOWEND_REASON_FEATURES = ["FLOW_ENDREASON_IDLE", "FLOW_ENDREASON_ACTIVE", "FLOW_ENDREASON_END", "FLOW_ENDREASON_OTHER"]
|
22
|
-
PHISTS_FEATURES = ["PHIST_SRC_SIZES", "PHIST_DST_SIZES", "PHIST_SRC_IPT", "PHIST_DST_IPT"]
|
23
21
|
PHIST_BIN_COUNT = 8
|
24
22
|
|
25
23
|
# Column names
|