cesnet-datazoo 0.0.16__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/PKG-INFO +2 -1
  2. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/config.py +174 -167
  3. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/constants.py +4 -6
  4. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/cesnet_dataset.py +200 -172
  5. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/datasets.py +22 -2
  6. cesnet-datazoo-0.1.0/cesnet_datazoo/datasets/datasets_constants.py +670 -0
  7. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/loaders.py +3 -0
  8. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
  9. cesnet-datazoo-0.1.0/cesnet_datazoo/datasets/metadata/metadata.csv +4 -0
  10. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/statistics.py +36 -16
  11. cesnet-datazoo-0.1.0/cesnet_datazoo/pytables_data/data_scalers.py +110 -0
  12. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/indices_setup.py +29 -33
  13. cesnet-datazoo-0.1.0/cesnet_datazoo/pytables_data/pytables_dataset.py +294 -0
  14. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/class_info.py +7 -5
  15. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/download.py +6 -1
  16. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/PKG-INFO +2 -1
  17. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/SOURCES.txt +2 -0
  18. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/requires.txt +1 -0
  19. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/pyproject.toml +2 -1
  20. cesnet-datazoo-0.0.16/cesnet_datazoo/datasets/metadata/metadata.csv +0 -4
  21. cesnet-datazoo-0.0.16/cesnet_datazoo/pytables_data/pytables_dataset.py +0 -420
  22. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/LICENCE +0 -0
  23. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/README.md +0 -0
  24. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/__init__.py +0 -0
  25. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/__init__.py +0 -0
  26. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  27. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/__init__.py +0 -0
  28. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/classification_report.py +0 -0
  29. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
  30. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  31. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  32. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/__init__.py +0 -0
  33. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/fileutils.py +0 -0
  34. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/random.py +0 -0
  35. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  36. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  37. {cesnet-datazoo-0.0.16 → cesnet-datazoo-0.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.16
3
+ Version: 0.1.0
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
16
16
  Requires-Python: >=3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENCE
19
+ Requires-Dist: cesnet_models
19
20
  Requires-Dist: matplotlib
20
21
  Requires-Dist: numpy
21
22
  Requires-Dist: pandas
@@ -8,12 +8,11 @@ import warnings
8
8
  from dataclasses import InitVar, field
9
9
  from datetime import datetime
10
10
  from enum import Enum
11
- from typing import TYPE_CHECKING, Literal, Optional
11
+ from typing import TYPE_CHECKING, Callable, Literal, Optional
12
12
 
13
13
  import yaml
14
14
  from pydantic import model_validator
15
15
  from pydantic.dataclasses import dataclass
16
- from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
17
16
 
18
17
  from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP_FLAGS,
19
18
  TCP_PPI_CHANNELS, UDP_PPI_CHANNELS)
@@ -21,19 +20,6 @@ from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP
21
20
  if TYPE_CHECKING:
22
21
  from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
23
22
 
24
- Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
25
-
26
- class ScalerEnum(Enum):
27
- """Available scalers for flow statistics, packet sizes, and inter-packet times."""
28
- STANDARD = "standard"
29
- """Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
30
- ROBUST = "robust"
31
- """Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
32
- MINMAX = "minmax"
33
- """Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
34
- NO_SCALER = "no-scaler"
35
- """No scaling."""
36
- def __str__(self): return self.value
37
23
 
38
24
  class Protocol(Enum):
39
25
  TLS = "TLS"
@@ -48,25 +34,23 @@ class ValidationApproach(Enum):
48
34
  is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
49
35
  VALIDATION_DATES = "validation-dates"
50
36
  """Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
51
- NO_VALIDATION = "no-validation"
52
- """Do not use validation. The validation dataloader and dataframe will not be available."""
53
37
  def __str__(self): return self.value
54
38
 
55
39
  class AppSelection(Enum):
56
40
  """
57
41
  Applications can be divided into *known* and *unknown* classes. To use a dataset in the standard closed-world setting, use `ALL_KNOWN ` to select all the applications as *known*.
58
- Use `TOPX_KNOWN` or `EXPLICIT_UNKNOWN` for the open-world setting and evaluation of out-of-distribution or open-set recognition methods.
59
- The `LONGTERM_FIXED` is for long-term measurements when it is desired to use the same applications for multiple subsequent train and test periods.
42
+ Use `TOPX_KNOWN` or `BACKGROUND_UNKNOWN` for the open-world setting and evaluation of out-of-distribution or open-set recognition methods.
43
+ The `FIXED` is for manual selection of *known* and *unknown* applications.
60
44
  """
61
45
  ALL_KNOWN = "all-known"
62
46
  """Use all applications as *known*."""
63
47
  TOPX_KNOWN = "topx-known"
64
48
  """Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
65
49
  Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
66
- EXPLICIT_UNKNOWN = "explicit-unknown"
67
- """Use the provided list of applications (`apps_selection_explicit_unknown`) as *unknown*, and the rest as *known*."""
68
- LONGTERM_FIXED = "longterm-fixed"
69
- """Use fixed application selection. Provide a tuple of `(known_apps_database_enum, unknown_apps_database_enum)` in `apps_selection_fixed_longterm`."""
50
+ BACKGROUND_UNKNOWN = "background-unknown"
51
+ """Use the list of background traffic classes (`apps_selection_background_unknown`) as *unknown*, and the rest as *known*."""
52
+ FIXED = "fixed"
53
+ """Manual application selection. Provide lists of *known* applications (`apps_selection_fixed_known`) and *unknown* applications (`apps_selection_fixed_unknown`)."""
70
54
  def __str__(self): return self.value
71
55
 
72
56
  class MinTrainSamplesCheck(Enum):
@@ -103,8 +87,9 @@ class TrainDataParams():
103
87
  train_tables_paths: list[str]
104
88
  apps_selection: AppSelection
105
89
  apps_selection_topx: int
106
- apps_selection_explicit_unknown: list[str]
107
- apps_selection_fixed_longterm: Optional[tuple[dict[int, str], dict[int, str]]]
90
+ apps_selection_background_unknown: list[str]
91
+ apps_selection_fixed_known: list[str]
92
+ apps_selection_fixed_unknown: list[str]
108
93
  disabled_apps: list[str]
109
94
  min_train_samples_check: MinTrainSamplesCheck
110
95
  min_train_samples_per_app: int
@@ -114,8 +99,8 @@ class TestDataParams():
114
99
  database_filename: str
115
100
  test_period_name: str
116
101
  test_tables_paths: list[str]
117
- known_apps_database_enum: dict[int, str]
118
- unknown_apps_database_enum: dict[int, str]
102
+ known_apps: list[str]
103
+ unknown_apps: list[str]
119
104
 
120
105
  class C:
121
106
  arbitrary_types_allowed = True
@@ -128,38 +113,43 @@ class DatasetConfig():
128
113
 
129
114
  - Train, validation, test sets (dates, sizes, validation approach).
130
115
  - Application selection — either the standard closed-world setting (only *known* classes) or the open-world setting (*known* and *unknown* classes).
131
- - Feature scaling. See the [data features][features] page for more information.
116
+ - Feature scaling. See the [data features][features] page for more information. DOCS_TODO
132
117
  - Dataloader options like batch sizes, order of loading, or number of workers.
133
118
 
134
119
  When initializing this class, pass a [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance to be configured and the desired configuration. Available options are [here][config.DatasetConfig--configuration-options].
135
120
 
136
121
  Attributes:
137
- dataset: The dataset instance to be configured
138
- data_root: Taken from the dataset instance
139
- database_filename: Taken from the dataset instance
140
- database_path: Taken from the dataset instance
141
- servicemap_path: Taken from the dataset instance
142
- flowstats_features: Taken from `dataset.metadata.flowstats_features`
143
- other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list
122
+ dataset: The dataset instance to be configured.
123
+ data_root: Taken from the dataset instance.
124
+ database_filename: Taken from the dataset instance.
125
+ database_path: Taken from the dataset instance.
126
+ servicemap_path: Taken from the dataset instance.
127
+ flowstats_features: Taken from `dataset.metadata.flowstats_features`.
128
+ flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
129
+ flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
130
+ other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
144
131
 
145
132
  # Configuration options
146
133
 
147
134
  Attributes:
135
+ need_train_set: Use to disable the train set. `Default: True`
136
+ need_val_set: Use to disable the validation set. When `need_train_set` is false, the validation set will also be disabled. `Default: True`
137
+ need_test_set: Use to disable the test set. `Default: True`
148
138
  train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
149
139
  train_dates: Dates used for creating a train set.
150
140
  train_dates_weigths: To use a non-uniform distribution of samples across train dates.
151
- val_approach: How a validation set should be created. Either split train data into train and validation, have a separate validation period, or no validation at all. `Default: SPLIT_FROM_TRAIN`
141
+ val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
152
142
  train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
153
143
  val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
154
144
  val_dates: Dates used for creating a validation set.
155
- no_test_set: Disable the test set. `Default: False`
156
145
  test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
157
146
  test_dates: Dates used for creating a test set.
158
147
 
159
148
  apps_selection: How to select application classes. `Default: ALL_KNOWN`
160
149
  apps_selection_topx: Take top X as known.
161
- apps_selection_explicit_unknown: Provide a list of unknown applications.
162
- apps_selection_fixed_longterm: Provide enums of known and unknown applications. This is suitable for long-term measurements.
150
+ apps_selection_background_unknown: Provide a list of background traffic classes to be used as unknown.
151
+ apps_selection_fixed_known: Provide a list of manually selected known applications.
152
+ apps_selection_fixed_unknown: Provide a list of manually selected unknown applications.
163
153
  disabled_apps: List of applications to be disabled and not used at all.
164
154
  min_train_samples_check: How to handle applications with *not enough* training samples. `Default: DISABLE_APPS`
165
155
  min_train_samples_per_app: Defines the threshold for *not enough*. `Default: 100`
@@ -182,21 +172,14 @@ class DatasetConfig():
182
172
  train_dataloader_seed: Seed for loading train data in random order. `Default: None`
183
173
 
184
174
  return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
185
- return_torch: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
186
- raw_output: Return raw output without data scaling, clipping, and normalization. `Default: False`
175
+ return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
187
176
  use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
188
- normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
189
177
  use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
190
178
  use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
191
- zero_ppi_start: Zeroing out the first N packets of each packet sequence. `Default: 0`
192
- fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25`
193
- flowstats_scaler: Which scaler to use for flow statistics. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: ROBUST`
194
- flowstats_clip: Quantile clip before the scaling of flow statistics. Should limit the influence of outliers. Set to `1` to disable. `Default: 0.99`
195
- psizes_scaler: Which scaler to use for packet sizes. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
196
- psizes_max: Max clip packet sizes before scaling. `Default: 1500`
197
- ipt_scaler: Which scaler to use for inter-packet times. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
198
- ipt_min: Min clip inter-packet times before scaling. `Default: 0`
199
- ipt_max: Max clip inter-packet times before scaling. `Default: 65000`
179
+ fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25` DOCS_TODO
180
+ ppi_transform: Transform function for PPI sequences. `Default: None` DOCS_TODO
181
+ flowstats_transform: Transform function for flow statistics. `Default: None`
182
+ flowstats_phist_transform: Transform function for packet histograms. `Default: None`
200
183
 
201
184
  # How to configure train, validation, and test sets
202
185
  There are three options for how to define train/validation/test dates.
@@ -222,8 +205,13 @@ class DatasetConfig():
222
205
  database_path: str = field(init=False)
223
206
  servicemap_path: str = field(init=False)
224
207
  flowstats_features: list[str] = field(init=False)
208
+ flowstats_features_boolean: list[str] = field(init=False)
209
+ flowstats_features_phist: list[str] = field(init=False)
225
210
  other_fields: list[str] = field(init=False)
226
211
 
212
+ need_train_set: bool = True
213
+ need_val_set: bool = True
214
+ need_test_set: bool = True
227
215
  train_period_name: str = ""
228
216
  train_dates: list[str] = field(default_factory=list)
229
217
  train_dates_weigths: Optional[list[int]] = None
@@ -231,14 +219,14 @@ class DatasetConfig():
231
219
  train_val_split_fraction: float = 0.2
232
220
  val_period_name: str = ""
233
221
  val_dates: list[str] = field(default_factory=list)
234
- no_test_set: bool = False
235
222
  test_period_name: str = ""
236
223
  test_dates: list[str] = field(default_factory=list)
237
224
 
238
225
  apps_selection: AppSelection = AppSelection.ALL_KNOWN
239
226
  apps_selection_topx: int = 0
240
- apps_selection_explicit_unknown: list[str] = field(default_factory=list)
241
- apps_selection_fixed_longterm: Optional[tuple[dict[int, str], dict[int, str]]] = None
227
+ apps_selection_background_unknown: list[str] = field(default_factory=list)
228
+ apps_selection_fixed_known: list[str] = field(default_factory=list)
229
+ apps_selection_fixed_unknown: list[str] = field(default_factory=list)
242
230
  disabled_apps: list[str] = field(default_factory=list)
243
231
  min_train_samples_check: MinTrainSamplesCheck = MinTrainSamplesCheck.DISABLE_APPS
244
232
  min_train_samples_per_app: int = 100
@@ -261,21 +249,14 @@ class DatasetConfig():
261
249
  train_dataloader_seed: Optional[int] = None
262
250
 
263
251
  return_other_fields: bool = False
264
- return_torch: bool = False
265
- raw_output: bool = False
266
- use_packet_histograms: bool = True
267
- normalize_packet_histograms: bool = True
268
- use_tcp_features: bool = True
252
+ return_tensors: bool = False
253
+ use_packet_histograms: bool = False
254
+ use_tcp_features: bool = False
269
255
  use_push_flags: bool = False
270
- zero_ppi_start: int = 0
271
256
  fit_scalers_samples: int | float = 0.25
272
- flowstats_scaler: ScalerEnum = ScalerEnum.ROBUST
273
- flowstats_clip: float = 0.99
274
- psizes_scaler: ScalerEnum = ScalerEnum.STANDARD
275
- psizes_max: int = 1500
276
- ipt_scaler: ScalerEnum = ScalerEnum.STANDARD
277
- ipt_min: int = 0
278
- ipt_max: int = 65000
257
+ ppi_transform: Optional[Callable] = None
258
+ flowstats_transform: Optional[Callable] = None
259
+ flowstats_phist_transform: Optional[Callable] = None
279
260
 
280
261
  def __post_init__(self, dataset: CesnetDataset):
281
262
  """
@@ -285,23 +266,28 @@ class DatasetConfig():
285
266
  self.servicemap_path = dataset.servicemap_path
286
267
  self.database_filename = dataset.database_filename
287
268
  self.database_path = dataset.database_path
288
- self.flowstats_features = dataset.metadata.flowstats_features
289
- self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
290
269
 
291
- # Configure train dates
292
- if len(self.train_dates) > 0 and self.train_period_name == "":
293
- raise ValueError("train_period_name has to be specified when train_dates are set")
294
- if len(self.train_dates) == 0 and self.train_period_name != "":
295
- if self.train_period_name not in dataset.time_periods:
296
- raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
297
- self.train_dates = dataset.time_periods[self.train_period_name]
298
- if len(self.train_dates) == 0 and self.test_period_name == "":
299
- self.train_period_name = dataset.default_train_period_name
300
- self.train_dates = dataset.time_periods[dataset.default_train_period_name]
270
+ if not self.need_train_set:
271
+ self.need_val_set = False
272
+ if self.apps_selection != AppSelection.FIXED:
273
+ raise ValueError("Application selection has to be fixed when need_train_set is false")
274
+ if (len(self.train_dates) > 0 or self.train_period_name != ""):
275
+ raise ValueError("train_dates and train_period_name cannot be specified when need_train_set is false")
276
+ else:
277
+ # Configure train dates
278
+ if len(self.train_dates) > 0 and self.train_period_name == "":
279
+ raise ValueError("train_period_name has to be specified when train_dates are set")
280
+ if len(self.train_dates) == 0 and self.train_period_name != "":
281
+ if self.train_period_name not in dataset.time_periods:
282
+ raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
283
+ self.train_dates = dataset.time_periods[self.train_period_name]
284
+ if len(self.train_dates) == 0 and self.test_period_name == "":
285
+ self.train_period_name = dataset.default_train_period_name
286
+ self.train_dates = dataset.time_periods[dataset.default_train_period_name]
301
287
  # Configure test dates
302
- if self.no_test_set:
288
+ if not self.need_test_set:
303
289
  if (len(self.test_dates) > 0 or self.test_period_name != ""):
304
- raise ValueError("test_dates and test_period_name cannot be specified when no_test_set is true")
290
+ raise ValueError("test_dates and test_period_name cannot be specified when need_test_set is false")
305
291
  else:
306
292
  if len(self.test_dates) > 0 and self.test_period_name == "":
307
293
  raise ValueError("test_period_name has to be specified when test_dates are set")
@@ -313,8 +299,8 @@ class DatasetConfig():
313
299
  self.test_period_name = dataset.default_test_period_name
314
300
  self.test_dates = dataset.time_periods[dataset.default_test_period_name]
315
301
  # Configure val dates
316
- if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
317
- raise ValueError("val_dates and val_period_name cannot be specified when val_approach is no-validation or split-from-train")
302
+ if (not self.need_val_set or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
303
+ raise ValueError("val_dates and val_period_name cannot be specified when need_val_set is false or the validation approach is split-from-train")
318
304
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
319
305
  if len(self.val_dates) > 0 and self.val_period_name == "":
320
306
  raise ValueError("val_period_name has to be specified when val_dates are set")
@@ -323,57 +309,58 @@ class DatasetConfig():
323
309
  raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
324
310
  self.val_dates = dataset.time_periods[self.val_period_name]
325
311
  if len(self.val_dates) == 0 and self.val_period_name == "":
326
- raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when val_approach is validation-dates")
312
+ raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
327
313
  # Check if train, val, and test dates are available in the dataset
328
- if dataset.available_dates:
329
- unknown_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
330
- unknown_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
331
- unknown_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
332
- if len(unknown_train_dates) > 0:
333
- raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
334
- + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
335
- if len(unknown_val_dates) > 0:
336
- raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
337
- + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
338
- if len(unknown_test_dates) > 0:
339
- raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
340
- + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
314
+ bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
315
+ bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
316
+ bad_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
317
+ if len(bad_train_dates) > 0:
318
+ raise ValueError(f"Bad train dates {bad_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
319
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
320
+ if len(bad_val_dates) > 0:
321
+ raise ValueError(f"Bad validation dates {bad_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
322
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
323
+ if len(bad_test_dates) > 0:
324
+ raise ValueError(f"Bad test dates {bad_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
325
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
341
326
  # Check time order of train, val, and test periods
342
327
  train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
343
328
  test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
344
- if not self.no_test_set and min(test_dates) <= max(train_dates):
329
+ if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
345
330
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
346
331
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
332
+ # Train dates are guaranteed to be set
347
333
  val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
348
334
  if min(val_dates) <= max(train_dates):
349
335
  warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
350
- if not self.no_test_set and min(test_dates) <= max(val_dates):
336
+ if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
351
337
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
352
338
  # Configure features
353
- if self.raw_output:
354
- self.normalize_packet_histograms = False
355
- self.flowstats_scaler = ScalerEnum.NO_SCALER
356
- self.flowstats_clip = 1.0
357
- self.psizes_scaler = ScalerEnum.NO_SCALER
358
- self.psizes_max = 1500
359
- self.ipt_scaler = ScalerEnum.NO_SCALER
360
- self.ipt_min = 0
361
- self.ipt_max = 65000
362
- if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
363
- self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
364
- if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.features_in_packet_sequences:
365
- raise ValueError("This TLS dataset does not support use_push_flags")
339
+ self.flowstats_features = dataset.metadata.flowstats_features
340
+ self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
341
+ self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
366
342
  if self.use_packet_histograms:
367
- if len(dataset.metadata.packet_histogram_features) > 0:
368
- self.flowstats_features = self.flowstats_features + dataset.metadata.packet_histogram_features
369
- else:
370
- self.use_packet_histograms = False
343
+ if len(dataset.metadata.packet_histograms) == 0:
344
+ raise ValueError("This dataset does not support use_packet_histograms")
345
+ self.flowstats_features_phist = dataset.metadata.packet_histograms
346
+ else:
347
+ self.flowstats_features_phist = []
348
+ if self.flowstats_phist_transform is not None:
349
+ raise ValueError("flowstats_phist_transform cannot be specified when use_packet_histograms is false")
350
+ if dataset.metadata.protocol == Protocol.TLS:
351
+ if self.use_tcp_features:
352
+ self.flowstats_features_boolean = self.flowstats_features_boolean + SELECTED_TCP_FLAGS
353
+ if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.ppi_features:
354
+ raise ValueError("This TLS dataset does not support use_push_flags")
371
355
  if dataset.metadata.protocol == Protocol.QUIC:
372
- self.use_tcp_features = False
356
+ if self.use_tcp_features:
357
+ raise ValueError("QUIC datasets do not support use_tcp_features")
373
358
  if self.use_push_flags:
374
359
  raise ValueError("QUIC datasets do not support use_push_flags")
375
360
  # When train_dates_weigths are used, train_size and val_known_size have to be specified
376
361
  if self.train_dates_weigths is not None:
362
+ if not self.need_train_set:
363
+ raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
377
364
  if len(self.train_dates_weigths) != len(self.train_dates):
378
365
  raise ValueError("train_dates_weigths has to have the same length as train_dates")
379
366
  if self.train_size == "all":
@@ -384,59 +371,75 @@ class DatasetConfig():
384
371
  if self.apps_selection == AppSelection.ALL_KNOWN:
385
372
  self.val_unknown_size = 0
386
373
  self.test_unknown_size = 0
387
- if self.apps_selection_topx != 0 or len(self.apps_selection_explicit_unknown) > 0 or self.apps_selection_fixed_longterm is not None:
388
- raise ValueError("apps_selection_topx, apps_selection_explicit_unknown, and apps_selection_fixed_longterm cannot be specified when apps_selection is all-known")
389
- if self.apps_selection == AppSelection.TOPX_KNOWN and self.apps_selection_topx == 0:
390
- raise ValueError("apps_selection_topx has to be greater than 0 when apps_selection is top-x-known")
391
- if self.apps_selection == AppSelection.EXPLICIT_UNKNOWN and len(self.apps_selection_explicit_unknown) == 0:
392
- raise ValueError("apps_selection_explicit_unknown has to be specified when apps_selection is explicit-unknown")
393
- if self.apps_selection == AppSelection.LONGTERM_FIXED:
394
- if self.apps_selection_fixed_longterm is None:
395
- raise ValueError("apps_selection_fixed_longterm, a tuple of (known_apps_database_enum, unknown_apps_database_enum), has to be specified when apps_selection is longterm-fixed")
374
+ if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
375
+ raise ValueError("apps_selection_topx, apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is all-known")
376
+ if self.apps_selection == AppSelection.TOPX_KNOWN:
377
+ if self.apps_selection_topx == 0:
378
+ raise ValueError("apps_selection_topx has to be greater than 0 when application selection is top-x-known")
379
+ if len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
380
+ raise ValueError("apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is top-x-known")
381
+ if self.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
382
+ if len(self.apps_selection_background_unknown) == 0:
383
+ raise ValueError("apps_selection_background_unknown has to be specified when application selection is background-unknown")
384
+ bad_apps = [a for a in self.apps_selection_background_unknown if a not in dataset.available_classes]
385
+ if len(bad_apps) > 0:
386
+ raise ValueError(f"Bad applications in apps_selection_background_unknown {bad_apps}. Use applications available in dataset.available_classes")
387
+ if self.apps_selection_topx != 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
388
+ raise ValueError("apps_selection_topx, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is background-unknown")
389
+ if self.apps_selection == AppSelection.FIXED:
390
+ if len(self.apps_selection_fixed_known) == 0:
391
+ raise ValueError("apps_selection_fixed_known has to be specified when application selection is fixed")
392
+ bad_apps = [a for a in self.apps_selection_fixed_known + self.apps_selection_fixed_unknown if a not in dataset.available_classes]
393
+ if len(bad_apps) > 0:
394
+ raise ValueError(f"Bad applications in apps_selection_fixed_known or apps_selection_fixed_unknown {bad_apps}. Use applications available in dataset.available_classes")
396
395
  if len(self.disabled_apps) > 0:
397
- raise ValueError("disabled_apps cannot be specified when apps_selection is longterm-fixed")
398
- if self.min_train_samples_per_app != 0:
399
- raise ValueError("min_train_samples_per_app has to be 0 when apps_selection is longterm-fixed")
400
- if sum((self.apps_selection_topx != 0, len(self.apps_selection_explicit_unknown) > 0, self.apps_selection_fixed_longterm is not None)) > 1:
401
- raise ValueError("apps_selection_topx, apps_selection_explicit_unknown, and apps_selection_fixed_longterm should not be specified at the same time")
396
+ raise ValueError("disabled_apps cannot be specified when application selection is fixed")
397
+ if self.min_train_samples_per_app != 0 and self.min_train_samples_per_app != 100:
398
+ warnings.warn("min_train_samples_per_app is not used when application selection is fixed")
399
+ if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0:
400
+ raise ValueError("apps_selection_topx and apps_selection_background_unknown cannot be specified when application selection is fixed")
402
401
  # More asserts
403
- if self.zero_ppi_start > PPI_MAX_LEN:
404
- raise ValueError(f"zero_ppi_start has to be <= {PPI_MAX_LEN}")
402
+ bad_disabled_apps = [a for a in self.disabled_apps if a not in dataset.available_classes]
403
+ if len(bad_disabled_apps) > 0:
404
+ raise ValueError(f"Bad applications in disabled_apps {bad_disabled_apps}. Use applications available in dataset.available_classes")
405
405
  if isinstance(self.fit_scalers_samples, float) and (self.fit_scalers_samples <= 0 or self.fit_scalers_samples > 1):
406
406
  raise ValueError("fit_scalers_samples has to be either float between 0 and 1 (giving the fraction of training samples used for fitting scalers) or an integer")
407
407
 
408
408
  def get_flowstats_features_len(self) -> int:
409
409
  """Gets the number of flow statistics features."""
410
- n = 0
411
- for f in self.flowstats_features:
412
- if f.startswith("PHIST_"):
413
- n += PHIST_BIN_COUNT
414
- else:
415
- n += 1
416
- return n
410
+ return len(self.flowstats_features) + len(self.flowstats_features_boolean) + PHIST_BIN_COUNT * len(self.flowstats_features_phist)
417
411
 
418
412
  def get_flowstats_feature_names_expanded(self, shorter_names: bool = False) -> list[str]:
419
413
  """Gets names of flow statistics features. Packet histograms are expanded into bin features."""
420
- name_mapping = {
414
+ phist_mapping = {
421
415
  "PHIST_SRC_SIZES": [f"PSIZE_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
422
416
  "PHIST_DST_SIZES": [f"PSIZE_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
423
417
  "PHIST_SRC_IPT": [f"IPT_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
424
418
  "PHIST_DST_IPT": [f"IPT_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
425
- "FLOW_ENDREASON_IDLE": "FEND_IDLE" if shorter_names else "FLOW_ENDREASON_IDLE",
426
- "FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE" if shorter_names else "FLOW_ENDREASON_ACTIVE",
427
- "FLOW_ENDREASON_END": "FEND_END" if shorter_names else "FLOW_ENDREASON_END",
428
- "FLOW_ENDREASON_OTHER": "FEND_OTHER" if shorter_names else "FLOW_ENDREASON_OTHER",
429
419
  }
430
- feature_names = []
431
- for f in self.flowstats_features:
432
- if f not in name_mapping:
433
- if shorter_names and f.startswith("FLAG"):
434
- f = "F" + f.lstrip("FLAG")
435
- feature_names.append(f)
436
- elif isinstance(name_mapping[f], list):
437
- feature_names.extend(name_mapping[f])
420
+ short_names_mapping = {
421
+ "FLOW_ENDREASON_IDLE": "FEND_IDLE",
422
+ "FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE",
423
+ "FLOW_ENDREASON_END": "FEND_END",
424
+ "FLOW_ENDREASON_OTHER": "FEND_OTHER",
425
+ "FLAG_CWR": "F_CWR",
426
+ "FLAG_CWR_REV": "F_CWR_REV",
427
+ "FLAG_ECE": "F_ECE",
428
+ "FLAG_ECE_REV": "F_ECE_REV",
429
+ "FLAG_PSH_REV": "F_PSH_REV",
430
+ "FLAG_RST": "F_RST",
431
+ "FLAG_RST_REV": "F_RST_REV",
432
+ "FLAG_FIN": "F_FIN",
433
+ "FLAG_FIN_REV": "F_FIN_REV",
434
+ }
435
+ feature_names = self.flowstats_features[:]
436
+ for f in self.flowstats_features_boolean:
437
+ if shorter_names and f in short_names_mapping:
438
+ feature_names.append(short_names_mapping[f])
438
439
  else:
439
- feature_names.append(name_mapping[f])
440
+ feature_names.append(f)
441
+ for f in self.flowstats_features_phist:
442
+ feature_names.extend(phist_mapping[f])
440
443
  assert len(feature_names) == self.get_flowstats_features_len()
441
444
  return feature_names
442
445
 
@@ -449,8 +452,8 @@ class DatasetConfig():
449
452
  ppi_feature_names += [f"PUSH_{i}" for i in range(1, PPI_MAX_LEN + 1)]
450
453
  return ppi_feature_names
451
454
 
452
- def get_ppi_channels(self) -> int:
453
- """Gets the number of features (channels) in PPI."""
455
+ def get_ppi_channels(self) -> list[int]:
456
+ """Gets the available features (channels) in PPI sequences."""
454
457
  if self.use_push_flags:
455
458
  return TCP_PPI_CHANNELS
456
459
  else:
@@ -485,8 +488,11 @@ class DatasetConfig():
485
488
  return params_hash
486
489
 
487
490
  def _get_train_data_path(self) -> str:
488
- params_hash = self._get_train_data_hash()
489
- return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
491
+ if self.need_train_set:
492
+ params_hash = self._get_train_data_hash()
493
+ return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
494
+ else:
495
+ return os.path.join(self.data_root, "train-data", "default")
490
496
 
491
497
  def _get_train_data_params(self) -> TrainDataParams:
492
498
  return TrainDataParams(
@@ -495,38 +501,39 @@ class DatasetConfig():
495
501
  train_tables_paths=self._get_train_tables_paths(),
496
502
  apps_selection=self.apps_selection,
497
503
  apps_selection_topx=self.apps_selection_topx,
498
- apps_selection_explicit_unknown=self.apps_selection_explicit_unknown,
499
- apps_selection_fixed_longterm=self.apps_selection_fixed_longterm,
504
+ apps_selection_background_unknown=self.apps_selection_background_unknown,
505
+ apps_selection_fixed_known=self.apps_selection_fixed_known,
506
+ apps_selection_fixed_unknown=self.apps_selection_fixed_unknown,
500
507
  disabled_apps=self.disabled_apps,
501
508
  min_train_samples_per_app=self.min_train_samples_per_app,
502
509
  min_train_samples_check=self.min_train_samples_check,)
503
510
 
504
- def _get_val_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
511
+ def _get_val_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
505
512
  assert self.val_approach == ValidationApproach.VALIDATION_DATES
506
513
  val_data_params = TestDataParams(
507
514
  database_filename=self.database_filename,
508
515
  test_period_name=self.val_period_name,
509
516
  test_tables_paths=self._get_val_tables_paths(),
510
- known_apps_database_enum=known_apps_database_enum,
511
- unknown_apps_database_enum=unknown_apps_database_enum,)
517
+ known_apps=known_apps,
518
+ unknown_apps=unknown_apps,)
512
519
  params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(val_data_params), sort_keys=True).encode()).hexdigest()
513
520
  params_hash = params_hash[:10]
514
521
  val_data_path = os.path.join(self.data_root, "val-data", f"{params_hash}_{self.random_state}")
515
522
  return val_data_params, val_data_path
516
523
 
517
- def _get_test_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
524
+ def _get_test_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
518
525
  test_data_params = TestDataParams(
519
526
  database_filename=self.database_filename,
520
527
  test_period_name=self.test_period_name,
521
528
  test_tables_paths=self._get_test_tables_paths(),
522
- known_apps_database_enum=known_apps_database_enum,
523
- unknown_apps_database_enum=unknown_apps_database_enum,)
529
+ known_apps=known_apps,
530
+ unknown_apps=unknown_apps,)
524
531
  params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(test_data_params), sort_keys=True).encode()).hexdigest()
525
532
  params_hash = params_hash[:10]
526
533
  test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
527
534
  return test_data_params, test_data_path
528
535
 
529
- @model_validator(mode="before")
536
+ @model_validator(mode="before") # type: ignore
530
537
  @classmethod
531
538
  def check_deprecated_args(cls, values):
532
539
  kwargs = values.kwargs
@@ -6,20 +6,18 @@ DATASET_SIZES = {
6
6
  }
7
7
 
8
8
  # Per-packet information (PPI) constants
9
+ PPI_MAX_LEN = 30
9
10
  IPT_POS = 0
10
11
  DIR_POS = 1
11
12
  SIZE_POS = 2
12
13
  PUSH_FLAGS_POS = 3
13
- PPI_MAX_LEN = 30
14
- TCP_PPI_CHANNELS = 4
15
- UDP_PPI_CHANNELS = 3
14
+ TCP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS, PUSH_FLAGS_POS]
15
+ UDP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS]
16
16
 
17
17
  # Features
18
18
  FLOWSTATS_TO_SCALE = ["BYTES", "BYTES_REV", "PACKETS", "PACKETS_REV", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION", "DURATION"]
19
- FLOWSTATS_NO_CLIP = ["PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION", "DURATION"]
19
+ FLOWSTATS_NO_CLIP = ["DURATION", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION"]
20
20
  SELECTED_TCP_FLAGS = ["FLAG_CWR", "FLAG_CWR_REV", "FLAG_ECE", "FLAG_ECE_REV", "FLAG_PSH_REV", "FLAG_RST", "FLAG_RST_REV", "FLAG_FIN", "FLAG_FIN_REV"]
21
- FLOWEND_REASON_FEATURES = ["FLOW_ENDREASON_IDLE", "FLOW_ENDREASON_ACTIVE", "FLOW_ENDREASON_END", "FLOW_ENDREASON_OTHER"]
22
- PHISTS_FEATURES = ["PHIST_SRC_SIZES", "PHIST_DST_SIZES", "PHIST_SRC_IPT", "PHIST_DST_IPT"]
23
21
  PHIST_BIN_COUNT = 8
24
22
 
25
23
  # Column names