cesnet-datazoo 0.0.17__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/PKG-INFO +2 -1
  2. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/config.py +173 -168
  3. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/constants.py +4 -6
  4. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/cesnet_dataset.py +200 -177
  5. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/datasets.py +22 -2
  6. cesnet-datazoo-0.1.0/cesnet_datazoo/datasets/datasets_constants.py +670 -0
  7. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/loaders.py +3 -0
  8. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
  9. cesnet-datazoo-0.1.0/cesnet_datazoo/datasets/metadata/metadata.csv +4 -0
  10. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/statistics.py +36 -16
  11. cesnet-datazoo-0.1.0/cesnet_datazoo/pytables_data/data_scalers.py +110 -0
  12. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/indices_setup.py +29 -33
  13. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/pytables_dataset.py +99 -122
  14. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/class_info.py +7 -5
  15. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/PKG-INFO +2 -1
  16. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/SOURCES.txt +1 -0
  17. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/requires.txt +1 -0
  18. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/pyproject.toml +2 -1
  19. cesnet-datazoo-0.0.17/cesnet_datazoo/datasets/metadata/metadata.csv +0 -4
  20. cesnet-datazoo-0.0.17/cesnet_datazoo/pytables_data/data_scalers.py +0 -196
  21. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/LICENCE +0 -0
  22. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/README.md +0 -0
  23. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/__init__.py +0 -0
  24. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/__init__.py +0 -0
  25. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  26. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/__init__.py +0 -0
  27. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/classification_report.py +0 -0
  28. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
  29. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  30. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  31. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/__init__.py +0 -0
  32. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/download.py +0 -0
  33. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/fileutils.py +0 -0
  34. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo/utils/random.py +0 -0
  35. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  36. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  37. {cesnet-datazoo-0.0.17 → cesnet-datazoo-0.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.17
3
+ Version: 0.1.0
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
16
16
  Requires-Python: >=3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENCE
19
+ Requires-Dist: cesnet_models
19
20
  Requires-Dist: matplotlib
20
21
  Requires-Dist: numpy
21
22
  Requires-Dist: pandas
@@ -8,12 +8,11 @@ import warnings
8
8
  from dataclasses import InitVar, field
9
9
  from datetime import datetime
10
10
  from enum import Enum
11
- from typing import TYPE_CHECKING, Literal, Optional
11
+ from typing import TYPE_CHECKING, Callable, Literal, Optional
12
12
 
13
13
  import yaml
14
14
  from pydantic import model_validator
15
15
  from pydantic.dataclasses import dataclass
16
- from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
17
16
 
18
17
  from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP_FLAGS,
19
18
  TCP_PPI_CHANNELS, UDP_PPI_CHANNELS)
@@ -21,19 +20,6 @@ from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP
21
20
  if TYPE_CHECKING:
22
21
  from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
23
22
 
24
- Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
25
-
26
- class ScalerEnum(Enum):
27
- """Available scalers for flow statistics, packet sizes, and inter-packet times."""
28
- STANDARD = "standard"
29
- """Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
30
- ROBUST = "robust"
31
- """Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
32
- MINMAX = "minmax"
33
- """Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
34
- NO_SCALER = "no-scaler"
35
- """No scaling."""
36
- def __str__(self): return self.value
37
23
 
38
24
  class Protocol(Enum):
39
25
  TLS = "TLS"
@@ -48,25 +34,23 @@ class ValidationApproach(Enum):
48
34
  is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
49
35
  VALIDATION_DATES = "validation-dates"
50
36
  """Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
51
- NO_VALIDATION = "no-validation"
52
- """Do not use validation. The validation dataloader and dataframe will not be available."""
53
37
  def __str__(self): return self.value
54
38
 
55
39
  class AppSelection(Enum):
56
40
  """
57
41
  Applications can be divided into *known* and *unknown* classes. To use a dataset in the standard closed-world setting, use `ALL_KNOWN ` to select all the applications as *known*.
58
- Use `TOPX_KNOWN` or `EXPLICIT_UNKNOWN` for the open-world setting and evaluation of out-of-distribution or open-set recognition methods.
59
- The `LONGTERM_FIXED` is for long-term measurements when it is desired to use the same applications for multiple subsequent train and test periods.
42
+ Use `TOPX_KNOWN` or `BACKGROUND_UNKNOWN` for the open-world setting and evaluation of out-of-distribution or open-set recognition methods.
43
+ The `FIXED` is for manual selection of *known* and *unknown* applications.
60
44
  """
61
45
  ALL_KNOWN = "all-known"
62
46
  """Use all applications as *known*."""
63
47
  TOPX_KNOWN = "topx-known"
64
48
  """Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
65
49
  Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
66
- EXPLICIT_UNKNOWN = "explicit-unknown"
67
- """Use the provided list of applications (`apps_selection_explicit_unknown`) as *unknown*, and the rest as *known*."""
68
- LONGTERM_FIXED = "longterm-fixed"
69
- """Use fixed application selection. Provide a tuple of `(known_apps_database_enum, unknown_apps_database_enum)` in `apps_selection_fixed_longterm`."""
50
+ BACKGROUND_UNKNOWN = "background-unknown"
51
+ """Use the list of background traffic classes (`apps_selection_background_unknown`) as *unknown*, and the rest as *known*."""
52
+ FIXED = "fixed"
53
+ """Manual application selection. Provide lists of *known* applications (`apps_selection_fixed_known`) and *unknown* applications (`apps_selection_fixed_unknown`)."""
70
54
  def __str__(self): return self.value
71
55
 
72
56
  class MinTrainSamplesCheck(Enum):
@@ -103,8 +87,9 @@ class TrainDataParams():
103
87
  train_tables_paths: list[str]
104
88
  apps_selection: AppSelection
105
89
  apps_selection_topx: int
106
- apps_selection_explicit_unknown: list[str]
107
- apps_selection_fixed_longterm: Optional[tuple[dict[int, str], dict[int, str]]]
90
+ apps_selection_background_unknown: list[str]
91
+ apps_selection_fixed_known: list[str]
92
+ apps_selection_fixed_unknown: list[str]
108
93
  disabled_apps: list[str]
109
94
  min_train_samples_check: MinTrainSamplesCheck
110
95
  min_train_samples_per_app: int
@@ -114,8 +99,8 @@ class TestDataParams():
114
99
  database_filename: str
115
100
  test_period_name: str
116
101
  test_tables_paths: list[str]
117
- known_apps_database_enum: dict[int, str]
118
- unknown_apps_database_enum: dict[int, str]
102
+ known_apps: list[str]
103
+ unknown_apps: list[str]
119
104
 
120
105
  class C:
121
106
  arbitrary_types_allowed = True
@@ -128,38 +113,43 @@ class DatasetConfig():
128
113
 
129
114
  - Train, validation, test sets (dates, sizes, validation approach).
130
115
  - Application selection — either the standard closed-world setting (only *known* classes) or the open-world setting (*known* and *unknown* classes).
131
- - Feature scaling. See the [data features][features] page for more information.
116
+ - Feature scaling. See the [data features][features] page for more information. DOCS_TODO
132
117
  - Dataloader options like batch sizes, order of loading, or number of workers.
133
118
 
134
119
  When initializing this class, pass a [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance to be configured and the desired configuration. Available options are [here][config.DatasetConfig--configuration-options].
135
120
 
136
121
  Attributes:
137
- dataset: The dataset instance to be configured
138
- data_root: Taken from the dataset instance
139
- database_filename: Taken from the dataset instance
140
- database_path: Taken from the dataset instance
141
- servicemap_path: Taken from the dataset instance
142
- flowstats_features: Taken from `dataset.metadata.flowstats_features`
143
- other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list
122
+ dataset: The dataset instance to be configured.
123
+ data_root: Taken from the dataset instance.
124
+ database_filename: Taken from the dataset instance.
125
+ database_path: Taken from the dataset instance.
126
+ servicemap_path: Taken from the dataset instance.
127
+ flowstats_features: Taken from `dataset.metadata.flowstats_features`.
128
+ flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
129
+ flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
130
+ other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
144
131
 
145
132
  # Configuration options
146
133
 
147
134
  Attributes:
135
+ need_train_set: Use to disable the train set. `Default: True`
136
+ need_val_set: Use to disable the validation set. When `need_train_set` is false, the validation set will also be disabled. `Default: True`
137
+ need_test_set: Use to disable the test set. `Default: True`
148
138
  train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
149
139
  train_dates: Dates used for creating a train set.
150
140
  train_dates_weigths: To use a non-uniform distribution of samples across train dates.
151
- val_approach: How a validation set should be created. Either split train data into train and validation, have a separate validation period, or no validation at all. `Default: SPLIT_FROM_TRAIN`
141
+ val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
152
142
  train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
153
143
  val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
154
144
  val_dates: Dates used for creating a validation set.
155
- no_test_set: Disable the test set. `Default: False`
156
145
  test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
157
146
  test_dates: Dates used for creating a test set.
158
147
 
159
148
  apps_selection: How to select application classes. `Default: ALL_KNOWN`
160
149
  apps_selection_topx: Take top X as known.
161
- apps_selection_explicit_unknown: Provide a list of unknown applications.
162
- apps_selection_fixed_longterm: Provide enums of known and unknown applications. This is suitable for long-term measurements.
150
+ apps_selection_background_unknown: Provide a list of background traffic classes to be used as unknown.
151
+ apps_selection_fixed_known: Provide a list of manually selected known applications.
152
+ apps_selection_fixed_unknown: Provide a list of manually selected unknown applications.
163
153
  disabled_apps: List of applications to be disabled and not used at all.
164
154
  min_train_samples_check: How to handle applications with *not enough* training samples. `Default: DISABLE_APPS`
165
155
  min_train_samples_per_app: Defines the threshold for *not enough*. `Default: 100`
@@ -182,22 +172,14 @@ class DatasetConfig():
182
172
  train_dataloader_seed: Seed for loading train data in random order. `Default: None`
183
173
 
184
174
  return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
185
- return_torch: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
186
- raw_output: Return raw output without data scaling, clipping, and normalization. `Default: False`
175
+ return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
187
176
  use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
188
- normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
189
177
  use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
190
178
  use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
191
- zero_ppi_start: Zeroing out the first N packets of each packet sequence. `Default: 0`
192
- fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25`
193
- flowstats_scaler: Which scaler to use for flow statistics. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: ROBUST`
194
- psizes_scaler: Which scaler to use for packet sizes. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
195
- ipt_scaler: Which scaler to use for inter-packet times. Options are [`ROBUST`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) | [`STANDARD`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) | [`MINMAX`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) | `NO_SCALER`. `Default: STANDARD`
196
- scalers_attrs: Load data scalers from numeric values in this dict rather than from pickled files. `Default: None`
197
- flowstats_clip: Quantile clip before the scaling of flow statistics. Should limit the influence of outliers. Set to `1` to disable. `Default: 0.99`
198
- psizes_max: Max clip packet sizes before scaling. `Default: 1500`
199
- ipt_min: Min clip inter-packet times before scaling. `Default: 0`
200
- ipt_max: Max clip inter-packet times before scaling. `Default: 65000`
179
+ fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25` DOCS_TODO
180
+ ppi_transform: Transform function for PPI sequences. `Default: None` DOCS_TODO
181
+ flowstats_transform: Transform function for flow statistics. `Default: None`
182
+ flowstats_phist_transform: Transform function for packet histograms. `Default: None`
201
183
 
202
184
  # How to configure train, validation, and test sets
203
185
  There are three options for how to define train/validation/test dates.
@@ -223,8 +205,13 @@ class DatasetConfig():
223
205
  database_path: str = field(init=False)
224
206
  servicemap_path: str = field(init=False)
225
207
  flowstats_features: list[str] = field(init=False)
208
+ flowstats_features_boolean: list[str] = field(init=False)
209
+ flowstats_features_phist: list[str] = field(init=False)
226
210
  other_fields: list[str] = field(init=False)
227
211
 
212
+ need_train_set: bool = True
213
+ need_val_set: bool = True
214
+ need_test_set: bool = True
228
215
  train_period_name: str = ""
229
216
  train_dates: list[str] = field(default_factory=list)
230
217
  train_dates_weigths: Optional[list[int]] = None
@@ -232,14 +219,14 @@ class DatasetConfig():
232
219
  train_val_split_fraction: float = 0.2
233
220
  val_period_name: str = ""
234
221
  val_dates: list[str] = field(default_factory=list)
235
- no_test_set: bool = False
236
222
  test_period_name: str = ""
237
223
  test_dates: list[str] = field(default_factory=list)
238
224
 
239
225
  apps_selection: AppSelection = AppSelection.ALL_KNOWN
240
226
  apps_selection_topx: int = 0
241
- apps_selection_explicit_unknown: list[str] = field(default_factory=list)
242
- apps_selection_fixed_longterm: Optional[tuple[dict[int, str], dict[int, str]]] = None
227
+ apps_selection_background_unknown: list[str] = field(default_factory=list)
228
+ apps_selection_fixed_known: list[str] = field(default_factory=list)
229
+ apps_selection_fixed_unknown: list[str] = field(default_factory=list)
243
230
  disabled_apps: list[str] = field(default_factory=list)
244
231
  min_train_samples_check: MinTrainSamplesCheck = MinTrainSamplesCheck.DISABLE_APPS
245
232
  min_train_samples_per_app: int = 100
@@ -262,22 +249,14 @@ class DatasetConfig():
262
249
  train_dataloader_seed: Optional[int] = None
263
250
 
264
251
  return_other_fields: bool = False
265
- return_torch: bool = False
266
- raw_output: bool = False
267
- use_packet_histograms: bool = True
268
- normalize_packet_histograms: bool = True
269
- use_tcp_features: bool = True
252
+ return_tensors: bool = False
253
+ use_packet_histograms: bool = False
254
+ use_tcp_features: bool = False
270
255
  use_push_flags: bool = False
271
- zero_ppi_start: int = 0
272
256
  fit_scalers_samples: int | float = 0.25
273
- flowstats_scaler: ScalerEnum = ScalerEnum.ROBUST
274
- psizes_scaler: ScalerEnum = ScalerEnum.STANDARD
275
- ipt_scaler: ScalerEnum = ScalerEnum.STANDARD
276
- scalers_attrs: Optional[dict] = None
277
- flowstats_clip: float = 0.99
278
- psizes_max: int = 1500
279
- ipt_min: int = 0
280
- ipt_max: int = 65000
257
+ ppi_transform: Optional[Callable] = None
258
+ flowstats_transform: Optional[Callable] = None
259
+ flowstats_phist_transform: Optional[Callable] = None
281
260
 
282
261
  def __post_init__(self, dataset: CesnetDataset):
283
262
  """
@@ -287,23 +266,28 @@ class DatasetConfig():
287
266
  self.servicemap_path = dataset.servicemap_path
288
267
  self.database_filename = dataset.database_filename
289
268
  self.database_path = dataset.database_path
290
- self.flowstats_features = dataset.metadata.flowstats_features
291
- self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
292
269
 
293
- # Configure train dates
294
- if len(self.train_dates) > 0 and self.train_period_name == "":
295
- raise ValueError("train_period_name has to be specified when train_dates are set")
296
- if len(self.train_dates) == 0 and self.train_period_name != "":
297
- if self.train_period_name not in dataset.time_periods:
298
- raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
299
- self.train_dates = dataset.time_periods[self.train_period_name]
300
- if len(self.train_dates) == 0 and self.test_period_name == "":
301
- self.train_period_name = dataset.default_train_period_name
302
- self.train_dates = dataset.time_periods[dataset.default_train_period_name]
270
+ if not self.need_train_set:
271
+ self.need_val_set = False
272
+ if self.apps_selection != AppSelection.FIXED:
273
+ raise ValueError("Application selection has to be fixed when need_train_set is false")
274
+ if (len(self.train_dates) > 0 or self.train_period_name != ""):
275
+ raise ValueError("train_dates and train_period_name cannot be specified when need_train_set is false")
276
+ else:
277
+ # Configure train dates
278
+ if len(self.train_dates) > 0 and self.train_period_name == "":
279
+ raise ValueError("train_period_name has to be specified when train_dates are set")
280
+ if len(self.train_dates) == 0 and self.train_period_name != "":
281
+ if self.train_period_name not in dataset.time_periods:
282
+ raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
283
+ self.train_dates = dataset.time_periods[self.train_period_name]
284
+ if len(self.train_dates) == 0 and self.test_period_name == "":
285
+ self.train_period_name = dataset.default_train_period_name
286
+ self.train_dates = dataset.time_periods[dataset.default_train_period_name]
303
287
  # Configure test dates
304
- if self.no_test_set:
288
+ if not self.need_test_set:
305
289
  if (len(self.test_dates) > 0 or self.test_period_name != ""):
306
- raise ValueError("test_dates and test_period_name cannot be specified when no_test_set is true")
290
+ raise ValueError("test_dates and test_period_name cannot be specified when need_test_set is false")
307
291
  else:
308
292
  if len(self.test_dates) > 0 and self.test_period_name == "":
309
293
  raise ValueError("test_period_name has to be specified when test_dates are set")
@@ -315,8 +299,8 @@ class DatasetConfig():
315
299
  self.test_period_name = dataset.default_test_period_name
316
300
  self.test_dates = dataset.time_periods[dataset.default_test_period_name]
317
301
  # Configure val dates
318
- if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
319
- raise ValueError("val_dates and val_period_name cannot be specified when val_approach is no-validation or split-from-train")
302
+ if (not self.need_val_set or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
303
+ raise ValueError("val_dates and val_period_name cannot be specified when need_val_set is false or the validation approach is split-from-train")
320
304
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
321
305
  if len(self.val_dates) > 0 and self.val_period_name == "":
322
306
  raise ValueError("val_period_name has to be specified when val_dates are set")
@@ -325,57 +309,58 @@ class DatasetConfig():
325
309
  raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
326
310
  self.val_dates = dataset.time_periods[self.val_period_name]
327
311
  if len(self.val_dates) == 0 and self.val_period_name == "":
328
- raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when val_approach is validation-dates")
312
+ raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
329
313
  # Check if train, val, and test dates are available in the dataset
330
- if dataset.available_dates:
331
- unknown_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
332
- unknown_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
333
- unknown_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
334
- if len(unknown_train_dates) > 0:
335
- raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
336
- + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
337
- if len(unknown_val_dates) > 0:
338
- raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
339
- + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
340
- if len(unknown_test_dates) > 0:
341
- raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
342
- + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
314
+ bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
315
+ bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
316
+ bad_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
317
+ if len(bad_train_dates) > 0:
318
+ raise ValueError(f"Bad train dates {bad_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
319
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
320
+ if len(bad_val_dates) > 0:
321
+ raise ValueError(f"Bad validation dates {bad_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
322
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
323
+ if len(bad_test_dates) > 0:
324
+ raise ValueError(f"Bad test dates {bad_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
325
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
343
326
  # Check time order of train, val, and test periods
344
327
  train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
345
328
  test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
346
- if not self.no_test_set and min(test_dates) <= max(train_dates):
329
+ if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
347
330
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
348
331
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
332
+ # Train dates are guaranteed to be set
349
333
  val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
350
334
  if min(val_dates) <= max(train_dates):
351
335
  warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
352
- if not self.no_test_set and min(test_dates) <= max(val_dates):
336
+ if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
353
337
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
354
338
  # Configure features
355
- if self.raw_output:
356
- self.normalize_packet_histograms = False
357
- self.flowstats_scaler = ScalerEnum.NO_SCALER
358
- self.flowstats_clip = 1.0
359
- self.psizes_scaler = ScalerEnum.NO_SCALER
360
- self.psizes_max = 1500
361
- self.ipt_scaler = ScalerEnum.NO_SCALER
362
- self.ipt_min = 0
363
- self.ipt_max = 65000
364
- if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
365
- self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
366
- if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.features_in_packet_sequences:
367
- raise ValueError("This TLS dataset does not support use_push_flags")
339
+ self.flowstats_features = dataset.metadata.flowstats_features
340
+ self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
341
+ self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
368
342
  if self.use_packet_histograms:
369
- if len(dataset.metadata.packet_histogram_features) > 0:
370
- self.flowstats_features = self.flowstats_features + dataset.metadata.packet_histogram_features
371
- else:
372
- self.use_packet_histograms = False
343
+ if len(dataset.metadata.packet_histograms) == 0:
344
+ raise ValueError("This dataset does not support use_packet_histograms")
345
+ self.flowstats_features_phist = dataset.metadata.packet_histograms
346
+ else:
347
+ self.flowstats_features_phist = []
348
+ if self.flowstats_phist_transform is not None:
349
+ raise ValueError("flowstats_phist_transform cannot be specified when use_packet_histograms is false")
350
+ if dataset.metadata.protocol == Protocol.TLS:
351
+ if self.use_tcp_features:
352
+ self.flowstats_features_boolean = self.flowstats_features_boolean + SELECTED_TCP_FLAGS
353
+ if self.use_push_flags and "PUSH_FLAG" not in dataset.metadata.ppi_features:
354
+ raise ValueError("This TLS dataset does not support use_push_flags")
373
355
  if dataset.metadata.protocol == Protocol.QUIC:
374
- self.use_tcp_features = False
356
+ if self.use_tcp_features:
357
+ raise ValueError("QUIC datasets do not support use_tcp_features")
375
358
  if self.use_push_flags:
376
359
  raise ValueError("QUIC datasets do not support use_push_flags")
377
360
  # When train_dates_weigths are used, train_size and val_known_size have to be specified
378
361
  if self.train_dates_weigths is not None:
362
+ if not self.need_train_set:
363
+ raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
379
364
  if len(self.train_dates_weigths) != len(self.train_dates):
380
365
  raise ValueError("train_dates_weigths has to have the same length as train_dates")
381
366
  if self.train_size == "all":
@@ -386,59 +371,75 @@ class DatasetConfig():
386
371
  if self.apps_selection == AppSelection.ALL_KNOWN:
387
372
  self.val_unknown_size = 0
388
373
  self.test_unknown_size = 0
389
- if self.apps_selection_topx != 0 or len(self.apps_selection_explicit_unknown) > 0 or self.apps_selection_fixed_longterm is not None:
390
- raise ValueError("apps_selection_topx, apps_selection_explicit_unknown, and apps_selection_fixed_longterm cannot be specified when apps_selection is all-known")
391
- if self.apps_selection == AppSelection.TOPX_KNOWN and self.apps_selection_topx == 0:
392
- raise ValueError("apps_selection_topx has to be greater than 0 when apps_selection is top-x-known")
393
- if self.apps_selection == AppSelection.EXPLICIT_UNKNOWN and len(self.apps_selection_explicit_unknown) == 0:
394
- raise ValueError("apps_selection_explicit_unknown has to be specified when apps_selection is explicit-unknown")
395
- if self.apps_selection == AppSelection.LONGTERM_FIXED:
396
- if self.apps_selection_fixed_longterm is None:
397
- raise ValueError("apps_selection_fixed_longterm, a tuple of (known_apps_database_enum, unknown_apps_database_enum), has to be specified when apps_selection is longterm-fixed")
374
+ if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
375
+ raise ValueError("apps_selection_topx, apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is all-known")
376
+ if self.apps_selection == AppSelection.TOPX_KNOWN:
377
+ if self.apps_selection_topx == 0:
378
+ raise ValueError("apps_selection_topx has to be greater than 0 when application selection is top-x-known")
379
+ if len(self.apps_selection_background_unknown) > 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
380
+ raise ValueError("apps_selection_background_unknown, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is top-x-known")
381
+ if self.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
382
+ if len(self.apps_selection_background_unknown) == 0:
383
+ raise ValueError("apps_selection_background_unknown has to be specified when application selection is background-unknown")
384
+ bad_apps = [a for a in self.apps_selection_background_unknown if a not in dataset.available_classes]
385
+ if len(bad_apps) > 0:
386
+ raise ValueError(f"Bad applications in apps_selection_background_unknown {bad_apps}. Use applications available in dataset.available_classes")
387
+ if self.apps_selection_topx != 0 or len(self.apps_selection_fixed_known) > 0 or len(self.apps_selection_fixed_unknown) > 0:
388
+ raise ValueError("apps_selection_topx, apps_selection_fixed_known, and apps_selection_fixed_unknown cannot be specified when application selection is background-unknown")
389
+ if self.apps_selection == AppSelection.FIXED:
390
+ if len(self.apps_selection_fixed_known) == 0:
391
+ raise ValueError("apps_selection_fixed_known has to be specified when application selection is fixed")
392
+ bad_apps = [a for a in self.apps_selection_fixed_known + self.apps_selection_fixed_unknown if a not in dataset.available_classes]
393
+ if len(bad_apps) > 0:
394
+ raise ValueError(f"Bad applications in apps_selection_fixed_known or apps_selection_fixed_unknown {bad_apps}. Use applications available in dataset.available_classes")
398
395
  if len(self.disabled_apps) > 0:
399
- raise ValueError("disabled_apps cannot be specified when apps_selection is longterm-fixed")
400
- if self.min_train_samples_per_app != 0:
401
- raise ValueError("min_train_samples_per_app has to be 0 when apps_selection is longterm-fixed")
402
- if sum((self.apps_selection_topx != 0, len(self.apps_selection_explicit_unknown) > 0, self.apps_selection_fixed_longterm is not None)) > 1:
403
- raise ValueError("apps_selection_topx, apps_selection_explicit_unknown, and apps_selection_fixed_longterm should not be specified at the same time")
396
+ raise ValueError("disabled_apps cannot be specified when application selection is fixed")
397
+ if self.min_train_samples_per_app != 0 and self.min_train_samples_per_app != 100:
398
+ warnings.warn("min_train_samples_per_app is not used when application selection is fixed")
399
+ if self.apps_selection_topx != 0 or len(self.apps_selection_background_unknown) > 0:
400
+ raise ValueError("apps_selection_topx and apps_selection_background_unknown cannot be specified when application selection is fixed")
404
401
  # More asserts
405
- if self.zero_ppi_start > PPI_MAX_LEN:
406
- raise ValueError(f"zero_ppi_start has to be <= {PPI_MAX_LEN}")
402
+ bad_disabled_apps = [a for a in self.disabled_apps if a not in dataset.available_classes]
403
+ if len(bad_disabled_apps) > 0:
404
+ raise ValueError(f"Bad applications in disabled_apps {bad_disabled_apps}. Use applications available in dataset.available_classes")
407
405
  if isinstance(self.fit_scalers_samples, float) and (self.fit_scalers_samples <= 0 or self.fit_scalers_samples > 1):
408
406
  raise ValueError("fit_scalers_samples has to be either float between 0 and 1 (giving the fraction of training samples used for fitting scalers) or an integer")
409
407
 
410
408
  def get_flowstats_features_len(self) -> int:
411
409
  """Gets the number of flow statistics features."""
412
- n = 0
413
- for f in self.flowstats_features:
414
- if f.startswith("PHIST_"):
415
- n += PHIST_BIN_COUNT
416
- else:
417
- n += 1
418
- return n
410
+ return len(self.flowstats_features) + len(self.flowstats_features_boolean) + PHIST_BIN_COUNT * len(self.flowstats_features_phist)
419
411
 
420
412
  def get_flowstats_feature_names_expanded(self, shorter_names: bool = False) -> list[str]:
421
413
  """Gets names of flow statistics features. Packet histograms are expanded into bin features."""
422
- name_mapping = {
414
+ phist_mapping = {
423
415
  "PHIST_SRC_SIZES": [f"PSIZE_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
424
416
  "PHIST_DST_SIZES": [f"PSIZE_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
425
417
  "PHIST_SRC_IPT": [f"IPT_BIN{i}" for i in range(1, PHIST_BIN_COUNT + 1)],
426
418
  "PHIST_DST_IPT": [f"IPT_BIN{i}_REV" for i in range(1, PHIST_BIN_COUNT + 1)],
427
- "FLOW_ENDREASON_IDLE": "FEND_IDLE" if shorter_names else "FLOW_ENDREASON_IDLE",
428
- "FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE" if shorter_names else "FLOW_ENDREASON_ACTIVE",
429
- "FLOW_ENDREASON_END": "FEND_END" if shorter_names else "FLOW_ENDREASON_END",
430
- "FLOW_ENDREASON_OTHER": "FEND_OTHER" if shorter_names else "FLOW_ENDREASON_OTHER",
431
419
  }
432
- feature_names = []
433
- for f in self.flowstats_features:
434
- if f not in name_mapping:
435
- if shorter_names and f.startswith("FLAG"):
436
- f = "F" + f.lstrip("FLAG")
437
- feature_names.append(f)
438
- elif isinstance(name_mapping[f], list):
439
- feature_names.extend(name_mapping[f])
420
+ short_names_mapping = {
421
+ "FLOW_ENDREASON_IDLE": "FEND_IDLE",
422
+ "FLOW_ENDREASON_ACTIVE": "FEND_ACTIVE",
423
+ "FLOW_ENDREASON_END": "FEND_END",
424
+ "FLOW_ENDREASON_OTHER": "FEND_OTHER",
425
+ "FLAG_CWR": "F_CWR",
426
+ "FLAG_CWR_REV": "F_CWR_REV",
427
+ "FLAG_ECE": "F_ECE",
428
+ "FLAG_ECE_REV": "F_ECE_REV",
429
+ "FLAG_PSH_REV": "F_PSH_REV",
430
+ "FLAG_RST": "F_RST",
431
+ "FLAG_RST_REV": "F_RST_REV",
432
+ "FLAG_FIN": "F_FIN",
433
+ "FLAG_FIN_REV": "F_FIN_REV",
434
+ }
435
+ feature_names = self.flowstats_features[:]
436
+ for f in self.flowstats_features_boolean:
437
+ if shorter_names and f in short_names_mapping:
438
+ feature_names.append(short_names_mapping[f])
440
439
  else:
441
- feature_names.append(name_mapping[f])
440
+ feature_names.append(f)
441
+ for f in self.flowstats_features_phist:
442
+ feature_names.extend(phist_mapping[f])
442
443
  assert len(feature_names) == self.get_flowstats_features_len()
443
444
  return feature_names
444
445
 
@@ -451,8 +452,8 @@ class DatasetConfig():
451
452
  ppi_feature_names += [f"PUSH_{i}" for i in range(1, PPI_MAX_LEN + 1)]
452
453
  return ppi_feature_names
453
454
 
454
- def get_ppi_channels(self) -> int:
455
- """Gets the number of features (channels) in PPI."""
455
+ def get_ppi_channels(self) -> list[int]:
456
+ """Gets the available features (channels) in PPI sequences."""
456
457
  if self.use_push_flags:
457
458
  return TCP_PPI_CHANNELS
458
459
  else:
@@ -487,8 +488,11 @@ class DatasetConfig():
487
488
  return params_hash
488
489
 
489
490
  def _get_train_data_path(self) -> str:
490
- params_hash = self._get_train_data_hash()
491
- return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
491
+ if self.need_train_set:
492
+ params_hash = self._get_train_data_hash()
493
+ return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
494
+ else:
495
+ return os.path.join(self.data_root, "train-data", "default")
492
496
 
493
497
  def _get_train_data_params(self) -> TrainDataParams:
494
498
  return TrainDataParams(
@@ -497,32 +501,33 @@ class DatasetConfig():
497
501
  train_tables_paths=self._get_train_tables_paths(),
498
502
  apps_selection=self.apps_selection,
499
503
  apps_selection_topx=self.apps_selection_topx,
500
- apps_selection_explicit_unknown=self.apps_selection_explicit_unknown,
501
- apps_selection_fixed_longterm=self.apps_selection_fixed_longterm,
504
+ apps_selection_background_unknown=self.apps_selection_background_unknown,
505
+ apps_selection_fixed_known=self.apps_selection_fixed_known,
506
+ apps_selection_fixed_unknown=self.apps_selection_fixed_unknown,
502
507
  disabled_apps=self.disabled_apps,
503
508
  min_train_samples_per_app=self.min_train_samples_per_app,
504
509
  min_train_samples_check=self.min_train_samples_check,)
505
510
 
506
- def _get_val_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
511
+ def _get_val_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
507
512
  assert self.val_approach == ValidationApproach.VALIDATION_DATES
508
513
  val_data_params = TestDataParams(
509
514
  database_filename=self.database_filename,
510
515
  test_period_name=self.val_period_name,
511
516
  test_tables_paths=self._get_val_tables_paths(),
512
- known_apps_database_enum=known_apps_database_enum,
513
- unknown_apps_database_enum=unknown_apps_database_enum,)
517
+ known_apps=known_apps,
518
+ unknown_apps=unknown_apps,)
514
519
  params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(val_data_params), sort_keys=True).encode()).hexdigest()
515
520
  params_hash = params_hash[:10]
516
521
  val_data_path = os.path.join(self.data_root, "val-data", f"{params_hash}_{self.random_state}")
517
522
  return val_data_params, val_data_path
518
523
 
519
- def _get_test_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
524
+ def _get_test_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
520
525
  test_data_params = TestDataParams(
521
526
  database_filename=self.database_filename,
522
527
  test_period_name=self.test_period_name,
523
528
  test_tables_paths=self._get_test_tables_paths(),
524
- known_apps_database_enum=known_apps_database_enum,
525
- unknown_apps_database_enum=unknown_apps_database_enum,)
529
+ known_apps=known_apps,
530
+ unknown_apps=unknown_apps,)
526
531
  params_hash = hashlib.sha256(json.dumps(dataclasses.asdict(test_data_params), sort_keys=True).encode()).hexdigest()
527
532
  params_hash = params_hash[:10]
528
533
  test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
@@ -6,20 +6,18 @@ DATASET_SIZES = {
6
6
  }
7
7
 
8
8
  # Per-packet information (PPI) constants
9
+ PPI_MAX_LEN = 30
9
10
  IPT_POS = 0
10
11
  DIR_POS = 1
11
12
  SIZE_POS = 2
12
13
  PUSH_FLAGS_POS = 3
13
- PPI_MAX_LEN = 30
14
- TCP_PPI_CHANNELS = 4
15
- UDP_PPI_CHANNELS = 3
14
+ TCP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS, PUSH_FLAGS_POS]
15
+ UDP_PPI_CHANNELS = [IPT_POS, DIR_POS, SIZE_POS]
16
16
 
17
17
  # Features
18
18
  FLOWSTATS_TO_SCALE = ["BYTES", "BYTES_REV", "PACKETS", "PACKETS_REV", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION", "DURATION"]
19
- FLOWSTATS_NO_CLIP = ["PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION", "DURATION"]
19
+ FLOWSTATS_NO_CLIP = ["DURATION", "PPI_LEN", "PPI_ROUNDTRIPS", "PPI_DURATION"]
20
20
  SELECTED_TCP_FLAGS = ["FLAG_CWR", "FLAG_CWR_REV", "FLAG_ECE", "FLAG_ECE_REV", "FLAG_PSH_REV", "FLAG_RST", "FLAG_RST_REV", "FLAG_FIN", "FLAG_FIN_REV"]
21
- FLOWEND_REASON_FEATURES = ["FLOW_ENDREASON_IDLE", "FLOW_ENDREASON_ACTIVE", "FLOW_ENDREASON_END", "FLOW_ENDREASON_OTHER"]
22
- PHISTS_FEATURES = ["PHIST_SRC_SIZES", "PHIST_DST_SIZES", "PHIST_SRC_IPT", "PHIST_DST_IPT"]
23
21
  PHIST_BIN_COUNT = 8
24
22
 
25
23
  # Column names