cesnet-datazoo 0.0.10__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/PKG-INFO +3 -3
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/README.md +2 -2
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/config.py +105 -62
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/cesnet_dataset.py +63 -26
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/datasets.py +8 -8
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/statistics.py +44 -36
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/metrics/classification_report.py +16 -16
- cesnet-datazoo-0.0.10/cesnet_datazoo/metrics/superclass_metrics.py → cesnet-datazoo-0.0.12/cesnet_datazoo/metrics/provider_metrics.py +9 -8
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/pytables_dataset.py +8 -7
- cesnet-datazoo-0.0.12/cesnet_datazoo/utils/class_info.py +50 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/PKG-INFO +3 -3
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/SOURCES.txt +1 -1
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/pyproject.toml +1 -1
- cesnet-datazoo-0.0.10/cesnet_datazoo/utils/class_info.py +0 -46
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/LICENCE +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/__init__.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/constants.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/__init__.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/loaders.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/metrics/__init__.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/__init__.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/indices_setup.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/__init__.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/download.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/fileutils.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/random.py +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/requires.txt +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/top_level.txt +0 -0
- {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.12
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -102,8 +102,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
|
|
102
102
|
dataset_config = DatasetConfig(
|
103
103
|
dataset=dataset,
|
104
104
|
apps_selection=AppSelection.ALL_KNOWN,
|
105
|
-
|
106
|
-
|
105
|
+
train_period_name="W-2022-44",
|
106
|
+
test_period_name="W-2022-45",
|
107
107
|
)
|
108
108
|
dataset.set_dataset_config_and_initialize(dataset_config)
|
109
109
|
train_dataframe = dataset.get_train_df()
|
@@ -60,8 +60,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
|
|
60
60
|
dataset_config = DatasetConfig(
|
61
61
|
dataset=dataset,
|
62
62
|
apps_selection=AppSelection.ALL_KNOWN,
|
63
|
-
|
64
|
-
|
63
|
+
train_period_name="W-2022-44",
|
64
|
+
test_period_name="W-2022-45",
|
65
65
|
)
|
66
66
|
dataset.set_dataset_config_and_initialize(dataset_config)
|
67
67
|
train_dataframe = dataset.get_train_df()
|
@@ -4,11 +4,14 @@ import dataclasses
|
|
4
4
|
import hashlib
|
5
5
|
import json
|
6
6
|
import os
|
7
|
+
import warnings
|
7
8
|
from dataclasses import InitVar, field
|
9
|
+
from datetime import datetime
|
8
10
|
from enum import Enum
|
9
11
|
from typing import TYPE_CHECKING, Literal, Optional
|
10
12
|
|
11
13
|
import yaml
|
14
|
+
from pydantic import model_validator
|
12
15
|
from pydantic.dataclasses import dataclass
|
13
16
|
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
|
14
17
|
|
@@ -21,10 +24,15 @@ if TYPE_CHECKING:
|
|
21
24
|
Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
|
22
25
|
|
23
26
|
class ScalerEnum(Enum):
|
24
|
-
|
27
|
+
"""Available scalers for flow statistics, packet sizes, and inter-packet times."""
|
25
28
|
STANDARD = "standard"
|
29
|
+
"""Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
|
30
|
+
ROBUST = "robust"
|
31
|
+
"""Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
|
26
32
|
MINMAX = "minmax"
|
33
|
+
"""Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
|
27
34
|
NO_SCALER = "no-scaler"
|
35
|
+
"""No scaling."""
|
28
36
|
def __str__(self): return self.value
|
29
37
|
|
30
38
|
class Protocol(Enum):
|
@@ -39,7 +47,7 @@ class ValidationApproach(Enum):
|
|
39
47
|
Scikit-learn [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
|
40
48
|
is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
|
41
49
|
VALIDATION_DATES = "validation-dates"
|
42
|
-
"""Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `
|
50
|
+
"""Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
|
43
51
|
NO_VALIDATION = "no-validation"
|
44
52
|
"""Do not use validation. The validation dataloader and dataframe will not be available."""
|
45
53
|
def __str__(self): return self.value
|
@@ -53,7 +61,8 @@ class AppSelection(Enum):
|
|
53
61
|
ALL_KNOWN = "all-known"
|
54
62
|
"""Use all applications as *known*."""
|
55
63
|
TOPX_KNOWN = "topx-known"
|
56
|
-
"""Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
|
64
|
+
"""Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
|
65
|
+
Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
|
57
66
|
EXPLICIT_UNKNOWN = "explicit-unknown"
|
58
67
|
"""Use the provided list of applications (`apps_selection_explicit_unknown`) as *unknown*, and the rest as *known*."""
|
59
68
|
LONGTERM_FIXED = "longterm-fixed"
|
@@ -90,7 +99,7 @@ class DataLoaderOrder(Enum):
|
|
90
99
|
@dataclass(frozen=True)
|
91
100
|
class TrainDataParams():
|
92
101
|
database_filename: str
|
93
|
-
|
102
|
+
train_period_name: str
|
94
103
|
train_tables_paths: list[str]
|
95
104
|
apps_selection: AppSelection
|
96
105
|
apps_selection_topx: int
|
@@ -103,7 +112,7 @@ class TrainDataParams():
|
|
103
112
|
@dataclass(frozen=True)
|
104
113
|
class TestDataParams():
|
105
114
|
database_filename: str
|
106
|
-
|
115
|
+
test_period_name: str
|
107
116
|
test_tables_paths: list[str]
|
108
117
|
known_apps_database_enum: dict[int, str]
|
109
118
|
unknown_apps_database_enum: dict[int, str]
|
@@ -125,23 +134,24 @@ class DatasetConfig():
|
|
125
134
|
|
126
135
|
Attributes:
|
127
136
|
dataset: The dataset instance to be configured
|
128
|
-
data_root:
|
129
|
-
database_filename:
|
130
|
-
database_path:
|
131
|
-
servicemap_path:
|
132
|
-
flowstats_features:
|
137
|
+
data_root: Taken from the dataset instance
|
138
|
+
database_filename: Taken from the dataset instance
|
139
|
+
database_path: Taken from the dataset instance
|
140
|
+
servicemap_path: Taken from the dataset instance
|
141
|
+
flowstats_features: Taken from `dataset.metadata.flowstats_features`
|
133
142
|
|
134
143
|
# Configuration options
|
135
144
|
|
136
145
|
Attributes:
|
137
|
-
|
146
|
+
train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
138
147
|
train_dates: Dates used for creating a train set.
|
139
148
|
train_dates_weigths: To use a non-uniform distribution of samples across train dates.
|
140
149
|
val_approach: How a validation set should be created. Either split train data into train and validation, have a separate validation period, or no validation at all. `Default: SPLIT_FROM_TRAIN`
|
141
150
|
train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
|
142
|
-
|
151
|
+
val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
143
152
|
val_dates: Dates used for creating a validation set.
|
144
|
-
|
153
|
+
no_test_set: Disable the test set. `Default: False`
|
154
|
+
test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
145
155
|
test_dates: Dates used for creating a test set.
|
146
156
|
|
147
157
|
apps_selection: How to select application classes. `Default: ALL_KNOWN`
|
@@ -171,6 +181,7 @@ class DatasetConfig():
|
|
171
181
|
return_ips: Use for IP-based classification. Dataloaders will return data in this tuple format `((SRC_IP, DST_IP, SRC_PORT, DST_PORT), LABELS)`. Dataframes are not available when this option is used. `Default: False`
|
172
182
|
return_torch: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
|
173
183
|
use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
|
184
|
+
normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
|
174
185
|
use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
|
175
186
|
use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
|
176
187
|
zero_ppi_start: Zeroing out the first N packets of each packet sequence. `Default: 0`
|
@@ -184,18 +195,18 @@ class DatasetConfig():
|
|
184
195
|
ipt_max: Max clip inter-packet times before scaling. `Default: 15000`
|
185
196
|
|
186
197
|
# How to configure train, validation, and test sets
|
187
|
-
There are three options for how to define train/validation/test
|
198
|
+
There are three options for how to define train/validation/test dates.
|
188
199
|
|
189
|
-
1.
|
190
|
-
2.
|
191
|
-
3.
|
200
|
+
1. Choose a predefined time period (`train_period_name`, `val_period_name`, or `test_period_name`) available in `dataset.time_periods` and leave the list of dates (`train_dates`, `val_dates`, or `test_dates`) empty.
|
201
|
+
2. Provide a list of dates and a name for the time period. The dates are checked against `dataset.available_dates`.
|
202
|
+
3. Do not specify anything and use the dataset's defaults `dataset.default_train_period_name` and `dataset.default_test_period_name`.
|
192
203
|
|
193
|
-
There are two options for configuring
|
204
|
+
There are two options for configuring sizes of train/validation/test sets.
|
194
205
|
|
195
206
|
1. Select an appropriate dataset size (default is `S`) when creating the [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance and leave `train_size`, `val_known_size`, and `test_known_size` with their default `all` value.
|
196
207
|
This will create train/validation/test sets with all samples available in the selected dataset size (of course, depending on the selected dates and validation approach).
|
197
208
|
2. Provide exact sizes in `train_size`, `val_known_size`, and `test_known_size`. This will create train/validation/test sets of the given sizes by doing a random subset.
|
198
|
-
This is especially useful when using the `ORIG` dataset size and want to
|
209
|
+
This is especially useful when using the `ORIG` dataset size and want to control the size of experiments.
|
199
210
|
|
200
211
|
!!! tip Validation set
|
201
212
|
The default approach for creating a validation set is to randomly split the train data into train and validation. The second approach is to define separate validation dates. See [ValidationApproach][config.ValidationApproach].
|
@@ -208,14 +219,15 @@ class DatasetConfig():
|
|
208
219
|
servicemap_path: str = field(init=False)
|
209
220
|
flowstats_features: list[str] = field(init=False)
|
210
221
|
|
211
|
-
|
222
|
+
train_period_name: str = ""
|
212
223
|
train_dates: list[str] = field(default_factory=list)
|
213
224
|
train_dates_weigths: Optional[list[int]] = None
|
214
225
|
val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
|
215
226
|
train_val_split_fraction: float = 0.2
|
216
|
-
|
227
|
+
val_period_name: str = ""
|
217
228
|
val_dates: list[str] = field(default_factory=list)
|
218
|
-
|
229
|
+
no_test_set: bool = False
|
230
|
+
test_period_name: str = ""
|
219
231
|
test_dates: list[str] = field(default_factory=list)
|
220
232
|
|
221
233
|
apps_selection: AppSelection = AppSelection.ALL_KNOWN
|
@@ -245,6 +257,7 @@ class DatasetConfig():
|
|
245
257
|
return_ips: bool = False
|
246
258
|
return_torch: bool = False
|
247
259
|
use_packet_histograms: bool = True
|
260
|
+
normalize_packet_histograms: bool = True
|
248
261
|
use_tcp_features: bool = True
|
249
262
|
use_push_flags: bool = False
|
250
263
|
zero_ppi_start: int = 0
|
@@ -267,52 +280,67 @@ class DatasetConfig():
|
|
267
280
|
self.database_path = dataset.database_path
|
268
281
|
self.flowstats_features = dataset.metadata.flowstats_features
|
269
282
|
|
270
|
-
# Configure train dates
|
271
|
-
if len(self.train_dates) > 0 and self.
|
272
|
-
raise ValueError("
|
273
|
-
if len(self.train_dates) == 0 and self.
|
274
|
-
if self.
|
275
|
-
raise ValueError(f"Unknown
|
276
|
-
self.train_dates = dataset.time_periods[self.
|
277
|
-
if len(self.train_dates) == 0 and self.
|
278
|
-
self.
|
279
|
-
self.train_dates = dataset.time_periods[dataset.
|
280
|
-
# Configure test dates
|
281
|
-
if
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
283
|
+
# Configure train dates
|
284
|
+
if len(self.train_dates) > 0 and self.train_period_name == "":
|
285
|
+
raise ValueError("train_period_name has to be specified when train_dates are set")
|
286
|
+
if len(self.train_dates) == 0 and self.train_period_name != "":
|
287
|
+
if self.train_period_name not in dataset.time_periods:
|
288
|
+
raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
|
289
|
+
self.train_dates = dataset.time_periods[self.train_period_name]
|
290
|
+
if len(self.train_dates) == 0 and self.test_period_name == "":
|
291
|
+
self.train_period_name = dataset.default_train_period_name
|
292
|
+
self.train_dates = dataset.time_periods[dataset.default_train_period_name]
|
293
|
+
# Configure test dates
|
294
|
+
if self.no_test_set:
|
295
|
+
if (len(self.test_dates) > 0 or self.test_period_name != ""):
|
296
|
+
raise ValueError("test_dates and test_period_name cannot be specified when no_test_set is true")
|
297
|
+
else:
|
298
|
+
if len(self.test_dates) > 0 and self.test_period_name == "":
|
299
|
+
raise ValueError("test_period_name has to be specified when test_dates are set")
|
300
|
+
if len(self.test_dates) == 0 and self.test_period_name != "":
|
301
|
+
if self.test_period_name not in dataset.time_periods:
|
302
|
+
raise ValueError(f"Unknown test_period_name {self.test_period_name}. Use time period available in dataset.time_periods")
|
303
|
+
self.test_dates = dataset.time_periods[self.test_period_name]
|
304
|
+
if len(self.test_dates) == 0 and self.test_period_name == "":
|
305
|
+
self.test_period_name = dataset.default_test_period_name
|
306
|
+
self.test_dates = dataset.time_periods[dataset.default_test_period_name]
|
307
|
+
# Configure val dates
|
308
|
+
if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
|
309
|
+
raise ValueError("val_dates and val_period_name cannot be specified when val_approach is no-validation or split-from-train")
|
293
310
|
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
294
|
-
if len(self.val_dates) > 0 and self.
|
295
|
-
raise ValueError("
|
296
|
-
if len(self.val_dates) == 0 and self.
|
297
|
-
if self.
|
298
|
-
raise ValueError(f"Unknown
|
299
|
-
self.val_dates = dataset.time_periods[self.
|
300
|
-
if len(self.val_dates) == 0 and self.
|
301
|
-
raise ValueError("
|
311
|
+
if len(self.val_dates) > 0 and self.val_period_name == "":
|
312
|
+
raise ValueError("val_period_name has to be specified when val_dates are set")
|
313
|
+
if len(self.val_dates) == 0 and self.val_period_name != "":
|
314
|
+
if self.val_period_name not in dataset.time_periods:
|
315
|
+
raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
|
316
|
+
self.val_dates = dataset.time_periods[self.val_period_name]
|
317
|
+
if len(self.val_dates) == 0 and self.val_period_name == "":
|
318
|
+
raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when val_approach is validation-dates")
|
302
319
|
# Check if train, val, and test dates are available in the dataset
|
303
320
|
if dataset.available_dates:
|
304
321
|
unknown_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
|
305
322
|
unknown_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
|
306
323
|
unknown_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
|
307
324
|
if len(unknown_train_dates) > 0:
|
308
|
-
raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates" \
|
309
|
-
+ f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
|
325
|
+
raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
326
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
310
327
|
if len(unknown_val_dates) > 0:
|
311
|
-
raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates" \
|
312
|
-
+ f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
|
328
|
+
raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
329
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
313
330
|
if len(unknown_test_dates) > 0:
|
314
|
-
raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates" \
|
315
|
-
+ f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
|
331
|
+
raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
|
332
|
+
+ (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
|
333
|
+
# Check time order of train, val, and test periods
|
334
|
+
train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
|
335
|
+
test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
|
336
|
+
if not self.no_test_set and min(test_dates) <= max(train_dates):
|
337
|
+
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
338
|
+
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
339
|
+
val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
|
340
|
+
if min(val_dates) <= max(train_dates):
|
341
|
+
warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
342
|
+
if not self.no_test_set and min(test_dates) <= max(val_dates):
|
343
|
+
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
316
344
|
# Configure features
|
317
345
|
if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
|
318
346
|
self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
|
@@ -446,7 +474,7 @@ class DatasetConfig():
|
|
446
474
|
def _get_train_data_params(self) -> TrainDataParams:
|
447
475
|
return TrainDataParams(
|
448
476
|
database_filename=self.database_filename,
|
449
|
-
|
477
|
+
train_period_name=self.train_period_name,
|
450
478
|
train_tables_paths=self._get_train_tables_paths(),
|
451
479
|
apps_selection=self.apps_selection,
|
452
480
|
apps_selection_topx=self.apps_selection_topx,
|
@@ -460,7 +488,7 @@ class DatasetConfig():
|
|
460
488
|
assert self.val_approach == ValidationApproach.VALIDATION_DATES
|
461
489
|
val_data_params = TestDataParams(
|
462
490
|
database_filename=self.database_filename,
|
463
|
-
|
491
|
+
test_period_name=self.val_period_name,
|
464
492
|
test_tables_paths=self._get_val_tables_paths(),
|
465
493
|
known_apps_database_enum=known_apps_database_enum,
|
466
494
|
unknown_apps_database_enum=unknown_apps_database_enum,)
|
@@ -472,7 +500,7 @@ class DatasetConfig():
|
|
472
500
|
def _get_test_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
|
473
501
|
test_data_params = TestDataParams(
|
474
502
|
database_filename=self.database_filename,
|
475
|
-
|
503
|
+
test_period_name=self.test_period_name,
|
476
504
|
test_tables_paths=self._get_test_tables_paths(),
|
477
505
|
known_apps_database_enum=known_apps_database_enum,
|
478
506
|
unknown_apps_database_enum=unknown_apps_database_enum,)
|
@@ -481,6 +509,21 @@ class DatasetConfig():
|
|
481
509
|
test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
|
482
510
|
return test_data_params, test_data_path
|
483
511
|
|
512
|
+
@model_validator(mode="before")
|
513
|
+
@classmethod
|
514
|
+
def check_deprecated_args(cls, values):
|
515
|
+
kwargs = values.kwargs
|
516
|
+
if "train_period" in kwargs:
|
517
|
+
warnings.warn("train_period is deprecated. Use train_period_name instead.")
|
518
|
+
kwargs["train_period_name"] = kwargs["train_period"]
|
519
|
+
if "val_period" in kwargs:
|
520
|
+
warnings.warn("val_period is deprecated. Use val_period_name instead.")
|
521
|
+
kwargs["val_period_name"] = kwargs["val_period"]
|
522
|
+
if "test_period" in kwargs:
|
523
|
+
warnings.warn("test_period is deprecated. Use test_period_name instead.")
|
524
|
+
kwargs["test_period_name"] = kwargs["test_period"]
|
525
|
+
return values
|
526
|
+
|
484
527
|
def __str__(self):
|
485
528
|
_process_tag = yaml.emitter.Emitter.process_tag
|
486
529
|
_ignore_aliases = yaml.Dumper.ignore_aliases
|
@@ -30,7 +30,7 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
|
|
30
30
|
from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, fit_or_load_scalers,
|
31
31
|
pytables_collate_fn,
|
32
32
|
pytables_ip_collate_fn, worker_init_fn)
|
33
|
-
from cesnet_datazoo.utils.class_info import ClassInfo,
|
33
|
+
from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
|
34
34
|
from cesnet_datazoo.utils.download import resumable_download, simple_download
|
35
35
|
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
36
36
|
|
@@ -74,8 +74,8 @@ class CesnetDataset():
|
|
74
74
|
metadata: Additional [dataset metadata][metadata].
|
75
75
|
available_dates: List of all available dates in the dataset.
|
76
76
|
time_periods: Predefined time periods. Each time period is a list of dates.
|
77
|
-
|
78
|
-
|
77
|
+
default_train_period_name: Default time period for training.
|
78
|
+
default_test_period_name: Default time period for testing.
|
79
79
|
|
80
80
|
The following attributes are initialized when [`set_dataset_config_and_initialize`][datasets.cesnet_dataset.CesnetDataset.set_dataset_config_and_initialize] is called.
|
81
81
|
|
@@ -111,8 +111,8 @@ class CesnetDataset():
|
|
111
111
|
metadata: DatasetMetadata
|
112
112
|
available_dates: list[str]
|
113
113
|
time_periods: dict[str, list[str]]
|
114
|
-
|
115
|
-
|
114
|
+
default_train_period_name: str
|
115
|
+
default_test_period_name: str
|
116
116
|
time_periods_gen: bool = False
|
117
117
|
silent: bool = False
|
118
118
|
|
@@ -165,13 +165,16 @@ class CesnetDataset():
|
|
165
165
|
num_samples += len(database.get_node(p))
|
166
166
|
if self.size == "ORIG" and num_samples != self.metadata.available_samples:
|
167
167
|
raise ValueError(f"Expected {self.metadata.available_samples} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
|
168
|
-
|
168
|
+
if self.size != "ORIG" and num_samples != DATASET_SIZES[self.size]:
|
169
169
|
raise ValueError(f"Expected {DATASET_SIZES[self.size]} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
|
170
170
|
self.available_dates = list(map(lambda x: x.removeprefix("/flows/D"), tables_paths))
|
171
171
|
else:
|
172
172
|
self.available_dates = []
|
173
173
|
if self.time_periods_gen:
|
174
174
|
self._generate_time_periods()
|
175
|
+
# Add all available dates as single date time periods
|
176
|
+
for d in self.available_dates:
|
177
|
+
self.time_periods[d] = [d]
|
175
178
|
|
176
179
|
def set_dataset_config_and_initialize(self, dataset_config: DatasetConfig) -> None:
|
177
180
|
"""
|
@@ -249,9 +252,9 @@ class CesnetDataset():
|
|
249
252
|
"""
|
250
253
|
if self.dataset_config is None:
|
251
254
|
raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting validaion dataloader")
|
252
|
-
assert self.val_dataset is not None
|
253
255
|
if self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
|
254
256
|
raise ValueError("Validation dataloader is not available when using no-validation")
|
257
|
+
assert self.val_dataset is not None
|
255
258
|
if self.val_dataloader:
|
256
259
|
return self.val_dataloader
|
257
260
|
batch_sampler = BatchSampler(sampler=SequentialSampler(self.val_dataset), batch_size=self.dataset_config.test_batch_size, drop_last=False)
|
@@ -288,6 +291,8 @@ class CesnetDataset():
|
|
288
291
|
"""
|
289
292
|
if self.dataset_config is None:
|
290
293
|
raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting test dataloader")
|
294
|
+
if self.dataset_config.no_test_set:
|
295
|
+
raise ValueError("Test dataloader is not available when no_test_set is true")
|
291
296
|
assert self.test_dataset is not None
|
292
297
|
if self.test_dataloader:
|
293
298
|
return self.test_dataloader
|
@@ -358,7 +363,7 @@ class CesnetDataset():
|
|
358
363
|
Returns:
|
359
364
|
Validation data as a dataframe.
|
360
365
|
"""
|
361
|
-
self._check_before_dataframe()
|
366
|
+
self._check_before_dataframe(check_no_val=True)
|
362
367
|
assert self.dataset_config is not None and self.val_dataset is not None
|
363
368
|
if len(self.val_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
|
364
369
|
warnings.warn(f"Validation set has ({len(self.val_dataset)} samples), consider using get_val_dataloader() instead")
|
@@ -384,13 +389,31 @@ class CesnetDataset():
|
|
384
389
|
Returns:
|
385
390
|
Test data as a dataframe.
|
386
391
|
"""
|
387
|
-
self._check_before_dataframe()
|
392
|
+
self._check_before_dataframe(check_no_test=True)
|
388
393
|
assert self.dataset_config is not None and self.test_dataset is not None
|
389
394
|
if len(self.test_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
|
390
395
|
warnings.warn(f"Test set has ({len(self.test_dataset)} samples), consider using get_test_dataloader() instead")
|
391
396
|
feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
|
392
397
|
return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
|
393
398
|
|
399
|
+
def get_num_classes(self) -> int:
|
400
|
+
"""Returns the number of classes in the current configuration of the dataset."""
|
401
|
+
if self.class_info is None:
|
402
|
+
raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting the number of classes")
|
403
|
+
return self.class_info.num_classes
|
404
|
+
|
405
|
+
def get_known_apps(self) -> list[str]:
|
406
|
+
"""Returns the list of known applications in the current configuration of the dataset."""
|
407
|
+
if self.class_info is None:
|
408
|
+
raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting known apps")
|
409
|
+
return self.class_info.known_apps
|
410
|
+
|
411
|
+
def get_unknown_apps(self) -> list[str]:
|
412
|
+
"""Returns the list of unknown applications in the current configuration of the dataset."""
|
413
|
+
if self.class_info is None:
|
414
|
+
raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting unknown apps")
|
415
|
+
return self.class_info.unknown_apps
|
416
|
+
|
394
417
|
def compute_dataset_statistics(self, num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, disabled_apps: Optional[list[str]] = None)-> None:
|
395
418
|
"""
|
396
419
|
Computes dataset statistics and saves them to the `statistics_path` folder.
|
@@ -401,8 +424,6 @@ class CesnetDataset():
|
|
401
424
|
batch_size: Number of samples per batch for loading data.
|
402
425
|
disabled_apps: List of applications to exclude from the statistics.
|
403
426
|
"""
|
404
|
-
if self.name.startswith("CESNET-TLS22"):
|
405
|
-
raise NotImplementedError("Dataset statistics are not supported for CESNET_TLS22")
|
406
427
|
flowstats_features = self.metadata.flowstats_features + self.metadata.packet_histogram_features + self.metadata.tcp_features
|
407
428
|
if not os.path.exists(self.statistics_path):
|
408
429
|
os.mkdir(self.statistics_path)
|
@@ -410,6 +431,7 @@ class CesnetDataset():
|
|
410
431
|
output_dir=self.statistics_path,
|
411
432
|
flowstats_features=flowstats_features,
|
412
433
|
protocol=self.metadata.protocol,
|
434
|
+
extra_fields=not self.name.startswith("CESNET-TLS22"),
|
413
435
|
disabled_apps=disabled_apps if disabled_apps is not None else [],
|
414
436
|
num_samples=num_samples,
|
415
437
|
num_workers=num_workers,
|
@@ -471,13 +493,17 @@ class CesnetDataset():
|
|
471
493
|
self.val_dataloader = None
|
472
494
|
self.test_dataloader = None
|
473
495
|
|
474
|
-
def _check_before_dataframe(self) -> None:
|
496
|
+
def _check_before_dataframe(self, check_no_val: bool = False, check_no_test: bool = False) -> None:
|
475
497
|
if self.dataset_config is None:
|
476
498
|
raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting a dataframe")
|
477
499
|
if self.dataset_config.return_ips:
|
478
500
|
raise ValueError("Dataframes are not available when return_ips is set. Use a dataloader instead.")
|
479
501
|
if self.dataset_config.return_torch:
|
480
502
|
raise ValueError("Dataframes are not available when return_torch is set. Use a dataloader instead.")
|
503
|
+
if check_no_val and self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
|
504
|
+
raise ValueError("Validation dataframe is not available when using no-validation")
|
505
|
+
if check_no_test and self.dataset_config.no_test_set:
|
506
|
+
raise ValueError("Test dataframe is not available when no_test_set is true")
|
481
507
|
|
482
508
|
def _initialize_train_val_test(self) -> None:
|
483
509
|
assert self.dataset_config is not None
|
@@ -485,7 +511,12 @@ class CesnetDataset():
|
|
485
511
|
servicemap = pd.read_csv(dataset_config.servicemap_path, index_col="Tag")
|
486
512
|
# Initialize train and test indices
|
487
513
|
train_indices, train_unknown_indices, encoder, known_apps_database_enum, unknown_apps_database_enum = init_or_load_train_indices(dataset_config=dataset_config, servicemap=servicemap)
|
488
|
-
|
514
|
+
if self.dataset_config.no_test_set:
|
515
|
+
test_known_indices = np.empty((0,3), dtype=np.int64)
|
516
|
+
test_unknown_indices = np.empty((0,3), dtype=np.int64)
|
517
|
+
test_data_path = None
|
518
|
+
else:
|
519
|
+
test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
|
489
520
|
# Date weight sampling of train indices
|
490
521
|
if dataset_config.train_dates_weigths is not None:
|
491
522
|
assert dataset_config.train_size != "all"
|
@@ -527,13 +558,13 @@ class CesnetDataset():
|
|
527
558
|
test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
|
528
559
|
stratify=train_labels, shuffle=True, random_state=train_val_rng)
|
529
560
|
elif dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
|
530
|
-
val_data_path = None
|
531
561
|
val_known_indices = np.empty((0,3), dtype=np.int64)
|
532
562
|
val_unknown_indices = np.empty((0,3), dtype=np.int64)
|
563
|
+
val_data_path = None
|
533
564
|
else: assert_never(dataset_config.val_approach)
|
534
565
|
|
535
566
|
# Create class info
|
536
|
-
class_info =
|
567
|
+
class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
|
537
568
|
# Load or fit data scalers
|
538
569
|
flowstats_scaler, flowstats_quantiles, ipt_scaler, psizes_scaler = fit_or_load_scalers(dataset_config=dataset_config, train_indices=train_indices)
|
539
570
|
# Subset dataset indices based on the selected sizes and compute application counts
|
@@ -555,16 +586,21 @@ class CesnetDataset():
|
|
555
586
|
indices=dataset_indices.train_indices,
|
556
587
|
flowstats_features=dataset_config.flowstats_features,
|
557
588
|
return_ips=dataset_config.return_ips,)
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
589
|
+
if dataset_config.no_test_set:
|
590
|
+
test_dataset = None
|
591
|
+
else:
|
592
|
+
assert test_data_path is not None
|
593
|
+
test_dataset = PyTablesDataset(
|
594
|
+
database_path=dataset_config.database_path,
|
595
|
+
tables_paths=dataset_config._get_test_tables_paths(),
|
596
|
+
indices=test_combined_indices,
|
597
|
+
flowstats_features=dataset_config.flowstats_features,
|
598
|
+
preload=dataset_config.preload_test,
|
599
|
+
preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),
|
600
|
+
return_ips=dataset_config.return_ips,)
|
601
|
+
if dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
|
602
|
+
val_dataset = None
|
603
|
+
else:
|
568
604
|
assert val_data_path is not None
|
569
605
|
val_dataset = PyTablesDataset(
|
570
606
|
database_path=dataset_config.database_path,
|
@@ -579,7 +615,6 @@ class CesnetDataset():
|
|
579
615
|
collate_fn = pytables_ip_collate_fn
|
580
616
|
else:
|
581
617
|
collate_fn = partial(pytables_collate_fn, # type: ignore
|
582
|
-
use_packet_histograms=dataset_config.use_packet_histograms,
|
583
618
|
flowstats_scaler=flowstats_scaler,
|
584
619
|
flowstats_quantiles=flowstats_quantiles,
|
585
620
|
psizes_scaler=psizes_scaler,
|
@@ -588,6 +623,8 @@ class CesnetDataset():
|
|
588
623
|
ipt_min=dataset_config.ipt_min,
|
589
624
|
ipt_max=dataset_config.ipt_max,
|
590
625
|
use_push_flags=dataset_config.use_push_flags,
|
626
|
+
use_packet_histograms=dataset_config.use_packet_histograms,
|
627
|
+
normalize_packet_histograms=dataset_config.normalize_packet_histograms,
|
591
628
|
zero_ppi_start=dataset_config.zero_ppi_start,
|
592
629
|
encoder=encoder,
|
593
630
|
known_apps=class_info.known_apps,
|
@@ -10,8 +10,8 @@ class CESNET_TLS22(CesnetDataset):
|
|
10
10
|
"W-2021-40": ["20211004", "20211005", "20211006", "20211007", "20211008", "20211009", "20211010"],
|
11
11
|
"W-2021-41": ["20211011", "20211012", "20211013", "20211014", "20211015", "20211016", "20211017"],
|
12
12
|
}
|
13
|
-
|
14
|
-
|
13
|
+
default_train_period_name = "W-2021-40"
|
14
|
+
default_test_period_name = "W-2021-41"
|
15
15
|
|
16
16
|
class CESNET_QUIC22(CesnetDataset):
|
17
17
|
"""Dataset class for [CESNET-QUIC22][cesnet-quic22]."""
|
@@ -24,11 +24,11 @@ class CESNET_QUIC22(CesnetDataset):
|
|
24
24
|
"W-2022-46": ["20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120"],
|
25
25
|
"W-2022-47": ["20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
|
26
26
|
"W45-47": ["20221107", "20221108", "20221109", "20221110", "20221111", "20221112", "20221113",
|
27
|
-
|
28
|
-
|
27
|
+
"20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120",
|
28
|
+
"20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
|
29
29
|
}
|
30
|
-
|
31
|
-
|
30
|
+
default_train_period_name = "W-2022-44"
|
31
|
+
default_test_period_name = "W-2022-45"
|
32
32
|
|
33
33
|
class CESNET_TLS_Year22(CesnetDataset):
|
34
34
|
"""Dataset class for [CESNET-TLS-Year22][cesnet-tls-year22]."""
|
@@ -37,5 +37,5 @@ class CESNET_TLS_Year22(CesnetDataset):
|
|
37
37
|
bucket_url = "https://liberouter.org/datazoo/download?bucket=cesnet-tls-year22"
|
38
38
|
time_periods = {f"W-2022-{week}": [] for week in range(1, 53)} | {f"M-2022-{month}": [] for month in range(1, 13)}
|
39
39
|
time_periods_gen = True
|
40
|
-
|
41
|
-
|
40
|
+
default_train_period_name = "M-2022-9"
|
41
|
+
default_test_period_name = "M-2022-10"
|
@@ -16,7 +16,7 @@ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASO
|
|
16
16
|
PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
|
17
17
|
from cesnet_datazoo.pytables_data.indices_setup import sort_indices
|
18
18
|
from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
|
19
|
-
worker_init_fn)
|
19
|
+
load_database, worker_init_fn)
|
20
20
|
|
21
21
|
|
22
22
|
def pick_quic_fields(batch):
|
@@ -26,23 +26,27 @@ def pick_quic_fields(batch):
|
|
26
26
|
batch["QUIC_VERSION"],
|
27
27
|
)
|
28
28
|
|
29
|
-
def pick_stats_fields(batch
|
29
|
+
def pick_stats_fields(batch):
|
30
30
|
return (
|
31
31
|
batch[PPI_COLUMN],
|
32
32
|
batch["DURATION"],
|
33
33
|
batch["PACKETS"] + batch["PACKETS_REV"],
|
34
34
|
batch["BYTES"] + batch["BYTES_REV"],
|
35
|
+
batch[APP_COLUMN],
|
36
|
+
batch[CATEGORY_COLUMN],
|
37
|
+
)
|
38
|
+
|
39
|
+
def pick_extra_fields(batch, flowstats_features: list[str]):
|
40
|
+
return (
|
35
41
|
batch["DST_ASN"],
|
36
42
|
batch[PHISTS_FEATURES],
|
37
43
|
batch[[f for f in FLOWEND_REASON_FEATURES if f in flowstats_features]],
|
38
|
-
batch[APP_COLUMN],
|
39
|
-
batch[CATEGORY_COLUMN],
|
40
44
|
)
|
41
45
|
|
42
46
|
def simple_collate_fn(batch):
|
43
47
|
return batch
|
44
48
|
|
45
|
-
def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
|
49
|
+
def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, extra_fields: bool, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
|
46
50
|
stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
|
47
51
|
stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
|
48
52
|
categories_csv_path = os.path.join(output_dir, "categories.csv")
|
@@ -70,8 +74,8 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
70
74
|
packet_sizes_counter = Counter()
|
71
75
|
if not silent:
|
72
76
|
print(f"Reading data from {database_path} for statistics")
|
73
|
-
|
74
|
-
stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=
|
77
|
+
table_paths = list_all_tables(database_path)
|
78
|
+
stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_paths, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
|
75
79
|
if num_samples != "all":
|
76
80
|
subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
|
77
81
|
stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
|
@@ -89,7 +93,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
89
93
|
stats_dataset.pytables_worker_init()
|
90
94
|
|
91
95
|
for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader), disable=silent):
|
92
|
-
ppi, duration, packets_total, bytes_total,
|
96
|
+
ppi, duration, packets_total, bytes_total, app, cat = pick_stats_fields(batch)
|
93
97
|
# Saving feature values for distribution plots
|
94
98
|
feature_duration.append(duration)
|
95
99
|
feature_packets_total.append(packets_total)
|
@@ -97,8 +101,6 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
97
101
|
packet_sizes_counter.update(ppi[:, SIZE_POS, :].flatten())
|
98
102
|
# Aggregating features for value_counts
|
99
103
|
app_series = app_series.add(pd.Series(app).value_counts(), fill_value=0)
|
100
|
-
asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
|
101
|
-
flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
|
102
104
|
# Grouping features per categories
|
103
105
|
df1 = pd.DataFrame(data={"cat": cat, "BYTES_TOTAL": bytes_total})
|
104
106
|
flow_counts = df1["cat"].value_counts().rename("FLOW_COUNT")
|
@@ -110,9 +112,12 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
110
112
|
quic_sni_series = quic_sni_series.add(pd.Series(sni).str.decode("utf-8").value_counts(), fill_value=0)
|
111
113
|
quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
|
112
114
|
quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
|
113
|
-
|
114
|
-
|
115
|
-
|
115
|
+
if extra_fields:
|
116
|
+
asn, phist, flowend_reason = pick_extra_fields(batch, flowstats_features=flowstats_features)
|
117
|
+
asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
|
118
|
+
flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
|
119
|
+
df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
|
120
|
+
df_phist = df_phist.add(df2, fill_value=0)
|
116
121
|
feature_duration = np.concatenate(feature_duration)
|
117
122
|
feature_packets_total = np.concatenate(feature_packets_total)
|
118
123
|
feature_bytes_total = np.concatenate(feature_bytes_total)
|
@@ -123,9 +128,12 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
123
128
|
# Flow statistics distribution output
|
124
129
|
df_flowstats = pd.DataFrame(data={"FLOW DURATION": feature_duration, "FLOW BYTE VOLUME": feature_bytes_total, "FLOW LENGTH": feature_packets_total}).describe()
|
125
130
|
df_flowstats.to_csv(stats_csv_path)
|
126
|
-
# Categories tikzpicture and csv output
|
127
|
-
|
128
|
-
|
131
|
+
# Categories tikzpicture and csv output; first, get the categories and applications enum
|
132
|
+
temp_database, temp_tables = load_database(database_path=database_path, tables_paths=table_paths[:1])
|
133
|
+
cat_enum = temp_tables[0].get_enum(CATEGORY_COLUMN)
|
134
|
+
app_enum = temp_tables[0].get_enum(APP_COLUMN)
|
135
|
+
temp_database.close()
|
136
|
+
df_categories.index = df_categories.index.map(cat_enum)
|
129
137
|
df_categories = df_categories.drop("default", errors="ignore")
|
130
138
|
df_categories["FLOW_PERC"] = df_categories["FLOW_COUNT"] / sum(df_categories["FLOW_COUNT"]) * 100
|
131
139
|
df_categories["BYTES_PERC"] = df_categories["BYTES_TOTAL"] / sum(df_categories["BYTES_TOTAL"]) * 100
|
@@ -139,20 +147,9 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
139
147
|
# Application distribution output
|
140
148
|
app_df = pd.DataFrame({"COUNT": app_series.sort_values(ascending=False).astype("int64")})
|
141
149
|
app_df["PERC"] = (app_df["COUNT"] / app_df["COUNT"].sum() * 100).round(3)
|
142
|
-
app_df.index = app_df.index.map(
|
150
|
+
app_df.index = app_df.index.map(app_enum)
|
143
151
|
app_df.index.name = "LABEL"
|
144
152
|
app_df.to_csv(app_path)
|
145
|
-
# ASN distribution output
|
146
|
-
asn_df = pd.DataFrame({"COUNT": asn_series.sort_values(ascending=False).astype("int64")})
|
147
|
-
asn_df["PERC"] = (asn_df["COUNT"] / asn_df["COUNT"].sum() * 100).round(3)
|
148
|
-
asn_df.index.name = "DESTINATION ASN"
|
149
|
-
asn_df.to_csv(asn_path)
|
150
|
-
# Flow end reason output
|
151
|
-
flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
|
152
|
-
flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
|
153
|
-
flow_endreason_df.index.name = "FLOW ENDREASON"
|
154
|
-
flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
|
155
|
-
flow_endreason_df.to_csv(flow_endreason_path)
|
156
153
|
# Packet sizes histogram output
|
157
154
|
packet_sizes_df = pd.DataFrame({"COUNT": pd.Series(packet_sizes_counter)}).sort_index()
|
158
155
|
packet_sizes_df["PERC"] = (packet_sizes_df["COUNT"] / packet_sizes_df["COUNT"].sum() * 100).round(3)
|
@@ -168,13 +165,25 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
168
165
|
quic_version_df.index = quic_version_df.index.map(hex)
|
169
166
|
quic_version_df.index.name = "QUIC VERSION"
|
170
167
|
quic_version_df.to_csv(quic_version_path)
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
168
|
+
if extra_fields:
|
169
|
+
# ASN distribution output
|
170
|
+
asn_df = pd.DataFrame({"COUNT": asn_series.sort_values(ascending=False).astype("int64")})
|
171
|
+
asn_df["PERC"] = (asn_df["COUNT"] / asn_df["COUNT"].sum() * 100).round(3)
|
172
|
+
asn_df.index.name = "DESTINATION ASN"
|
173
|
+
asn_df.to_csv(asn_path)
|
174
|
+
# Flow end reason output
|
175
|
+
flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
|
176
|
+
flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
|
177
|
+
flow_endreason_df.index.name = "FLOW ENDREASON"
|
178
|
+
flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
|
179
|
+
flow_endreason_df.to_csv(flow_endreason_path)
|
180
|
+
# PHIST output
|
181
|
+
df_phist.index.name = "BINS"
|
182
|
+
df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
|
183
|
+
df_phist = df_phist.astype("int64")
|
184
|
+
for i, column in zip((1, 3, 5, 7), df_phist.columns):
|
185
|
+
df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
|
186
|
+
df_phist.to_csv(phist_path)
|
178
187
|
|
179
188
|
# Dataset stats figure
|
180
189
|
axes: Any
|
@@ -232,5 +241,4 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
232
241
|
ax4.set_xlabel("Bytes")
|
233
242
|
|
234
243
|
plt.tight_layout()
|
235
|
-
fig.show()
|
236
244
|
fig.savefig(stats_pdf_path, bbox_inches="tight")
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/metrics/classification_report.py
RENAMED
@@ -1,8 +1,8 @@
|
|
1
1
|
import numpy as np
|
2
2
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
3
3
|
|
4
|
-
from cesnet_datazoo.metrics.
|
5
|
-
|
4
|
+
from cesnet_datazoo.metrics.provider_metrics import (per_app_provider_metrics,
|
5
|
+
provider_accuracies)
|
6
6
|
from cesnet_datazoo.utils.class_info import ClassInfo
|
7
7
|
|
8
8
|
|
@@ -10,23 +10,23 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
|
|
10
10
|
p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
|
11
11
|
labels=labels,
|
12
12
|
zero_division=zero_division)
|
13
|
-
sc_p, sc_r, sc_f1 =
|
13
|
+
sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
|
14
14
|
predicted_unknown = cm[:, -1]
|
15
15
|
with np.errstate(divide="ignore", invalid="ignore"):
|
16
16
|
predicted_unknown_perc = predicted_unknown / s
|
17
17
|
predicted_unknown_perc = np.nan_to_num(predicted_unknown_perc)
|
18
|
-
headers = ["precision (
|
18
|
+
headers = ["precision (pr)", "recall (pr)", "f1-score (pr)", "pred unknown", "support"]
|
19
19
|
headers_fmt = "{:>{width}} {:>15} {:>15} {:>15} {:>15} {:>9}\n"
|
20
|
-
width = max(max(len(cn) for cn in class_info.target_names), len("failed
|
20
|
+
width = max(max(len(cn) for cn in class_info.target_names), len("failed provider acc"))
|
21
21
|
report = headers_fmt.format("", *headers, width=width)
|
22
22
|
report += "\n"
|
23
|
-
|
23
|
+
row_fmt_provider = "{:>{width}} " + 3 * " {:>7.{digits}f} ({:.{digits}f}) " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
|
24
24
|
row_fmt = "{:>{width}} " + 3 * " {:>7.{digits}f} " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
|
25
25
|
rows = zip(map(class_info.target_names.__getitem__, labels), p, sc_p, r, sc_r, f1, sc_f1, predicted_unknown, predicted_unknown_perc, s) # type: ignore
|
26
26
|
for row in rows:
|
27
27
|
app, p_, _, r_, _, f1_, _, u_, up_, s_ = row
|
28
|
-
if class_info.
|
29
|
-
report +=
|
28
|
+
if class_info.has_provider[app]:
|
29
|
+
report += row_fmt_provider.format(*row, width=width, digits=digits)
|
30
30
|
else:
|
31
31
|
report += row_fmt.format(app, p_, r_, f1_, u_, up_, s_, width=width, digits=digits)
|
32
32
|
report += "\n"
|
@@ -40,26 +40,26 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
|
|
40
40
|
avg_sc_f1 = np.average(np.where(np.isnan([np.nan if x is None else x for x in sc_f1]), f1, sc_f1)[:-1])
|
41
41
|
row_avg = [avg_p, avg_sc_p, avg_r, avg_sc_r, avg_f1, avg_sc_f1, predicted_unknown_sum, samples_sum]
|
42
42
|
|
43
|
-
headers_avg = ["precision (
|
43
|
+
headers_avg = ["precision (pr)", "recall (pr)", "f1-score (pr)", "pred unknown", "support"]
|
44
44
|
row_fmt_avg = "{:>{width}} " + 3 * " {:>6.{digits}} ({:.{digits}f}) " + "{:>15} " + "{:>9}\n"
|
45
45
|
digits = 3 # show more precise averages
|
46
46
|
report += headers_fmt.format("", *headers_avg, width=width)
|
47
47
|
report += row_fmt_avg.format("macro avg", *row_avg, width=width, digits=digits)
|
48
48
|
|
49
49
|
acc = accuracy_score(y_true, y_pred)
|
50
|
-
|
50
|
+
provider_acc, failed_provider_acc = provider_accuracies(y_true, y_pred, class_info=class_info)
|
51
51
|
|
52
52
|
row_fmt_acc = "{:>{width}} {:>15} {:>15} {:>7.{digits}f}\n"
|
53
53
|
report += row_fmt_acc.format("acc", "", "", acc, width=width, digits=digits)
|
54
|
-
report += row_fmt_acc.format("
|
55
|
-
report += row_fmt_acc.format("failed
|
54
|
+
report += row_fmt_acc.format("provider acc", "", "", provider_acc, width=width, digits=digits)
|
55
|
+
report += row_fmt_acc.format("failed provider acc", "", "", failed_provider_acc, width=width, digits=digits)
|
56
56
|
metrics = {
|
57
57
|
"Test/Accuracy": acc,
|
58
|
-
"Test/
|
59
|
-
"Test/Failed
|
58
|
+
"Test/Provider Accuracy": provider_acc,
|
59
|
+
"Test/Failed Provider Accuracy": failed_provider_acc,
|
60
60
|
"Test/Fscore": avg_f1,
|
61
|
-
"Test/
|
61
|
+
"Test/Provider Fscore": avg_sc_f1,
|
62
62
|
"Test/Recall": avg_r,
|
63
|
-
"Test/
|
63
|
+
"Test/Provider Recall": avg_sc_r,
|
64
64
|
}
|
65
65
|
return report, metrics
|
@@ -3,18 +3,19 @@ import numpy as np
|
|
3
3
|
from cesnet_datazoo.utils.class_info import ClassInfo
|
4
4
|
|
5
5
|
|
6
|
-
def
|
7
|
-
|
8
|
-
|
6
|
+
def provider_accuracies(y_true: np.ndarray, y_pred: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
|
7
|
+
provider_mapping_arr = np.array(list(class_info.provider_mapping.values()))
|
8
|
+
y_true_sc = provider_mapping_arr[y_true]
|
9
|
+
y_pred_sc = provider_mapping_arr[y_pred]
|
9
10
|
mistakes = y_true != y_pred
|
10
|
-
|
11
|
-
|
12
|
-
return
|
11
|
+
provider_acc = (y_true_sc == y_pred_sc).sum() / len(y_true_sc)
|
12
|
+
failed_provider_acc = (y_true_sc[mistakes] == y_pred_sc[mistakes]).sum() / mistakes.sum()
|
13
|
+
return provider_acc, failed_provider_acc
|
13
14
|
|
14
|
-
def
|
15
|
+
def per_app_provider_metrics(cm, class_info: ClassInfo):
|
15
16
|
metrics = []
|
16
17
|
for i, app in enumerate(class_info.target_names):
|
17
|
-
if not class_info.
|
18
|
+
if not class_info.has_provider[app]:
|
18
19
|
metrics.append((None, None, None))
|
19
20
|
continue
|
20
21
|
group = class_info.group_matrix[i]
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/pytables_dataset.py
RENAMED
@@ -140,7 +140,7 @@ def pytables_collate_fn(batch: tuple,
|
|
140
140
|
flowstats_scaler: Scaler, flowstats_quantiles: pd.Series,
|
141
141
|
psizes_scaler: Scaler, psizes_max: int,
|
142
142
|
ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
|
143
|
-
use_push_flags: bool, use_packet_histograms: bool, zero_ppi_start: int,
|
143
|
+
use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
|
144
144
|
encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
|
145
145
|
x_ppi, x_flowstats, labels = batch
|
146
146
|
x_ppi = x_ppi.transpose(0, 2, 1)
|
@@ -164,12 +164,13 @@ def pytables_collate_fn(batch: tuple,
|
|
164
164
|
|
165
165
|
if use_packet_histograms:
|
166
166
|
x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
167
|
+
if normalize_packet_histograms:
|
168
|
+
src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
|
169
|
+
dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
|
170
|
+
np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
|
171
|
+
np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
|
172
|
+
np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
|
173
|
+
np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
|
173
174
|
x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
|
174
175
|
x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
|
175
176
|
else:
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Any, Optional
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
from sklearn.preprocessing import LabelEncoder
|
7
|
+
|
8
|
+
from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROVIDER_COLUMN
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass()
|
12
|
+
class ClassInfo:
|
13
|
+
target_names: list[str]
|
14
|
+
num_classes: int
|
15
|
+
known_apps: list[str]
|
16
|
+
unknown_apps: list[str]
|
17
|
+
unknown_class_label: int
|
18
|
+
group_matrix: np.ndarray
|
19
|
+
has_provider: dict[str, bool]
|
20
|
+
provider_mapping: dict[str, str]
|
21
|
+
provider_members: dict[str, list[str]]
|
22
|
+
categories_mapping: dict[str, Optional[str]]
|
23
|
+
|
24
|
+
def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> ClassInfo:
|
25
|
+
known_apps = sorted(known_apps_database_enum.values())
|
26
|
+
unknown_apps = sorted(unknown_apps_database_enum.values())
|
27
|
+
target_names_arr = encoder.classes_
|
28
|
+
assert known_apps == list(target_names_arr[:-1])
|
29
|
+
group_matrix = np.array([[a == b or
|
30
|
+
(a in servicemap.index and b in servicemap.index and
|
31
|
+
not pd.isnull(servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN]) and not pd.isnull(servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]) and
|
32
|
+
servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN] == servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN])
|
33
|
+
for a in target_names_arr] for b in target_names_arr])
|
34
|
+
has_provider = {app: app in servicemap.index and not pd.isnull(servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN]) for app in target_names_arr}
|
35
|
+
provider_mapping = {app: servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN] if has_provider[app] else app for app in target_names_arr}
|
36
|
+
providers = sorted({provider_mapping[app] for app in target_names_arr if has_provider[app]})
|
37
|
+
provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
|
38
|
+
categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
|
39
|
+
return ClassInfo(
|
40
|
+
target_names=list(target_names_arr),
|
41
|
+
num_classes=len(known_apps),
|
42
|
+
known_apps=known_apps,
|
43
|
+
unknown_apps=unknown_apps,
|
44
|
+
unknown_class_label=len(known_apps),
|
45
|
+
group_matrix=group_matrix,
|
46
|
+
has_provider=has_provider,
|
47
|
+
provider_mapping=provider_mapping,
|
48
|
+
provider_members=provider_members,
|
49
|
+
categories_mapping=categories_mapping,
|
50
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.12
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -102,8 +102,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
|
|
102
102
|
dataset_config = DatasetConfig(
|
103
103
|
dataset=dataset,
|
104
104
|
apps_selection=AppSelection.ALL_KNOWN,
|
105
|
-
|
106
|
-
|
105
|
+
train_period_name="W-2022-44",
|
106
|
+
test_period_name="W-2022-45",
|
107
107
|
)
|
108
108
|
dataset.set_dataset_config_and_initialize(dataset_config)
|
109
109
|
train_dataframe = dataset.get_train_df()
|
@@ -19,7 +19,7 @@ cesnet_datazoo/datasets/metadata/dataset_metadata.py
|
|
19
19
|
cesnet_datazoo/datasets/metadata/metadata.csv
|
20
20
|
cesnet_datazoo/metrics/__init__.py
|
21
21
|
cesnet_datazoo/metrics/classification_report.py
|
22
|
-
cesnet_datazoo/metrics/
|
22
|
+
cesnet_datazoo/metrics/provider_metrics.py
|
23
23
|
cesnet_datazoo/pytables_data/__init__.py
|
24
24
|
cesnet_datazoo/pytables_data/apps_split.py
|
25
25
|
cesnet_datazoo/pytables_data/indices_setup.py
|
@@ -1,46 +0,0 @@
|
|
1
|
-
from dataclasses import dataclass
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
import pandas as pd
|
5
|
-
|
6
|
-
from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROVIDER_COLUMN
|
7
|
-
|
8
|
-
|
9
|
-
@dataclass()
|
10
|
-
class ClassInfo:
|
11
|
-
target_names: list[str]
|
12
|
-
known_apps: list[str]
|
13
|
-
group_matrix: np.ndarray
|
14
|
-
superclass_members: dict[str, list[str]]
|
15
|
-
has_superclass: dict[str, bool]
|
16
|
-
superclass_mapping: dict[str, str]
|
17
|
-
superclass_mapping_arr: np.ndarray
|
18
|
-
categories_mapping: dict[str, str]
|
19
|
-
|
20
|
-
def get_num_classes(self):
|
21
|
-
return len(self.known_apps)
|
22
|
-
|
23
|
-
def create_superclass_structures(servicemap: pd.DataFrame, target_names: list[str]) -> ClassInfo:
|
24
|
-
known_apps = target_names[:-1]
|
25
|
-
target_names_arr = np.array(target_names)
|
26
|
-
group_matrix = np.array([[
|
27
|
-
a in servicemap.index and b in servicemap.index and
|
28
|
-
not pd.isnull(servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN]) and not pd.isnull(servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]) and
|
29
|
-
servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN] == servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]
|
30
|
-
for a in target_names_arr] for b in target_names_arr])
|
31
|
-
has_superclass = {app: app in servicemap.index and not pd.isnull(servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN]) for app in target_names_arr}
|
32
|
-
superclass_mapping: dict[str, str] = {app: servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN] if has_superclass[app] else app for app in target_names_arr} # type: ignore
|
33
|
-
superclass_mapping_arr = np.array(list(superclass_mapping.values()))
|
34
|
-
superclass_members = {superclass: servicemap.loc[servicemap[SERVICEMAP_PROVIDER_COLUMN] == superclass].index.to_list()
|
35
|
-
for superclass in servicemap.loc[:, SERVICEMAP_PROVIDER_COLUMN].dropna().unique()}
|
36
|
-
categories_mapping: dict[str, str] = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr} # type: ignore
|
37
|
-
return ClassInfo(
|
38
|
-
target_names=target_names,
|
39
|
-
known_apps=known_apps,
|
40
|
-
group_matrix=group_matrix,
|
41
|
-
superclass_members=superclass_members,
|
42
|
-
has_superclass=has_superclass,
|
43
|
-
superclass_mapping=superclass_mapping,
|
44
|
-
superclass_mapping_arr=superclass_mapping_arr,
|
45
|
-
categories_mapping=categories_mapping,
|
46
|
-
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/__init__.py
RENAMED
File without changes
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/dataset_metadata.py
RENAMED
File without changes
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/metadata.csv
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/indices_setup.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|