cesnet-datazoo 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/PKG-INFO +3 -6
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/README.md +2 -2
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/config.py +30 -25
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/constants.py +1 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/cesnet_dataset.py +10 -10
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/metadata.csv +1 -1
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/data_scalers.py +3 -37
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/indices_setup.py +4 -1
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/pytables_dataset.py +9 -5
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/class_info.py +0 -2
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/PKG-INFO +3 -6
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/requires.txt +0 -3
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/pyproject.toml +1 -4
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/LICENCE +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/__init__.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/__init__.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/datasets.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/loaders.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/statistics.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/__init__.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/classification_report.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/__init__.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/__init__.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/download.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/fileutils.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/random.py +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/top_level.txt +0 -0
- {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -29,10 +29,7 @@ Requires-Dist: tables>=3.8.0
|
|
29
29
|
Requires-Dist: torch>=1.10
|
30
30
|
Requires-Dist: tqdm
|
31
31
|
Provides-Extra: dev
|
32
|
-
Requires-Dist: black; extra == "dev"
|
33
32
|
Requires-Dist: build; extra == "dev"
|
34
|
-
Requires-Dist: jupyterlab; extra == "dev"
|
35
|
-
Requires-Dist: lightgbm; extra == "dev"
|
36
33
|
Requires-Dist: mkdocs-autorefs; extra == "dev"
|
37
34
|
Requires-Dist: mkdocs-material-extensions; extra == "dev"
|
38
35
|
Requires-Dist: mkdocs-material; extra == "dev"
|
@@ -57,7 +54,7 @@ The goal of this project is to provide tools for working with large network traf
|
|
57
54
|
- Extensive configuration options for:
|
58
55
|
- Selection of train, validation, and test periods.
|
59
56
|
- Selection of application classes and splitting classes between *known* and *unknown*.
|
60
|
-
-
|
57
|
+
- Data transformations, such as feature scaling.
|
61
58
|
- Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
|
62
59
|
- Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
|
63
60
|
|
@@ -72,7 +69,7 @@ The package is able to handle the following datasets:
|
|
72
69
|
| _Collection duration_ | 2 weeks | 4 weeks | 1 year |
|
73
70
|
| _Collection period_ | 4.10.2021 - 17.10.2021 | 31.10.2022 - 27.11.2022 | 1.1.2022 - 31.12.2022 | | ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST | ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST |
|
74
71
|
| _Application count_ | 191 | 102 | 180 |
|
75
|
-
| _Available samples_ |
|
72
|
+
| _Available samples_ | 141392195 | 153226273 | 507739073 |
|
76
73
|
| _Available dataset sizes_ | XS, S, M, L | XS, S, M, L | XS, S, M, L |
|
77
74
|
| _Cite_ | [https://doi.org/10.1016/j.comnet.2022.109467](https://doi.org/10.1016/j.comnet.2022.109467) | [https://doi.org/10.1016/j.dib.2023.108888](https://doi.org/10.1016/j.dib.2023.108888) | |
|
78
75
|
| _Zenodo URL_ | [https://zenodo.org/record/7965515](https://zenodo.org/record/7965515) | [https://zenodo.org/record/7963302](https://zenodo.org/record/7963302) | |
|
@@ -14,7 +14,7 @@ The goal of this project is to provide tools for working with large network traf
|
|
14
14
|
- Extensive configuration options for:
|
15
15
|
- Selection of train, validation, and test periods.
|
16
16
|
- Selection of application classes and splitting classes between *known* and *unknown*.
|
17
|
-
-
|
17
|
+
- Data transformations, such as feature scaling.
|
18
18
|
- Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
|
19
19
|
- Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
|
20
20
|
|
@@ -29,7 +29,7 @@ The package is able to handle the following datasets:
|
|
29
29
|
| _Collection duration_ | 2 weeks | 4 weeks | 1 year |
|
30
30
|
| _Collection period_ | 4.10.2021 - 17.10.2021 | 31.10.2022 - 27.11.2022 | 1.1.2022 - 31.12.2022 | | ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST | ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST |
|
31
31
|
| _Application count_ | 191 | 102 | 180 |
|
32
|
-
| _Available samples_ |
|
32
|
+
| _Available samples_ | 141392195 | 153226273 | 507739073 |
|
33
33
|
| _Available dataset sizes_ | XS, S, M, L | XS, S, M, L | XS, S, M, L |
|
34
34
|
| _Cite_ | [https://doi.org/10.1016/j.comnet.2022.109467](https://doi.org/10.1016/j.comnet.2022.109467) | [https://doi.org/10.1016/j.dib.2023.108888](https://doi.org/10.1016/j.dib.2023.108888) | |
|
35
35
|
| _Zenodo URL_ | [https://zenodo.org/record/7965515](https://zenodo.org/record/7965515) | [https://zenodo.org/record/7963302](https://zenodo.org/record/7963302) | |
|
@@ -113,7 +113,7 @@ class DatasetConfig():
|
|
113
113
|
|
114
114
|
- Train, validation, test sets (dates, sizes, validation approach).
|
115
115
|
- Application selection — either the standard closed-world setting (only *known* classes) or the open-world setting (*known* and *unknown* classes).
|
116
|
-
-
|
116
|
+
- Data transformations. See the [transforms][transforms] page for more information.
|
117
117
|
- Dataloader options like batch sizes, order of loading, or number of workers.
|
118
118
|
|
119
119
|
When initializing this class, pass a [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance to be configured and the desired configuration. Available options are [here][config.DatasetConfig--configuration-options].
|
@@ -133,7 +133,7 @@ class DatasetConfig():
|
|
133
133
|
|
134
134
|
Attributes:
|
135
135
|
need_train_set: Use to disable the train set. `Default: True`
|
136
|
-
need_val_set: Use to disable the validation set.
|
136
|
+
need_val_set: Use to disable the validation set. `Default: True`
|
137
137
|
need_test_set: Use to disable the test set. `Default: True`
|
138
138
|
train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
139
139
|
train_dates: Dates used for creating a train set.
|
@@ -161,7 +161,7 @@ class DatasetConfig():
|
|
161
161
|
val_workers: Number of workers for loading validation data. `0` means that the data will be loaded in the main process. `Default: 1`
|
162
162
|
batch_size: Number of samples per batch. `Default: 192`
|
163
163
|
test_batch_size: Number of samples per batch for loading validation and test data. `Default: 2048`
|
164
|
-
preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default:
|
164
|
+
preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
|
165
165
|
preload_test: Whether to dump the test set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
|
166
166
|
train_size: Size of the train set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
|
167
167
|
val_known_size: Size of the validation set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
|
@@ -176,10 +176,10 @@ class DatasetConfig():
|
|
176
176
|
use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
|
177
177
|
use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
|
178
178
|
use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
|
179
|
-
fit_scalers_samples: Fraction of train samples used for fitting
|
180
|
-
ppi_transform: Transform function for PPI sequences. `Default: None`
|
181
|
-
flowstats_transform: Transform function for flow statistics. `Default: None`
|
182
|
-
flowstats_phist_transform: Transform function for packet histograms. `Default: None`
|
179
|
+
fit_scalers_samples: Used when scaling transformation is configured and requires fitting. Fraction of train samples used for fitting, if float. The absolute number of samples otherwise. `Default: 0.25`
|
180
|
+
ppi_transform: Transform function for PPI sequences. See the [transforms][transforms] page for more information. `Default: None`
|
181
|
+
flowstats_transform: Transform function for flow statistics. See the [transforms][transforms] page for more information. `Default: None`
|
182
|
+
flowstats_phist_transform: Transform function for packet histograms. See the [transforms][transforms] page for more information. `Default: None`
|
183
183
|
|
184
184
|
# How to configure train, validation, and test sets
|
185
185
|
There are three options for how to define train/validation/test dates.
|
@@ -238,7 +238,7 @@ class DatasetConfig():
|
|
238
238
|
val_workers: int = 1
|
239
239
|
batch_size: int = 192
|
240
240
|
test_batch_size: int = 2048
|
241
|
-
preload_val: bool =
|
241
|
+
preload_val: bool = False
|
242
242
|
preload_test: bool = False
|
243
243
|
train_size: int | Literal["all"] = "all"
|
244
244
|
val_known_size: int | Literal["all"] = "all"
|
@@ -268,7 +268,6 @@ class DatasetConfig():
|
|
268
268
|
self.database_path = dataset.database_path
|
269
269
|
|
270
270
|
if not self.need_train_set:
|
271
|
-
self.need_val_set = False
|
272
271
|
if self.apps_selection != AppSelection.FIXED:
|
273
272
|
raise ValueError("Application selection has to be fixed when need_train_set is false")
|
274
273
|
if (len(self.train_dates) > 0 or self.train_period_name != ""):
|
@@ -281,7 +280,7 @@ class DatasetConfig():
|
|
281
280
|
if self.train_period_name not in dataset.time_periods:
|
282
281
|
raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
|
283
282
|
self.train_dates = dataset.time_periods[self.train_period_name]
|
284
|
-
if len(self.train_dates) == 0 and self.
|
283
|
+
if len(self.train_dates) == 0 and self.train_period_name == "":
|
285
284
|
self.train_period_name = dataset.default_train_period_name
|
286
285
|
self.train_dates = dataset.time_periods[dataset.default_train_period_name]
|
287
286
|
# Configure test dates
|
@@ -299,17 +298,24 @@ class DatasetConfig():
|
|
299
298
|
self.test_period_name = dataset.default_test_period_name
|
300
299
|
self.test_dates = dataset.time_periods[dataset.default_test_period_name]
|
301
300
|
# Configure val dates
|
302
|
-
if
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
301
|
+
if not self.need_val_set:
|
302
|
+
if len(self.val_dates) > 0 or self.val_period_name != "" or self.val_approach != ValidationApproach.SPLIT_FROM_TRAIN:
|
303
|
+
raise ValueError("val_dates, val_period_name, and val_approach cannot be specified when need_val_set is false")
|
304
|
+
else:
|
305
|
+
if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
|
306
|
+
if len(self.val_dates) > 0 or self.val_period_name != "":
|
307
|
+
raise ValueError("val_dates and val_period_name cannot be specified when the validation approach is split-from-train")
|
308
|
+
if not self.need_train_set:
|
309
|
+
raise ValueError("Cannot use the split-from-train validation approach when need_train_set is false. Either use the validation-dates approach or set need_val_set to false.")
|
310
|
+
elif self.val_approach == ValidationApproach.VALIDATION_DATES:
|
311
|
+
if len(self.val_dates) > 0 and self.val_period_name == "":
|
312
|
+
raise ValueError("val_period_name has to be specified when val_dates are set")
|
313
|
+
if len(self.val_dates) == 0 and self.val_period_name != "":
|
314
|
+
if self.val_period_name not in dataset.time_periods:
|
315
|
+
raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
|
316
|
+
self.val_dates = dataset.time_periods[self.val_period_name]
|
317
|
+
if len(self.val_dates) == 0 and self.val_period_name == "":
|
318
|
+
raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
|
313
319
|
# Check if train, val, and test dates are available in the dataset
|
314
320
|
bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
|
315
321
|
bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
|
@@ -326,12 +332,11 @@ class DatasetConfig():
|
|
326
332
|
# Check time order of train, val, and test periods
|
327
333
|
train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
|
328
334
|
test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
|
329
|
-
if len(train_dates) > 0 and len(test_dates) > 0
|
335
|
+
if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
|
330
336
|
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
331
337
|
if self.val_approach == ValidationApproach.VALIDATION_DATES:
|
332
|
-
# Train dates are guaranteed to be set
|
333
338
|
val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
|
334
|
-
if min(val_dates) <= max(train_dates):
|
339
|
+
if len(train_dates) > 0 and min(val_dates) <= max(train_dates):
|
335
340
|
warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
336
341
|
if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
|
337
342
|
warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
|
@@ -475,7 +480,7 @@ class DatasetConfig():
|
|
475
480
|
|
476
481
|
def _get_val_tables_paths(self) -> list[str]:
|
477
482
|
if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
|
478
|
-
return
|
483
|
+
return self._get_train_tables_paths()
|
479
484
|
return list(map(lambda t: f"/flows/D{t}", self.val_dates))
|
480
485
|
|
481
486
|
def _get_test_tables_paths(self) -> list[str]:
|
@@ -28,7 +28,7 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
|
|
28
28
|
date_weight_sample_train_indices,
|
29
29
|
init_or_load_test_indices,
|
30
30
|
init_or_load_train_indices,
|
31
|
-
init_or_load_val_indices,
|
31
|
+
init_or_load_val_indices, no_indices,
|
32
32
|
subset_and_sort_indices)
|
33
33
|
from cesnet_datazoo.pytables_data.pytables_dataset import PyTablesDataset, worker_init_fn
|
34
34
|
from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
|
@@ -537,10 +537,10 @@ class CesnetDataset():
|
|
537
537
|
raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
|
538
538
|
train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
|
539
539
|
elif dataset_config.apps_selection == AppSelection.FIXED:
|
540
|
-
known_apps = dataset_config.apps_selection_fixed_known
|
541
|
-
unknown_apps = dataset_config.apps_selection_fixed_unknown
|
542
|
-
train_indices =
|
543
|
-
train_unknown_indices =
|
540
|
+
known_apps = sorted(dataset_config.apps_selection_fixed_known)
|
541
|
+
unknown_apps = sorted(dataset_config.apps_selection_fixed_unknown)
|
542
|
+
train_indices = no_indices()
|
543
|
+
train_unknown_indices = no_indices()
|
544
544
|
else:
|
545
545
|
raise ValueError("Either need train set or the fixed application selection")
|
546
546
|
# Initialize validation set
|
@@ -577,8 +577,8 @@ class CesnetDataset():
|
|
577
577
|
test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
|
578
578
|
stratify=train_labels, shuffle=True, random_state=train_val_rng)
|
579
579
|
else:
|
580
|
-
val_known_indices =
|
581
|
-
val_unknown_indices =
|
580
|
+
val_known_indices = no_indices()
|
581
|
+
val_unknown_indices = no_indices()
|
582
582
|
val_data_path = None
|
583
583
|
# Initialize test set
|
584
584
|
if dataset_config.need_test_set:
|
@@ -588,8 +588,8 @@ class CesnetDataset():
|
|
588
588
|
tables_app_enum=self._tables_app_enum,
|
589
589
|
disable_indices_cache=disable_indices_cache,)
|
590
590
|
else:
|
591
|
-
test_known_indices =
|
592
|
-
test_unknown_indices =
|
591
|
+
test_known_indices = no_indices()
|
592
|
+
test_unknown_indices = no_indices()
|
593
593
|
test_data_path = None
|
594
594
|
# Fit scalers if needed
|
595
595
|
if (dataset_config.ppi_transform is not None and dataset_config.ppi_transform.needs_fitting or
|
@@ -636,7 +636,7 @@ class CesnetDataset():
|
|
636
636
|
assert val_data_path is not None
|
637
637
|
val_dataset = PyTablesDataset(
|
638
638
|
database_path=dataset_config.database_path,
|
639
|
-
tables_paths=dataset_config.
|
639
|
+
tables_paths=dataset_config._get_val_tables_paths(),
|
640
640
|
indices=dataset_indices.val_known_indices,
|
641
641
|
tables_app_enum=self._tables_app_enum,
|
642
642
|
tables_cat_enum=self._tables_cat_enum,
|
@@ -1,4 +1,4 @@
|
|
1
1
|
Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic classes,PPI features,Flowstats features,Flowstats features boolean,Packet histograms,TCP features,Other fields,Cite,Zenodo URL,Related papers
|
2
|
-
CESNET-TLS22,TLS,2022,2021,2 weeks,
|
2
|
+
CESNET-TLS22,TLS,2022,2021,2 weeks,141392195,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
|
3
3
|
CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
|
4
4
|
CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
|
@@ -17,18 +17,6 @@ from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_gene
|
|
17
17
|
log = logging.getLogger(__name__)
|
18
18
|
|
19
19
|
|
20
|
-
def get_scaler_attrs(scaler: StandardScaler | RobustScaler | MinMaxScaler) -> dict[str, list[float]]:
|
21
|
-
if isinstance(scaler, StandardScaler):
|
22
|
-
assert hasattr(scaler, "mean_") and scaler.mean_ is not None and hasattr(scaler, "scale_") and scaler.scale_ is not None
|
23
|
-
scaler_attrs = {"mean_": scaler.mean_.tolist(), "scale_": scaler.scale_.tolist()}
|
24
|
-
elif isinstance(scaler, RobustScaler):
|
25
|
-
assert hasattr(scaler, "center_") and hasattr(scaler, "scale_")
|
26
|
-
scaler_attrs = {"center_": scaler.center_.tolist(), "scale_": scaler.scale_.tolist()}
|
27
|
-
elif isinstance(scaler, MinMaxScaler):
|
28
|
-
assert hasattr(scaler, "min_") and hasattr(scaler, "scale_")
|
29
|
-
scaler_attrs = {"min_": scaler.min_.tolist(), "scale_": scaler.scale_.tolist()}
|
30
|
-
return scaler_attrs
|
31
|
-
|
32
20
|
def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> None:
|
33
21
|
# Define indices for fitting scalers
|
34
22
|
if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
|
@@ -48,6 +36,7 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
|
|
48
36
|
|
49
37
|
clip_and_scale_ppi_transform = dataset_config.ppi_transform # TODO Fix after transforms composing is implemented
|
50
38
|
clip_and_scale_flowstats_transform = dataset_config.flowstats_transform
|
39
|
+
train_data_path = dataset_config._get_train_data_path()
|
51
40
|
|
52
41
|
# Fit the ClipAndScalePPI transform
|
53
42
|
if clip_and_scale_ppi_transform is not None and clip_and_scale_ppi_transform.needs_fitting:
|
@@ -70,6 +59,7 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
|
|
70
59
|
train_psizes = np.concatenate((train_psizes, [0]))
|
71
60
|
clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
72
61
|
clip_and_scale_ppi_transform.needs_fitting = False
|
62
|
+
json.dump(clip_and_scale_ppi_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "ppi-transform.json"), "w"), indent=4)
|
73
63
|
|
74
64
|
# Fit the ClipAndScaleFlowstats transform
|
75
65
|
if clip_and_scale_flowstats_transform is not None and clip_and_scale_flowstats_transform.needs_fitting:
|
@@ -82,29 +72,5 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
|
|
82
72
|
clip_and_scale_flowstats_transform.flowstats_scaler.fit(train_flowstats)
|
83
73
|
clip_and_scale_flowstats_transform.flowstats_quantiles = flowstats_quantiles.tolist()
|
84
74
|
clip_and_scale_flowstats_transform.needs_fitting = False
|
85
|
-
|
75
|
+
json.dump(clip_and_scale_flowstats_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "flowstats-transform.json"), "w"), indent=4)
|
86
76
|
log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
|
87
|
-
train_data_path = dataset_config._get_train_data_path()
|
88
|
-
if clip_and_scale_ppi_transform is not None:
|
89
|
-
ppi_transform_path = os.path.join(train_data_path, "transforms", "ppi-transform.json")
|
90
|
-
ppi_transform_dict = {
|
91
|
-
"psizes_scaler_enum": str(clip_and_scale_ppi_transform._psizes_scaler_enum),
|
92
|
-
"psizes_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.psizes_scaler),
|
93
|
-
"pszies_min": clip_and_scale_ppi_transform.pszies_min,
|
94
|
-
"psizes_max": clip_and_scale_ppi_transform.psizes_max,
|
95
|
-
"ipt_scaler_enum": str(clip_and_scale_ppi_transform._ipt_scaler_enum),
|
96
|
-
"ipt_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.ipt_scaler),
|
97
|
-
"ipt_min": clip_and_scale_ppi_transform.ipt_min,
|
98
|
-
"ipt_max": clip_and_scale_ppi_transform.ipt_max,
|
99
|
-
}
|
100
|
-
json.dump(ppi_transform_dict, open(ppi_transform_path, "w"), indent=4)
|
101
|
-
if clip_and_scale_flowstats_transform is not None:
|
102
|
-
assert clip_and_scale_flowstats_transform.flowstats_quantiles is not None
|
103
|
-
flowstats_transform_path = os.path.join(train_data_path, "transforms", "flowstats-transform.json")
|
104
|
-
flowstats_transform_dict = {
|
105
|
-
"flowstats_scaler_enum": str(clip_and_scale_flowstats_transform._flowstats_scaler_enum),
|
106
|
-
"flowstats_scaler_attrs": get_scaler_attrs(clip_and_scale_flowstats_transform.flowstats_scaler),
|
107
|
-
"flowstats_quantiles": clip_and_scale_flowstats_transform.flowstats_quantiles,
|
108
|
-
"quantile_clip": clip_and_scale_flowstats_transform.quantile_clip,
|
109
|
-
}
|
110
|
-
json.dump(flowstats_transform_dict, open(flowstats_transform_path, "w"), indent=4)
|
@@ -132,7 +132,7 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str
|
|
132
132
|
np.save(os.path.join(val_data_path, "val_known_indices.npy"), val_known_indices)
|
133
133
|
np.save(os.path.join(val_data_path, "val_unknown_indices.npy"), val_unknown_indices)
|
134
134
|
else:
|
135
|
-
val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.
|
135
|
+
val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.npy"))
|
136
136
|
val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
|
137
137
|
return val_known_indices, val_unknown_indices, val_data_path
|
138
138
|
|
@@ -162,3 +162,6 @@ def init_train_data(train_data_path: str):
|
|
162
162
|
def init_test_data(test_data_path: str):
|
163
163
|
os.makedirs(test_data_path, exist_ok=True)
|
164
164
|
os.makedirs(os.path.join(test_data_path, "preload"), exist_ok=True)
|
165
|
+
|
166
|
+
def no_indices() -> np.ndarray:
|
167
|
+
return np.zeros((0,3), dtype=np.int64)
|
{cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/pytables_dataset.py
RENAMED
@@ -16,7 +16,8 @@ from typing_extensions import assert_never
|
|
16
16
|
|
17
17
|
from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
|
18
18
|
TrainDataParams)
|
19
|
-
from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
|
19
|
+
from cesnet_datazoo.constants import (APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN,
|
20
|
+
QUIC_SNI_COLUMN, TLS_SNI_COLUMN)
|
20
21
|
from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
|
21
22
|
split_apps_topx_with_provider_groups)
|
22
23
|
|
@@ -66,6 +67,7 @@ class PyTablesDataset(Dataset):
|
|
66
67
|
self.target_transform = target_transform
|
67
68
|
self.return_tensors = return_tensors
|
68
69
|
self.return_all_fields = return_all_fields
|
70
|
+
self.sni_column = TLS_SNI_COLUMN if TLS_SNI_COLUMN in self.other_fields else QUIC_SNI_COLUMN if QUIC_SNI_COLUMN in self.other_fields else None
|
69
71
|
|
70
72
|
self.preload = preload
|
71
73
|
self.preload_blob = preload_blob
|
@@ -179,7 +181,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
179
181
|
start_time = time.time()
|
180
182
|
for i, table_path in enumerate(train_data_params.train_tables_paths):
|
181
183
|
all_app_labels[i] = train_tables[i].read(field=APP_COLUMN)
|
182
|
-
log.info(f"Reading app column for
|
184
|
+
log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
183
185
|
app_counts = app_counts.add(pd.Series(all_app_labels[i]).value_counts(), fill_value=0)
|
184
186
|
database.close()
|
185
187
|
# Handle disabled apps and apps with less than min_samples_per_app samples
|
@@ -223,13 +225,15 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
223
225
|
else:
|
224
226
|
known_apps = train_data_params.apps_selection_fixed_known
|
225
227
|
unknown_apps = train_data_params.apps_selection_fixed_unknown
|
228
|
+
known_apps = sorted(known_apps)
|
229
|
+
unknown_apps = sorted(unknown_apps)
|
226
230
|
known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
|
227
231
|
unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
|
228
232
|
|
229
233
|
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
|
230
234
|
rng.shuffle(train_known_indices)
|
231
235
|
rng.shuffle(train_unknown_indices)
|
232
|
-
log.info(f"Processing
|
236
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
233
237
|
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
234
238
|
|
235
239
|
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
@@ -240,7 +244,7 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, tabl
|
|
240
244
|
start_time = time.time()
|
241
245
|
for i, table_path in enumerate(test_data_params.test_tables_paths):
|
242
246
|
base_labels[i] = test_tables[i].read(field=APP_COLUMN)
|
243
|
-
log.info(f"Reading app column for
|
247
|
+
log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
244
248
|
base_indices[i] = np.arange(len(test_tables[i]))
|
245
249
|
database.close()
|
246
250
|
known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
|
@@ -248,7 +252,7 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, tabl
|
|
248
252
|
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
|
249
253
|
rng.shuffle(test_known_indices)
|
250
254
|
rng.shuffle(test_unknown_indices)
|
251
|
-
log.info(f"Processing
|
255
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
252
256
|
return test_known_indices, test_unknown_indices
|
253
257
|
|
254
258
|
def load_database(database_path: str, tables_paths: Optional[list[str]] = None, mode: str = "r") -> tuple[tb.File, dict[int, Any]]: # dict[int, tb.Table]
|
@@ -23,8 +23,6 @@ class ClassInfo:
|
|
23
23
|
categories_mapping: dict[str, Optional[str]]
|
24
24
|
|
25
25
|
def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
|
26
|
-
known_apps = sorted(known_apps)
|
27
|
-
unknown_apps = sorted(unknown_apps)
|
28
26
|
target_names_arr = encoder.classes_
|
29
27
|
assert known_apps == list(target_names_arr[:-1])
|
30
28
|
group_matrix = np.array([[a == b or
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -29,10 +29,7 @@ Requires-Dist: tables>=3.8.0
|
|
29
29
|
Requires-Dist: torch>=1.10
|
30
30
|
Requires-Dist: tqdm
|
31
31
|
Provides-Extra: dev
|
32
|
-
Requires-Dist: black; extra == "dev"
|
33
32
|
Requires-Dist: build; extra == "dev"
|
34
|
-
Requires-Dist: jupyterlab; extra == "dev"
|
35
|
-
Requires-Dist: lightgbm; extra == "dev"
|
36
33
|
Requires-Dist: mkdocs-autorefs; extra == "dev"
|
37
34
|
Requires-Dist: mkdocs-material-extensions; extra == "dev"
|
38
35
|
Requires-Dist: mkdocs-material; extra == "dev"
|
@@ -57,7 +54,7 @@ The goal of this project is to provide tools for working with large network traf
|
|
57
54
|
- Extensive configuration options for:
|
58
55
|
- Selection of train, validation, and test periods.
|
59
56
|
- Selection of application classes and splitting classes between *known* and *unknown*.
|
60
|
-
-
|
57
|
+
- Data transformations, such as feature scaling.
|
61
58
|
- Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
|
62
59
|
- Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
|
63
60
|
|
@@ -72,7 +69,7 @@ The package is able to handle the following datasets:
|
|
72
69
|
| _Collection duration_ | 2 weeks | 4 weeks | 1 year |
|
73
70
|
| _Collection period_ | 4.10.2021 - 17.10.2021 | 31.10.2022 - 27.11.2022 | 1.1.2022 - 31.12.2022 | | ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST | ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST |
|
74
71
|
| _Application count_ | 191 | 102 | 180 |
|
75
|
-
| _Available samples_ |
|
72
|
+
| _Available samples_ | 141392195 | 153226273 | 507739073 |
|
76
73
|
| _Available dataset sizes_ | XS, S, M, L | XS, S, M, L | XS, S, M, L |
|
77
74
|
| _Cite_ | [https://doi.org/10.1016/j.comnet.2022.109467](https://doi.org/10.1016/j.comnet.2022.109467) | [https://doi.org/10.1016/j.dib.2023.108888](https://doi.org/10.1016/j.dib.2023.108888) | |
|
78
75
|
| _Zenodo URL_ | [https://zenodo.org/record/7965515](https://zenodo.org/record/7965515) | [https://zenodo.org/record/7963302](https://zenodo.org/record/7963302) | |
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "cesnet-datazoo"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.2"
|
8
8
|
authors = [
|
9
9
|
{name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
|
10
10
|
{name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
|
@@ -45,10 +45,7 @@ dependencies = [
|
|
45
45
|
|
46
46
|
[project.optional-dependencies]
|
47
47
|
dev = [
|
48
|
-
"black",
|
49
48
|
"build",
|
50
|
-
"jupyterlab",
|
51
|
-
"lightgbm",
|
52
49
|
"mkdocs-autorefs",
|
53
50
|
"mkdocs-material-extensions",
|
54
51
|
"mkdocs-material",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/dataset_metadata.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/classification_report.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|