cesnet-datazoo 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +5 -1
- cesnet_datazoo/datasets/cesnet_dataset.py +10 -5
- cesnet_datazoo/pytables_data/pytables_dataset.py +2 -1
- {cesnet_datazoo-0.1.13.dist-info → cesnet_datazoo-0.1.15.dist-info}/METADATA +1 -1
- {cesnet_datazoo-0.1.13.dist-info → cesnet_datazoo-0.1.15.dist-info}/RECORD +8 -8
- {cesnet_datazoo-0.1.13.dist-info → cesnet_datazoo-0.1.15.dist-info}/WHEEL +0 -0
- {cesnet_datazoo-0.1.13.dist-info → cesnet_datazoo-0.1.15.dist-info}/licenses/LICENCE +0 -0
- {cesnet_datazoo-0.1.13.dist-info → cesnet_datazoo-0.1.15.dist-info}/top_level.txt +0 -0
cesnet_datazoo/config.py
CHANGED
@@ -166,7 +166,8 @@ class DatasetConfig():
|
|
166
166
|
val_workers: Number of workers for loading validation data. `0` means that the data will be loaded in the main process. `Default: 1`
|
167
167
|
batch_size: Number of samples per batch. `Default: 192`
|
168
168
|
test_batch_size: Number of samples per batch for loading validation and test data. `Default: 2048`
|
169
|
-
|
169
|
+
preload_train: Whether to dump the train set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
|
170
|
+
preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
|
170
171
|
preload_test: Whether to dump the test set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
|
171
172
|
train_size: Size of the train set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
|
172
173
|
val_known_size: Size of the validation set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
|
@@ -178,6 +179,7 @@ class DatasetConfig():
|
|
178
179
|
|
179
180
|
return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
|
180
181
|
return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
|
182
|
+
disable_label_encoding: Whether to disable label encoding and return application names as strings. The original labels of configured unknown classes are preserved. `Default: False`
|
181
183
|
use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
|
182
184
|
use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
|
183
185
|
use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
|
@@ -243,6 +245,7 @@ class DatasetConfig():
|
|
243
245
|
val_workers: int = 1
|
244
246
|
batch_size: int = 192
|
245
247
|
test_batch_size: int = 2048
|
248
|
+
preload_train: bool = False
|
246
249
|
preload_val: bool = False
|
247
250
|
preload_test: bool = False
|
248
251
|
train_size: int | Literal["all"] = "all"
|
@@ -255,6 +258,7 @@ class DatasetConfig():
|
|
255
258
|
|
256
259
|
return_other_fields: bool = False
|
257
260
|
return_tensors: bool = False
|
261
|
+
disable_label_encoding: bool = False
|
258
262
|
use_packet_histograms: bool = False
|
259
263
|
use_tcp_features: bool = False
|
260
264
|
use_push_flags: bool = False
|
@@ -619,7 +619,10 @@ class CesnetDataset():
|
|
619
619
|
encoder = LabelEncoder().fit(known_apps)
|
620
620
|
encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
|
621
621
|
class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
|
622
|
-
|
622
|
+
if dataset_config.disable_label_encoding:
|
623
|
+
label_encoder_fn = None
|
624
|
+
else:
|
625
|
+
label_encoder_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
|
623
626
|
# Create train, validation, and test datasets
|
624
627
|
train_dataset = val_dataset = test_dataset = None
|
625
628
|
if dataset_config.need_train_set:
|
@@ -638,8 +641,10 @@ class CesnetDataset():
|
|
638
641
|
ppi_transform=dataset_config.ppi_transform,
|
639
642
|
flowstats_transform=dataset_config.flowstats_transform,
|
640
643
|
flowstats_phist_transform=dataset_config.flowstats_phist_transform,
|
641
|
-
target_transform=
|
642
|
-
return_tensors=dataset_config.return_tensors,
|
644
|
+
target_transform=label_encoder_fn,
|
645
|
+
return_tensors=dataset_config.return_tensors,
|
646
|
+
preload=dataset_config.preload_train,
|
647
|
+
preload_blob=os.path.join(dataset_config._get_train_data_path(), "preload", f"train_dataset-{dataset_config.train_size}.npz"),)
|
643
648
|
if dataset_config.need_val_set:
|
644
649
|
assert val_data_path is not None
|
645
650
|
val_dataset = PyTablesDataset(
|
@@ -657,7 +662,7 @@ class CesnetDataset():
|
|
657
662
|
ppi_transform=dataset_config.ppi_transform,
|
658
663
|
flowstats_transform=dataset_config.flowstats_transform,
|
659
664
|
flowstats_phist_transform=dataset_config.flowstats_phist_transform,
|
660
|
-
target_transform=
|
665
|
+
target_transform=label_encoder_fn,
|
661
666
|
return_tensors=dataset_config.return_tensors,
|
662
667
|
preload=dataset_config.preload_val,
|
663
668
|
preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
|
@@ -678,7 +683,7 @@ class CesnetDataset():
|
|
678
683
|
ppi_transform=dataset_config.ppi_transform,
|
679
684
|
flowstats_transform=dataset_config.flowstats_transform,
|
680
685
|
flowstats_phist_transform=dataset_config.flowstats_phist_transform,
|
681
|
-
target_transform=
|
686
|
+
target_transform=label_encoder_fn,
|
682
687
|
return_tensors=dataset_config.return_tensors,
|
683
688
|
preload=dataset_config.preload_test,
|
684
689
|
preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import atexit
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import sys
|
4
5
|
import time
|
5
6
|
import warnings
|
6
7
|
from datetime import datetime
|
@@ -198,7 +199,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
198
199
|
if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
|
199
200
|
warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
200
201
|
"To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
|
201
|
-
exit()
|
202
|
+
sys.exit()
|
202
203
|
elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
|
203
204
|
log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
204
205
|
"Disabling these applications")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -1,8 +1,8 @@
|
|
1
1
|
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
cesnet_datazoo/config.py,sha256=
|
2
|
+
cesnet_datazoo/config.py,sha256=OEVdJQP9p7RYIgHxbXnwKsLZ85xJzlsA1CCoJ8SO4Ps,38933
|
3
3
|
cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
|
4
4
|
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
-
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=
|
5
|
+
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=zP97SwDHHvpRsFVP8kdOntV9tNkXYX_YpYpy1qLt8Zc,48016
|
6
6
|
cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
|
7
7
|
cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
|
8
8
|
cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
|
@@ -17,14 +17,14 @@ cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
17
17
|
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
18
18
|
cesnet_datazoo/pytables_data/data_scalers.py,sha256=xPL0SCLByDOgKv1Apqi5XQd501mIfsF8FdonmRQ0zzQ,5236
|
19
19
|
cesnet_datazoo/pytables_data/indices_setup.py,sha256=yCYWjkCPIj0en3btnC-C7cte0CqbqMZzOnaVR9jaNes,13717
|
20
|
-
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=
|
20
|
+
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=Xk5BwKoQPewqL1gj5-EuiA9HfhJPUsYs7lOsG3CEzlQ,19447
|
21
21
|
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
|
23
23
|
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
24
24
|
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
25
25
|
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
26
|
-
cesnet_datazoo-0.1.
|
27
|
-
cesnet_datazoo-0.1.
|
28
|
-
cesnet_datazoo-0.1.
|
29
|
-
cesnet_datazoo-0.1.
|
30
|
-
cesnet_datazoo-0.1.
|
26
|
+
cesnet_datazoo-0.1.15.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
27
|
+
cesnet_datazoo-0.1.15.dist-info/METADATA,sha256=iVq2zImXujDGrI8Jca4cVpUY0LqWrBv7oGKVOVTJZ8k,12574
|
28
|
+
cesnet_datazoo-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
29
|
+
cesnet_datazoo-0.1.15.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
30
|
+
cesnet_datazoo-0.1.15.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|