PyPI - cesnet-datazoo - Versions diffs - 0.1.13__tar.gz → 0.1.15__tar.gz - Mend

cesnet-datazoo 0.1.13tar.gz → 0.1.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cesnet-datazoo
-Version: 0.1.13
+Version: 0.1.15
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>

{cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/config.py RENAMED Viewed

@@ -166,7 +166,8 @@ class DatasetConfig():
         val_workers: Number of workers for loading validation data. `0` means that the data will be loaded in the main process. `Default: 1`
         batch_size: Number of samples per batch. `Default: 192`
         test_batch_size: Number of samples per batch for loading validation and test data. `Default: 2048`
-        preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
+        preload_train: Whether to dump the train set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
+        preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
         preload_test: Whether to dump the test set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
         train_size: Size of the train set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
         val_known_size: Size of the validation set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
@@ -178,6 +179,7 @@ class DatasetConfig():
         return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
         return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
+        disable_label_encoding: Whether to disable label encoding and return application names as strings. The original labels of configured unknown classes are preserved. `Default: False`
         use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
         use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
         use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
@@ -243,6 +245,7 @@ class DatasetConfig():
     val_workers: int = 1
     batch_size: int = 192
     test_batch_size: int = 2048
+    preload_train: bool = False
     preload_val: bool = False
     preload_test: bool = False
     train_size: int | Literal["all"] = "all"
@@ -255,6 +258,7 @@ class DatasetConfig():
     return_other_fields: bool = False
     return_tensors: bool = False
+    disable_label_encoding: bool = False
     use_packet_histograms: bool = False
     use_tcp_features: bool = False
     use_push_flags: bool = False

{cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/cesnet_dataset.py RENAMED Viewed

@@ -619,7 +619,10 @@ class CesnetDataset():
         encoder = LabelEncoder().fit(known_apps)
         encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
         class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
-        encode_labels_with_unknown_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
+        if dataset_config.disable_label_encoding:
+            label_encoder_fn = None
+        else:
+            label_encoder_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
         # Create train, validation, and test datasets
         train_dataset = val_dataset = test_dataset = None
         if dataset_config.need_train_set:
@@ -638,8 +641,10 @@ class CesnetDataset():
                 ppi_transform=dataset_config.ppi_transform,
                 flowstats_transform=dataset_config.flowstats_transform,
                 flowstats_phist_transform=dataset_config.flowstats_phist_transform,
-                target_transform=encode_labels_with_unknown_fn,
-                return_tensors=dataset_config.return_tensors,)
+                target_transform=label_encoder_fn,
+                return_tensors=dataset_config.return_tensors,
+                preload=dataset_config.preload_train,
+                preload_blob=os.path.join(dataset_config._get_train_data_path(), "preload", f"train_dataset-{dataset_config.train_size}.npz"),)
         if dataset_config.need_val_set:
             assert val_data_path is not None
             val_dataset = PyTablesDataset(
@@ -657,7 +662,7 @@ class CesnetDataset():
                 ppi_transform=dataset_config.ppi_transform,
                 flowstats_transform=dataset_config.flowstats_transform,
                 flowstats_phist_transform=dataset_config.flowstats_phist_transform,
-                target_transform=encode_labels_with_unknown_fn,
+                target_transform=label_encoder_fn,
                 return_tensors=dataset_config.return_tensors,
                 preload=dataset_config.preload_val,
                 preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
@@ -678,7 +683,7 @@ class CesnetDataset():
                 ppi_transform=dataset_config.ppi_transform,
                 flowstats_transform=dataset_config.flowstats_transform,
                 flowstats_phist_transform=dataset_config.flowstats_phist_transform,
-                target_transform=encode_labels_with_unknown_fn,
+                target_transform=label_encoder_fn,
                 return_tensors=dataset_config.return_tensors,
                 preload=dataset_config.preload_test,
                 preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)

{cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/pytables_data/pytables_dataset.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import atexit
 import logging
 import os
+import sys
 import time
 import warnings
 from datetime import datetime
@@ -198,7 +199,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
         if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
             warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
                             "To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
-            exit()
+            sys.exit()
         elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
             log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
                        "Disabling these applications")

{cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cesnet-datazoo
-Version: 0.1.13
+Version: 0.1.15
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>

{cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cesnet-datazoo"
-version = "0.1.13"
+version = "0.1.15"
 authors = [
   {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
   {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},