PyPI - cesnet-datazoo - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

cesnet-datazoo 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

cesnet_datazoo/config.py CHANGED Viewed

@@ -142,7 +142,7 @@ class DatasetConfig():
         need_test_set: Use to disable the test set. `Default: True`
         train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
         train_dates: Dates used for creating a train set.
-        train_dates_weigths: To use a non-uniform distribution of samples across train dates.
+        train_dates_weights: To use a non-uniform distribution of samples across train dates.
         val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
         train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
         val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
@@ -178,6 +178,7 @@ class DatasetConfig():
         return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
         return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
+        disable_label_encoding: Whether to disable label encoding and return application names as strings. The original labels of configured unknown classes are preserved. `Default: False`
         use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
         use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
         use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
@@ -219,7 +220,7 @@ class DatasetConfig():
     need_test_set: bool = True
     train_period_name: str = ""
     train_dates: list[str] = field(default_factory=list)
-    train_dates_weigths: Optional[list[int]] = None
+    train_dates_weights: Optional[list[int]] = None
     val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
     train_val_split_fraction: float = 0.2
     val_period_name: str = ""
@@ -255,6 +256,7 @@ class DatasetConfig():
     return_other_fields: bool = False
     return_tensors: bool = False
+    disable_label_encoding: bool = False
     use_packet_histograms: bool = False
     use_tcp_features: bool = False
     use_push_flags: bool = False
@@ -369,16 +371,16 @@ class DatasetConfig():
                 raise ValueError("QUIC datasets do not support use_tcp_features")
             if self.use_push_flags:
                 raise ValueError("QUIC datasets do not support use_push_flags")
-        # When train_dates_weigths are used, train_size and val_known_size have to be specified
-        if self.train_dates_weigths is not None:
+        # When train_dates_weights are used, train_size and val_known_size have to be specified
+        if self.train_dates_weights is not None:
             if not self.need_train_set:
-                raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
-            if len(self.train_dates_weigths) != len(self.train_dates):
-                raise ValueError("train_dates_weigths has to have the same length as train_dates")
+                raise ValueError("train_dates_weights cannot be specified when need_train_set is false")
+            if len(self.train_dates_weights) != len(self.train_dates):
+                raise ValueError("train_dates_weights has to have the same length as train_dates")
             if self.train_size == "all":
-                raise ValueError("train_size cannot be 'all' when train_dates_weigths are speficied")
+                raise ValueError("train_size cannot be 'all' when train_dates_weights are speficied")
             if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN and self.val_known_size == "all":
-                raise ValueError("val_known_size cannot be 'all' when train_dates_weigths are speficied and validation_approach is split-from-train")
+                raise ValueError("val_known_size cannot be 'all' when train_dates_weights are speficied and validation_approach is split-from-train")
         # App selection
         if self.apps_selection == AppSelection.ALL_KNOWN:
             self.val_unknown_size = 0

cesnet_datazoo/datasets/cesnet_dataset.py CHANGED Viewed

@@ -532,7 +532,7 @@ class CesnetDataset():
                                                                                                         servicemap=servicemap,
                                                                                                         disable_indices_cache=disable_indices_cache,)
             # Date weight sampling of train indices
-            if dataset_config.train_dates_weigths is not None:
+            if dataset_config.train_dates_weights is not None:
                 assert dataset_config.train_size != "all"
                 if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
                     # requested number of samples is train_size + val_known_size when using the split-from-train validation approach
@@ -563,7 +563,7 @@ class CesnetDataset():
                 val_data_path = dataset_config._get_train_data_path()
                 val_unknown_indices = train_unknown_indices
                 train_labels = train_indices[INDICES_APP_FIELD]
-                if dataset_config.train_dates_weigths is not None:
+                if dataset_config.train_dates_weights is not None:
                     assert dataset_config.val_known_size != "all"
                     # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
                     if dataset_config.val_known_size > len(train_indices):
@@ -619,7 +619,10 @@ class CesnetDataset():
         encoder = LabelEncoder().fit(known_apps)
         encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
         class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
-        encode_labels_with_unknown_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
+        if dataset_config.disable_label_encoding:
+            label_encoder_fn = None
+        else:
+            label_encoder_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
         # Create train, validation, and test datasets
         train_dataset = val_dataset = test_dataset = None
         if dataset_config.need_train_set:
@@ -638,7 +641,7 @@ class CesnetDataset():
                 ppi_transform=dataset_config.ppi_transform,
                 flowstats_transform=dataset_config.flowstats_transform,
                 flowstats_phist_transform=dataset_config.flowstats_phist_transform,
-                target_transform=encode_labels_with_unknown_fn,
+                target_transform=label_encoder_fn,
                 return_tensors=dataset_config.return_tensors,)
         if dataset_config.need_val_set:
             assert val_data_path is not None
@@ -657,7 +660,7 @@ class CesnetDataset():
                 ppi_transform=dataset_config.ppi_transform,
                 flowstats_transform=dataset_config.flowstats_transform,
                 flowstats_phist_transform=dataset_config.flowstats_phist_transform,
-                target_transform=encode_labels_with_unknown_fn,
+                target_transform=label_encoder_fn,
                 return_tensors=dataset_config.return_tensors,
                 preload=dataset_config.preload_val,
                 preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
@@ -678,7 +681,7 @@ class CesnetDataset():
                 ppi_transform=dataset_config.ppi_transform,
                 flowstats_transform=dataset_config.flowstats_transform,
                 flowstats_phist_transform=dataset_config.flowstats_phist_transform,
-                target_transform=encode_labels_with_unknown_fn,
+                target_transform=label_encoder_fn,
                 return_tensors=dataset_config.return_tensors,
                 preload=dataset_config.preload_test,
                 preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)

cesnet_datazoo/pytables_data/indices_setup.py CHANGED Viewed

@@ -64,11 +64,11 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
 def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
     rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
     indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
-    weights = np.array(dataset_config.train_dates_weigths)
+    weights = np.array(dataset_config.train_dates_weights)
     weights = weights / weights.sum()
     samples_per_date = np.ceil((weights * (num_samples))).astype(int)
     samples_per_date_clipped = np.clip(samples_per_date, a_max=list(map(len, indices_per_date)), a_min=0)
-    df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weigths, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
+    df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weights, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
     log.info(f"Weight sampling per date with requsted total number of samples {num_samples} (train_size + val_known_size when using the split-from-train validation approach; train_size otherwise)")
     for l in df.to_string(index=False).splitlines():
         log.info(l)

cesnet_datazoo/pytables_data/pytables_dataset.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import atexit
 import logging
 import os
+import sys
 import time
 import warnings
 from datetime import datetime
@@ -198,7 +199,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
         if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
             warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
                             "To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
-            exit()
+            sys.exit()
         elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
             log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
                        "Disabling these applications")

{cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cesnet-datazoo
-Version: 0.1.12
+Version: 0.1.14
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>

{cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cesnet_datazoo/config.py,sha256=wkpD_OL3gRXX2t0WDfDMsBD2A3vEdAjSm4yXhzsR8T0,38536
+cesnet_datazoo/config.py,sha256=AoYMX_drWqb6K6MhdTQRyvUPf_6kZeL3ie04hUgJzgE,38768
 cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
 cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
-cesnet_datazoo/datasets/cesnet_dataset.py,sha256=V2rBE4Mh2gaMw-NjQ4xHp6ViuuvCdEzB-ymX3CYyfkc,47762
+cesnet_datazoo/datasets/cesnet_dataset.py,sha256=AgxdiEwtJrOug9F8LJcMwttc7NMrligkq7cy4lRmLAs,47817
 cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
 cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
 cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
@@ -16,15 +16,15 @@ cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKU
 cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
 cesnet_datazoo/pytables_data/data_scalers.py,sha256=xPL0SCLByDOgKv1Apqi5XQd501mIfsF8FdonmRQ0zzQ,5236
-cesnet_datazoo/pytables_data/indices_setup.py,sha256=M5J2BevkQK8fuC22vUauKyKAEVwYg8xRz9JJK8E1VX8,13717
-cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=YGbzYKrSklCu3J52Xbdcs3zZsYroBBtP8ulgS1c5Fnw,19431
+cesnet_datazoo/pytables_data/indices_setup.py,sha256=yCYWjkCPIj0en3btnC-C7cte0CqbqMZzOnaVR9jaNes,13717
+cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=Xk5BwKoQPewqL1gj5-EuiA9HfhJPUsYs7lOsG3CEzlQ,19447
 cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
 cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
 cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
 cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
-cesnet_datazoo-0.1.12.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
-cesnet_datazoo-0.1.12.dist-info/METADATA,sha256=WpP33duuGItPTvO3SOXoyTkNPHMZ16WchDpmzY75Pbs,12574
-cesnet_datazoo-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cesnet_datazoo-0.1.12.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
-cesnet_datazoo-0.1.12.dist-info/RECORD,,
+cesnet_datazoo-0.1.14.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
+cesnet_datazoo-0.1.14.dist-info/METADATA,sha256=yp-ld51MKd-oTx-z86x0BwbJEvdY2zlc1fIbfYIDKWY,12574
+cesnet_datazoo-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cesnet_datazoo-0.1.14.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
+cesnet_datazoo-0.1.14.dist-info/RECORD,,

{cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/licenses/LICENCE RENAMED Viewed

File without changes

{cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

cesnet-datazoo 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

cesnet-datazoo 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl