PyPI - cesnet-datazoo - Versions diffs - 0.0.10__tar.gz → 0.0.12__tar.gz - Mend

cesnet-datazoo 0.0.10tar.gz → 0.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cesnet-datazoo
-Version: 0.0.10
+Version: 0.0.12
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -102,8 +102,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
 dataset_config = DatasetConfig(
     dataset=dataset,
     apps_selection=AppSelection.ALL_KNOWN,
-    train_period="W-2022-44",
-    test_period="W-2022-45",
+    train_period_name="W-2022-44",
+    test_period_name="W-2022-45",
 )
 dataset.set_dataset_config_and_initialize(dataset_config)
 train_dataframe = dataset.get_train_df()

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/README.md RENAMED Viewed

@@ -60,8 +60,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
 dataset_config = DatasetConfig(
     dataset=dataset,
     apps_selection=AppSelection.ALL_KNOWN,
-    train_period="W-2022-44",
-    test_period="W-2022-45",
+    train_period_name="W-2022-44",
+    test_period_name="W-2022-45",
 )
 dataset.set_dataset_config_and_initialize(dataset_config)
 train_dataframe = dataset.get_train_df()

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/config.py RENAMED Viewed

@@ -4,11 +4,14 @@ import dataclasses
 import hashlib
 import json
 import os
+import warnings
 from dataclasses import InitVar, field
+from datetime import datetime
 from enum import Enum
 from typing import TYPE_CHECKING, Literal, Optional
 import yaml
+from pydantic import model_validator
 from pydantic.dataclasses import dataclass
 from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
@@ -21,10 +24,15 @@ if TYPE_CHECKING:
 Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
 class ScalerEnum(Enum):
-    ROBUST = "robust"
+    """Available scalers for flow statistics, packet sizes, and inter-packet times."""
     STANDARD = "standard"
+    """Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
+    ROBUST = "robust"
+    """Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
     MINMAX = "minmax"
+    """Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
     NO_SCALER = "no-scaler"
+    """No scaling."""
     def __str__(self): return self.value
 class Protocol(Enum):
@@ -39,7 +47,7 @@ class ValidationApproach(Enum):
     Scikit-learn [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
     is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
     VALIDATION_DATES = "validation-dates"
-    """Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period`."""
+    """Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
     NO_VALIDATION = "no-validation"
     """Do not use validation. The validation dataloader and dataframe will not be available."""
     def __str__(self): return self.value
@@ -53,7 +61,8 @@ class AppSelection(Enum):
     ALL_KNOWN = "all-known"
     """Use all applications as *known*."""
     TOPX_KNOWN = "topx-known"
-    """Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*."""
+    """Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
+    Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
     EXPLICIT_UNKNOWN = "explicit-unknown"
     """Use the provided list of applications (`apps_selection_explicit_unknown`) as *unknown*, and the rest as *known*."""
     LONGTERM_FIXED = "longterm-fixed"
@@ -90,7 +99,7 @@ class DataLoaderOrder(Enum):
 @dataclass(frozen=True)
 class TrainDataParams():
     database_filename: str
-    train_period: str
+    train_period_name: str
     train_tables_paths: list[str]
     apps_selection: AppSelection
     apps_selection_topx: int
@@ -103,7 +112,7 @@ class TrainDataParams():
 @dataclass(frozen=True)
 class TestDataParams():
     database_filename: str
-    test_period: str
+    test_period_name: str
     test_tables_paths: list[str]
     known_apps_database_enum: dict[int, str]
     unknown_apps_database_enum: dict[int, str]
@@ -125,23 +134,24 @@ class DatasetConfig():
     Attributes:
         dataset: The dataset instance to be configured
-        data_root: Extracted from the dataset instance
-        database_filename: Extracted from the dataset instance
-        database_path: Extracted from the dataset instance
-        servicemap_path: Extracted from the dataset instance
-        flowstats_features: Extracted from the `dataset.metadata.flowstats_features`
+        data_root: Taken from the dataset instance
+        database_filename: Taken from the dataset instance
+        database_path: Taken from the dataset instance
+        servicemap_path: Taken from the dataset instance
+        flowstats_features: Taken from `dataset.metadata.flowstats_features`
     # Configuration options
     Attributes:
-        train_period: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
+        train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
         train_dates: Dates used for creating a train set.
         train_dates_weigths: To use a non-uniform distribution of samples across train dates.
         val_approach: How a validation set should be created. Either split train data into train and validation, have a separate validation period, or no validation at all. `Default: SPLIT_FROM_TRAIN`
         train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
-        val_period: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
+        val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
         val_dates: Dates used for creating a validation set.
-        test_period: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
+        no_test_set: Disable the test set. `Default: False`
+        test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
         test_dates: Dates used for creating a test set.
         apps_selection: How to select application classes. `Default: ALL_KNOWN`
@@ -171,6 +181,7 @@ class DatasetConfig():
         return_ips: Use for IP-based classification. Dataloaders will return data in this tuple format `((SRC_IP, DST_IP, SRC_PORT, DST_PORT), LABELS)`. Dataframes are not available when this option is used. `Default: False`
         return_torch: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
         use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
+        normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
         use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
         use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
         zero_ppi_start: Zeroing out the first N packets of each packet sequence. `Default: 0`
@@ -184,18 +195,18 @@ class DatasetConfig():
         ipt_max: Max clip inter-packet times before scaling. `Default: 15000`
     # How to configure train, validation, and test sets
-    There are three options for how to define train/validation/test __time periods and dates__.
+    There are three options for how to define train/validation/test dates.
-    1. Specify a predefined time period (`train_period`, `val_period`, or `test_period`) available in `dataset.time_periods` and leave the list of dates (`train_dates`, `val_dates`, or `test_dates`) empty.
-    2. Name a custom time period and provide a list of dates. The dates are checked against `dataset.available_dates`.
-    3. Leave everything empty and use the dataset's defaults `dataset.default_train_period` and `dataset.default_test_period`.
+    1. Choose a predefined time period (`train_period_name`, `val_period_name`, or `test_period_name`) available in `dataset.time_periods` and leave the list of dates (`train_dates`, `val_dates`, or `test_dates`) empty.
+    2. Provide a list of dates and a name for the time period. The dates are checked against `dataset.available_dates`.
+    3. Do not specify anything and use the dataset's defaults `dataset.default_train_period_name` and `dataset.default_test_period_name`.
-    There are two options for configuring __sizes__ of train/validation/test sets.
+    There are two options for configuring sizes of train/validation/test sets.
     1. Select an appropriate dataset size (default is `S`) when creating the [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance and leave `train_size`, `val_known_size`, and `test_known_size` with their default `all` value.
     This will create train/validation/test sets with all samples available in the selected dataset size (of course, depending on the selected dates and validation approach).
     2. Provide exact sizes in `train_size`, `val_known_size`, and `test_known_size`. This will create train/validation/test sets of the given sizes by doing a random subset.
-    This is especially useful when using the `ORIG` dataset size and want to run smaller experiments.
+    This is especially useful when using the `ORIG` dataset size and want to control the size of experiments.
     !!! tip Validation set
         The default approach for creating a validation set is to randomly split the train data into train and validation. The second approach is to define separate validation dates. See [ValidationApproach][config.ValidationApproach].
@@ -208,14 +219,15 @@ class DatasetConfig():
     servicemap_path: str = field(init=False)
     flowstats_features: list[str] = field(init=False)
-    train_period: str = ""
+    train_period_name: str = ""
     train_dates: list[str] = field(default_factory=list)
     train_dates_weigths: Optional[list[int]] = None
     val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
     train_val_split_fraction: float = 0.2
-    val_period: str = ""
+    val_period_name: str = ""
     val_dates: list[str] = field(default_factory=list)
-    test_period: str = ""
+    no_test_set: bool = False
+    test_period_name: str = ""
     test_dates: list[str] = field(default_factory=list)
     apps_selection: AppSelection = AppSelection.ALL_KNOWN
@@ -245,6 +257,7 @@ class DatasetConfig():
     return_ips: bool = False
     return_torch: bool = False
     use_packet_histograms: bool = True
+    normalize_packet_histograms: bool = True
     use_tcp_features: bool = True
     use_push_flags: bool = False
     zero_ppi_start: int = 0
@@ -267,52 +280,67 @@ class DatasetConfig():
         self.database_path = dataset.database_path
         self.flowstats_features = dataset.metadata.flowstats_features
-        # Configure train dates and period
-        if len(self.train_dates) > 0 and self.train_period == "":
-            raise ValueError("train_period has to be specified when train_dates are set")
-        if len(self.train_dates) == 0 and self.train_period != "":
-            if self.train_period not in dataset.time_periods:
-                raise ValueError(f"Unknown train_period {self.train_period}. Use time period available in dataset.time_periods")
-            self.train_dates = dataset.time_periods[self.train_period]
-        if len(self.train_dates) == 0 and self.test_period == "":
-            self.train_period = dataset.default_train_period
-            self.train_dates = dataset.time_periods[dataset.default_train_period]
-        # Configure test dates and period
-        if len(self.test_dates) > 0 and self.test_period == "":
-            raise ValueError("test_period has to be specified when test_dates are set")
-        if len(self.test_dates) == 0 and self.test_period != "":
-            if self.test_period not in dataset.time_periods:
-                raise ValueError(f"Unknown test_period {self.test_period}. Use time period available in dataset.time_periods")
-            self.test_dates = dataset.time_periods[self.test_period]
-        if len(self.test_dates) == 0 and self.test_period == "":
-            self.test_period = dataset.default_test_period
-            self.test_dates = dataset.time_periods[dataset.default_test_period]
-        # Configure val dates and period
-        if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period != ""):
-            raise ValueError("val_dates and val_period cannot be specified when val_approach is no-validation or split-from-train")
+        # Configure train dates
+        if len(self.train_dates) > 0 and self.train_period_name == "":
+            raise ValueError("train_period_name has to be specified when train_dates are set")
+        if len(self.train_dates) == 0 and self.train_period_name != "":
+            if self.train_period_name not in dataset.time_periods:
+                raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
+            self.train_dates = dataset.time_periods[self.train_period_name]
+        if len(self.train_dates) == 0 and self.test_period_name == "":
+            self.train_period_name = dataset.default_train_period_name
+            self.train_dates = dataset.time_periods[dataset.default_train_period_name]
+        # Configure test dates
+        if self.no_test_set:
+            if (len(self.test_dates) > 0 or self.test_period_name != ""):
+                raise ValueError("test_dates and test_period_name cannot be specified when no_test_set is true")
+        else:
+            if len(self.test_dates) > 0 and self.test_period_name == "":
+                raise ValueError("test_period_name has to be specified when test_dates are set")
+            if len(self.test_dates) == 0 and self.test_period_name != "":
+                if self.test_period_name not in dataset.time_periods:
+                    raise ValueError(f"Unknown test_period_name {self.test_period_name}. Use time period available in dataset.time_periods")
+                self.test_dates = dataset.time_periods[self.test_period_name]
+            if len(self.test_dates) == 0 and self.test_period_name == "":
+                self.test_period_name = dataset.default_test_period_name
+                self.test_dates = dataset.time_periods[dataset.default_test_period_name]
+        # Configure val dates
+        if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
+            raise ValueError("val_dates and val_period_name cannot be specified when val_approach is no-validation or split-from-train")
         if self.val_approach == ValidationApproach.VALIDATION_DATES:
-            if len(self.val_dates) > 0 and self.val_period == "":
-                raise ValueError("val_period has to be specified when val_dates are set")
-            if len(self.val_dates) == 0 and self.val_period != "":
-                if self.val_period not in dataset.time_periods:
-                    raise ValueError(f"Unknown val_period {self.val_period}. Use time period available in dataset.time_periods")
-                self.val_dates = dataset.time_periods[self.val_period]
-            if len(self.val_dates) == 0 and self.val_period == "":
-                raise ValueError("val_period and val_dates (or val_period from dataset.time_periods) have to be specified when val_approach is validation-dates")
+            if len(self.val_dates) > 0 and self.val_period_name == "":
+                raise ValueError("val_period_name has to be specified when val_dates are set")
+            if len(self.val_dates) == 0 and self.val_period_name != "":
+                if self.val_period_name not in dataset.time_periods:
+                    raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
+                self.val_dates = dataset.time_periods[self.val_period_name]
+            if len(self.val_dates) == 0 and self.val_period_name == "":
+                raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when val_approach is validation-dates")
         # Check if train, val, and test dates are available in the dataset
         if dataset.available_dates:
             unknown_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
             unknown_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
             unknown_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
             if len(unknown_train_dates) > 0:
-                raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates" \
-                                + f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
+                raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
+                                + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
             if len(unknown_val_dates) > 0:
-                raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates" \
-                                + f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
+                raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
+                                + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
             if len(unknown_test_dates) > 0:
-                raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates" \
-                                + f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
+                raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
+                                + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
+        # Check time order of train, val, and test periods
+        train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
+        test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
+        if not self.no_test_set and min(test_dates) <= max(train_dates):
+            warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
+        if self.val_approach == ValidationApproach.VALIDATION_DATES:
+            val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
+            if min(val_dates) <= max(train_dates):
+                warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
+            if not self.no_test_set and min(test_dates) <= max(val_dates):
+                warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
         # Configure features
         if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
             self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
@@ -446,7 +474,7 @@ class DatasetConfig():
     def _get_train_data_params(self) -> TrainDataParams:
         return TrainDataParams(
             database_filename=self.database_filename,
-            train_period=self.train_period,
+            train_period_name=self.train_period_name,
             train_tables_paths=self._get_train_tables_paths(),
             apps_selection=self.apps_selection,
             apps_selection_topx=self.apps_selection_topx,
@@ -460,7 +488,7 @@ class DatasetConfig():
         assert self.val_approach == ValidationApproach.VALIDATION_DATES
         val_data_params = TestDataParams(
             database_filename=self.database_filename,
-            test_period=self.val_period,
+            test_period_name=self.val_period_name,
             test_tables_paths=self._get_val_tables_paths(),
             known_apps_database_enum=known_apps_database_enum,
             unknown_apps_database_enum=unknown_apps_database_enum,)
@@ -472,7 +500,7 @@ class DatasetConfig():
     def _get_test_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
         test_data_params = TestDataParams(
             database_filename=self.database_filename,
-            test_period=self.test_period,
+            test_period_name=self.test_period_name,
             test_tables_paths=self._get_test_tables_paths(),
             known_apps_database_enum=known_apps_database_enum,
             unknown_apps_database_enum=unknown_apps_database_enum,)
@@ -481,6 +509,21 @@ class DatasetConfig():
         test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
         return test_data_params, test_data_path
+    @model_validator(mode="before")
+    @classmethod
+    def check_deprecated_args(cls, values):
+        kwargs = values.kwargs
+        if "train_period" in kwargs:
+            warnings.warn("train_period is deprecated. Use train_period_name instead.")
+            kwargs["train_period_name"] = kwargs["train_period"]
+        if "val_period" in kwargs:
+            warnings.warn("val_period is deprecated. Use val_period_name instead.")
+            kwargs["val_period_name"] = kwargs["val_period"]
+        if "test_period" in kwargs:
+            warnings.warn("test_period is deprecated. Use test_period_name instead.")
+            kwargs["test_period_name"] = kwargs["test_period"]
+        return values
     def __str__(self):
         _process_tag = yaml.emitter.Emitter.process_tag
         _ignore_aliases = yaml.Dumper.ignore_aliases

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/cesnet_dataset.py RENAMED Viewed

@@ -30,7 +30,7 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
 from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, fit_or_load_scalers,
                                                            pytables_collate_fn,
                                                            pytables_ip_collate_fn, worker_init_fn)
-from cesnet_datazoo.utils.class_info import ClassInfo, create_superclass_structures
+from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
 from cesnet_datazoo.utils.download import resumable_download, simple_download
 from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
@@ -74,8 +74,8 @@ class CesnetDataset():
         metadata: Additional [dataset metadata][metadata].
         available_dates: List of all available dates in the dataset.
         time_periods: Predefined time periods. Each time period is a list of dates.
-        default_train_period: Default time period for training.
-        default_test_period: Default time period for testing.
+        default_train_period_name: Default time period for training.
+        default_test_period_name: Default time period for testing.
     The following attributes are initialized when [`set_dataset_config_and_initialize`][datasets.cesnet_dataset.CesnetDataset.set_dataset_config_and_initialize] is called.
@@ -111,8 +111,8 @@ class CesnetDataset():
     metadata: DatasetMetadata
     available_dates: list[str]
     time_periods: dict[str, list[str]]
-    default_train_period: str
-    default_test_period: str
+    default_train_period_name: str
+    default_test_period_name: str
     time_periods_gen: bool = False
     silent: bool = False
@@ -165,13 +165,16 @@ class CesnetDataset():
                     num_samples += len(database.get_node(p))
                 if self.size == "ORIG" and num_samples != self.metadata.available_samples:
                     raise ValueError(f"Expected {self.metadata.available_samples} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
-                elif num_samples != DATASET_SIZES[self.size]:
+                if self.size != "ORIG" and num_samples != DATASET_SIZES[self.size]:
                     raise ValueError(f"Expected {DATASET_SIZES[self.size]} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
                 self.available_dates = list(map(lambda x: x.removeprefix("/flows/D"), tables_paths))
         else:
             self.available_dates = []
         if self.time_periods_gen:
             self._generate_time_periods()
+        # Add all available dates as single date time periods
+        for d in self.available_dates:
+            self.time_periods[d] = [d]
     def set_dataset_config_and_initialize(self, dataset_config: DatasetConfig) -> None:
         """
@@ -249,9 +252,9 @@ class CesnetDataset():
         """
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting validaion dataloader")
-        assert self.val_dataset is not None
         if self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
             raise ValueError("Validation dataloader is not available when using no-validation")
+        assert self.val_dataset is not None
         if self.val_dataloader:
             return self.val_dataloader
         batch_sampler = BatchSampler(sampler=SequentialSampler(self.val_dataset), batch_size=self.dataset_config.test_batch_size, drop_last=False)
@@ -288,6 +291,8 @@ class CesnetDataset():
         """
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting test dataloader")
+        if self.dataset_config.no_test_set:
+            raise ValueError("Test dataloader is not available when no_test_set is true")
         assert self.test_dataset is not None
         if self.test_dataloader:
             return self.test_dataloader
@@ -358,7 +363,7 @@ class CesnetDataset():
         Returns:
             Validation data as a dataframe.
         """
-        self._check_before_dataframe()
+        self._check_before_dataframe(check_no_val=True)
         assert self.dataset_config is not None and self.val_dataset is not None
         if len(self.val_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Validation set has ({len(self.val_dataset)} samples), consider using get_val_dataloader() instead")
@@ -384,13 +389,31 @@ class CesnetDataset():
         Returns:
             Test data as a dataframe.
         """
-        self._check_before_dataframe()
+        self._check_before_dataframe(check_no_test=True)
         assert self.dataset_config is not None and self.test_dataset is not None
         if len(self.test_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Test set has ({len(self.test_dataset)} samples), consider using get_test_dataloader() instead")
         feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
         return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
+    def get_num_classes(self) -> int:
+        """Returns the number of classes in the current configuration of the dataset."""
+        if self.class_info is None:
+            raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting the number of classes")
+        return self.class_info.num_classes
+    def get_known_apps(self) -> list[str]:
+        """Returns the list of known applications in the current configuration of the dataset."""
+        if self.class_info is None:
+            raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting known apps")
+        return self.class_info.known_apps
+    def get_unknown_apps(self) -> list[str]:
+        """Returns the list of unknown applications in the current configuration of the dataset."""
+        if self.class_info is None:
+            raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting unknown apps")
+        return self.class_info.unknown_apps
     def compute_dataset_statistics(self, num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, disabled_apps: Optional[list[str]] = None)-> None:
         """
         Computes dataset statistics and saves them to the `statistics_path` folder.
@@ -401,8 +424,6 @@ class CesnetDataset():
             batch_size: Number of samples per batch for loading data.
             disabled_apps: List of applications to exclude from the statistics.
         """
-        if self.name.startswith("CESNET-TLS22"):
-            raise NotImplementedError("Dataset statistics are not supported for CESNET_TLS22")
         flowstats_features = self.metadata.flowstats_features + self.metadata.packet_histogram_features + self.metadata.tcp_features
         if not os.path.exists(self.statistics_path):
             os.mkdir(self.statistics_path)
@@ -410,6 +431,7 @@ class CesnetDataset():
                                    output_dir=self.statistics_path,
                                    flowstats_features=flowstats_features,
                                    protocol=self.metadata.protocol,
+                                   extra_fields=not self.name.startswith("CESNET-TLS22"),
                                    disabled_apps=disabled_apps if disabled_apps is not None else [],
                                    num_samples=num_samples,
                                    num_workers=num_workers,
@@ -471,13 +493,17 @@ class CesnetDataset():
         self.val_dataloader = None
         self.test_dataloader = None
-    def _check_before_dataframe(self) -> None:
+    def _check_before_dataframe(self, check_no_val: bool = False, check_no_test: bool = False) -> None:
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting a dataframe")
         if self.dataset_config.return_ips:
             raise ValueError("Dataframes are not available when return_ips is set. Use a dataloader instead.")
         if self.dataset_config.return_torch:
             raise ValueError("Dataframes are not available when return_torch is set. Use a dataloader instead.")
+        if check_no_val and self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
+            raise ValueError("Validation dataframe is not available when using no-validation")
+        if check_no_test and self.dataset_config.no_test_set:
+            raise ValueError("Test dataframe is not available when no_test_set is true")
     def _initialize_train_val_test(self) -> None:
         assert self.dataset_config is not None
@@ -485,7 +511,12 @@ class CesnetDataset():
         servicemap = pd.read_csv(dataset_config.servicemap_path, index_col="Tag")
         # Initialize train and test indices
         train_indices, train_unknown_indices, encoder, known_apps_database_enum, unknown_apps_database_enum = init_or_load_train_indices(dataset_config=dataset_config, servicemap=servicemap)
-        test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
+        if self.dataset_config.no_test_set:
+            test_known_indices = np.empty((0,3), dtype=np.int64)
+            test_unknown_indices = np.empty((0,3), dtype=np.int64)
+            test_data_path = None
+        else:
+            test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
         # Date weight sampling of train indices
         if dataset_config.train_dates_weigths is not None:
             assert dataset_config.train_size != "all"
@@ -527,13 +558,13 @@ class CesnetDataset():
                                                                           test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
                                                                           stratify=train_labels, shuffle=True, random_state=train_val_rng)
         elif dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
-            val_data_path = None
             val_known_indices = np.empty((0,3), dtype=np.int64)
             val_unknown_indices = np.empty((0,3), dtype=np.int64)
+            val_data_path = None
         else: assert_never(dataset_config.val_approach)
         # Create class info
-        class_info = create_superclass_structures(servicemap=servicemap, target_names=list(encoder.classes_))
+        class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
         # Load or fit data scalers
         flowstats_scaler, flowstats_quantiles, ipt_scaler, psizes_scaler = fit_or_load_scalers(dataset_config=dataset_config, train_indices=train_indices)
         # Subset dataset indices based on the selected sizes and compute application counts
@@ -555,16 +586,21 @@ class CesnetDataset():
             indices=dataset_indices.train_indices,
             flowstats_features=dataset_config.flowstats_features,
             return_ips=dataset_config.return_ips,)
-        test_dataset = PyTablesDataset(
-            database_path=dataset_config.database_path,
-            tables_paths=dataset_config._get_test_tables_paths(),
-            indices=test_combined_indices,
-            flowstats_features=dataset_config.flowstats_features,
-            preload=dataset_config.preload_test,
-            preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),
-            return_ips=dataset_config.return_ips,)
-        val_dataset = None
-        if dataset_config.val_approach != ValidationApproach.NO_VALIDATION:
+        if dataset_config.no_test_set:
+            test_dataset = None
+        else:
+            assert test_data_path is not None
+            test_dataset = PyTablesDataset(
+                database_path=dataset_config.database_path,
+                tables_paths=dataset_config._get_test_tables_paths(),
+                indices=test_combined_indices,
+                flowstats_features=dataset_config.flowstats_features,
+                preload=dataset_config.preload_test,
+                preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),
+                return_ips=dataset_config.return_ips,)
+        if dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
+            val_dataset = None
+        else:
             assert val_data_path is not None
             val_dataset = PyTablesDataset(
             database_path=dataset_config.database_path,
@@ -579,7 +615,6 @@ class CesnetDataset():
             collate_fn = pytables_ip_collate_fn
         else:
             collate_fn = partial(pytables_collate_fn, # type: ignore
-                use_packet_histograms=dataset_config.use_packet_histograms,
                 flowstats_scaler=flowstats_scaler,
                 flowstats_quantiles=flowstats_quantiles,
                 psizes_scaler=psizes_scaler,
@@ -588,6 +623,8 @@ class CesnetDataset():
                 ipt_min=dataset_config.ipt_min,
                 ipt_max=dataset_config.ipt_max,
                 use_push_flags=dataset_config.use_push_flags,
+                use_packet_histograms=dataset_config.use_packet_histograms,
+                normalize_packet_histograms=dataset_config.normalize_packet_histograms,
                 zero_ppi_start=dataset_config.zero_ppi_start,
                 encoder=encoder,
                 known_apps=class_info.known_apps,

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/datasets.py RENAMED Viewed

@@ -10,8 +10,8 @@ class CESNET_TLS22(CesnetDataset):
         "W-2021-40": ["20211004", "20211005", "20211006", "20211007", "20211008", "20211009", "20211010"],
         "W-2021-41": ["20211011", "20211012", "20211013", "20211014", "20211015", "20211016", "20211017"],
     }
-    default_train_period = "W-2021-40"
-    default_test_period = "W-2021-41"
+    default_train_period_name = "W-2021-40"
+    default_test_period_name = "W-2021-41"
 class CESNET_QUIC22(CesnetDataset):
     """Dataset class for [CESNET-QUIC22][cesnet-quic22]."""
@@ -24,11 +24,11 @@ class CESNET_QUIC22(CesnetDataset):
         "W-2022-46": ["20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120"],
         "W-2022-47": ["20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
         "W45-47": ["20221107", "20221108", "20221109", "20221110", "20221111", "20221112", "20221113",
-                        "20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120",
-                        "20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
+                   "20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120",
+                   "20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
     }
-    default_train_period = "W-2022-44"
-    default_test_period = "W-2022-45"
+    default_train_period_name = "W-2022-44"
+    default_test_period_name = "W-2022-45"
 class CESNET_TLS_Year22(CesnetDataset):
     """Dataset class for [CESNET-TLS-Year22][cesnet-tls-year22]."""
@@ -37,5 +37,5 @@ class CESNET_TLS_Year22(CesnetDataset):
     bucket_url = "https://liberouter.org/datazoo/download?bucket=cesnet-tls-year22"
     time_periods = {f"W-2022-{week}": [] for week in range(1, 53)} | {f"M-2022-{month}": [] for month in range(1, 13)}
     time_periods_gen = True
-    default_train_period = "M-2022-9"
-    default_test_period = "M-2022-10"
+    default_train_period_name = "M-2022-9"
+    default_test_period_name = "M-2022-10"

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/statistics.py RENAMED Viewed

@@ -16,7 +16,7 @@ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASO
                                       PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
 from cesnet_datazoo.pytables_data.indices_setup import sort_indices
 from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
-                                                           worker_init_fn)
+                                                           load_database, worker_init_fn)
 def pick_quic_fields(batch):
@@ -26,23 +26,27 @@ def pick_quic_fields(batch):
         batch["QUIC_VERSION"],
     )
-def pick_stats_fields(batch, flowstats_features: list[str]):
+def pick_stats_fields(batch):
     return (
         batch[PPI_COLUMN],
         batch["DURATION"],
         batch["PACKETS"] + batch["PACKETS_REV"],
         batch["BYTES"] + batch["BYTES_REV"],
+        batch[APP_COLUMN],
+        batch[CATEGORY_COLUMN],
+    )
+def pick_extra_fields(batch, flowstats_features: list[str]):
+    return (
         batch["DST_ASN"],
         batch[PHISTS_FEATURES],
         batch[[f for f in FLOWEND_REASON_FEATURES if f in flowstats_features]],
-        batch[APP_COLUMN],
-        batch[CATEGORY_COLUMN],
     )
 def simple_collate_fn(batch):
     return batch
-def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
+def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, extra_fields: bool, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
     stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
     stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
     categories_csv_path = os.path.join(output_dir, "categories.csv")
@@ -70,8 +74,8 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     packet_sizes_counter = Counter()
     if not silent:
         print(f"Reading data from {database_path} for statistics")
-    table_names = list_all_tables(database_path)
-    stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_names, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
+    table_paths = list_all_tables(database_path)
+    stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_paths, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
     if num_samples != "all":
         subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
         stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
@@ -89,7 +93,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
         stats_dataset.pytables_worker_init()
     for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader), disable=silent):
-        ppi, duration, packets_total, bytes_total, asn, phist, flowend_reason, app, cat = pick_stats_fields(batch, flowstats_features=flowstats_features)
+        ppi, duration, packets_total, bytes_total, app, cat = pick_stats_fields(batch)
         # Saving feature values for distribution plots
         feature_duration.append(duration)
         feature_packets_total.append(packets_total)
@@ -97,8 +101,6 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
         packet_sizes_counter.update(ppi[:, SIZE_POS, :].flatten())
         # Aggregating features for value_counts
         app_series = app_series.add(pd.Series(app).value_counts(), fill_value=0)
-        asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
-        flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
         # Grouping features per categories
         df1 = pd.DataFrame(data={"cat": cat, "BYTES_TOTAL": bytes_total})
         flow_counts = df1["cat"].value_counts().rename("FLOW_COUNT")
@@ -110,9 +112,12 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
             quic_sni_series = quic_sni_series.add(pd.Series(sni).str.decode("utf-8").value_counts(), fill_value=0)
             quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
             quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
-        # Aggregate PHISTS
-        df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
-        df_phist = df_phist.add(df2, fill_value=0)
+        if extra_fields:
+            asn, phist, flowend_reason = pick_extra_fields(batch, flowstats_features=flowstats_features)
+            asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
+            flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
+            df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
+            df_phist = df_phist.add(df2, fill_value=0)
     feature_duration = np.concatenate(feature_duration)
     feature_packets_total = np.concatenate(feature_packets_total)
     feature_bytes_total = np.concatenate(feature_bytes_total)
@@ -123,9 +128,12 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     # Flow statistics distribution output
     df_flowstats = pd.DataFrame(data={"FLOW DURATION": feature_duration, "FLOW BYTE VOLUME": feature_bytes_total, "FLOW LENGTH": feature_packets_total}).describe()
     df_flowstats.to_csv(stats_csv_path)
-    # Categories tikzpicture and csv output
-    stats_dataset.pytables_worker_init() # to get access to cat enum; TODO implement better
-    df_categories.index = df_categories.index.map(stats_dataset.get_cat_enum())
+    # Categories tikzpicture and csv output; first, get the categories and applications enum
+    temp_database, temp_tables = load_database(database_path=database_path, tables_paths=table_paths[:1])
+    cat_enum = temp_tables[0].get_enum(CATEGORY_COLUMN)
+    app_enum = temp_tables[0].get_enum(APP_COLUMN)
+    temp_database.close()
+    df_categories.index = df_categories.index.map(cat_enum)
     df_categories = df_categories.drop("default", errors="ignore")
     df_categories["FLOW_PERC"] = df_categories["FLOW_COUNT"] / sum(df_categories["FLOW_COUNT"]) * 100
     df_categories["BYTES_PERC"] = df_categories["BYTES_TOTAL"] / sum(df_categories["BYTES_TOTAL"]) * 100
@@ -139,20 +147,9 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     # Application distribution output
     app_df = pd.DataFrame({"COUNT": app_series.sort_values(ascending=False).astype("int64")})
     app_df["PERC"] = (app_df["COUNT"] / app_df["COUNT"].sum() * 100).round(3)
-    app_df.index = app_df.index.map(stats_dataset.get_app_enum())
+    app_df.index = app_df.index.map(app_enum)
     app_df.index.name = "LABEL"
     app_df.to_csv(app_path)
-    # ASN distribution output
-    asn_df = pd.DataFrame({"COUNT": asn_series.sort_values(ascending=False).astype("int64")})
-    asn_df["PERC"] = (asn_df["COUNT"] / asn_df["COUNT"].sum() * 100).round(3)
-    asn_df.index.name = "DESTINATION ASN"
-    asn_df.to_csv(asn_path)
-    # Flow end reason output
-    flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
-    flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
-    flow_endreason_df.index.name = "FLOW ENDREASON"
-    flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
-    flow_endreason_df.to_csv(flow_endreason_path)
     # Packet sizes histogram output
     packet_sizes_df = pd.DataFrame({"COUNT": pd.Series(packet_sizes_counter)}).sort_index()
     packet_sizes_df["PERC"] = (packet_sizes_df["COUNT"] / packet_sizes_df["COUNT"].sum() * 100).round(3)
@@ -168,13 +165,25 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
         quic_version_df.index = quic_version_df.index.map(hex)
         quic_version_df.index.name = "QUIC VERSION"
         quic_version_df.to_csv(quic_version_path)
-    # PHIST output
-    df_phist.index.name = "BINS"
-    df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
-    df_phist = df_phist.astype("int64")
-    for i, column in zip((1, 3, 5, 7), df_phist.columns):
-        df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
-    df_phist.to_csv(phist_path)
+    if extra_fields:
+        # ASN distribution output
+        asn_df = pd.DataFrame({"COUNT": asn_series.sort_values(ascending=False).astype("int64")})
+        asn_df["PERC"] = (asn_df["COUNT"] / asn_df["COUNT"].sum() * 100).round(3)
+        asn_df.index.name = "DESTINATION ASN"
+        asn_df.to_csv(asn_path)
+        # Flow end reason output
+        flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
+        flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
+        flow_endreason_df.index.name = "FLOW ENDREASON"
+        flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
+        flow_endreason_df.to_csv(flow_endreason_path)
+        # PHIST output
+        df_phist.index.name = "BINS"
+        df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
+        df_phist = df_phist.astype("int64")
+        for i, column in zip((1, 3, 5, 7), df_phist.columns):
+            df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
+        df_phist.to_csv(phist_path)
     # Dataset stats figure
     axes: Any
@@ -232,5 +241,4 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     ax4.set_xlabel("Bytes")
     plt.tight_layout()
-    fig.show()
     fig.savefig(stats_pdf_path, bbox_inches="tight")

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/metrics/classification_report.py RENAMED Viewed

@@ -1,8 +1,8 @@
 import numpy as np
 from sklearn.metrics import accuracy_score, precision_recall_fscore_support
-from cesnet_datazoo.metrics.superclass_metrics import (per_app_superclass_metrics,
-                                                       superclass_accuracies)
+from cesnet_datazoo.metrics.provider_metrics import (per_app_provider_metrics,
+                                                       provider_accuracies)
 from cesnet_datazoo.utils.class_info import ClassInfo
@@ -10,23 +10,23 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
     p, r, f1, s  = precision_recall_fscore_support(y_true, y_pred,
                                                     labels=labels,
                                                     zero_division=zero_division)
-    sc_p, sc_r, sc_f1 = per_app_superclass_metrics(cm, class_info=class_info)
+    sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
     predicted_unknown = cm[:, -1]
     with np.errstate(divide="ignore", invalid="ignore"):
         predicted_unknown_perc = predicted_unknown / s
         predicted_unknown_perc = np.nan_to_num(predicted_unknown_perc)
-    headers = ["precision (sc)", "recall (sc)", "f1-score (sc)", "pred unknown", "support"]
+    headers = ["precision (pr)", "recall (pr)", "f1-score (pr)", "pred unknown", "support"]
     headers_fmt = "{:>{width}} {:>15} {:>15} {:>15} {:>15} {:>9}\n"
-    width = max(max(len(cn) for cn in class_info.target_names), len("failed superclass acc"))
+    width = max(max(len(cn) for cn in class_info.target_names), len("failed provider acc"))
     report = headers_fmt.format("", *headers, width=width)
     report += "\n"
-    row_fmt_superclass = "{:>{width}} " + 3 * " {:>7.{digits}f} ({:.{digits}f}) " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
+    row_fmt_provider = "{:>{width}} " + 3 * " {:>7.{digits}f} ({:.{digits}f}) " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
     row_fmt = "{:>{width}} " + 3 * " {:>7.{digits}f}        " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
     rows = zip(map(class_info.target_names.__getitem__, labels), p, sc_p, r, sc_r, f1, sc_f1, predicted_unknown, predicted_unknown_perc, s) # type: ignore
     for row in rows:
         app, p_, _, r_, _, f1_, _, u_, up_, s_ = row
-        if class_info.has_superclass[app]:
-            report += row_fmt_superclass.format(*row, width=width, digits=digits)
+        if class_info.has_provider[app]:
+            report += row_fmt_provider.format(*row, width=width, digits=digits)
         else:
             report += row_fmt.format(app, p_, r_, f1_, u_, up_, s_, width=width, digits=digits)
     report += "\n"
@@ -40,26 +40,26 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
     avg_sc_f1 = np.average(np.where(np.isnan([np.nan if x is None else x for x in sc_f1]), f1, sc_f1)[:-1])
     row_avg = [avg_p, avg_sc_p, avg_r, avg_sc_r, avg_f1, avg_sc_f1, predicted_unknown_sum, samples_sum]
-    headers_avg = ["precision (sc)", "recall (sc)", "f1-score (sc)", "pred unknown", "support"]
+    headers_avg = ["precision (pr)", "recall (pr)", "f1-score (pr)", "pred unknown", "support"]
     row_fmt_avg = "{:>{width}} " + 3 * " {:>6.{digits}} ({:.{digits}f}) " + "{:>15} " + "{:>9}\n"
     digits = 3 # show more precise averages
     report += headers_fmt.format("", *headers_avg, width=width)
     report += row_fmt_avg.format("macro avg", *row_avg, width=width, digits=digits)
     acc = accuracy_score(y_true, y_pred)
-    superclass_acc, failed_superclass_acc = superclass_accuracies(y_true, y_pred, class_info=class_info)
+    provider_acc, failed_provider_acc = provider_accuracies(y_true, y_pred, class_info=class_info)
     row_fmt_acc = "{:>{width}} {:>15} {:>15} {:>7.{digits}f}\n"
     report += row_fmt_acc.format("acc", "", "", acc, width=width, digits=digits)
-    report += row_fmt_acc.format("superclass acc", "", "", superclass_acc, width=width, digits=digits)
-    report += row_fmt_acc.format("failed superclass acc", "", "", failed_superclass_acc, width=width, digits=digits)
+    report += row_fmt_acc.format("provider acc", "", "", provider_acc, width=width, digits=digits)
+    report += row_fmt_acc.format("failed provider acc", "", "", failed_provider_acc, width=width, digits=digits)
     metrics = {
         "Test/Accuracy": acc,
-        "Test/Superclass Accuracy": superclass_acc,
-        "Test/Failed Superclass Accuracy": failed_superclass_acc,
+        "Test/Provider Accuracy": provider_acc,
+        "Test/Failed Provider Accuracy": failed_provider_acc,
         "Test/Fscore": avg_f1,
-        "Test/Superclass Fscore": avg_sc_f1,
+        "Test/Provider Fscore": avg_sc_f1,
         "Test/Recall": avg_r,
-        "Test/Superclass Recall": avg_sc_r,
+        "Test/Provider Recall": avg_sc_r,
     }
     return report, metrics

cesnet-datazoo-0.0.10/cesnet_datazoo/metrics/superclass_metrics.py → cesnet-datazoo-0.0.12/cesnet_datazoo/metrics/provider_metrics.py RENAMED Viewed

@@ -3,18 +3,19 @@ import numpy as np
 from cesnet_datazoo.utils.class_info import ClassInfo
-def superclass_accuracies(y_true: np.ndarray, y_pred: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
-    y_true_sc = class_info.superclass_mapping_arr[y_true]
-    y_pred_sc = class_info.superclass_mapping_arr[y_pred]
+def provider_accuracies(y_true: np.ndarray, y_pred: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
+    provider_mapping_arr = np.array(list(class_info.provider_mapping.values()))
+    y_true_sc = provider_mapping_arr[y_true]
+    y_pred_sc = provider_mapping_arr[y_pred]
     mistakes = y_true != y_pred
-    superclass_acc = (y_true_sc == y_pred_sc).sum() / len(y_true_sc)
-    failed_superclass_acc = (y_true_sc[mistakes] == y_pred_sc[mistakes]).sum() / mistakes.sum()
-    return superclass_acc, failed_superclass_acc
+    provider_acc = (y_true_sc == y_pred_sc).sum() / len(y_true_sc)
+    failed_provider_acc = (y_true_sc[mistakes] == y_pred_sc[mistakes]).sum() / mistakes.sum()
+    return provider_acc, failed_provider_acc
-def per_app_superclass_metrics(cm, class_info: ClassInfo):
+def per_app_provider_metrics(cm, class_info: ClassInfo):
     metrics = []
     for i, app in enumerate(class_info.target_names):
-        if not class_info.has_superclass[app]:
+        if not class_info.has_provider[app]:
             metrics.append((None, None, None))
             continue
         group = class_info.group_matrix[i]

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/pytables_dataset.py RENAMED Viewed

@@ -140,7 +140,7 @@ def pytables_collate_fn(batch: tuple,
                         flowstats_scaler: Scaler, flowstats_quantiles: pd.Series,
                         psizes_scaler: Scaler, psizes_max: int,
                         ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
-                        use_push_flags: bool, use_packet_histograms: bool, zero_ppi_start: int,
+                        use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
                         encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
     x_ppi, x_flowstats, labels = batch
     x_ppi = x_ppi.transpose(0, 2, 1)
@@ -164,12 +164,13 @@ def pytables_collate_fn(batch: tuple,
     if use_packet_histograms:
         x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
-        src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
-        dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
-        np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
-        np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
-        np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
-        np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
+        if normalize_packet_histograms:
+            src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
+            dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
+            np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
+            np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
+            np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
+            np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
         x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
         x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
     else:

cesnet-datazoo-0.0.12/cesnet_datazoo/utils/class_info.py ADDED Viewed

@@ -0,0 +1,50 @@
+from dataclasses import dataclass
+from typing import Any, Optional
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROVIDER_COLUMN
+@dataclass()
+class ClassInfo:
+    target_names: list[str]
+    num_classes: int
+    known_apps: list[str]
+    unknown_apps: list[str]
+    unknown_class_label: int
+    group_matrix: np.ndarray
+    has_provider: dict[str, bool]
+    provider_mapping: dict[str, str]
+    provider_members: dict[str, list[str]]
+    categories_mapping: dict[str, Optional[str]]
+def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> ClassInfo:
+    known_apps = sorted(known_apps_database_enum.values())
+    unknown_apps = sorted(unknown_apps_database_enum.values())
+    target_names_arr = encoder.classes_
+    assert known_apps == list(target_names_arr[:-1])
+    group_matrix = np.array([[a == b or
+                (a in servicemap.index and b in servicemap.index and
+                not pd.isnull(servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN]) and not pd.isnull(servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]) and
+                servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN] == servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN])
+                for a in target_names_arr] for b in target_names_arr])
+    has_provider = {app: app in servicemap.index and not pd.isnull(servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN]) for app in target_names_arr}
+    provider_mapping = {app: servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN] if has_provider[app] else app for app in target_names_arr}
+    providers = sorted({provider_mapping[app] for app in target_names_arr if has_provider[app]})
+    provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
+    categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
+    return ClassInfo(
+            target_names=list(target_names_arr),
+            num_classes=len(known_apps),
+            known_apps=known_apps,
+            unknown_apps=unknown_apps,
+            unknown_class_label=len(known_apps),
+            group_matrix=group_matrix,
+            has_provider=has_provider,
+            provider_mapping=provider_mapping,
+            provider_members=provider_members,
+            categories_mapping=categories_mapping,
+    )

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cesnet-datazoo
-Version: 0.0.10
+Version: 0.0.12
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -102,8 +102,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
 dataset_config = DatasetConfig(
     dataset=dataset,
     apps_selection=AppSelection.ALL_KNOWN,
-    train_period="W-2022-44",
-    test_period="W-2022-45",
+    train_period_name="W-2022-44",
+    test_period_name="W-2022-45",
 )
 dataset.set_dataset_config_and_initialize(dataset_config)
 train_dataframe = dataset.get_train_df()

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/SOURCES.txt RENAMED Viewed

@@ -19,7 +19,7 @@ cesnet_datazoo/datasets/metadata/dataset_metadata.py
 cesnet_datazoo/datasets/metadata/metadata.csv
 cesnet_datazoo/metrics/__init__.py
 cesnet_datazoo/metrics/classification_report.py
-cesnet_datazoo/metrics/superclass_metrics.py
+cesnet_datazoo/metrics/provider_metrics.py
 cesnet_datazoo/pytables_data/__init__.py
 cesnet_datazoo/pytables_data/apps_split.py
 cesnet_datazoo/pytables_data/indices_setup.py

{cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cesnet-datazoo"
-version = "0.0.10"
+version = "0.0.12"
 authors = [
   {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
   {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},

cesnet-datazoo-0.0.10/cesnet_datazoo/utils/class_info.py DELETED Viewed

@@ -1,46 +0,0 @@
-from dataclasses import dataclass
-import numpy as np
-import pandas as pd
-from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROVIDER_COLUMN
-@dataclass()
-class ClassInfo:
-    target_names: list[str]
-    known_apps: list[str]
-    group_matrix: np.ndarray
-    superclass_members: dict[str, list[str]]
-    has_superclass: dict[str, bool]
-    superclass_mapping: dict[str, str]
-    superclass_mapping_arr: np.ndarray
-    categories_mapping: dict[str, str]
-    def get_num_classes(self):
-        return len(self.known_apps)
-def create_superclass_structures(servicemap: pd.DataFrame, target_names: list[str]) -> ClassInfo:
-    known_apps = target_names[:-1]
-    target_names_arr = np.array(target_names)
-    group_matrix = np.array([[
-                a in servicemap.index and b in servicemap.index and
-                not pd.isnull(servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN]) and not pd.isnull(servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]) and
-                servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN] == servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]
-                for a in target_names_arr] for b in target_names_arr])
-    has_superclass = {app: app in servicemap.index and not pd.isnull(servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN]) for app in target_names_arr}
-    superclass_mapping: dict[str, str] = {app: servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN] if has_superclass[app] else app for app in target_names_arr} # type: ignore
-    superclass_mapping_arr = np.array(list(superclass_mapping.values()))
-    superclass_members = {superclass: servicemap.loc[servicemap[SERVICEMAP_PROVIDER_COLUMN] == superclass].index.to_list()
-                    for superclass in servicemap.loc[:, SERVICEMAP_PROVIDER_COLUMN].dropna().unique()}
-    categories_mapping: dict[str, str] = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr} # type: ignore
-    return ClassInfo(
-            target_names=target_names,
-            known_apps=known_apps,
-            group_matrix=group_matrix,
-            superclass_members=superclass_members,
-            has_superclass=has_superclass,
-            superclass_mapping=superclass_mapping,
-            superclass_mapping_arr=superclass_mapping_arr,
-            categories_mapping=categories_mapping,
-    )