PyPI - cesnet-datazoo - Versions diffs - 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

cesnet-datazoo 0.0.17py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

cesnet_datazoo/config.py +173 -168
cesnet_datazoo/constants.py +4 -6
cesnet_datazoo/datasets/cesnet_dataset.py +200 -177
cesnet_datazoo/datasets/datasets.py +22 -2
cesnet_datazoo/datasets/datasets_constants.py +670 -0
cesnet_datazoo/datasets/loaders.py +3 -0
cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
cesnet_datazoo/datasets/metadata/metadata.csv +4 -4
cesnet_datazoo/datasets/statistics.py +36 -16
cesnet_datazoo/pytables_data/data_scalers.py +68 -154
cesnet_datazoo/pytables_data/indices_setup.py +29 -33
cesnet_datazoo/pytables_data/pytables_dataset.py +99 -122
cesnet_datazoo/utils/class_info.py +7 -5
{cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/METADATA +2 -1
cesnet_datazoo-0.1.0.dist-info/RECORD +30 -0
{cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/WHEEL +1 -1
cesnet_datazoo-0.0.17.dist-info/RECORD +0 -29
{cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/LICENCE +0 -0
{cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/top_level.txt +0 -0

cesnet_datazoo/datasets/cesnet_dataset.py CHANGED Viewed

@@ -15,11 +15,14 @@ from sklearn.preprocessing import LabelEncoder
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, SequentialSampler
 from typing_extensions import assert_never
-from cesnet_datazoo.config import DataLoaderOrder, DatasetConfig, Scaler, ValidationApproach
-from cesnet_datazoo.constants import DATASET_SIZES, INDICES_LABEL_POS, SERVICEMAP_FILE
-from cesnet_datazoo.datasets.loaders import create_df_from_dataloader
+from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
+from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_LABEL_POS,
+                                      SERVICEMAP_FILE, UNKNOWN_STR_LABEL)
+from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
 from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
 from cesnet_datazoo.datasets.statistics import compute_dataset_statistics
+from cesnet_datazoo.pytables_data.apps_split import is_background_app
+from cesnet_datazoo.pytables_data.data_scalers import fit_scalers
 from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_known_app_counts,
                                                         compute_unknown_app_counts,
                                                         date_weight_sample_train_indices,
@@ -27,10 +30,8 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
                                                         init_or_load_train_indices,
                                                         init_or_load_val_indices,
                                                         subset_and_sort_indices)
-from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, pytables_collate_fn,
-                                                           worker_init_fn)
+from cesnet_datazoo.pytables_data.pytables_dataset import PyTablesDataset, worker_init_fn
 from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
-from cesnet_datazoo.pytables_data.data_scalers import fit_or_load_scalers
 from cesnet_datazoo.utils.download import resumable_download, simple_download
 from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
@@ -39,8 +40,7 @@ DATAFRAME_SAMPLES_WARNING_THRESHOLD = 20_000_000
 class CesnetDataset():
     """
-    The main class for accessing CESNET datasets. It handles downloading, data preprocessing,
-    train/validation/test splitting, and class selection. Access to data is provided through:
+    The main class for accessing CESNET datasets. It handles downloading, train/validation/test splitting, and class selection. Access to data is provided through:
     - Iterable PyTorch DataLoader for batch processing. See [using dataloaders][using-dataloaders] for more details.
     - Pandas DataFrame for loading the entire train, validation, or test set at once.
@@ -54,7 +54,7 @@ class CesnetDataset():
     1. Create an instance of the [dataset class][dataset-classes] with the desired size and data root. This will download the dataset if it has not already been downloaded.
     2. Create an instance of [`DatasetConfig`][config.DatasetConfig] and set it with [`set_dataset_config_and_initialize`][datasets.cesnet_dataset.CesnetDataset.set_dataset_config_and_initialize].
-    This will initialize the dataset — select classes, split data into train/validation/test sets, and fit data scalers. All is done according to the provided configuration and is cached for later use.
+    This will initialize the dataset — select classes, split data into train/validation/test sets, and fit data scalers if needed. All is done according to the provided configuration and is cached for later use.
     3. Use [`get_train_dataloader`][datasets.cesnet_dataset.CesnetDataset.get_train_dataloader] or [`get_train_df`][datasets.cesnet_dataset.CesnetDataset.get_train_df] to get training data for a classification model.
     4. Validate the model and perform the hyperparameter optimalization on [`get_val_dataloader`][datasets.cesnet_dataset.CesnetDataset.get_val_dataloader] or [`get_val_df`][datasets.cesnet_dataset.CesnetDataset.get_val_df].
     5. Evaluate the model on [`get_test_dataloader`][datasets.cesnet_dataset.CesnetDataset.get_test_dataloader] or [`get_test_df`][datasets.cesnet_dataset.CesnetDataset.get_test_df].
@@ -69,9 +69,10 @@ class CesnetDataset():
         database_filename: Name of the database file.
         database_path: Path to the database file.
         servicemap_path: Path to the servicemap file.
-        statistics_path: Path to the dataset statistics.
+        statistics_path: Path to the dataset statistics folder.
         bucket_url: URL of the bucket where the database is stored.
         metadata: Additional [dataset metadata][metadata].
+        available_classes: List of all available classes in the dataset.
         available_dates: List of all available dates in the dataset.
         time_periods: Predefined time periods. Each time period is a list of dates.
         default_train_period_name: Default time period for training.
@@ -86,36 +87,30 @@ class CesnetDataset():
         train_dataset: Train set in the form of `PyTablesDataset` instance wrapping the PyTables database.
         val_dataset: Validation set in the form of `PyTablesDataset` instance wrapping the PyTables database.
         test_dataset: Test set in the form of `PyTablesDataset` instance wrapping the PyTables database.
-        known_apps_database_enum: Dictionary that maps the database integer labels (different to those from `encoder`) of known applications to their names.
-        unknown_apps_database_enum: Dictionary that maps the database integer labels (different to those from `encoder`) of unknown applications to their names.
         known_app_counts: Known application counts in the train, validation, and test sets.
         unknown_app_counts: Unknown application counts in the validation and test sets.
-        collate_fn: Collate function used for creating batches in dataloaders.
-        encoder: Scikit-learn [`LabelEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) used to encode class names into integers. It is fitted during the initialization of the dataset.
-        flowstats_scaler: Scaler for flow statistics. It is fitted during the initialization of the dataset.
-        psizes_scaler: Scaler for packet sizes.
-        ipt_scaler: Scaler for inter-packet times.
-        flowstats_quantiles: Quantiles of flow statistics used for clipping.
         train_dataloader: Iterable PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) for training.
         train_dataloader_sampler: Sampler used for iterating the training dataloader. Either [`RandomSampler`](https://pytorch.org/docs/stable/data.html#torch.utils.data.RandomSampler) or [`SequentialSampler`](https://pytorch.org/docs/stable/data.html#torch.utils.data.SequentialSampler).
+        train_dataloader_drop_last: Whether to drop the last incomplete batch when iterating the training dataloader.
         val_dataloader: Iterable PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) for validation.
         test_dataloader: Iterable PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) for testing.
     """
-    name: str
-    size: str
     data_root: str
+    size: str
+    silent: bool = False
+    name: str
     database_filename: str
     database_path: str
     servicemap_path: str
     statistics_path: str
     bucket_url: str
     metadata: DatasetMetadata
+    available_classes: list[str]
     available_dates: list[str]
     time_periods: dict[str, list[str]]
     default_train_period_name: str
     default_test_period_name: str
-    time_periods_gen: bool = False
-    silent: bool = False
     dataset_config: Optional[DatasetConfig] = None
     class_info: Optional[ClassInfo] = None
@@ -123,25 +118,19 @@ class CesnetDataset():
     train_dataset: Optional[PyTablesDataset] = None
     val_dataset: Optional[PyTablesDataset] = None
     test_dataset: Optional[PyTablesDataset] = None
-    known_apps_database_enum: Optional[dict[int, str]] = None
-    unknown_apps_database_enum: Optional[dict[int, str]] = None
     known_app_counts: Optional[pd.DataFrame] = None
     unknown_app_counts: Optional[pd.DataFrame] = None
-    collate_fn: Optional[Callable] = None
-    encoder: Optional[LabelEncoder] = None
-    flowstats_scaler: Scaler = None
-    psizes_scaler: Scaler = None
-    ipt_scaler: Scaler = None
-    flowstats_quantiles: Optional[np.ndarray] = None
     train_dataloader: Optional[DataLoader] = None
     train_dataloader_sampler: Optional[Sampler] = None
     train_dataloader_drop_last: bool = True
     val_dataloader: Optional[DataLoader] = None
     test_dataloader: Optional[DataLoader] = None
-    def __init__(self, data_root: str, size: str = "S", skip_dataset_read_at_init: bool = False, silent: bool = False) -> None:
+    _collate_fn: Optional[Callable] = None
+    _tables_app_enum: dict[int, str]
+    _tables_cat_enum: dict[int, str]
+    def __init__(self, data_root: str, size: str = "S", database_checks_at_init: bool = False, silent: bool = False) -> None:
         self.silent = silent
         self.metadata = load_metadata(self.name)
         self.size = size
@@ -159,24 +148,31 @@ class CesnetDataset():
             os.makedirs(self.data_root)
         if not self._is_downloaded():
             self._download()
-        if not skip_dataset_read_at_init:
+        if database_checks_at_init:
             with tb.open_file(self.database_path, mode="r") as database:
                 tables_paths = list(map(lambda x: x._v_pathname, iter(database.get_node(f"/flows"))))
                 num_samples = 0
                 for p in tables_paths:
-                    num_samples += len(database.get_node(p))
+                    table = database.get_node(p)
+                    assert isinstance(table, tb.Table)
+                    if self._tables_app_enum != {v: k for k, v in dict(table.get_enum(APP_COLUMN)).items()}:
+                        raise ValueError(f"Found mismatch between _tables_app_enum and the PyTables database enum in table {p}. Please report this issue.")
+                    if self._tables_cat_enum != {v: k for k, v in dict(table.get_enum(CATEGORY_COLUMN)).items()}:
+                        raise ValueError(f"Found mismatch between _tables_cat_enum and the PyTables database enum in table {p}. Please report this issue.")
+                    num_samples += len(table)
                 if self.size == "ORIG" and num_samples != self.metadata.available_samples:
                     raise ValueError(f"Expected {self.metadata.available_samples} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
                 if self.size != "ORIG" and num_samples != DATASET_SIZES[self.size]:
                     raise ValueError(f"Expected {DATASET_SIZES[self.size]} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
-                self.available_dates = list(map(lambda x: x.removeprefix("/flows/D"), tables_paths))
-        else:
-            self.available_dates = []
-        if self.time_periods_gen:
-            self._generate_time_periods()
+                if self.available_dates != list(map(lambda x: x.removeprefix("/flows/D"), tables_paths)):
+                    raise ValueError(f"Found mismatch between available_dates and the dates available in the PyTables database. Please report this issue.")
         # Add all available dates as single date time periods
         for d in self.available_dates:
             self.time_periods[d] = [d]
+        available_applications = sorted([app for app in pd.read_csv(self.servicemap_path, index_col="Tag").index if not is_background_app(app)])
+        if len(available_applications) != self.metadata.application_count:
+            raise ValueError(f"Found {len(available_applications)} applications in the servicemap (omitting background traffic classes), but expected {self.metadata.application_count}. Please report this issue.")
+        self.available_classes = available_applications + self.metadata.background_traffic_classes
     def set_dataset_config_and_initialize(self, dataset_config: DatasetConfig, disable_indices_cache: bool = False) -> None:
         """
@@ -208,6 +204,8 @@ class CesnetDataset():
         """
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting train dataloader")
+        if not self.dataset_config.need_train_set:
+            raise ValueError("Train dataloader is not available when need_train_set is false")
         assert self.train_dataset
         if self.train_dataloader:
             return self.train_dataloader
@@ -230,7 +228,7 @@ class CesnetDataset():
             self.train_dataset,
             num_workers=self.dataset_config.train_workers,
             worker_init_fn=worker_init_fn,
-            collate_fn=self.collate_fn,
+            collate_fn=self._collate_fn,
             persistent_workers=self.dataset_config.train_workers > 0,
             batch_size=None,
             sampler=batch_sampler,)
@@ -255,8 +253,8 @@ class CesnetDataset():
         """
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting validaion dataloader")
-        if self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
-            raise ValueError("Validation dataloader is not available when using no-validation")
+        if not self.dataset_config.need_val_set:
+            raise ValueError("Validation dataloader is not available when need_val_set is false")
         assert self.val_dataset is not None
         if self.val_dataloader:
             return self.val_dataloader
@@ -265,7 +263,7 @@ class CesnetDataset():
             self.val_dataset,
             num_workers=self.dataset_config.val_workers,
             worker_init_fn=worker_init_fn,
-            collate_fn=self.collate_fn,
+            collate_fn=self._collate_fn,
             persistent_workers=self.dataset_config.val_workers > 0,
             batch_size=None,
             sampler=batch_sampler,)
@@ -294,8 +292,8 @@ class CesnetDataset():
         """
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting test dataloader")
-        if self.dataset_config.no_test_set:
-            raise ValueError("Test dataloader is not available when no_test_set is true")
+        if not self.dataset_config.need_test_set:
+            raise ValueError("Test dataloader is not available when need_test_set is false")
         assert self.test_dataset is not None
         if self.test_dataloader:
             return self.test_dataloader
@@ -304,7 +302,7 @@ class CesnetDataset():
             self.test_dataset,
             num_workers=self.dataset_config.test_workers,
             worker_init_fn=worker_init_fn,
-            collate_fn=self.collate_fn,
+            collate_fn=self._collate_fn,
             persistent_workers=False,
             batch_size=None,
             sampler=batch_sampler,)
@@ -336,7 +334,7 @@ class CesnetDataset():
         Returns:
             Train data as a dataframe.
         """
-        self._check_before_dataframe()
+        self._check_before_dataframe(check_train=True)
         assert self.dataset_config is not None and self.train_dataset is not None
         if len(self.train_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Train set has ({len(self.train_dataset)} samples), consider using get_train_dataloader() instead")
@@ -369,7 +367,7 @@ class CesnetDataset():
         Returns:
             Validation data as a dataframe.
         """
-        self._check_before_dataframe(check_no_val=True)
+        self._check_before_dataframe(check_val=True)
         assert self.dataset_config is not None and self.val_dataset is not None
         if len(self.val_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Validation set has ({len(self.val_dataset)} samples), consider using get_val_dataloader() instead")
@@ -398,7 +396,7 @@ class CesnetDataset():
         Returns:
             Test data as a dataframe.
         """
-        self._check_before_dataframe(check_no_test=True)
+        self._check_before_dataframe(check_test=True)
         assert self.dataset_config is not None and self.test_dataset is not None
         if len(self.test_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Test set has ({len(self.test_dataset)} samples), consider using get_test_dataloader() instead")
@@ -436,12 +434,18 @@ class CesnetDataset():
             batch_size: Number of samples per batch for loading data.
             disabled_apps: List of applications to exclude from the statistics.
         """
-        flowstats_features = self.metadata.flowstats_features + self.metadata.packet_histogram_features + self.metadata.tcp_features
+        if disabled_apps:
+            bad_disabled_apps = [a for a in disabled_apps if a not in self.available_classes]
+            if len(bad_disabled_apps) > 0:
+                raise ValueError(f"Bad applications in disabled_apps {bad_disabled_apps}. Use applications available in dataset.available_classes")
         if not os.path.exists(self.statistics_path):
             os.mkdir(self.statistics_path)
         compute_dataset_statistics(database_path=self.database_path,
+                                   tables_app_enum=self._tables_app_enum,
+                                   tables_cat_enum=self._tables_cat_enum,
                                    output_dir=self.statistics_path,
-                                   flowstats_features=flowstats_features,
+                                   packet_histograms=self.metadata.packet_histograms,
+                                   flowstats_features_boolean=self.metadata.flowstats_features_boolean,
                                    protocol=self.metadata.protocol,
                                    extra_fields=not self.name.startswith("CESNET-TLS22"),
                                    disabled_apps=disabled_apps if disabled_apps is not None else [],
@@ -489,174 +493,193 @@ class CesnetDataset():
         self.train_dataset = None
         self.val_dataset = None
         self.test_dataset = None
-        self.known_apps_database_enum = None
-        self.unknown_apps_database_enum = None
         self.known_app_counts = None
         self.unknown_app_counts = None
-        self.collate_fn = None
-        self.encoder = None
-        self.flowstats_scaler = None
-        self.psizes_scaler = None
-        self.ipt_scaler = None
-        self.flowstats_quantiles = None
         self.train_dataloader = None
         self.train_dataloader_sampler = None
         self.train_dataloader_drop_last = True
         self.val_dataloader = None
         self.test_dataloader = None
+        self._collate_fn = None
-    def _check_before_dataframe(self, check_no_val: bool = False, check_no_test: bool = False) -> None:
+    def _check_before_dataframe(self, check_train: bool = False, check_val: bool = False, check_test: bool = False) -> None:
         if self.dataset_config is None:
             raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting a dataframe")
-        if self.dataset_config.return_torch:
-            raise ValueError("Dataframes are not available when return_torch is set. Use a dataloader instead.")
-        if check_no_val and self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
-            raise ValueError("Validation dataframe is not available when using no-validation")
-        if check_no_test and self.dataset_config.no_test_set:
-            raise ValueError("Test dataframe is not available when no_test_set is true")
+        if self.dataset_config.return_tensors:
+            raise ValueError("Dataframes are not available when return_tensors is set. Use a dataloader instead.")
+        if check_train and not self.dataset_config.need_train_set:
+            raise ValueError("Train dataframe is not available when need_train_set is false")
+        if check_val and not self.dataset_config.need_val_set:
+            raise ValueError("Validation dataframe is not available when need_val_set is false")
+        if check_test and not self.dataset_config.need_test_set:
+            raise ValueError("Test dataframe is not available when need_test_set is false")
     def _initialize_train_val_test(self, disable_indices_cache: bool = False) -> None:
         assert self.dataset_config is not None
         dataset_config = self.dataset_config
         servicemap = pd.read_csv(dataset_config.servicemap_path, index_col="Tag")
-        # Initialize train and test indices
-        train_indices, train_unknown_indices, encoder, known_apps_database_enum, unknown_apps_database_enum = init_or_load_train_indices(dataset_config=dataset_config,
-                                                                                                                                         servicemap=servicemap,
-                                                                                                                                         disable_indices_cache=disable_indices_cache,)
-        if self.dataset_config.no_test_set:
-            test_known_indices = np.empty((0,3), dtype=np.int64)
-            test_unknown_indices = np.empty((0,3), dtype=np.int64)
-            test_data_path = None
+        # Initialize train set
+        if dataset_config.need_train_set:
+            train_indices, train_unknown_indices, known_apps, unknown_apps = init_or_load_train_indices(dataset_config=dataset_config,
+                                                                                                        tables_app_enum=self._tables_app_enum,
+                                                                                                        servicemap=servicemap,
+                                                                                                        disable_indices_cache=disable_indices_cache,)
+            # Date weight sampling of train indices
+            if dataset_config.train_dates_weigths is not None:
+                assert dataset_config.train_size != "all"
+                if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
+                    # requested number of samples is train_size + val_known_size when using the split-from-train validation approach
+                    assert dataset_config.val_known_size != "all"
+                    num_samples = dataset_config.train_size + dataset_config.val_known_size
+                else:
+                    num_samples = dataset_config.train_size
+                if num_samples > len(train_indices):
+                    raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
+                train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
+        elif dataset_config.apps_selection == AppSelection.FIXED:
+            known_apps = dataset_config.apps_selection_fixed_known
+            unknown_apps = dataset_config.apps_selection_fixed_unknown
+            train_indices = np.zeros((0,3), dtype=np.int64)
+            train_unknown_indices = np.zeros((0,3), dtype=np.int64)
         else:
-            test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config,
-                                                                                                 known_apps_database_enum=known_apps_database_enum,
-                                                                                                 unknown_apps_database_enum=unknown_apps_database_enum,
+            raise ValueError("Either need train set or the fixed application selection")
+        # Initialize validation set
+        if dataset_config.need_val_set:
+            if dataset_config.val_approach == ValidationApproach.VALIDATION_DATES:
+                val_known_indices, val_unknown_indices, val_data_path = init_or_load_val_indices(dataset_config=dataset_config,
+                                                                                                 known_apps=known_apps,
+                                                                                                 unknown_apps=unknown_apps,
+                                                                                                 tables_app_enum=self._tables_app_enum,
                                                                                                  disable_indices_cache=disable_indices_cache,)
-        # Date weight sampling of train indices
-        if dataset_config.train_dates_weigths is not None:
-            assert dataset_config.train_size != "all"
-            if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
-                # requested number of samples is train_size + val_known_size when using the split-from-train validation approach
-                assert dataset_config.val_known_size != "all"
-                num_samples = dataset_config.train_size + dataset_config.val_known_size
-            else:
-                num_samples = dataset_config.train_size
-            if num_samples > len(train_indices):
-                raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
-            train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
-        # Obtain validation indices based on the selected approach
-        if dataset_config.val_approach == ValidationApproach.VALIDATION_DATES:
-            val_known_indices, val_unknown_indices, val_data_path = init_or_load_val_indices(dataset_config=dataset_config,
-                                                                                             known_apps_database_enum=known_apps_database_enum,
-                                                                                             unknown_apps_database_enum=unknown_apps_database_enum,
-                                                                                             disable_indices_cache=disable_indices_cache,)
-        elif dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
-            train_val_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.TRAIN_VAL_SPLIT)
-            val_data_path = dataset_config._get_train_data_path()
-            val_unknown_indices = train_unknown_indices
-            train_labels = train_indices[:, INDICES_LABEL_POS]
-            if dataset_config.train_dates_weigths is not None:
-                assert dataset_config.val_known_size != "all"
-                # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
-                if dataset_config.val_known_size > len(train_indices):
-                    raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples after weight sampling ({len(train_indices)})")
-                train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.val_known_size, stratify=train_labels, shuffle=True, random_state=train_val_rng)
-                dataset_config.train_size = len(train_indices)
-            elif dataset_config.train_size == "all" and dataset_config.val_known_size == "all":
-                train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.train_val_split_fraction, stratify=train_labels, shuffle=True, random_state=train_val_rng)
-            else:
-                if dataset_config.val_known_size != "all" and  dataset_config.train_size != "all" and dataset_config.train_size + dataset_config.val_known_size > len(train_indices):
-                    raise ValueError(f"Requested train size + validation size ({dataset_config.train_size + dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
-                if dataset_config.train_size != "all" and dataset_config.train_size > len(train_indices):
-                    raise ValueError(f"Requested train size ({dataset_config.train_size}) is larger than the number of available train samples ({len(train_indices)})")
-                if dataset_config.val_known_size != "all" and dataset_config.val_known_size > len(train_indices):
-                    raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
-                train_indices, val_known_indices = train_test_split(train_indices,
-                                                                          train_size=dataset_config.train_size if dataset_config.train_size != "all" else None,
-                                                                          test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
-                                                                          stratify=train_labels, shuffle=True, random_state=train_val_rng)
-        elif dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
-            val_known_indices = np.empty((0,3), dtype=np.int64)
-            val_unknown_indices = np.empty((0,3), dtype=np.int64)
+            elif dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
+                train_val_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.TRAIN_VAL_SPLIT)
+                val_data_path = dataset_config._get_train_data_path()
+                val_unknown_indices = train_unknown_indices
+                train_labels = train_indices[:, INDICES_LABEL_POS]
+                if dataset_config.train_dates_weigths is not None:
+                    assert dataset_config.val_known_size != "all"
+                    # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
+                    if dataset_config.val_known_size > len(train_indices):
+                        raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples after weight sampling ({len(train_indices)})")
+                    train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.val_known_size, stratify=train_labels, shuffle=True, random_state=train_val_rng)
+                    dataset_config.train_size = len(train_indices)
+                elif dataset_config.train_size == "all" and dataset_config.val_known_size == "all":
+                    train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.train_val_split_fraction, stratify=train_labels, shuffle=True, random_state=train_val_rng)
+                else:
+                    if dataset_config.val_known_size != "all" and  dataset_config.train_size != "all" and dataset_config.train_size + dataset_config.val_known_size > len(train_indices):
+                        raise ValueError(f"Requested train size + validation size ({dataset_config.train_size + dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
+                    if dataset_config.train_size != "all" and dataset_config.train_size > len(train_indices):
+                        raise ValueError(f"Requested train size ({dataset_config.train_size}) is larger than the number of available train samples ({len(train_indices)})")
+                    if dataset_config.val_known_size != "all" and dataset_config.val_known_size > len(train_indices):
+                        raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
+                    train_indices, val_known_indices = train_test_split(train_indices,
+                                                                        train_size=dataset_config.train_size if dataset_config.train_size != "all" else None,
+                                                                        test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
+                                                                        stratify=train_labels, shuffle=True, random_state=train_val_rng)
+        else:
+            val_known_indices = np.zeros((0,3), dtype=np.int64)
+            val_unknown_indices = np.zeros((0,3), dtype=np.int64)
             val_data_path = None
-        else: assert_never(dataset_config.val_approach)
-        # Create class info
-        class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
-        # Load or fit data scalers
-        flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles = fit_or_load_scalers(dataset_config=dataset_config, train_indices=train_indices)
+        # Initialize test set
+        if dataset_config.need_test_set:
+            test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config,
+                                                                                                 known_apps=known_apps,
+                                                                                                 unknown_apps=unknown_apps,
+                                                                                                 tables_app_enum=self._tables_app_enum,
+                                                                                                 disable_indices_cache=disable_indices_cache,)
+        else:
+            test_known_indices = np.zeros((0,3), dtype=np.int64)
+            test_unknown_indices = np.zeros((0,3), dtype=np.int64)
+            test_data_path = None
+        # Fit scalers if needed
+        if (dataset_config.ppi_transform is not None and dataset_config.ppi_transform.needs_fitting or
+            dataset_config.flowstats_transform is not None and dataset_config.flowstats_transform.needs_fitting):
+            if not dataset_config.need_train_set:
+                raise ValueError("Train set is needed to fit the scalers. Provide pre-fitted scalers.")
+            fit_scalers(dataset_config=dataset_config, train_indices=train_indices)
         # Subset dataset indices based on the selected sizes and compute application counts
         dataset_indices = IndicesTuple(train_indices=train_indices, val_known_indices=val_known_indices, val_unknown_indices=val_unknown_indices, test_known_indices=test_known_indices, test_unknown_indices=test_unknown_indices)
         dataset_indices = subset_and_sort_indices(dataset_config=dataset_config, dataset_indices=dataset_indices)
-        known_app_counts = compute_known_app_counts(dataset_indices=dataset_indices, database_enum=known_apps_database_enum)
-        unknown_app_counts = compute_unknown_app_counts(dataset_indices=dataset_indices, database_enum=unknown_apps_database_enum)
+        known_app_counts = compute_known_app_counts(dataset_indices=dataset_indices, tables_app_enum=self._tables_app_enum)
+        unknown_app_counts = compute_unknown_app_counts(dataset_indices=dataset_indices, tables_app_enum=self._tables_app_enum)
         # Combine known and unknown test indicies to create a single dataloader
         assert isinstance(dataset_config.test_unknown_size, int)
-        if dataset_config.test_unknown_size > 0 and len(unknown_apps_database_enum) > 0:
+        if dataset_config.test_unknown_size > 0 and len(unknown_apps) > 0:
             test_combined_indices = np.concatenate((dataset_indices.test_known_indices, dataset_indices.test_unknown_indices))
         else:
             test_combined_indices = dataset_indices.test_known_indices
+        # Create encoder the class info structure
+        encoder = LabelEncoder().fit(known_apps)
+        encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
+        class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
+        encode_labels_with_unknown_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
         # Create train, validation, and test datasets
-        train_dataset = PyTablesDataset(
-            database_path=dataset_config.database_path,
-            tables_paths=dataset_config._get_train_tables_paths(),
-            indices=dataset_indices.train_indices,
-            flowstats_features=dataset_config.flowstats_features,
-            other_fields=self.dataset_config.other_fields,)
-        if dataset_config.no_test_set:
-            test_dataset = None
-        else:
-            assert test_data_path is not None
-            test_dataset = PyTablesDataset(
+        train_dataset = val_dataset = test_dataset = None
+        if dataset_config.need_train_set:
+            train_dataset = PyTablesDataset(
                 database_path=dataset_config.database_path,
-                tables_paths=dataset_config._get_test_tables_paths(),
-                indices=test_combined_indices,
+                tables_paths=dataset_config._get_train_tables_paths(),
+                indices=dataset_indices.train_indices,
+                tables_app_enum=self._tables_app_enum,
+                tables_cat_enum=self._tables_cat_enum,
                 flowstats_features=dataset_config.flowstats_features,
+                flowstats_features_boolean=dataset_config.flowstats_features_boolean,
+                flowstats_features_phist=dataset_config.flowstats_features_phist,
                 other_fields=self.dataset_config.other_fields,
-                preload=dataset_config.preload_test,
-                preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)
-        if dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
-            val_dataset = None
-        else:
+                ppi_channels=dataset_config.get_ppi_channels(),
+                ppi_transform=dataset_config.ppi_transform,
+                flowstats_transform=dataset_config.flowstats_transform,
+                flowstats_phist_transform=dataset_config.flowstats_phist_transform,
+                target_transform=encode_labels_with_unknown_fn,
+                return_tensors=dataset_config.return_tensors,)
+        if dataset_config.need_val_set:
             assert val_data_path is not None
             val_dataset = PyTablesDataset(
                 database_path=dataset_config.database_path,
                 tables_paths=dataset_config._get_train_tables_paths(),
                 indices=dataset_indices.val_known_indices,
+                tables_app_enum=self._tables_app_enum,
+                tables_cat_enum=self._tables_cat_enum,
                 flowstats_features=dataset_config.flowstats_features,
+                flowstats_features_boolean=dataset_config.flowstats_features_boolean,
+                flowstats_features_phist=dataset_config.flowstats_features_phist,
                 other_fields=self.dataset_config.other_fields,
+                ppi_channels=dataset_config.get_ppi_channels(),
+                ppi_transform=dataset_config.ppi_transform,
+                flowstats_transform=dataset_config.flowstats_transform,
+                flowstats_phist_transform=dataset_config.flowstats_phist_transform,
+                target_transform=encode_labels_with_unknown_fn,
+                return_tensors=dataset_config.return_tensors,
                 preload=dataset_config.preload_val,
                 preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
-        collate_fn = partial(pytables_collate_fn,
-            flowstats_scaler=flowstats_scaler,
-            flowstats_quantiles=flowstats_quantiles,
-            psizes_scaler=psizes_scaler,
-            psizes_max=dataset_config.psizes_max,
-            ipt_scaler=ipt_scaler,
-            ipt_min=dataset_config.ipt_min,
-            ipt_max=dataset_config.ipt_max,
-            use_push_flags=dataset_config.use_push_flags,
-            use_packet_histograms=dataset_config.use_packet_histograms,
-            normalize_packet_histograms=dataset_config.normalize_packet_histograms,
-            zero_ppi_start=dataset_config.zero_ppi_start,
-            encoder=encoder,
-            known_apps=class_info.known_apps,
-            return_torch=dataset_config.return_torch,)
+        if dataset_config.need_test_set:
+            assert test_data_path is not None
+            test_dataset = PyTablesDataset(
+                database_path=dataset_config.database_path,
+                tables_paths=dataset_config._get_test_tables_paths(),
+                indices=test_combined_indices,
+                tables_app_enum=self._tables_app_enum,
+                tables_cat_enum=self._tables_cat_enum,
+                flowstats_features=dataset_config.flowstats_features,
+                flowstats_features_boolean=dataset_config.flowstats_features_boolean,
+                flowstats_features_phist=dataset_config.flowstats_features_phist,
+                other_fields=self.dataset_config.other_fields,
+                ppi_channels=dataset_config.get_ppi_channels(),
+                ppi_transform=dataset_config.ppi_transform,
+                flowstats_transform=dataset_config.flowstats_transform,
+                flowstats_phist_transform=dataset_config.flowstats_phist_transform,
+                target_transform=encode_labels_with_unknown_fn,
+                return_tensors=dataset_config.return_tensors,
+                preload=dataset_config.preload_test,
+                preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)
         self.class_info = class_info
         self.dataset_indices = dataset_indices
         self.train_dataset = train_dataset
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
-        self.known_apps_database_enum = known_apps_database_enum
-        self.unknown_apps_database_enum = unknown_apps_database_enum
         self.known_app_counts = known_app_counts
         self.unknown_app_counts = unknown_app_counts
-        self.collate_fn = collate_fn
-        self.encoder = encoder
-        self.flowstats_scaler = flowstats_scaler
-        self.psizes_scaler = psizes_scaler
-        self.ipt_scaler = ipt_scaler
-        self.flowstats_quantiles = flowstats_quantiles
+        self._collate_fn = collate_fn_simple
+def _encode_labels_with_unknown(labels, encoder: LabelEncoder, class_info: ClassInfo):
+    return encoder.transform(np.where(np.isin(labels, class_info.known_apps), labels, UNKNOWN_STR_LABEL))

cesnet-datazoo 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl

cesnet-datazoo 0.0.17py3-none-any.whl → 0.1.0py3-none-any.whl