cesnet-datazoo 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cesnet_datazoo/config.py CHANGED
@@ -142,7 +142,7 @@ class DatasetConfig():
142
142
  need_test_set: Use to disable the test set. `Default: True`
143
143
  train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
144
144
  train_dates: Dates used for creating a train set.
145
- train_dates_weigths: To use a non-uniform distribution of samples across train dates.
145
+ train_dates_weights: To use a non-uniform distribution of samples across train dates.
146
146
  val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
147
147
  train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
148
148
  val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
@@ -178,6 +178,7 @@ class DatasetConfig():
178
178
 
179
179
  return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
180
180
  return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
181
+ disable_label_encoding: Whether to disable label encoding and return application names as strings. The original labels of configured unknown classes are preserved. `Default: False`
181
182
  use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
182
183
  use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
183
184
  use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
@@ -219,7 +220,7 @@ class DatasetConfig():
219
220
  need_test_set: bool = True
220
221
  train_period_name: str = ""
221
222
  train_dates: list[str] = field(default_factory=list)
222
- train_dates_weigths: Optional[list[int]] = None
223
+ train_dates_weights: Optional[list[int]] = None
223
224
  val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
224
225
  train_val_split_fraction: float = 0.2
225
226
  val_period_name: str = ""
@@ -255,6 +256,7 @@ class DatasetConfig():
255
256
 
256
257
  return_other_fields: bool = False
257
258
  return_tensors: bool = False
259
+ disable_label_encoding: bool = False
258
260
  use_packet_histograms: bool = False
259
261
  use_tcp_features: bool = False
260
262
  use_push_flags: bool = False
@@ -369,16 +371,16 @@ class DatasetConfig():
369
371
  raise ValueError("QUIC datasets do not support use_tcp_features")
370
372
  if self.use_push_flags:
371
373
  raise ValueError("QUIC datasets do not support use_push_flags")
372
- # When train_dates_weigths are used, train_size and val_known_size have to be specified
373
- if self.train_dates_weigths is not None:
374
+ # When train_dates_weights are used, train_size and val_known_size have to be specified
375
+ if self.train_dates_weights is not None:
374
376
  if not self.need_train_set:
375
- raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
376
- if len(self.train_dates_weigths) != len(self.train_dates):
377
- raise ValueError("train_dates_weigths has to have the same length as train_dates")
377
+ raise ValueError("train_dates_weights cannot be specified when need_train_set is false")
378
+ if len(self.train_dates_weights) != len(self.train_dates):
379
+ raise ValueError("train_dates_weights has to have the same length as train_dates")
378
380
  if self.train_size == "all":
379
- raise ValueError("train_size cannot be 'all' when train_dates_weigths are speficied")
381
+ raise ValueError("train_size cannot be 'all' when train_dates_weights are speficied")
380
382
  if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN and self.val_known_size == "all":
381
- raise ValueError("val_known_size cannot be 'all' when train_dates_weigths are speficied and validation_approach is split-from-train")
383
+ raise ValueError("val_known_size cannot be 'all' when train_dates_weights are speficied and validation_approach is split-from-train")
382
384
  # App selection
383
385
  if self.apps_selection == AppSelection.ALL_KNOWN:
384
386
  self.val_unknown_size = 0
@@ -532,7 +532,7 @@ class CesnetDataset():
532
532
  servicemap=servicemap,
533
533
  disable_indices_cache=disable_indices_cache,)
534
534
  # Date weight sampling of train indices
535
- if dataset_config.train_dates_weigths is not None:
535
+ if dataset_config.train_dates_weights is not None:
536
536
  assert dataset_config.train_size != "all"
537
537
  if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
538
538
  # requested number of samples is train_size + val_known_size when using the split-from-train validation approach
@@ -563,7 +563,7 @@ class CesnetDataset():
563
563
  val_data_path = dataset_config._get_train_data_path()
564
564
  val_unknown_indices = train_unknown_indices
565
565
  train_labels = train_indices[INDICES_APP_FIELD]
566
- if dataset_config.train_dates_weigths is not None:
566
+ if dataset_config.train_dates_weights is not None:
567
567
  assert dataset_config.val_known_size != "all"
568
568
  # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
569
569
  if dataset_config.val_known_size > len(train_indices):
@@ -619,7 +619,10 @@ class CesnetDataset():
619
619
  encoder = LabelEncoder().fit(known_apps)
620
620
  encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
621
621
  class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
622
- encode_labels_with_unknown_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
622
+ if dataset_config.disable_label_encoding:
623
+ label_encoder_fn = None
624
+ else:
625
+ label_encoder_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
623
626
  # Create train, validation, and test datasets
624
627
  train_dataset = val_dataset = test_dataset = None
625
628
  if dataset_config.need_train_set:
@@ -638,7 +641,7 @@ class CesnetDataset():
638
641
  ppi_transform=dataset_config.ppi_transform,
639
642
  flowstats_transform=dataset_config.flowstats_transform,
640
643
  flowstats_phist_transform=dataset_config.flowstats_phist_transform,
641
- target_transform=encode_labels_with_unknown_fn,
644
+ target_transform=label_encoder_fn,
642
645
  return_tensors=dataset_config.return_tensors,)
643
646
  if dataset_config.need_val_set:
644
647
  assert val_data_path is not None
@@ -657,7 +660,7 @@ class CesnetDataset():
657
660
  ppi_transform=dataset_config.ppi_transform,
658
661
  flowstats_transform=dataset_config.flowstats_transform,
659
662
  flowstats_phist_transform=dataset_config.flowstats_phist_transform,
660
- target_transform=encode_labels_with_unknown_fn,
663
+ target_transform=label_encoder_fn,
661
664
  return_tensors=dataset_config.return_tensors,
662
665
  preload=dataset_config.preload_val,
663
666
  preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
@@ -678,7 +681,7 @@ class CesnetDataset():
678
681
  ppi_transform=dataset_config.ppi_transform,
679
682
  flowstats_transform=dataset_config.flowstats_transform,
680
683
  flowstats_phist_transform=dataset_config.flowstats_phist_transform,
681
- target_transform=encode_labels_with_unknown_fn,
684
+ target_transform=label_encoder_fn,
682
685
  return_tensors=dataset_config.return_tensors,
683
686
  preload=dataset_config.preload_test,
684
687
  preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)
@@ -64,11 +64,11 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
64
64
  def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
65
65
  rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
66
66
  indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
67
- weights = np.array(dataset_config.train_dates_weigths)
67
+ weights = np.array(dataset_config.train_dates_weights)
68
68
  weights = weights / weights.sum()
69
69
  samples_per_date = np.ceil((weights * (num_samples))).astype(int)
70
70
  samples_per_date_clipped = np.clip(samples_per_date, a_max=list(map(len, indices_per_date)), a_min=0)
71
- df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weigths, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
71
+ df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weights, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
72
72
  log.info(f"Weight sampling per date with requsted total number of samples {num_samples} (train_size + val_known_size when using the split-from-train validation approach; train_size otherwise)")
73
73
  for l in df.to_string(index=False).splitlines():
74
74
  log.info(l)
@@ -1,6 +1,7 @@
1
1
  import atexit
2
2
  import logging
3
3
  import os
4
+ import sys
4
5
  import time
5
6
  import warnings
6
7
  from datetime import datetime
@@ -198,7 +199,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
198
199
  if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
199
200
  warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
200
201
  "To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
201
- exit()
202
+ sys.exit()
202
203
  elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
203
204
  log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
204
205
  "Disabling these applications")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -1,8 +1,8 @@
1
1
  cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- cesnet_datazoo/config.py,sha256=wkpD_OL3gRXX2t0WDfDMsBD2A3vEdAjSm4yXhzsR8T0,38536
2
+ cesnet_datazoo/config.py,sha256=AoYMX_drWqb6K6MhdTQRyvUPf_6kZeL3ie04hUgJzgE,38768
3
3
  cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
4
4
  cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
- cesnet_datazoo/datasets/cesnet_dataset.py,sha256=V2rBE4Mh2gaMw-NjQ4xHp6ViuuvCdEzB-ymX3CYyfkc,47762
5
+ cesnet_datazoo/datasets/cesnet_dataset.py,sha256=AgxdiEwtJrOug9F8LJcMwttc7NMrligkq7cy4lRmLAs,47817
6
6
  cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
7
7
  cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
8
8
  cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
@@ -16,15 +16,15 @@ cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKU
16
16
  cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
18
18
  cesnet_datazoo/pytables_data/data_scalers.py,sha256=xPL0SCLByDOgKv1Apqi5XQd501mIfsF8FdonmRQ0zzQ,5236
19
- cesnet_datazoo/pytables_data/indices_setup.py,sha256=M5J2BevkQK8fuC22vUauKyKAEVwYg8xRz9JJK8E1VX8,13717
20
- cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=YGbzYKrSklCu3J52Xbdcs3zZsYroBBtP8ulgS1c5Fnw,19431
19
+ cesnet_datazoo/pytables_data/indices_setup.py,sha256=yCYWjkCPIj0en3btnC-C7cte0CqbqMZzOnaVR9jaNes,13717
20
+ cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=Xk5BwKoQPewqL1gj5-EuiA9HfhJPUsYs7lOsG3CEzlQ,19447
21
21
  cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
23
23
  cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
24
24
  cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
25
25
  cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
26
- cesnet_datazoo-0.1.12.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
- cesnet_datazoo-0.1.12.dist-info/METADATA,sha256=WpP33duuGItPTvO3SOXoyTkNPHMZ16WchDpmzY75Pbs,12574
28
- cesnet_datazoo-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- cesnet_datazoo-0.1.12.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
- cesnet_datazoo-0.1.12.dist-info/RECORD,,
26
+ cesnet_datazoo-0.1.14.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
+ cesnet_datazoo-0.1.14.dist-info/METADATA,sha256=yp-ld51MKd-oTx-z86x0BwbJEvdY2zlc1fIbfYIDKWY,12574
28
+ cesnet_datazoo-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
+ cesnet_datazoo-0.1.14.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
+ cesnet_datazoo-0.1.14.dist-info/RECORD,,