cesnet-datazoo 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cesnet_datazoo/config.py CHANGED
@@ -142,7 +142,7 @@ class DatasetConfig():
142
142
  need_test_set: Use to disable the test set. `Default: True`
143
143
  train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
144
144
  train_dates: Dates used for creating a train set.
145
- train_dates_weigths: To use a non-uniform distribution of samples across train dates.
145
+ train_dates_weights: To use a non-uniform distribution of samples across train dates.
146
146
  val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
147
147
  train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
148
148
  val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
@@ -219,7 +219,7 @@ class DatasetConfig():
219
219
  need_test_set: bool = True
220
220
  train_period_name: str = ""
221
221
  train_dates: list[str] = field(default_factory=list)
222
- train_dates_weigths: Optional[list[int]] = None
222
+ train_dates_weights: Optional[list[int]] = None
223
223
  val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
224
224
  train_val_split_fraction: float = 0.2
225
225
  val_period_name: str = ""
@@ -369,16 +369,16 @@ class DatasetConfig():
369
369
  raise ValueError("QUIC datasets do not support use_tcp_features")
370
370
  if self.use_push_flags:
371
371
  raise ValueError("QUIC datasets do not support use_push_flags")
372
- # When train_dates_weigths are used, train_size and val_known_size have to be specified
373
- if self.train_dates_weigths is not None:
372
+ # When train_dates_weights are used, train_size and val_known_size have to be specified
373
+ if self.train_dates_weights is not None:
374
374
  if not self.need_train_set:
375
- raise ValueError("train_dates_weigths cannot be specified when need_train_set is false")
376
- if len(self.train_dates_weigths) != len(self.train_dates):
377
- raise ValueError("train_dates_weigths has to have the same length as train_dates")
375
+ raise ValueError("train_dates_weights cannot be specified when need_train_set is false")
376
+ if len(self.train_dates_weights) != len(self.train_dates):
377
+ raise ValueError("train_dates_weights has to have the same length as train_dates")
378
378
  if self.train_size == "all":
379
- raise ValueError("train_size cannot be 'all' when train_dates_weigths are speficied")
379
+ raise ValueError("train_size cannot be 'all' when train_dates_weights are speficied")
380
380
  if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN and self.val_known_size == "all":
381
- raise ValueError("val_known_size cannot be 'all' when train_dates_weigths are speficied and validation_approach is split-from-train")
381
+ raise ValueError("val_known_size cannot be 'all' when train_dates_weights are speficied and validation_approach is split-from-train")
382
382
  # App selection
383
383
  if self.apps_selection == AppSelection.ALL_KNOWN:
384
384
  self.val_unknown_size = 0
@@ -532,7 +532,7 @@ class CesnetDataset():
532
532
  servicemap=servicemap,
533
533
  disable_indices_cache=disable_indices_cache,)
534
534
  # Date weight sampling of train indices
535
- if dataset_config.train_dates_weigths is not None:
535
+ if dataset_config.train_dates_weights is not None:
536
536
  assert dataset_config.train_size != "all"
537
537
  if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
538
538
  # requested number of samples is train_size + val_known_size when using the split-from-train validation approach
@@ -563,7 +563,7 @@ class CesnetDataset():
563
563
  val_data_path = dataset_config._get_train_data_path()
564
564
  val_unknown_indices = train_unknown_indices
565
565
  train_labels = train_indices[INDICES_APP_FIELD]
566
- if dataset_config.train_dates_weigths is not None:
566
+ if dataset_config.train_dates_weights is not None:
567
567
  assert dataset_config.val_known_size != "all"
568
568
  # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
569
569
  if dataset_config.val_known_size > len(train_indices):
@@ -64,11 +64,11 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
64
64
  def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
65
65
  rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
66
66
  indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
67
- weights = np.array(dataset_config.train_dates_weigths)
67
+ weights = np.array(dataset_config.train_dates_weights)
68
68
  weights = weights / weights.sum()
69
69
  samples_per_date = np.ceil((weights * (num_samples))).astype(int)
70
70
  samples_per_date_clipped = np.clip(samples_per_date, a_max=list(map(len, indices_per_date)), a_min=0)
71
- df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weigths, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
71
+ df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weights, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
72
72
  log.info(f"Weight sampling per date with requsted total number of samples {num_samples} (train_size + val_known_size when using the split-from-train validation approach; train_size otherwise)")
73
73
  for l in df.to_string(index=False).splitlines():
74
74
  log.info(l)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -19,7 +19,7 @@ Requires-Dist: cesnet_models
19
19
  Requires-Dist: matplotlib
20
20
  Requires-Dist: numpy
21
21
  Requires-Dist: pandas
22
- Requires-Dist: pydantic!=2.10.*,!=2.9.*,<2.12.0,>=2.0
22
+ Requires-Dist: pydantic!=2.9.*,<2.12.0,>=2.0
23
23
  Requires-Dist: PyYAML
24
24
  Requires-Dist: requests
25
25
  Requires-Dist: scikit-learn
@@ -1,8 +1,8 @@
1
1
  cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- cesnet_datazoo/config.py,sha256=wkpD_OL3gRXX2t0WDfDMsBD2A3vEdAjSm4yXhzsR8T0,38536
2
+ cesnet_datazoo/config.py,sha256=4lE5pnUUhJkQ2KuEhekzUo3NWSzSKOsusiBWbGMX2yU,38536
3
3
  cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
4
4
  cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
- cesnet_datazoo/datasets/cesnet_dataset.py,sha256=V2rBE4Mh2gaMw-NjQ4xHp6ViuuvCdEzB-ymX3CYyfkc,47762
5
+ cesnet_datazoo/datasets/cesnet_dataset.py,sha256=lcQ3ovsKE3sEgrYhx-JaDbeyu7UkXNhsZRAPpZAS6-g,47762
6
6
  cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
7
7
  cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
8
8
  cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
@@ -16,15 +16,15 @@ cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKU
16
16
  cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
18
18
  cesnet_datazoo/pytables_data/data_scalers.py,sha256=xPL0SCLByDOgKv1Apqi5XQd501mIfsF8FdonmRQ0zzQ,5236
19
- cesnet_datazoo/pytables_data/indices_setup.py,sha256=M5J2BevkQK8fuC22vUauKyKAEVwYg8xRz9JJK8E1VX8,13717
19
+ cesnet_datazoo/pytables_data/indices_setup.py,sha256=yCYWjkCPIj0en3btnC-C7cte0CqbqMZzOnaVR9jaNes,13717
20
20
  cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=YGbzYKrSklCu3J52Xbdcs3zZsYroBBtP8ulgS1c5Fnw,19431
21
21
  cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
23
23
  cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
24
24
  cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
25
25
  cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
26
- cesnet_datazoo-0.1.11.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
- cesnet_datazoo-0.1.11.dist-info/METADATA,sha256=NBfTvdZUASh2-Et2p9nExhveoHkVaWtvZSLbojZqiDw,12583
28
- cesnet_datazoo-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- cesnet_datazoo-0.1.11.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
- cesnet_datazoo-0.1.11.dist-info/RECORD,,
26
+ cesnet_datazoo-0.1.13.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
+ cesnet_datazoo-0.1.13.dist-info/METADATA,sha256=AKyK8HNdpysKMUz5xqKL6TSXMgbVAPsCCytalVg7sWA,12574
28
+ cesnet_datazoo-0.1.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
+ cesnet_datazoo-0.1.13.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
+ cesnet_datazoo-0.1.13.dist-info/RECORD,,