cesnet-datazoo 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/PKG-INFO +1 -1
  2. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/config.py +5 -1
  3. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/cesnet_dataset.py +10 -5
  4. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/pytables_data/pytables_dataset.py +2 -1
  5. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo.egg-info/PKG-INFO +1 -1
  6. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/pyproject.toml +1 -1
  7. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/LICENCE +0 -0
  8. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/README.md +0 -0
  9. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/__init__.py +0 -0
  10. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/constants.py +0 -0
  11. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/__init__.py +0 -0
  12. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/datasets.py +0 -0
  13. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
  14. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/loaders.py +0 -0
  15. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  16. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
  17. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
  18. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/datasets/statistics.py +0 -0
  19. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/metrics/__init__.py +0 -0
  20. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/metrics/classification_report.py +0 -0
  21. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
  22. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  23. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  24. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/pytables_data/data_scalers.py +0 -0
  25. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/pytables_data/indices_setup.py +0 -0
  26. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/utils/__init__.py +0 -0
  27. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/utils/class_info.py +0 -0
  28. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/utils/download.py +0 -0
  29. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/utils/fileutils.py +0 -0
  30. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo/utils/random.py +0 -0
  31. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
  32. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  33. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo.egg-info/requires.txt +0 -0
  34. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  35. {cesnet_datazoo-0.1.13 → cesnet_datazoo-0.1.15}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -166,7 +166,8 @@ class DatasetConfig():
166
166
  val_workers: Number of workers for loading validation data. `0` means that the data will be loaded in the main process. `Default: 1`
167
167
  batch_size: Number of samples per batch. `Default: 192`
168
168
  test_batch_size: Number of samples per batch for loading validation and test data. `Default: 2048`
169
- preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
169
+ preload_train: Whether to dump the train set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
170
+ preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
170
171
  preload_test: Whether to dump the test set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
171
172
  train_size: Size of the train set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
172
173
  val_known_size: Size of the validation set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
@@ -178,6 +179,7 @@ class DatasetConfig():
178
179
 
179
180
  return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
180
181
  return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
182
+ disable_label_encoding: Whether to disable label encoding and return application names as strings. The original labels of configured unknown classes are preserved. `Default: False`
181
183
  use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
182
184
  use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
183
185
  use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
@@ -243,6 +245,7 @@ class DatasetConfig():
243
245
  val_workers: int = 1
244
246
  batch_size: int = 192
245
247
  test_batch_size: int = 2048
248
+ preload_train: bool = False
246
249
  preload_val: bool = False
247
250
  preload_test: bool = False
248
251
  train_size: int | Literal["all"] = "all"
@@ -255,6 +258,7 @@ class DatasetConfig():
255
258
 
256
259
  return_other_fields: bool = False
257
260
  return_tensors: bool = False
261
+ disable_label_encoding: bool = False
258
262
  use_packet_histograms: bool = False
259
263
  use_tcp_features: bool = False
260
264
  use_push_flags: bool = False
@@ -619,7 +619,10 @@ class CesnetDataset():
619
619
  encoder = LabelEncoder().fit(known_apps)
620
620
  encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
621
621
  class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
622
- encode_labels_with_unknown_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
622
+ if dataset_config.disable_label_encoding:
623
+ label_encoder_fn = None
624
+ else:
625
+ label_encoder_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
623
626
  # Create train, validation, and test datasets
624
627
  train_dataset = val_dataset = test_dataset = None
625
628
  if dataset_config.need_train_set:
@@ -638,8 +641,10 @@ class CesnetDataset():
638
641
  ppi_transform=dataset_config.ppi_transform,
639
642
  flowstats_transform=dataset_config.flowstats_transform,
640
643
  flowstats_phist_transform=dataset_config.flowstats_phist_transform,
641
- target_transform=encode_labels_with_unknown_fn,
642
- return_tensors=dataset_config.return_tensors,)
644
+ target_transform=label_encoder_fn,
645
+ return_tensors=dataset_config.return_tensors,
646
+ preload=dataset_config.preload_train,
647
+ preload_blob=os.path.join(dataset_config._get_train_data_path(), "preload", f"train_dataset-{dataset_config.train_size}.npz"),)
643
648
  if dataset_config.need_val_set:
644
649
  assert val_data_path is not None
645
650
  val_dataset = PyTablesDataset(
@@ -657,7 +662,7 @@ class CesnetDataset():
657
662
  ppi_transform=dataset_config.ppi_transform,
658
663
  flowstats_transform=dataset_config.flowstats_transform,
659
664
  flowstats_phist_transform=dataset_config.flowstats_phist_transform,
660
- target_transform=encode_labels_with_unknown_fn,
665
+ target_transform=label_encoder_fn,
661
666
  return_tensors=dataset_config.return_tensors,
662
667
  preload=dataset_config.preload_val,
663
668
  preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
@@ -678,7 +683,7 @@ class CesnetDataset():
678
683
  ppi_transform=dataset_config.ppi_transform,
679
684
  flowstats_transform=dataset_config.flowstats_transform,
680
685
  flowstats_phist_transform=dataset_config.flowstats_phist_transform,
681
- target_transform=encode_labels_with_unknown_fn,
686
+ target_transform=label_encoder_fn,
682
687
  return_tensors=dataset_config.return_tensors,
683
688
  preload=dataset_config.preload_test,
684
689
  preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)
@@ -1,6 +1,7 @@
1
1
  import atexit
2
2
  import logging
3
3
  import os
4
+ import sys
4
5
  import time
5
6
  import warnings
6
7
  from datetime import datetime
@@ -198,7 +199,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
198
199
  if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
199
200
  warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
200
201
  "To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
201
- exit()
202
+ sys.exit()
202
203
  elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
203
204
  log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
204
205
  "Disabling these applications")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.1.13"
7
+ version = "0.1.15"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
File without changes