cesnet-datazoo 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/PKG-INFO +9 -6
  2. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/README.md +8 -5
  3. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/config.py +24 -19
  4. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/constants.py +1 -0
  5. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/cesnet_dataset.py +10 -10
  6. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/metrics/classification_report.py +2 -2
  7. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/pytables_data/indices_setup.py +4 -1
  8. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/pytables_data/pytables_dataset.py +9 -5
  9. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/utils/class_info.py +0 -2
  10. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo.egg-info/PKG-INFO +9 -6
  11. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/pyproject.toml +1 -1
  12. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/LICENCE +0 -0
  13. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/__init__.py +0 -0
  14. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/__init__.py +0 -0
  15. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/datasets.py +0 -0
  16. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
  17. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/loaders.py +0 -0
  18. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  19. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
  20. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
  21. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/datasets/statistics.py +0 -0
  22. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/metrics/__init__.py +0 -0
  23. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
  24. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  25. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  26. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/pytables_data/data_scalers.py +0 -0
  27. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/utils/__init__.py +0 -0
  28. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/utils/download.py +0 -0
  29. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/utils/fileutils.py +0 -0
  30. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo/utils/random.py +0 -0
  31. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
  32. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  33. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo.egg-info/requires.txt +0 -0
  34. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  35. {cesnet-datazoo-0.1.1 → cesnet-datazoo-0.1.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -43,7 +43,7 @@ Requires-Dist: twine; extra == "dev"
43
43
  </p>
44
44
 
45
45
  [![](https://img.shields.io/badge/license-BSD-blue.svg)](https://github.com/CESNET/cesnet-datazoo/blob/main/LICENCE)
46
- [![](https://img.shields.io/badge/docs-mkdocs_material-blue.svg)](https://cesnet.github.io/cesnet-datazoo/)
46
+ [![](https://img.shields.io/badge/docs-cesnet--datazoo-blue.svg)](https://cesnet.github.io/cesnet-datazoo/)
47
47
  [![](https://img.shields.io/badge/python->=3.10-blue.svg)](https://pypi.org/project/cesnet-datazoo/)
48
48
  [![](https://img.shields.io/pypi/v/cesnet-datazoo)](https://pypi.org/project/cesnet-datazoo/)
49
49
 
@@ -58,9 +58,12 @@ The goal of this project is to provide tools for working with large network traf
58
58
  - Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
59
59
  - Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
60
60
 
61
- ## Datasets
61
+ :brain: :brain: See a related project [CESNET Models](https://github.com/CESNET/cesnet-models) providing pre-trained neural networks for traffic classification. :brain: :brain:
62
+
63
+ :notebook: :notebook: Example Jupyter notebooks are included in a separate [CESNET Traffic Classification Examples](https://github.com/CESNET/cesnet-tcexamples) repo. :notebook: :notebook:
62
64
 
63
- The package is able to handle the following datasets:
65
+ ## Datasets
66
+ The following datasets are available in the `cesnet-datazoo` package:
64
67
 
65
68
  | Name | CESNET-TLS22 | CESNET-QUIC22 | CESNET-TLS-Year22 |
66
69
  | ---------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -120,6 +123,6 @@ See more examples in the [documentation](https://cesnet.github.io/cesnet-datazoo
120
123
  Jan Luxemburk and Karel Hynek <br>
121
124
  CoNEXT Workshop on Explainable and Safety Bounded, Fidelitous, Machine Learning for Networking (SAFE), 2023
122
125
 
123
- ### Acknowledgements
126
+ ## Acknowledgments
124
127
 
125
- This project was supported by the Ministry of the Interior of the Czech Republic, grant No. VJ02010024: Flow-Based Encrypted Traffic Analysis.
128
+ This project was supported by the Ministry of the Interior of the Czech Republic, grant No. VJ02010024: Flow-Based Encrypted Traffic Analysis.
@@ -3,7 +3,7 @@
3
3
  </p>
4
4
 
5
5
  [![](https://img.shields.io/badge/license-BSD-blue.svg)](https://github.com/CESNET/cesnet-datazoo/blob/main/LICENCE)
6
- [![](https://img.shields.io/badge/docs-mkdocs_material-blue.svg)](https://cesnet.github.io/cesnet-datazoo/)
6
+ [![](https://img.shields.io/badge/docs-cesnet--datazoo-blue.svg)](https://cesnet.github.io/cesnet-datazoo/)
7
7
  [![](https://img.shields.io/badge/python->=3.10-blue.svg)](https://pypi.org/project/cesnet-datazoo/)
8
8
  [![](https://img.shields.io/pypi/v/cesnet-datazoo)](https://pypi.org/project/cesnet-datazoo/)
9
9
 
@@ -18,9 +18,12 @@ The goal of this project is to provide tools for working with large network traf
18
18
  - Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
19
19
  - Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
20
20
 
21
- ## Datasets
21
+ :brain: :brain: See a related project [CESNET Models](https://github.com/CESNET/cesnet-models) providing pre-trained neural networks for traffic classification. :brain: :brain:
22
+
23
+ :notebook: :notebook: Example Jupyter notebooks are included in a separate [CESNET Traffic Classification Examples](https://github.com/CESNET/cesnet-tcexamples) repo. :notebook: :notebook:
22
24
 
23
- The package is able to handle the following datasets:
25
+ ## Datasets
26
+ The following datasets are available in the `cesnet-datazoo` package:
24
27
 
25
28
  | Name | CESNET-TLS22 | CESNET-QUIC22 | CESNET-TLS-Year22 |
26
29
  | ---------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -80,6 +83,6 @@ See more examples in the [documentation](https://cesnet.github.io/cesnet-datazoo
80
83
  Jan Luxemburk and Karel Hynek <br>
81
84
  CoNEXT Workshop on Explainable and Safety Bounded, Fidelitous, Machine Learning for Networking (SAFE), 2023
82
85
 
83
- ### Acknowledgements
86
+ ## Acknowledgments
84
87
 
85
- This project was supported by the Ministry of the Interior of the Czech Republic, grant No. VJ02010024: Flow-Based Encrypted Traffic Analysis.
88
+ This project was supported by the Ministry of the Interior of the Czech Republic, grant No. VJ02010024: Flow-Based Encrypted Traffic Analysis.
@@ -133,7 +133,7 @@ class DatasetConfig():
133
133
 
134
134
  Attributes:
135
135
  need_train_set: Use to disable the train set. `Default: True`
136
- need_val_set: Use to disable the validation set. When `need_train_set` is false, the validation set will also be disabled. `Default: True`
136
+ need_val_set: Use to disable the validation set. `Default: True`
137
137
  need_test_set: Use to disable the test set. `Default: True`
138
138
  train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
139
139
  train_dates: Dates used for creating a train set.
@@ -161,7 +161,7 @@ class DatasetConfig():
161
161
  val_workers: Number of workers for loading validation data. `0` means that the data will be loaded in the main process. `Default: 1`
162
162
  batch_size: Number of samples per batch. `Default: 192`
163
163
  test_batch_size: Number of samples per batch for loading validation and test data. `Default: 2048`
164
- preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: True`
164
+ preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
165
165
  preload_test: Whether to dump the test set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
166
166
  train_size: Size of the train set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
167
167
  val_known_size: Size of the validation set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
@@ -238,7 +238,7 @@ class DatasetConfig():
238
238
  val_workers: int = 1
239
239
  batch_size: int = 192
240
240
  test_batch_size: int = 2048
241
- preload_val: bool = True
241
+ preload_val: bool = False
242
242
  preload_test: bool = False
243
243
  train_size: int | Literal["all"] = "all"
244
244
  val_known_size: int | Literal["all"] = "all"
@@ -268,7 +268,6 @@ class DatasetConfig():
268
268
  self.database_path = dataset.database_path
269
269
 
270
270
  if not self.need_train_set:
271
- self.need_val_set = False
272
271
  if self.apps_selection != AppSelection.FIXED:
273
272
  raise ValueError("Application selection has to be fixed when need_train_set is false")
274
273
  if (len(self.train_dates) > 0 or self.train_period_name != ""):
@@ -299,17 +298,24 @@ class DatasetConfig():
299
298
  self.test_period_name = dataset.default_test_period_name
300
299
  self.test_dates = dataset.time_periods[dataset.default_test_period_name]
301
300
  # Configure val dates
302
- if (not self.need_val_set or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
303
- raise ValueError("val_dates and val_period_name cannot be specified when need_val_set is false or the validation approach is split-from-train")
304
- if self.val_approach == ValidationApproach.VALIDATION_DATES:
305
- if len(self.val_dates) > 0 and self.val_period_name == "":
306
- raise ValueError("val_period_name has to be specified when val_dates are set")
307
- if len(self.val_dates) == 0 and self.val_period_name != "":
308
- if self.val_period_name not in dataset.time_periods:
309
- raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
310
- self.val_dates = dataset.time_periods[self.val_period_name]
311
- if len(self.val_dates) == 0 and self.val_period_name == "":
312
- raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
301
+ if not self.need_val_set:
302
+ if len(self.val_dates) > 0 or self.val_period_name != "" or self.val_approach != ValidationApproach.SPLIT_FROM_TRAIN:
303
+ raise ValueError("val_dates, val_period_name, and val_approach cannot be specified when need_val_set is false")
304
+ else:
305
+ if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
306
+ if len(self.val_dates) > 0 or self.val_period_name != "":
307
+ raise ValueError("val_dates and val_period_name cannot be specified when the validation approach is split-from-train")
308
+ if not self.need_train_set:
309
+ raise ValueError("Cannot use the split-from-train validation approach when need_train_set is false. Either use the validation-dates approach or set need_val_set to false.")
310
+ elif self.val_approach == ValidationApproach.VALIDATION_DATES:
311
+ if len(self.val_dates) > 0 and self.val_period_name == "":
312
+ raise ValueError("val_period_name has to be specified when val_dates are set")
313
+ if len(self.val_dates) == 0 and self.val_period_name != "":
314
+ if self.val_period_name not in dataset.time_periods:
315
+ raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
316
+ self.val_dates = dataset.time_periods[self.val_period_name]
317
+ if len(self.val_dates) == 0 and self.val_period_name == "":
318
+ raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
313
319
  # Check if train, val, and test dates are available in the dataset
314
320
  bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
315
321
  bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
@@ -326,12 +332,11 @@ class DatasetConfig():
326
332
  # Check time order of train, val, and test periods
327
333
  train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
328
334
  test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
329
- if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
335
+ if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
330
336
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
331
337
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
332
- # Train dates are guaranteed to be set
333
338
  val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
334
- if min(val_dates) <= max(train_dates):
339
+ if len(train_dates) > 0 and min(val_dates) <= max(train_dates):
335
340
  warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
336
341
  if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
337
342
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
@@ -475,7 +480,7 @@ class DatasetConfig():
475
480
 
476
481
  def _get_val_tables_paths(self) -> list[str]:
477
482
  if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
478
- return list(map(lambda t: f"/flows/D{t}", self.train_dates))
483
+ return self._get_train_tables_paths()
479
484
  return list(map(lambda t: f"/flows/D{t}", self.val_dates))
480
485
 
481
486
  def _get_test_tables_paths(self) -> list[str]:
@@ -21,6 +21,7 @@ SELECTED_TCP_FLAGS = ["FLAG_CWR", "FLAG_CWR_REV", "FLAG_ECE", "FLAG_ECE_REV", "F
21
21
  PHIST_BIN_COUNT = 8
22
22
 
23
23
  # Column names
24
+ ID_COLUMN = "ID"
24
25
  APP_COLUMN = "APP"
25
26
  CATEGORY_COLUMN = "CATEGORY"
26
27
  PPI_COLUMN = "PPI"
@@ -28,7 +28,7 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
28
28
  date_weight_sample_train_indices,
29
29
  init_or_load_test_indices,
30
30
  init_or_load_train_indices,
31
- init_or_load_val_indices,
31
+ init_or_load_val_indices, no_indices,
32
32
  subset_and_sort_indices)
33
33
  from cesnet_datazoo.pytables_data.pytables_dataset import PyTablesDataset, worker_init_fn
34
34
  from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
@@ -537,10 +537,10 @@ class CesnetDataset():
537
537
  raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
538
538
  train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
539
539
  elif dataset_config.apps_selection == AppSelection.FIXED:
540
- known_apps = dataset_config.apps_selection_fixed_known
541
- unknown_apps = dataset_config.apps_selection_fixed_unknown
542
- train_indices = np.zeros((0,3), dtype=np.int64)
543
- train_unknown_indices = np.zeros((0,3), dtype=np.int64)
540
+ known_apps = sorted(dataset_config.apps_selection_fixed_known)
541
+ unknown_apps = sorted(dataset_config.apps_selection_fixed_unknown)
542
+ train_indices = no_indices()
543
+ train_unknown_indices = no_indices()
544
544
  else:
545
545
  raise ValueError("Either need train set or the fixed application selection")
546
546
  # Initialize validation set
@@ -577,8 +577,8 @@ class CesnetDataset():
577
577
  test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
578
578
  stratify=train_labels, shuffle=True, random_state=train_val_rng)
579
579
  else:
580
- val_known_indices = np.zeros((0,3), dtype=np.int64)
581
- val_unknown_indices = np.zeros((0,3), dtype=np.int64)
580
+ val_known_indices = no_indices()
581
+ val_unknown_indices = no_indices()
582
582
  val_data_path = None
583
583
  # Initialize test set
584
584
  if dataset_config.need_test_set:
@@ -588,8 +588,8 @@ class CesnetDataset():
588
588
  tables_app_enum=self._tables_app_enum,
589
589
  disable_indices_cache=disable_indices_cache,)
590
590
  else:
591
- test_known_indices = np.zeros((0,3), dtype=np.int64)
592
- test_unknown_indices = np.zeros((0,3), dtype=np.int64)
591
+ test_known_indices = no_indices()
592
+ test_unknown_indices = no_indices()
593
593
  test_data_path = None
594
594
  # Fit scalers if needed
595
595
  if (dataset_config.ppi_transform is not None and dataset_config.ppi_transform.needs_fitting or
@@ -636,7 +636,7 @@ class CesnetDataset():
636
636
  assert val_data_path is not None
637
637
  val_dataset = PyTablesDataset(
638
638
  database_path=dataset_config.database_path,
639
- tables_paths=dataset_config._get_train_tables_paths(),
639
+ tables_paths=dataset_config._get_val_tables_paths(),
640
640
  indices=dataset_indices.val_known_indices,
641
641
  tables_app_enum=self._tables_app_enum,
642
642
  tables_cat_enum=self._tables_cat_enum,
@@ -8,8 +8,8 @@ from cesnet_datazoo.utils.class_info import ClassInfo
8
8
 
9
9
  def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.ndarray, labels: list[int], class_info: ClassInfo, digits: int = 2, zero_division: int = 0) -> tuple[str, dict[str, float]]:
10
10
  p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
11
- labels=labels,
12
- zero_division=zero_division)
11
+ labels=labels,
12
+ zero_division=zero_division)
13
13
  sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
14
14
  predicted_unknown = cm[:, -1]
15
15
  with np.errstate(divide="ignore", invalid="ignore"):
@@ -132,7 +132,7 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str
132
132
  np.save(os.path.join(val_data_path, "val_known_indices.npy"), val_known_indices)
133
133
  np.save(os.path.join(val_data_path, "val_unknown_indices.npy"), val_unknown_indices)
134
134
  else:
135
- val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.npu"))
135
+ val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.npy"))
136
136
  val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
137
137
  return val_known_indices, val_unknown_indices, val_data_path
138
138
 
@@ -162,3 +162,6 @@ def init_train_data(train_data_path: str):
162
162
  def init_test_data(test_data_path: str):
163
163
  os.makedirs(test_data_path, exist_ok=True)
164
164
  os.makedirs(os.path.join(test_data_path, "preload"), exist_ok=True)
165
+
166
+ def no_indices() -> np.ndarray:
167
+ return np.zeros((0,3), dtype=np.int64)
@@ -16,7 +16,8 @@ from typing_extensions import assert_never
16
16
 
17
17
  from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
18
18
  TrainDataParams)
19
- from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
19
+ from cesnet_datazoo.constants import (APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN,
20
+ QUIC_SNI_COLUMN, TLS_SNI_COLUMN)
20
21
  from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
21
22
  split_apps_topx_with_provider_groups)
22
23
 
@@ -66,6 +67,7 @@ class PyTablesDataset(Dataset):
66
67
  self.target_transform = target_transform
67
68
  self.return_tensors = return_tensors
68
69
  self.return_all_fields = return_all_fields
70
+ self.sni_column = TLS_SNI_COLUMN if TLS_SNI_COLUMN in self.other_fields else QUIC_SNI_COLUMN if QUIC_SNI_COLUMN in self.other_fields else None
69
71
 
70
72
  self.preload = preload
71
73
  self.preload_blob = preload_blob
@@ -179,7 +181,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
179
181
  start_time = time.time()
180
182
  for i, table_path in enumerate(train_data_params.train_tables_paths):
181
183
  all_app_labels[i] = train_tables[i].read(field=APP_COLUMN)
182
- log.info(f"Reading app column for train table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
184
+ log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
183
185
  app_counts = app_counts.add(pd.Series(all_app_labels[i]).value_counts(), fill_value=0)
184
186
  database.close()
185
187
  # Handle disabled apps and apps with less than min_samples_per_app samples
@@ -223,13 +225,15 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
223
225
  else:
224
226
  known_apps = train_data_params.apps_selection_fixed_known
225
227
  unknown_apps = train_data_params.apps_selection_fixed_unknown
228
+ known_apps = sorted(known_apps)
229
+ unknown_apps = sorted(unknown_apps)
226
230
  known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
227
231
  unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
228
232
 
229
233
  train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
230
234
  rng.shuffle(train_known_indices)
231
235
  rng.shuffle(train_unknown_indices)
232
- log.info(f"Processing train indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
236
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
233
237
  return train_known_indices, train_unknown_indices, known_apps, unknown_apps
234
238
 
235
239
  def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
@@ -240,7 +244,7 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, tabl
240
244
  start_time = time.time()
241
245
  for i, table_path in enumerate(test_data_params.test_tables_paths):
242
246
  base_labels[i] = test_tables[i].read(field=APP_COLUMN)
243
- log.info(f"Reading app column for test table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
247
+ log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
244
248
  base_indices[i] = np.arange(len(test_tables[i]))
245
249
  database.close()
246
250
  known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
@@ -248,7 +252,7 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, tabl
248
252
  test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
249
253
  rng.shuffle(test_known_indices)
250
254
  rng.shuffle(test_unknown_indices)
251
- log.info(f"Processing test indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
255
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
252
256
  return test_known_indices, test_unknown_indices
253
257
 
254
258
  def load_database(database_path: str, tables_paths: Optional[list[str]] = None, mode: str = "r") -> tuple[tb.File, dict[int, Any]]: # dict[int, tb.Table]
@@ -23,8 +23,6 @@ class ClassInfo:
23
23
  categories_mapping: dict[str, Optional[str]]
24
24
 
25
25
  def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
26
- known_apps = sorted(known_apps)
27
- unknown_apps = sorted(unknown_apps)
28
26
  target_names_arr = encoder.classes_
29
27
  assert known_apps == list(target_names_arr[:-1])
30
28
  group_matrix = np.array([[a == b or
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -43,7 +43,7 @@ Requires-Dist: twine; extra == "dev"
43
43
  </p>
44
44
 
45
45
  [![](https://img.shields.io/badge/license-BSD-blue.svg)](https://github.com/CESNET/cesnet-datazoo/blob/main/LICENCE)
46
- [![](https://img.shields.io/badge/docs-mkdocs_material-blue.svg)](https://cesnet.github.io/cesnet-datazoo/)
46
+ [![](https://img.shields.io/badge/docs-cesnet--datazoo-blue.svg)](https://cesnet.github.io/cesnet-datazoo/)
47
47
  [![](https://img.shields.io/badge/python->=3.10-blue.svg)](https://pypi.org/project/cesnet-datazoo/)
48
48
  [![](https://img.shields.io/pypi/v/cesnet-datazoo)](https://pypi.org/project/cesnet-datazoo/)
49
49
 
@@ -58,9 +58,12 @@ The goal of this project is to provide tools for working with large network traf
58
58
  - Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
59
59
  - Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
60
60
 
61
- ## Datasets
61
+ :brain: :brain: See a related project [CESNET Models](https://github.com/CESNET/cesnet-models) providing pre-trained neural networks for traffic classification. :brain: :brain:
62
+
63
+ :notebook: :notebook: Example Jupyter notebooks are included in a separate [CESNET Traffic Classification Examples](https://github.com/CESNET/cesnet-tcexamples) repo. :notebook: :notebook:
62
64
 
63
- The package is able to handle the following datasets:
65
+ ## Datasets
66
+ The following datasets are available in the `cesnet-datazoo` package:
64
67
 
65
68
  | Name | CESNET-TLS22 | CESNET-QUIC22 | CESNET-TLS-Year22 |
66
69
  | ---------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -120,6 +123,6 @@ See more examples in the [documentation](https://cesnet.github.io/cesnet-datazoo
120
123
  Jan Luxemburk and Karel Hynek <br>
121
124
  CoNEXT Workshop on Explainable and Safety Bounded, Fidelitous, Machine Learning for Networking (SAFE), 2023
122
125
 
123
- ### Acknowledgements
126
+ ## Acknowledgments
124
127
 
125
- This project was supported by the Ministry of the Interior of the Czech Republic, grant No. VJ02010024: Flow-Based Encrypted Traffic Analysis.
128
+ This project was supported by the Ministry of the Interior of the Czech Republic, grant No. VJ02010024: Flow-Based Encrypted Traffic Analysis.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.1.1"
7
+ version = "0.1.3"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
File without changes
File without changes