cesnet-datazoo 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/PKG-INFO +3 -6
  2. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/README.md +2 -2
  3. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/config.py +30 -25
  4. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/constants.py +1 -0
  5. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/cesnet_dataset.py +10 -10
  6. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/metadata.csv +1 -1
  7. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/data_scalers.py +3 -37
  8. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/indices_setup.py +4 -1
  9. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/pytables_dataset.py +9 -5
  10. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/class_info.py +0 -2
  11. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/PKG-INFO +3 -6
  12. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/requires.txt +0 -3
  13. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/pyproject.toml +1 -4
  14. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/LICENCE +0 -0
  15. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/__init__.py +0 -0
  16. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/__init__.py +0 -0
  17. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/datasets.py +0 -0
  18. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
  19. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/loaders.py +0 -0
  20. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  21. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
  22. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/datasets/statistics.py +0 -0
  23. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/__init__.py +0 -0
  24. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/classification_report.py +0 -0
  25. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
  26. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  27. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  28. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/__init__.py +0 -0
  29. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/download.py +0 -0
  30. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/fileutils.py +0 -0
  31. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo/utils/random.py +0 -0
  32. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
  33. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  34. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  35. {cesnet-datazoo-0.1.0 → cesnet-datazoo-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -29,10 +29,7 @@ Requires-Dist: tables>=3.8.0
29
29
  Requires-Dist: torch>=1.10
30
30
  Requires-Dist: tqdm
31
31
  Provides-Extra: dev
32
- Requires-Dist: black; extra == "dev"
33
32
  Requires-Dist: build; extra == "dev"
34
- Requires-Dist: jupyterlab; extra == "dev"
35
- Requires-Dist: lightgbm; extra == "dev"
36
33
  Requires-Dist: mkdocs-autorefs; extra == "dev"
37
34
  Requires-Dist: mkdocs-material-extensions; extra == "dev"
38
35
  Requires-Dist: mkdocs-material; extra == "dev"
@@ -57,7 +54,7 @@ The goal of this project is to provide tools for working with large network traf
57
54
  - Extensive configuration options for:
58
55
  - Selection of train, validation, and test periods.
59
56
  - Selection of application classes and splitting classes between *known* and *unknown*.
60
- - Feature scaling.
57
+ - Data transformations, such as feature scaling.
61
58
  - Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
62
59
  - Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
63
60
 
@@ -72,7 +69,7 @@ The package is able to handle the following datasets:
72
69
  | _Collection duration_ | 2 weeks | 4 weeks | 1 year |
73
70
  | _Collection period_ | 4.10.2021 - 17.10.2021 | 31.10.2022 - 27.11.2022 | 1.1.2022 - 31.12.2022 | | ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST | ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST |
74
71
  | _Application count_ | 191 | 102 | 180 |
75
- | _Available samples_ | 141720670 | 153226273 | 507739073 |
72
+ | _Available samples_ | 141392195 | 153226273 | 507739073 |
76
73
  | _Available dataset sizes_ | XS, S, M, L | XS, S, M, L | XS, S, M, L |
77
74
  | _Cite_ | [https://doi.org/10.1016/j.comnet.2022.109467](https://doi.org/10.1016/j.comnet.2022.109467) | [https://doi.org/10.1016/j.dib.2023.108888](https://doi.org/10.1016/j.dib.2023.108888) | |
78
75
  | _Zenodo URL_ | [https://zenodo.org/record/7965515](https://zenodo.org/record/7965515) | [https://zenodo.org/record/7963302](https://zenodo.org/record/7963302) | |
@@ -14,7 +14,7 @@ The goal of this project is to provide tools for working with large network traf
14
14
  - Extensive configuration options for:
15
15
  - Selection of train, validation, and test periods.
16
16
  - Selection of application classes and splitting classes between *known* and *unknown*.
17
- - Feature scaling.
17
+ - Data transformations, such as feature scaling.
18
18
  - Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
19
19
  - Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
20
20
 
@@ -29,7 +29,7 @@ The package is able to handle the following datasets:
29
29
  | _Collection duration_ | 2 weeks | 4 weeks | 1 year |
30
30
  | _Collection period_ | 4.10.2021 - 17.10.2021 | 31.10.2022 - 27.11.2022 | 1.1.2022 - 31.12.2022 | | ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST | ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST |
31
31
  | _Application count_ | 191 | 102 | 180 |
32
- | _Available samples_ | 141720670 | 153226273 | 507739073 |
32
+ | _Available samples_ | 141392195 | 153226273 | 507739073 |
33
33
  | _Available dataset sizes_ | XS, S, M, L | XS, S, M, L | XS, S, M, L |
34
34
  | _Cite_ | [https://doi.org/10.1016/j.comnet.2022.109467](https://doi.org/10.1016/j.comnet.2022.109467) | [https://doi.org/10.1016/j.dib.2023.108888](https://doi.org/10.1016/j.dib.2023.108888) | |
35
35
  | _Zenodo URL_ | [https://zenodo.org/record/7965515](https://zenodo.org/record/7965515) | [https://zenodo.org/record/7963302](https://zenodo.org/record/7963302) | |
@@ -113,7 +113,7 @@ class DatasetConfig():
113
113
 
114
114
  - Train, validation, test sets (dates, sizes, validation approach).
115
115
  - Application selection — either the standard closed-world setting (only *known* classes) or the open-world setting (*known* and *unknown* classes).
116
- - Feature scaling. See the [data features][features] page for more information. DOCS_TODO
116
+ - Data transformations. See the [transforms][transforms] page for more information.
117
117
  - Dataloader options like batch sizes, order of loading, or number of workers.
118
118
 
119
119
  When initializing this class, pass a [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance to be configured and the desired configuration. Available options are [here][config.DatasetConfig--configuration-options].
@@ -133,7 +133,7 @@ class DatasetConfig():
133
133
 
134
134
  Attributes:
135
135
  need_train_set: Use to disable the train set. `Default: True`
136
- need_val_set: Use to disable the validation set. When `need_train_set` is false, the validation set will also be disabled. `Default: True`
136
+ need_val_set: Use to disable the validation set. `Default: True`
137
137
  need_test_set: Use to disable the test set. `Default: True`
138
138
  train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
139
139
  train_dates: Dates used for creating a train set.
@@ -161,7 +161,7 @@ class DatasetConfig():
161
161
  val_workers: Number of workers for loading validation data. `0` means that the data will be loaded in the main process. `Default: 1`
162
162
  batch_size: Number of samples per batch. `Default: 192`
163
163
  test_batch_size: Number of samples per batch for loading validation and test data. `Default: 2048`
164
- preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: True`
164
+ preload_val: Whether to dump the validation set with `numpy.savez_compressed` and preload it in future runs. Useful when running a lot of experiments with the same dataset configuration. `Default: False`
165
165
  preload_test: Whether to dump the test set with `numpy.savez_compressed` and preload it in future runs. `Default: False`
166
166
  train_size: Size of the train set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
167
167
  val_known_size: Size of the validation set. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets]. `Default: all`
@@ -176,10 +176,10 @@ class DatasetConfig():
176
176
  use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
177
177
  use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
178
178
  use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
179
- fit_scalers_samples: Fraction of train samples used for fitting feature scalers, if float. The absolute number of samples otherwise. `Default: 0.25` DOCS_TODO
180
- ppi_transform: Transform function for PPI sequences. `Default: None` DOCS_TODO
181
- flowstats_transform: Transform function for flow statistics. `Default: None`
182
- flowstats_phist_transform: Transform function for packet histograms. `Default: None`
179
+ fit_scalers_samples: Used when scaling transformation is configured and requires fitting. Fraction of train samples used for fitting, if float. The absolute number of samples otherwise. `Default: 0.25`
180
+ ppi_transform: Transform function for PPI sequences. See the [transforms][transforms] page for more information. `Default: None`
181
+ flowstats_transform: Transform function for flow statistics. See the [transforms][transforms] page for more information. `Default: None`
182
+ flowstats_phist_transform: Transform function for packet histograms. See the [transforms][transforms] page for more information. `Default: None`
183
183
 
184
184
  # How to configure train, validation, and test sets
185
185
  There are three options for how to define train/validation/test dates.
@@ -238,7 +238,7 @@ class DatasetConfig():
238
238
  val_workers: int = 1
239
239
  batch_size: int = 192
240
240
  test_batch_size: int = 2048
241
- preload_val: bool = True
241
+ preload_val: bool = False
242
242
  preload_test: bool = False
243
243
  train_size: int | Literal["all"] = "all"
244
244
  val_known_size: int | Literal["all"] = "all"
@@ -268,7 +268,6 @@ class DatasetConfig():
268
268
  self.database_path = dataset.database_path
269
269
 
270
270
  if not self.need_train_set:
271
- self.need_val_set = False
272
271
  if self.apps_selection != AppSelection.FIXED:
273
272
  raise ValueError("Application selection has to be fixed when need_train_set is false")
274
273
  if (len(self.train_dates) > 0 or self.train_period_name != ""):
@@ -281,7 +280,7 @@ class DatasetConfig():
281
280
  if self.train_period_name not in dataset.time_periods:
282
281
  raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
283
282
  self.train_dates = dataset.time_periods[self.train_period_name]
284
- if len(self.train_dates) == 0 and self.test_period_name == "":
283
+ if len(self.train_dates) == 0 and self.train_period_name == "":
285
284
  self.train_period_name = dataset.default_train_period_name
286
285
  self.train_dates = dataset.time_periods[dataset.default_train_period_name]
287
286
  # Configure test dates
@@ -299,17 +298,24 @@ class DatasetConfig():
299
298
  self.test_period_name = dataset.default_test_period_name
300
299
  self.test_dates = dataset.time_periods[dataset.default_test_period_name]
301
300
  # Configure val dates
302
- if (not self.need_val_set or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
303
- raise ValueError("val_dates and val_period_name cannot be specified when need_val_set is false or the validation approach is split-from-train")
304
- if self.val_approach == ValidationApproach.VALIDATION_DATES:
305
- if len(self.val_dates) > 0 and self.val_period_name == "":
306
- raise ValueError("val_period_name has to be specified when val_dates are set")
307
- if len(self.val_dates) == 0 and self.val_period_name != "":
308
- if self.val_period_name not in dataset.time_periods:
309
- raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
310
- self.val_dates = dataset.time_periods[self.val_period_name]
311
- if len(self.val_dates) == 0 and self.val_period_name == "":
312
- raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
301
+ if not self.need_val_set:
302
+ if len(self.val_dates) > 0 or self.val_period_name != "" or self.val_approach != ValidationApproach.SPLIT_FROM_TRAIN:
303
+ raise ValueError("val_dates, val_period_name, and val_approach cannot be specified when need_val_set is false")
304
+ else:
305
+ if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
306
+ if len(self.val_dates) > 0 or self.val_period_name != "":
307
+ raise ValueError("val_dates and val_period_name cannot be specified when the validation approach is split-from-train")
308
+ if not self.need_train_set:
309
+ raise ValueError("Cannot use the split-from-train validation approach when need_train_set is false. Either use the validation-dates approach or set need_val_set to false.")
310
+ elif self.val_approach == ValidationApproach.VALIDATION_DATES:
311
+ if len(self.val_dates) > 0 and self.val_period_name == "":
312
+ raise ValueError("val_period_name has to be specified when val_dates are set")
313
+ if len(self.val_dates) == 0 and self.val_period_name != "":
314
+ if self.val_period_name not in dataset.time_periods:
315
+ raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
316
+ self.val_dates = dataset.time_periods[self.val_period_name]
317
+ if len(self.val_dates) == 0 and self.val_period_name == "":
318
+ raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when the validation approach is validation-dates")
313
319
  # Check if train, val, and test dates are available in the dataset
314
320
  bad_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
315
321
  bad_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
@@ -326,12 +332,11 @@ class DatasetConfig():
326
332
  # Check time order of train, val, and test periods
327
333
  train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
328
334
  test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
329
- if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
335
+ if len(train_dates) > 0 and len(test_dates) > 0 and min(test_dates) <= max(train_dates):
330
336
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
331
337
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
332
- # Train dates are guaranteed to be set
333
338
  val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
334
- if min(val_dates) <= max(train_dates):
339
+ if len(train_dates) > 0 and min(val_dates) <= max(train_dates):
335
340
  warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
336
341
  if len(test_dates) > 0 and min(test_dates) <= max(val_dates):
337
342
  warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
@@ -475,7 +480,7 @@ class DatasetConfig():
475
480
 
476
481
  def _get_val_tables_paths(self) -> list[str]:
477
482
  if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
478
- return list(map(lambda t: f"/flows/D{t}", self.train_dates))
483
+ return self._get_train_tables_paths()
479
484
  return list(map(lambda t: f"/flows/D{t}", self.val_dates))
480
485
 
481
486
  def _get_test_tables_paths(self) -> list[str]:
@@ -21,6 +21,7 @@ SELECTED_TCP_FLAGS = ["FLAG_CWR", "FLAG_CWR_REV", "FLAG_ECE", "FLAG_ECE_REV", "F
21
21
  PHIST_BIN_COUNT = 8
22
22
 
23
23
  # Column names
24
+ ID_COLUMN = "ID"
24
25
  APP_COLUMN = "APP"
25
26
  CATEGORY_COLUMN = "CATEGORY"
26
27
  PPI_COLUMN = "PPI"
@@ -28,7 +28,7 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
28
28
  date_weight_sample_train_indices,
29
29
  init_or_load_test_indices,
30
30
  init_or_load_train_indices,
31
- init_or_load_val_indices,
31
+ init_or_load_val_indices, no_indices,
32
32
  subset_and_sort_indices)
33
33
  from cesnet_datazoo.pytables_data.pytables_dataset import PyTablesDataset, worker_init_fn
34
34
  from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
@@ -537,10 +537,10 @@ class CesnetDataset():
537
537
  raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
538
538
  train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
539
539
  elif dataset_config.apps_selection == AppSelection.FIXED:
540
- known_apps = dataset_config.apps_selection_fixed_known
541
- unknown_apps = dataset_config.apps_selection_fixed_unknown
542
- train_indices = np.zeros((0,3), dtype=np.int64)
543
- train_unknown_indices = np.zeros((0,3), dtype=np.int64)
540
+ known_apps = sorted(dataset_config.apps_selection_fixed_known)
541
+ unknown_apps = sorted(dataset_config.apps_selection_fixed_unknown)
542
+ train_indices = no_indices()
543
+ train_unknown_indices = no_indices()
544
544
  else:
545
545
  raise ValueError("Either need train set or the fixed application selection")
546
546
  # Initialize validation set
@@ -577,8 +577,8 @@ class CesnetDataset():
577
577
  test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
578
578
  stratify=train_labels, shuffle=True, random_state=train_val_rng)
579
579
  else:
580
- val_known_indices = np.zeros((0,3), dtype=np.int64)
581
- val_unknown_indices = np.zeros((0,3), dtype=np.int64)
580
+ val_known_indices = no_indices()
581
+ val_unknown_indices = no_indices()
582
582
  val_data_path = None
583
583
  # Initialize test set
584
584
  if dataset_config.need_test_set:
@@ -588,8 +588,8 @@ class CesnetDataset():
588
588
  tables_app_enum=self._tables_app_enum,
589
589
  disable_indices_cache=disable_indices_cache,)
590
590
  else:
591
- test_known_indices = np.zeros((0,3), dtype=np.int64)
592
- test_unknown_indices = np.zeros((0,3), dtype=np.int64)
591
+ test_known_indices = no_indices()
592
+ test_unknown_indices = no_indices()
593
593
  test_data_path = None
594
594
  # Fit scalers if needed
595
595
  if (dataset_config.ppi_transform is not None and dataset_config.ppi_transform.needs_fitting or
@@ -636,7 +636,7 @@ class CesnetDataset():
636
636
  assert val_data_path is not None
637
637
  val_dataset = PyTablesDataset(
638
638
  database_path=dataset_config.database_path,
639
- tables_paths=dataset_config._get_train_tables_paths(),
639
+ tables_paths=dataset_config._get_val_tables_paths(),
640
640
  indices=dataset_indices.val_known_indices,
641
641
  tables_app_enum=self._tables_app_enum,
642
642
  tables_cat_enum=self._tables_cat_enum,
@@ -1,4 +1,4 @@
1
1
  Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic classes,PPI features,Flowstats features,Flowstats features boolean,Packet histograms,TCP features,Other fields,Cite,Zenodo URL,Related papers
2
- CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
2
+ CESNET-TLS22,TLS,2022,2021,2 weeks,141392195,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
3
3
  CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
4
4
  CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
@@ -17,18 +17,6 @@ from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_gene
17
17
  log = logging.getLogger(__name__)
18
18
 
19
19
 
20
- def get_scaler_attrs(scaler: StandardScaler | RobustScaler | MinMaxScaler) -> dict[str, list[float]]:
21
- if isinstance(scaler, StandardScaler):
22
- assert hasattr(scaler, "mean_") and scaler.mean_ is not None and hasattr(scaler, "scale_") and scaler.scale_ is not None
23
- scaler_attrs = {"mean_": scaler.mean_.tolist(), "scale_": scaler.scale_.tolist()}
24
- elif isinstance(scaler, RobustScaler):
25
- assert hasattr(scaler, "center_") and hasattr(scaler, "scale_")
26
- scaler_attrs = {"center_": scaler.center_.tolist(), "scale_": scaler.scale_.tolist()}
27
- elif isinstance(scaler, MinMaxScaler):
28
- assert hasattr(scaler, "min_") and hasattr(scaler, "scale_")
29
- scaler_attrs = {"min_": scaler.min_.tolist(), "scale_": scaler.scale_.tolist()}
30
- return scaler_attrs
31
-
32
20
  def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> None:
33
21
  # Define indices for fitting scalers
34
22
  if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
@@ -48,6 +36,7 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
48
36
 
49
37
  clip_and_scale_ppi_transform = dataset_config.ppi_transform # TODO Fix after transforms composing is implemented
50
38
  clip_and_scale_flowstats_transform = dataset_config.flowstats_transform
39
+ train_data_path = dataset_config._get_train_data_path()
51
40
 
52
41
  # Fit the ClipAndScalePPI transform
53
42
  if clip_and_scale_ppi_transform is not None and clip_and_scale_ppi_transform.needs_fitting:
@@ -70,6 +59,7 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
70
59
  train_psizes = np.concatenate((train_psizes, [0]))
71
60
  clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
72
61
  clip_and_scale_ppi_transform.needs_fitting = False
62
+ json.dump(clip_and_scale_ppi_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "ppi-transform.json"), "w"), indent=4)
73
63
 
74
64
  # Fit the ClipAndScaleFlowstats transform
75
65
  if clip_and_scale_flowstats_transform is not None and clip_and_scale_flowstats_transform.needs_fitting:
@@ -82,29 +72,5 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
82
72
  clip_and_scale_flowstats_transform.flowstats_scaler.fit(train_flowstats)
83
73
  clip_and_scale_flowstats_transform.flowstats_quantiles = flowstats_quantiles.tolist()
84
74
  clip_and_scale_flowstats_transform.needs_fitting = False
85
-
75
+ json.dump(clip_and_scale_flowstats_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "flowstats-transform.json"), "w"), indent=4)
86
76
  log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
87
- train_data_path = dataset_config._get_train_data_path()
88
- if clip_and_scale_ppi_transform is not None:
89
- ppi_transform_path = os.path.join(train_data_path, "transforms", "ppi-transform.json")
90
- ppi_transform_dict = {
91
- "psizes_scaler_enum": str(clip_and_scale_ppi_transform._psizes_scaler_enum),
92
- "psizes_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.psizes_scaler),
93
- "pszies_min": clip_and_scale_ppi_transform.pszies_min,
94
- "psizes_max": clip_and_scale_ppi_transform.psizes_max,
95
- "ipt_scaler_enum": str(clip_and_scale_ppi_transform._ipt_scaler_enum),
96
- "ipt_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.ipt_scaler),
97
- "ipt_min": clip_and_scale_ppi_transform.ipt_min,
98
- "ipt_max": clip_and_scale_ppi_transform.ipt_max,
99
- }
100
- json.dump(ppi_transform_dict, open(ppi_transform_path, "w"), indent=4)
101
- if clip_and_scale_flowstats_transform is not None:
102
- assert clip_and_scale_flowstats_transform.flowstats_quantiles is not None
103
- flowstats_transform_path = os.path.join(train_data_path, "transforms", "flowstats-transform.json")
104
- flowstats_transform_dict = {
105
- "flowstats_scaler_enum": str(clip_and_scale_flowstats_transform._flowstats_scaler_enum),
106
- "flowstats_scaler_attrs": get_scaler_attrs(clip_and_scale_flowstats_transform.flowstats_scaler),
107
- "flowstats_quantiles": clip_and_scale_flowstats_transform.flowstats_quantiles,
108
- "quantile_clip": clip_and_scale_flowstats_transform.quantile_clip,
109
- }
110
- json.dump(flowstats_transform_dict, open(flowstats_transform_path, "w"), indent=4)
@@ -132,7 +132,7 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str
132
132
  np.save(os.path.join(val_data_path, "val_known_indices.npy"), val_known_indices)
133
133
  np.save(os.path.join(val_data_path, "val_unknown_indices.npy"), val_unknown_indices)
134
134
  else:
135
- val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.npu"))
135
+ val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.npy"))
136
136
  val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
137
137
  return val_known_indices, val_unknown_indices, val_data_path
138
138
 
@@ -162,3 +162,6 @@ def init_train_data(train_data_path: str):
162
162
  def init_test_data(test_data_path: str):
163
163
  os.makedirs(test_data_path, exist_ok=True)
164
164
  os.makedirs(os.path.join(test_data_path, "preload"), exist_ok=True)
165
+
166
+ def no_indices() -> np.ndarray:
167
+ return np.zeros((0,3), dtype=np.int64)
@@ -16,7 +16,8 @@ from typing_extensions import assert_never
16
16
 
17
17
  from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
18
18
  TrainDataParams)
19
- from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
19
+ from cesnet_datazoo.constants import (APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN,
20
+ QUIC_SNI_COLUMN, TLS_SNI_COLUMN)
20
21
  from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
21
22
  split_apps_topx_with_provider_groups)
22
23
 
@@ -66,6 +67,7 @@ class PyTablesDataset(Dataset):
66
67
  self.target_transform = target_transform
67
68
  self.return_tensors = return_tensors
68
69
  self.return_all_fields = return_all_fields
70
+ self.sni_column = TLS_SNI_COLUMN if TLS_SNI_COLUMN in self.other_fields else QUIC_SNI_COLUMN if QUIC_SNI_COLUMN in self.other_fields else None
69
71
 
70
72
  self.preload = preload
71
73
  self.preload_blob = preload_blob
@@ -179,7 +181,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
179
181
  start_time = time.time()
180
182
  for i, table_path in enumerate(train_data_params.train_tables_paths):
181
183
  all_app_labels[i] = train_tables[i].read(field=APP_COLUMN)
182
- log.info(f"Reading app column for train table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
184
+ log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
183
185
  app_counts = app_counts.add(pd.Series(all_app_labels[i]).value_counts(), fill_value=0)
184
186
  database.close()
185
187
  # Handle disabled apps and apps with less than min_samples_per_app samples
@@ -223,13 +225,15 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
223
225
  else:
224
226
  known_apps = train_data_params.apps_selection_fixed_known
225
227
  unknown_apps = train_data_params.apps_selection_fixed_unknown
228
+ known_apps = sorted(known_apps)
229
+ unknown_apps = sorted(unknown_apps)
226
230
  known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
227
231
  unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
228
232
 
229
233
  train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
230
234
  rng.shuffle(train_known_indices)
231
235
  rng.shuffle(train_unknown_indices)
232
- log.info(f"Processing train indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
236
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
233
237
  return train_known_indices, train_unknown_indices, known_apps, unknown_apps
234
238
 
235
239
  def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
@@ -240,7 +244,7 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, tabl
240
244
  start_time = time.time()
241
245
  for i, table_path in enumerate(test_data_params.test_tables_paths):
242
246
  base_labels[i] = test_tables[i].read(field=APP_COLUMN)
243
- log.info(f"Reading app column for test table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
247
+ log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
244
248
  base_indices[i] = np.arange(len(test_tables[i]))
245
249
  database.close()
246
250
  known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
@@ -248,7 +252,7 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, tabl
248
252
  test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
249
253
  rng.shuffle(test_known_indices)
250
254
  rng.shuffle(test_unknown_indices)
251
- log.info(f"Processing test indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
255
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
252
256
  return test_known_indices, test_unknown_indices
253
257
 
254
258
  def load_database(database_path: str, tables_paths: Optional[list[str]] = None, mode: str = "r") -> tuple[tb.File, dict[int, Any]]: # dict[int, tb.Table]
@@ -23,8 +23,6 @@ class ClassInfo:
23
23
  categories_mapping: dict[str, Optional[str]]
24
24
 
25
25
  def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
26
- known_apps = sorted(known_apps)
27
- unknown_apps = sorted(unknown_apps)
28
26
  target_names_arr = encoder.classes_
29
27
  assert known_apps == list(target_names_arr[:-1])
30
28
  group_matrix = np.array([[a == b or
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -29,10 +29,7 @@ Requires-Dist: tables>=3.8.0
29
29
  Requires-Dist: torch>=1.10
30
30
  Requires-Dist: tqdm
31
31
  Provides-Extra: dev
32
- Requires-Dist: black; extra == "dev"
33
32
  Requires-Dist: build; extra == "dev"
34
- Requires-Dist: jupyterlab; extra == "dev"
35
- Requires-Dist: lightgbm; extra == "dev"
36
33
  Requires-Dist: mkdocs-autorefs; extra == "dev"
37
34
  Requires-Dist: mkdocs-material-extensions; extra == "dev"
38
35
  Requires-Dist: mkdocs-material; extra == "dev"
@@ -57,7 +54,7 @@ The goal of this project is to provide tools for working with large network traf
57
54
  - Extensive configuration options for:
58
55
  - Selection of train, validation, and test periods.
59
56
  - Selection of application classes and splitting classes between *known* and *unknown*.
60
- - Feature scaling.
57
+ - Data transformations, such as feature scaling.
61
58
  - Built on suitable data structures for experiments with large datasets. There are several caching mechanisms to make repeated runs faster, for example, when searching for the best model configuration.
62
59
  - Datasets are offered in multiple sizes to give users an option to start the experiments at a smaller scale (also faster dataset download, disk space, etc.). The default is the `S` size containing 25 million samples.
63
60
 
@@ -72,7 +69,7 @@ The package is able to handle the following datasets:
72
69
  | _Collection duration_ | 2 weeks | 4 weeks | 1 year |
73
70
  | _Collection period_ | 4.10.2021 - 17.10.2021 | 31.10.2022 - 27.11.2022 | 1.1.2022 - 31.12.2022 | | ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST | ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST |
74
71
  | _Application count_ | 191 | 102 | 180 |
75
- | _Available samples_ | 141720670 | 153226273 | 507739073 |
72
+ | _Available samples_ | 141392195 | 153226273 | 507739073 |
76
73
  | _Available dataset sizes_ | XS, S, M, L | XS, S, M, L | XS, S, M, L |
77
74
  | _Cite_ | [https://doi.org/10.1016/j.comnet.2022.109467](https://doi.org/10.1016/j.comnet.2022.109467) | [https://doi.org/10.1016/j.dib.2023.108888](https://doi.org/10.1016/j.dib.2023.108888) | |
78
75
  | _Zenodo URL_ | [https://zenodo.org/record/7965515](https://zenodo.org/record/7965515) | [https://zenodo.org/record/7963302](https://zenodo.org/record/7963302) | |
@@ -12,10 +12,7 @@ torch>=1.10
12
12
  tqdm
13
13
 
14
14
  [dev]
15
- black
16
15
  build
17
- jupyterlab
18
- lightgbm
19
16
  mkdocs-autorefs
20
17
  mkdocs-material-extensions
21
18
  mkdocs-material
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.1.0"
7
+ version = "0.1.2"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
@@ -45,10 +45,7 @@ dependencies = [
45
45
 
46
46
  [project.optional-dependencies]
47
47
  dev = [
48
- "black",
49
48
  "build",
50
- "jupyterlab",
51
- "lightgbm",
52
49
  "mkdocs-autorefs",
53
50
  "mkdocs-material-extensions",
54
51
  "mkdocs-material",
File without changes
File without changes