cesnet-datazoo 0.0.10__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/PKG-INFO +3 -3
  2. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/README.md +2 -2
  3. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/config.py +105 -62
  4. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/cesnet_dataset.py +63 -26
  5. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/datasets.py +8 -8
  6. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/statistics.py +44 -36
  7. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/metrics/classification_report.py +16 -16
  8. cesnet-datazoo-0.0.10/cesnet_datazoo/metrics/superclass_metrics.py → cesnet-datazoo-0.0.12/cesnet_datazoo/metrics/provider_metrics.py +9 -8
  9. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/pytables_dataset.py +8 -7
  10. cesnet-datazoo-0.0.12/cesnet_datazoo/utils/class_info.py +50 -0
  11. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/PKG-INFO +3 -3
  12. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/SOURCES.txt +1 -1
  13. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/pyproject.toml +1 -1
  14. cesnet-datazoo-0.0.10/cesnet_datazoo/utils/class_info.py +0 -46
  15. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/LICENCE +0 -0
  16. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/__init__.py +0 -0
  17. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/constants.py +0 -0
  18. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/__init__.py +0 -0
  19. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/loaders.py +0 -0
  20. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  21. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
  22. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
  23. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/metrics/__init__.py +0 -0
  24. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  25. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  26. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/pytables_data/indices_setup.py +0 -0
  27. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/__init__.py +0 -0
  28. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/download.py +0 -0
  29. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/fileutils.py +0 -0
  30. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo/utils/random.py +0 -0
  31. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  32. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/requires.txt +0 -0
  33. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  34. {cesnet-datazoo-0.0.10 → cesnet-datazoo-0.0.12}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -102,8 +102,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
102
102
  dataset_config = DatasetConfig(
103
103
  dataset=dataset,
104
104
  apps_selection=AppSelection.ALL_KNOWN,
105
- train_period="W-2022-44",
106
- test_period="W-2022-45",
105
+ train_period_name="W-2022-44",
106
+ test_period_name="W-2022-45",
107
107
  )
108
108
  dataset.set_dataset_config_and_initialize(dataset_config)
109
109
  train_dataframe = dataset.get_train_df()
@@ -60,8 +60,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
60
60
  dataset_config = DatasetConfig(
61
61
  dataset=dataset,
62
62
  apps_selection=AppSelection.ALL_KNOWN,
63
- train_period="W-2022-44",
64
- test_period="W-2022-45",
63
+ train_period_name="W-2022-44",
64
+ test_period_name="W-2022-45",
65
65
  )
66
66
  dataset.set_dataset_config_and_initialize(dataset_config)
67
67
  train_dataframe = dataset.get_train_df()
@@ -4,11 +4,14 @@ import dataclasses
4
4
  import hashlib
5
5
  import json
6
6
  import os
7
+ import warnings
7
8
  from dataclasses import InitVar, field
9
+ from datetime import datetime
8
10
  from enum import Enum
9
11
  from typing import TYPE_CHECKING, Literal, Optional
10
12
 
11
13
  import yaml
14
+ from pydantic import model_validator
12
15
  from pydantic.dataclasses import dataclass
13
16
  from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
14
17
 
@@ -21,10 +24,15 @@ if TYPE_CHECKING:
21
24
  Scaler = RobustScaler | StandardScaler | MinMaxScaler | None
22
25
 
23
26
  class ScalerEnum(Enum):
24
- ROBUST = "robust"
27
+ """Available scalers for flow statistics, packet sizes, and inter-packet times."""
25
28
  STANDARD = "standard"
29
+ """Standardize features by removing the mean and scaling to unit variance - [`sklearn.preprocessing.StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)."""
30
+ ROBUST = "robust"
31
+ """Robust scaling with the median and the interquartile range - [`sklearn.preprocessing.RobustScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)."""
26
32
  MINMAX = "minmax"
33
+ """Scaling to a (0, 1) range - [`sklearn.preprocessing.MinMaxScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)."""
27
34
  NO_SCALER = "no-scaler"
35
+ """No scaling."""
28
36
  def __str__(self): return self.value
29
37
 
30
38
  class Protocol(Enum):
@@ -39,7 +47,7 @@ class ValidationApproach(Enum):
39
47
  Scikit-learn [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
40
48
  is used to create a random stratified validation set. The fraction of validation samples is defined in `train_val_split_fraction`."""
41
49
  VALIDATION_DATES = "validation-dates"
42
- """Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period`."""
50
+ """Use separate validation dates to create a validation set. Validation dates need to be specified in `val_dates`, and the name of the validation period in `val_period_name`."""
43
51
  NO_VALIDATION = "no-validation"
44
52
  """Do not use validation. The validation dataloader and dataframe will not be available."""
45
53
  def __str__(self): return self.value
@@ -53,7 +61,8 @@ class AppSelection(Enum):
53
61
  ALL_KNOWN = "all-known"
54
62
  """Use all applications as *known*."""
55
63
  TOPX_KNOWN = "topx-known"
56
- """Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*."""
64
+ """Use the first X (`apps_selection_topx`) most frequent (with the most samples) applications as *known*, and the rest as *unknown*.
65
+ Applications with the same provider are never separated, i.e., all applications of a given provider are either *known* or *unknown*."""
57
66
  EXPLICIT_UNKNOWN = "explicit-unknown"
58
67
  """Use the provided list of applications (`apps_selection_explicit_unknown`) as *unknown*, and the rest as *known*."""
59
68
  LONGTERM_FIXED = "longterm-fixed"
@@ -90,7 +99,7 @@ class DataLoaderOrder(Enum):
90
99
  @dataclass(frozen=True)
91
100
  class TrainDataParams():
92
101
  database_filename: str
93
- train_period: str
102
+ train_period_name: str
94
103
  train_tables_paths: list[str]
95
104
  apps_selection: AppSelection
96
105
  apps_selection_topx: int
@@ -103,7 +112,7 @@ class TrainDataParams():
103
112
  @dataclass(frozen=True)
104
113
  class TestDataParams():
105
114
  database_filename: str
106
- test_period: str
115
+ test_period_name: str
107
116
  test_tables_paths: list[str]
108
117
  known_apps_database_enum: dict[int, str]
109
118
  unknown_apps_database_enum: dict[int, str]
@@ -125,23 +134,24 @@ class DatasetConfig():
125
134
 
126
135
  Attributes:
127
136
  dataset: The dataset instance to be configured
128
- data_root: Extracted from the dataset instance
129
- database_filename: Extracted from the dataset instance
130
- database_path: Extracted from the dataset instance
131
- servicemap_path: Extracted from the dataset instance
132
- flowstats_features: Extracted from the `dataset.metadata.flowstats_features`
137
+ data_root: Taken from the dataset instance
138
+ database_filename: Taken from the dataset instance
139
+ database_path: Taken from the dataset instance
140
+ servicemap_path: Taken from the dataset instance
141
+ flowstats_features: Taken from `dataset.metadata.flowstats_features`
133
142
 
134
143
  # Configuration options
135
144
 
136
145
  Attributes:
137
- train_period: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
146
+ train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
138
147
  train_dates: Dates used for creating a train set.
139
148
  train_dates_weigths: To use a non-uniform distribution of samples across train dates.
140
149
  val_approach: How a validation set should be created. Either split train data into train and validation, have a separate validation period, or no validation at all. `Default: SPLIT_FROM_TRAIN`
141
150
  train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
142
- val_period: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
151
+ val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
143
152
  val_dates: Dates used for creating a validation set.
144
- test_period: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
153
+ no_test_set: Disable the test set. `Default: False`
154
+ test_period_name: Name of the test period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
145
155
  test_dates: Dates used for creating a test set.
146
156
 
147
157
  apps_selection: How to select application classes. `Default: ALL_KNOWN`
@@ -171,6 +181,7 @@ class DatasetConfig():
171
181
  return_ips: Use for IP-based classification. Dataloaders will return data in this tuple format `((SRC_IP, DST_IP, SRC_PORT, DST_PORT), LABELS)`. Dataframes are not available when this option is used. `Default: False`
172
182
  return_torch: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
173
183
  use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
184
+ normalize_packet_histograms: Whether to normalize packet histograms. If true, bins contain fractions instead of absolute numbers. `Default: True`
174
185
  use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
175
186
  use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
176
187
  zero_ppi_start: Zeroing out the first N packets of each packet sequence. `Default: 0`
@@ -184,18 +195,18 @@ class DatasetConfig():
184
195
  ipt_max: Max clip inter-packet times before scaling. `Default: 15000`
185
196
 
186
197
  # How to configure train, validation, and test sets
187
- There are three options for how to define train/validation/test __time periods and dates__.
198
+ There are three options for how to define train/validation/test dates.
188
199
 
189
- 1. Specify a predefined time period (`train_period`, `val_period`, or `test_period`) available in `dataset.time_periods` and leave the list of dates (`train_dates`, `val_dates`, or `test_dates`) empty.
190
- 2. Name a custom time period and provide a list of dates. The dates are checked against `dataset.available_dates`.
191
- 3. Leave everything empty and use the dataset's defaults `dataset.default_train_period` and `dataset.default_test_period`.
200
+ 1. Choose a predefined time period (`train_period_name`, `val_period_name`, or `test_period_name`) available in `dataset.time_periods` and leave the list of dates (`train_dates`, `val_dates`, or `test_dates`) empty.
201
+ 2. Provide a list of dates and a name for the time period. The dates are checked against `dataset.available_dates`.
202
+ 3. Do not specify anything and use the dataset's defaults `dataset.default_train_period_name` and `dataset.default_test_period_name`.
192
203
 
193
- There are two options for configuring __sizes__ of train/validation/test sets.
204
+ There are two options for configuring sizes of train/validation/test sets.
194
205
 
195
206
  1. Select an appropriate dataset size (default is `S`) when creating the [`CesnetDataset`][datasets.cesnet_dataset.CesnetDataset] instance and leave `train_size`, `val_known_size`, and `test_known_size` with their default `all` value.
196
207
  This will create train/validation/test sets with all samples available in the selected dataset size (of course, depending on the selected dates and validation approach).
197
208
  2. Provide exact sizes in `train_size`, `val_known_size`, and `test_known_size`. This will create train/validation/test sets of the given sizes by doing a random subset.
198
- This is especially useful when using the `ORIG` dataset size and want to run smaller experiments.
209
+ This is especially useful when using the `ORIG` dataset size and want to control the size of experiments.
199
210
 
200
211
  !!! tip Validation set
201
212
  The default approach for creating a validation set is to randomly split the train data into train and validation. The second approach is to define separate validation dates. See [ValidationApproach][config.ValidationApproach].
@@ -208,14 +219,15 @@ class DatasetConfig():
208
219
  servicemap_path: str = field(init=False)
209
220
  flowstats_features: list[str] = field(init=False)
210
221
 
211
- train_period: str = ""
222
+ train_period_name: str = ""
212
223
  train_dates: list[str] = field(default_factory=list)
213
224
  train_dates_weigths: Optional[list[int]] = None
214
225
  val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
215
226
  train_val_split_fraction: float = 0.2
216
- val_period: str = ""
227
+ val_period_name: str = ""
217
228
  val_dates: list[str] = field(default_factory=list)
218
- test_period: str = ""
229
+ no_test_set: bool = False
230
+ test_period_name: str = ""
219
231
  test_dates: list[str] = field(default_factory=list)
220
232
 
221
233
  apps_selection: AppSelection = AppSelection.ALL_KNOWN
@@ -245,6 +257,7 @@ class DatasetConfig():
245
257
  return_ips: bool = False
246
258
  return_torch: bool = False
247
259
  use_packet_histograms: bool = True
260
+ normalize_packet_histograms: bool = True
248
261
  use_tcp_features: bool = True
249
262
  use_push_flags: bool = False
250
263
  zero_ppi_start: int = 0
@@ -267,52 +280,67 @@ class DatasetConfig():
267
280
  self.database_path = dataset.database_path
268
281
  self.flowstats_features = dataset.metadata.flowstats_features
269
282
 
270
- # Configure train dates and period
271
- if len(self.train_dates) > 0 and self.train_period == "":
272
- raise ValueError("train_period has to be specified when train_dates are set")
273
- if len(self.train_dates) == 0 and self.train_period != "":
274
- if self.train_period not in dataset.time_periods:
275
- raise ValueError(f"Unknown train_period {self.train_period}. Use time period available in dataset.time_periods")
276
- self.train_dates = dataset.time_periods[self.train_period]
277
- if len(self.train_dates) == 0 and self.test_period == "":
278
- self.train_period = dataset.default_train_period
279
- self.train_dates = dataset.time_periods[dataset.default_train_period]
280
- # Configure test dates and period
281
- if len(self.test_dates) > 0 and self.test_period == "":
282
- raise ValueError("test_period has to be specified when test_dates are set")
283
- if len(self.test_dates) == 0 and self.test_period != "":
284
- if self.test_period not in dataset.time_periods:
285
- raise ValueError(f"Unknown test_period {self.test_period}. Use time period available in dataset.time_periods")
286
- self.test_dates = dataset.time_periods[self.test_period]
287
- if len(self.test_dates) == 0 and self.test_period == "":
288
- self.test_period = dataset.default_test_period
289
- self.test_dates = dataset.time_periods[dataset.default_test_period]
290
- # Configure val dates and period
291
- if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period != ""):
292
- raise ValueError("val_dates and val_period cannot be specified when val_approach is no-validation or split-from-train")
283
+ # Configure train dates
284
+ if len(self.train_dates) > 0 and self.train_period_name == "":
285
+ raise ValueError("train_period_name has to be specified when train_dates are set")
286
+ if len(self.train_dates) == 0 and self.train_period_name != "":
287
+ if self.train_period_name not in dataset.time_periods:
288
+ raise ValueError(f"Unknown train_period_name {self.train_period_name}. Use time period available in dataset.time_periods")
289
+ self.train_dates = dataset.time_periods[self.train_period_name]
290
+ if len(self.train_dates) == 0 and self.test_period_name == "":
291
+ self.train_period_name = dataset.default_train_period_name
292
+ self.train_dates = dataset.time_periods[dataset.default_train_period_name]
293
+ # Configure test dates
294
+ if self.no_test_set:
295
+ if (len(self.test_dates) > 0 or self.test_period_name != ""):
296
+ raise ValueError("test_dates and test_period_name cannot be specified when no_test_set is true")
297
+ else:
298
+ if len(self.test_dates) > 0 and self.test_period_name == "":
299
+ raise ValueError("test_period_name has to be specified when test_dates are set")
300
+ if len(self.test_dates) == 0 and self.test_period_name != "":
301
+ if self.test_period_name not in dataset.time_periods:
302
+ raise ValueError(f"Unknown test_period_name {self.test_period_name}. Use time period available in dataset.time_periods")
303
+ self.test_dates = dataset.time_periods[self.test_period_name]
304
+ if len(self.test_dates) == 0 and self.test_period_name == "":
305
+ self.test_period_name = dataset.default_test_period_name
306
+ self.test_dates = dataset.time_periods[dataset.default_test_period_name]
307
+ # Configure val dates
308
+ if (self.val_approach == ValidationApproach.NO_VALIDATION or self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN) and (len(self.val_dates) > 0 or self.val_period_name != ""):
309
+ raise ValueError("val_dates and val_period_name cannot be specified when val_approach is no-validation or split-from-train")
293
310
  if self.val_approach == ValidationApproach.VALIDATION_DATES:
294
- if len(self.val_dates) > 0 and self.val_period == "":
295
- raise ValueError("val_period has to be specified when val_dates are set")
296
- if len(self.val_dates) == 0 and self.val_period != "":
297
- if self.val_period not in dataset.time_periods:
298
- raise ValueError(f"Unknown val_period {self.val_period}. Use time period available in dataset.time_periods")
299
- self.val_dates = dataset.time_periods[self.val_period]
300
- if len(self.val_dates) == 0 and self.val_period == "":
301
- raise ValueError("val_period and val_dates (or val_period from dataset.time_periods) have to be specified when val_approach is validation-dates")
311
+ if len(self.val_dates) > 0 and self.val_period_name == "":
312
+ raise ValueError("val_period_name has to be specified when val_dates are set")
313
+ if len(self.val_dates) == 0 and self.val_period_name != "":
314
+ if self.val_period_name not in dataset.time_periods:
315
+ raise ValueError(f"Unknown val_period_name {self.val_period_name}. Use time period available in dataset.time_periods")
316
+ self.val_dates = dataset.time_periods[self.val_period_name]
317
+ if len(self.val_dates) == 0 and self.val_period_name == "":
318
+ raise ValueError("val_period_name and val_dates (or val_period_name from dataset.time_periods) have to be specified when val_approach is validation-dates")
302
319
  # Check if train, val, and test dates are available in the dataset
303
320
  if dataset.available_dates:
304
321
  unknown_train_dates = [t for t in self.train_dates if t not in dataset.available_dates]
305
322
  unknown_val_dates = [t for t in self.val_dates if t not in dataset.available_dates]
306
323
  unknown_test_dates = [t for t in self.test_dates if t not in dataset.available_dates]
307
324
  if len(unknown_train_dates) > 0:
308
- raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates" \
309
- + f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
325
+ raise ValueError(f"Unknown train dates {unknown_train_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
326
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
310
327
  if len(unknown_val_dates) > 0:
311
- raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates" \
312
- + f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
328
+ raise ValueError(f"Unknown validation dates {unknown_val_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
329
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
313
330
  if len(unknown_test_dates) > 0:
314
- raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates" \
315
- + f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else "")
331
+ raise ValueError(f"Unknown test dates {unknown_test_dates}. Use dates available in dataset.available_dates (collection period {dataset.metadata.collection_period})" \
332
+ + (f". These dates are missing from the dataset collection period {dataset.metadata.missing_dates_in_collection_period}" if dataset.metadata.missing_dates_in_collection_period else ""))
333
+ # Check time order of train, val, and test periods
334
+ train_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.train_dates]
335
+ test_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.test_dates]
336
+ if not self.no_test_set and min(test_dates) <= max(train_dates):
337
+ warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
338
+ if self.val_approach == ValidationApproach.VALIDATION_DATES:
339
+ val_dates = [datetime.strptime(date_str, "%Y%m%d").date() for date_str in self.val_dates]
340
+ if min(val_dates) <= max(train_dates):
341
+ warnings.warn(f"Some validation dates ({min(val_dates).strftime('%Y%m%d')}) are before or equal to the last train date ({max(train_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
342
+ if not self.no_test_set and min(test_dates) <= max(val_dates):
343
+ warnings.warn(f"Some test dates ({min(test_dates).strftime('%Y%m%d')}) are before or equal to the last validation date ({max(val_dates).strftime('%Y%m%d')}). This might lead to improper evaluation and should be avoided.")
316
344
  # Configure features
317
345
  if dataset.metadata.protocol == Protocol.TLS and self.use_tcp_features:
318
346
  self.flowstats_features = self.flowstats_features + SELECTED_TCP_FLAGS
@@ -446,7 +474,7 @@ class DatasetConfig():
446
474
  def _get_train_data_params(self) -> TrainDataParams:
447
475
  return TrainDataParams(
448
476
  database_filename=self.database_filename,
449
- train_period=self.train_period,
477
+ train_period_name=self.train_period_name,
450
478
  train_tables_paths=self._get_train_tables_paths(),
451
479
  apps_selection=self.apps_selection,
452
480
  apps_selection_topx=self.apps_selection_topx,
@@ -460,7 +488,7 @@ class DatasetConfig():
460
488
  assert self.val_approach == ValidationApproach.VALIDATION_DATES
461
489
  val_data_params = TestDataParams(
462
490
  database_filename=self.database_filename,
463
- test_period=self.val_period,
491
+ test_period_name=self.val_period_name,
464
492
  test_tables_paths=self._get_val_tables_paths(),
465
493
  known_apps_database_enum=known_apps_database_enum,
466
494
  unknown_apps_database_enum=unknown_apps_database_enum,)
@@ -472,7 +500,7 @@ class DatasetConfig():
472
500
  def _get_test_data_params_and_path(self, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> tuple[TestDataParams, str]:
473
501
  test_data_params = TestDataParams(
474
502
  database_filename=self.database_filename,
475
- test_period=self.test_period,
503
+ test_period_name=self.test_period_name,
476
504
  test_tables_paths=self._get_test_tables_paths(),
477
505
  known_apps_database_enum=known_apps_database_enum,
478
506
  unknown_apps_database_enum=unknown_apps_database_enum,)
@@ -481,6 +509,21 @@ class DatasetConfig():
481
509
  test_data_path = os.path.join(self.data_root, "test-data", f"{params_hash}_{self.random_state}")
482
510
  return test_data_params, test_data_path
483
511
 
512
+ @model_validator(mode="before")
513
+ @classmethod
514
+ def check_deprecated_args(cls, values):
515
+ kwargs = values.kwargs
516
+ if "train_period" in kwargs:
517
+ warnings.warn("train_period is deprecated. Use train_period_name instead.")
518
+ kwargs["train_period_name"] = kwargs["train_period"]
519
+ if "val_period" in kwargs:
520
+ warnings.warn("val_period is deprecated. Use val_period_name instead.")
521
+ kwargs["val_period_name"] = kwargs["val_period"]
522
+ if "test_period" in kwargs:
523
+ warnings.warn("test_period is deprecated. Use test_period_name instead.")
524
+ kwargs["test_period_name"] = kwargs["test_period"]
525
+ return values
526
+
484
527
  def __str__(self):
485
528
  _process_tag = yaml.emitter.Emitter.process_tag
486
529
  _ignore_aliases = yaml.Dumper.ignore_aliases
@@ -30,7 +30,7 @@ from cesnet_datazoo.pytables_data.indices_setup import (IndicesTuple, compute_kn
30
30
  from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, fit_or_load_scalers,
31
31
  pytables_collate_fn,
32
32
  pytables_ip_collate_fn, worker_init_fn)
33
- from cesnet_datazoo.utils.class_info import ClassInfo, create_superclass_structures
33
+ from cesnet_datazoo.utils.class_info import ClassInfo, create_class_info
34
34
  from cesnet_datazoo.utils.download import resumable_download, simple_download
35
35
  from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
36
36
 
@@ -74,8 +74,8 @@ class CesnetDataset():
74
74
  metadata: Additional [dataset metadata][metadata].
75
75
  available_dates: List of all available dates in the dataset.
76
76
  time_periods: Predefined time periods. Each time period is a list of dates.
77
- default_train_period: Default time period for training.
78
- default_test_period: Default time period for testing.
77
+ default_train_period_name: Default time period for training.
78
+ default_test_period_name: Default time period for testing.
79
79
 
80
80
  The following attributes are initialized when [`set_dataset_config_and_initialize`][datasets.cesnet_dataset.CesnetDataset.set_dataset_config_and_initialize] is called.
81
81
 
@@ -111,8 +111,8 @@ class CesnetDataset():
111
111
  metadata: DatasetMetadata
112
112
  available_dates: list[str]
113
113
  time_periods: dict[str, list[str]]
114
- default_train_period: str
115
- default_test_period: str
114
+ default_train_period_name: str
115
+ default_test_period_name: str
116
116
  time_periods_gen: bool = False
117
117
  silent: bool = False
118
118
 
@@ -165,13 +165,16 @@ class CesnetDataset():
165
165
  num_samples += len(database.get_node(p))
166
166
  if self.size == "ORIG" and num_samples != self.metadata.available_samples:
167
167
  raise ValueError(f"Expected {self.metadata.available_samples} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
168
- elif num_samples != DATASET_SIZES[self.size]:
168
+ if self.size != "ORIG" and num_samples != DATASET_SIZES[self.size]:
169
169
  raise ValueError(f"Expected {DATASET_SIZES[self.size]} samples, but got {num_samples} in the database. Please delete the data root folder, update cesnet-datazoo, and redownload the dataset.")
170
170
  self.available_dates = list(map(lambda x: x.removeprefix("/flows/D"), tables_paths))
171
171
  else:
172
172
  self.available_dates = []
173
173
  if self.time_periods_gen:
174
174
  self._generate_time_periods()
175
+ # Add all available dates as single date time periods
176
+ for d in self.available_dates:
177
+ self.time_periods[d] = [d]
175
178
 
176
179
  def set_dataset_config_and_initialize(self, dataset_config: DatasetConfig) -> None:
177
180
  """
@@ -249,9 +252,9 @@ class CesnetDataset():
249
252
  """
250
253
  if self.dataset_config is None:
251
254
  raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting validaion dataloader")
252
- assert self.val_dataset is not None
253
255
  if self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
254
256
  raise ValueError("Validation dataloader is not available when using no-validation")
257
+ assert self.val_dataset is not None
255
258
  if self.val_dataloader:
256
259
  return self.val_dataloader
257
260
  batch_sampler = BatchSampler(sampler=SequentialSampler(self.val_dataset), batch_size=self.dataset_config.test_batch_size, drop_last=False)
@@ -288,6 +291,8 @@ class CesnetDataset():
288
291
  """
289
292
  if self.dataset_config is None:
290
293
  raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting test dataloader")
294
+ if self.dataset_config.no_test_set:
295
+ raise ValueError("Test dataloader is not available when no_test_set is true")
291
296
  assert self.test_dataset is not None
292
297
  if self.test_dataloader:
293
298
  return self.test_dataloader
@@ -358,7 +363,7 @@ class CesnetDataset():
358
363
  Returns:
359
364
  Validation data as a dataframe.
360
365
  """
361
- self._check_before_dataframe()
366
+ self._check_before_dataframe(check_no_val=True)
362
367
  assert self.dataset_config is not None and self.val_dataset is not None
363
368
  if len(self.val_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
364
369
  warnings.warn(f"Validation set has ({len(self.val_dataset)} samples), consider using get_val_dataloader() instead")
@@ -384,13 +389,31 @@ class CesnetDataset():
384
389
  Returns:
385
390
  Test data as a dataframe.
386
391
  """
387
- self._check_before_dataframe()
392
+ self._check_before_dataframe(check_no_test=True)
388
393
  assert self.dataset_config is not None and self.test_dataset is not None
389
394
  if len(self.test_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
390
395
  warnings.warn(f"Test set has ({len(self.test_dataset)} samples), consider using get_test_dataloader() instead")
391
396
  feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
392
397
  return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
393
398
 
399
+ def get_num_classes(self) -> int:
400
+ """Returns the number of classes in the current configuration of the dataset."""
401
+ if self.class_info is None:
402
+ raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting the number of classes")
403
+ return self.class_info.num_classes
404
+
405
+ def get_known_apps(self) -> list[str]:
406
+ """Returns the list of known applications in the current configuration of the dataset."""
407
+ if self.class_info is None:
408
+ raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting known apps")
409
+ return self.class_info.known_apps
410
+
411
+ def get_unknown_apps(self) -> list[str]:
412
+ """Returns the list of unknown applications in the current configuration of the dataset."""
413
+ if self.class_info is None:
414
+ raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting unknown apps")
415
+ return self.class_info.unknown_apps
416
+
394
417
  def compute_dataset_statistics(self, num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, disabled_apps: Optional[list[str]] = None)-> None:
395
418
  """
396
419
  Computes dataset statistics and saves them to the `statistics_path` folder.
@@ -401,8 +424,6 @@ class CesnetDataset():
401
424
  batch_size: Number of samples per batch for loading data.
402
425
  disabled_apps: List of applications to exclude from the statistics.
403
426
  """
404
- if self.name.startswith("CESNET-TLS22"):
405
- raise NotImplementedError("Dataset statistics are not supported for CESNET_TLS22")
406
427
  flowstats_features = self.metadata.flowstats_features + self.metadata.packet_histogram_features + self.metadata.tcp_features
407
428
  if not os.path.exists(self.statistics_path):
408
429
  os.mkdir(self.statistics_path)
@@ -410,6 +431,7 @@ class CesnetDataset():
410
431
  output_dir=self.statistics_path,
411
432
  flowstats_features=flowstats_features,
412
433
  protocol=self.metadata.protocol,
434
+ extra_fields=not self.name.startswith("CESNET-TLS22"),
413
435
  disabled_apps=disabled_apps if disabled_apps is not None else [],
414
436
  num_samples=num_samples,
415
437
  num_workers=num_workers,
@@ -471,13 +493,17 @@ class CesnetDataset():
471
493
  self.val_dataloader = None
472
494
  self.test_dataloader = None
473
495
 
474
- def _check_before_dataframe(self) -> None:
496
+ def _check_before_dataframe(self, check_no_val: bool = False, check_no_test: bool = False) -> None:
475
497
  if self.dataset_config is None:
476
498
  raise ValueError("Dataset is not initialized, use set_dataset_config_and_initialize() before getting a dataframe")
477
499
  if self.dataset_config.return_ips:
478
500
  raise ValueError("Dataframes are not available when return_ips is set. Use a dataloader instead.")
479
501
  if self.dataset_config.return_torch:
480
502
  raise ValueError("Dataframes are not available when return_torch is set. Use a dataloader instead.")
503
+ if check_no_val and self.dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
504
+ raise ValueError("Validation dataframe is not available when using no-validation")
505
+ if check_no_test and self.dataset_config.no_test_set:
506
+ raise ValueError("Test dataframe is not available when no_test_set is true")
481
507
 
482
508
  def _initialize_train_val_test(self) -> None:
483
509
  assert self.dataset_config is not None
@@ -485,7 +511,12 @@ class CesnetDataset():
485
511
  servicemap = pd.read_csv(dataset_config.servicemap_path, index_col="Tag")
486
512
  # Initialize train and test indices
487
513
  train_indices, train_unknown_indices, encoder, known_apps_database_enum, unknown_apps_database_enum = init_or_load_train_indices(dataset_config=dataset_config, servicemap=servicemap)
488
- test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
514
+ if self.dataset_config.no_test_set:
515
+ test_known_indices = np.empty((0,3), dtype=np.int64)
516
+ test_unknown_indices = np.empty((0,3), dtype=np.int64)
517
+ test_data_path = None
518
+ else:
519
+ test_known_indices, test_unknown_indices, test_data_path = init_or_load_test_indices(dataset_config=dataset_config, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
489
520
  # Date weight sampling of train indices
490
521
  if dataset_config.train_dates_weigths is not None:
491
522
  assert dataset_config.train_size != "all"
@@ -527,13 +558,13 @@ class CesnetDataset():
527
558
  test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
528
559
  stratify=train_labels, shuffle=True, random_state=train_val_rng)
529
560
  elif dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
530
- val_data_path = None
531
561
  val_known_indices = np.empty((0,3), dtype=np.int64)
532
562
  val_unknown_indices = np.empty((0,3), dtype=np.int64)
563
+ val_data_path = None
533
564
  else: assert_never(dataset_config.val_approach)
534
565
 
535
566
  # Create class info
536
- class_info = create_superclass_structures(servicemap=servicemap, target_names=list(encoder.classes_))
567
+ class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
537
568
  # Load or fit data scalers
538
569
  flowstats_scaler, flowstats_quantiles, ipt_scaler, psizes_scaler = fit_or_load_scalers(dataset_config=dataset_config, train_indices=train_indices)
539
570
  # Subset dataset indices based on the selected sizes and compute application counts
@@ -555,16 +586,21 @@ class CesnetDataset():
555
586
  indices=dataset_indices.train_indices,
556
587
  flowstats_features=dataset_config.flowstats_features,
557
588
  return_ips=dataset_config.return_ips,)
558
- test_dataset = PyTablesDataset(
559
- database_path=dataset_config.database_path,
560
- tables_paths=dataset_config._get_test_tables_paths(),
561
- indices=test_combined_indices,
562
- flowstats_features=dataset_config.flowstats_features,
563
- preload=dataset_config.preload_test,
564
- preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),
565
- return_ips=dataset_config.return_ips,)
566
- val_dataset = None
567
- if dataset_config.val_approach != ValidationApproach.NO_VALIDATION:
589
+ if dataset_config.no_test_set:
590
+ test_dataset = None
591
+ else:
592
+ assert test_data_path is not None
593
+ test_dataset = PyTablesDataset(
594
+ database_path=dataset_config.database_path,
595
+ tables_paths=dataset_config._get_test_tables_paths(),
596
+ indices=test_combined_indices,
597
+ flowstats_features=dataset_config.flowstats_features,
598
+ preload=dataset_config.preload_test,
599
+ preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),
600
+ return_ips=dataset_config.return_ips,)
601
+ if dataset_config.val_approach == ValidationApproach.NO_VALIDATION:
602
+ val_dataset = None
603
+ else:
568
604
  assert val_data_path is not None
569
605
  val_dataset = PyTablesDataset(
570
606
  database_path=dataset_config.database_path,
@@ -579,7 +615,6 @@ class CesnetDataset():
579
615
  collate_fn = pytables_ip_collate_fn
580
616
  else:
581
617
  collate_fn = partial(pytables_collate_fn, # type: ignore
582
- use_packet_histograms=dataset_config.use_packet_histograms,
583
618
  flowstats_scaler=flowstats_scaler,
584
619
  flowstats_quantiles=flowstats_quantiles,
585
620
  psizes_scaler=psizes_scaler,
@@ -588,6 +623,8 @@ class CesnetDataset():
588
623
  ipt_min=dataset_config.ipt_min,
589
624
  ipt_max=dataset_config.ipt_max,
590
625
  use_push_flags=dataset_config.use_push_flags,
626
+ use_packet_histograms=dataset_config.use_packet_histograms,
627
+ normalize_packet_histograms=dataset_config.normalize_packet_histograms,
591
628
  zero_ppi_start=dataset_config.zero_ppi_start,
592
629
  encoder=encoder,
593
630
  known_apps=class_info.known_apps,
@@ -10,8 +10,8 @@ class CESNET_TLS22(CesnetDataset):
10
10
  "W-2021-40": ["20211004", "20211005", "20211006", "20211007", "20211008", "20211009", "20211010"],
11
11
  "W-2021-41": ["20211011", "20211012", "20211013", "20211014", "20211015", "20211016", "20211017"],
12
12
  }
13
- default_train_period = "W-2021-40"
14
- default_test_period = "W-2021-41"
13
+ default_train_period_name = "W-2021-40"
14
+ default_test_period_name = "W-2021-41"
15
15
 
16
16
  class CESNET_QUIC22(CesnetDataset):
17
17
  """Dataset class for [CESNET-QUIC22][cesnet-quic22]."""
@@ -24,11 +24,11 @@ class CESNET_QUIC22(CesnetDataset):
24
24
  "W-2022-46": ["20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120"],
25
25
  "W-2022-47": ["20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
26
26
  "W45-47": ["20221107", "20221108", "20221109", "20221110", "20221111", "20221112", "20221113",
27
- "20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120",
28
- "20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
27
+ "20221114", "20221115", "20221116", "20221117", "20221118", "20221119", "20221120",
28
+ "20221121", "20221122", "20221123", "20221124", "20221125", "20221126", "20221127"],
29
29
  }
30
- default_train_period = "W-2022-44"
31
- default_test_period = "W-2022-45"
30
+ default_train_period_name = "W-2022-44"
31
+ default_test_period_name = "W-2022-45"
32
32
 
33
33
  class CESNET_TLS_Year22(CesnetDataset):
34
34
  """Dataset class for [CESNET-TLS-Year22][cesnet-tls-year22]."""
@@ -37,5 +37,5 @@ class CESNET_TLS_Year22(CesnetDataset):
37
37
  bucket_url = "https://liberouter.org/datazoo/download?bucket=cesnet-tls-year22"
38
38
  time_periods = {f"W-2022-{week}": [] for week in range(1, 53)} | {f"M-2022-{month}": [] for month in range(1, 13)}
39
39
  time_periods_gen = True
40
- default_train_period = "M-2022-9"
41
- default_test_period = "M-2022-10"
40
+ default_train_period_name = "M-2022-9"
41
+ default_test_period_name = "M-2022-10"
@@ -16,7 +16,7 @@ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASO
16
16
  PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
17
17
  from cesnet_datazoo.pytables_data.indices_setup import sort_indices
18
18
  from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
19
- worker_init_fn)
19
+ load_database, worker_init_fn)
20
20
 
21
21
 
22
22
  def pick_quic_fields(batch):
@@ -26,23 +26,27 @@ def pick_quic_fields(batch):
26
26
  batch["QUIC_VERSION"],
27
27
  )
28
28
 
29
- def pick_stats_fields(batch, flowstats_features: list[str]):
29
+ def pick_stats_fields(batch):
30
30
  return (
31
31
  batch[PPI_COLUMN],
32
32
  batch["DURATION"],
33
33
  batch["PACKETS"] + batch["PACKETS_REV"],
34
34
  batch["BYTES"] + batch["BYTES_REV"],
35
+ batch[APP_COLUMN],
36
+ batch[CATEGORY_COLUMN],
37
+ )
38
+
39
+ def pick_extra_fields(batch, flowstats_features: list[str]):
40
+ return (
35
41
  batch["DST_ASN"],
36
42
  batch[PHISTS_FEATURES],
37
43
  batch[[f for f in FLOWEND_REASON_FEATURES if f in flowstats_features]],
38
- batch[APP_COLUMN],
39
- batch[CATEGORY_COLUMN],
40
44
  )
41
45
 
42
46
  def simple_collate_fn(batch):
43
47
  return batch
44
48
 
45
- def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
49
+ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, extra_fields: bool, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
46
50
  stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
47
51
  stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
48
52
  categories_csv_path = os.path.join(output_dir, "categories.csv")
@@ -70,8 +74,8 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
70
74
  packet_sizes_counter = Counter()
71
75
  if not silent:
72
76
  print(f"Reading data from {database_path} for statistics")
73
- table_names = list_all_tables(database_path)
74
- stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_names, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
77
+ table_paths = list_all_tables(database_path)
78
+ stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_paths, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
75
79
  if num_samples != "all":
76
80
  subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
77
81
  stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
@@ -89,7 +93,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
89
93
  stats_dataset.pytables_worker_init()
90
94
 
91
95
  for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader), disable=silent):
92
- ppi, duration, packets_total, bytes_total, asn, phist, flowend_reason, app, cat = pick_stats_fields(batch, flowstats_features=flowstats_features)
96
+ ppi, duration, packets_total, bytes_total, app, cat = pick_stats_fields(batch)
93
97
  # Saving feature values for distribution plots
94
98
  feature_duration.append(duration)
95
99
  feature_packets_total.append(packets_total)
@@ -97,8 +101,6 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
97
101
  packet_sizes_counter.update(ppi[:, SIZE_POS, :].flatten())
98
102
  # Aggregating features for value_counts
99
103
  app_series = app_series.add(pd.Series(app).value_counts(), fill_value=0)
100
- asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
101
- flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
102
104
  # Grouping features per categories
103
105
  df1 = pd.DataFrame(data={"cat": cat, "BYTES_TOTAL": bytes_total})
104
106
  flow_counts = df1["cat"].value_counts().rename("FLOW_COUNT")
@@ -110,9 +112,12 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
110
112
  quic_sni_series = quic_sni_series.add(pd.Series(sni).str.decode("utf-8").value_counts(), fill_value=0)
111
113
  quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
112
114
  quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
113
- # Aggregate PHISTS
114
- df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
115
- df_phist = df_phist.add(df2, fill_value=0)
115
+ if extra_fields:
116
+ asn, phist, flowend_reason = pick_extra_fields(batch, flowstats_features=flowstats_features)
117
+ asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
118
+ flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
119
+ df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
120
+ df_phist = df_phist.add(df2, fill_value=0)
116
121
  feature_duration = np.concatenate(feature_duration)
117
122
  feature_packets_total = np.concatenate(feature_packets_total)
118
123
  feature_bytes_total = np.concatenate(feature_bytes_total)
@@ -123,9 +128,12 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
123
128
  # Flow statistics distribution output
124
129
  df_flowstats = pd.DataFrame(data={"FLOW DURATION": feature_duration, "FLOW BYTE VOLUME": feature_bytes_total, "FLOW LENGTH": feature_packets_total}).describe()
125
130
  df_flowstats.to_csv(stats_csv_path)
126
- # Categories tikzpicture and csv output
127
- stats_dataset.pytables_worker_init() # to get access to cat enum; TODO implement better
128
- df_categories.index = df_categories.index.map(stats_dataset.get_cat_enum())
131
+ # Categories tikzpicture and csv output; first, get the categories and applications enum
132
+ temp_database, temp_tables = load_database(database_path=database_path, tables_paths=table_paths[:1])
133
+ cat_enum = temp_tables[0].get_enum(CATEGORY_COLUMN)
134
+ app_enum = temp_tables[0].get_enum(APP_COLUMN)
135
+ temp_database.close()
136
+ df_categories.index = df_categories.index.map(cat_enum)
129
137
  df_categories = df_categories.drop("default", errors="ignore")
130
138
  df_categories["FLOW_PERC"] = df_categories["FLOW_COUNT"] / sum(df_categories["FLOW_COUNT"]) * 100
131
139
  df_categories["BYTES_PERC"] = df_categories["BYTES_TOTAL"] / sum(df_categories["BYTES_TOTAL"]) * 100
@@ -139,20 +147,9 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
139
147
  # Application distribution output
140
148
  app_df = pd.DataFrame({"COUNT": app_series.sort_values(ascending=False).astype("int64")})
141
149
  app_df["PERC"] = (app_df["COUNT"] / app_df["COUNT"].sum() * 100).round(3)
142
- app_df.index = app_df.index.map(stats_dataset.get_app_enum())
150
+ app_df.index = app_df.index.map(app_enum)
143
151
  app_df.index.name = "LABEL"
144
152
  app_df.to_csv(app_path)
145
- # ASN distribution output
146
- asn_df = pd.DataFrame({"COUNT": asn_series.sort_values(ascending=False).astype("int64")})
147
- asn_df["PERC"] = (asn_df["COUNT"] / asn_df["COUNT"].sum() * 100).round(3)
148
- asn_df.index.name = "DESTINATION ASN"
149
- asn_df.to_csv(asn_path)
150
- # Flow end reason output
151
- flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
152
- flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
153
- flow_endreason_df.index.name = "FLOW ENDREASON"
154
- flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
155
- flow_endreason_df.to_csv(flow_endreason_path)
156
153
  # Packet sizes histogram output
157
154
  packet_sizes_df = pd.DataFrame({"COUNT": pd.Series(packet_sizes_counter)}).sort_index()
158
155
  packet_sizes_df["PERC"] = (packet_sizes_df["COUNT"] / packet_sizes_df["COUNT"].sum() * 100).round(3)
@@ -168,13 +165,25 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
168
165
  quic_version_df.index = quic_version_df.index.map(hex)
169
166
  quic_version_df.index.name = "QUIC VERSION"
170
167
  quic_version_df.to_csv(quic_version_path)
171
- # PHIST output
172
- df_phist.index.name = "BINS"
173
- df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
174
- df_phist = df_phist.astype("int64")
175
- for i, column in zip((1, 3, 5, 7), df_phist.columns):
176
- df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
177
- df_phist.to_csv(phist_path)
168
+ if extra_fields:
169
+ # ASN distribution output
170
+ asn_df = pd.DataFrame({"COUNT": asn_series.sort_values(ascending=False).astype("int64")})
171
+ asn_df["PERC"] = (asn_df["COUNT"] / asn_df["COUNT"].sum() * 100).round(3)
172
+ asn_df.index.name = "DESTINATION ASN"
173
+ asn_df.to_csv(asn_path)
174
+ # Flow end reason output
175
+ flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
176
+ flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
177
+ flow_endreason_df.index.name = "FLOW ENDREASON"
178
+ flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
179
+ flow_endreason_df.to_csv(flow_endreason_path)
180
+ # PHIST output
181
+ df_phist.index.name = "BINS"
182
+ df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
183
+ df_phist = df_phist.astype("int64")
184
+ for i, column in zip((1, 3, 5, 7), df_phist.columns):
185
+ df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
186
+ df_phist.to_csv(phist_path)
178
187
 
179
188
  # Dataset stats figure
180
189
  axes: Any
@@ -232,5 +241,4 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
232
241
  ax4.set_xlabel("Bytes")
233
242
 
234
243
  plt.tight_layout()
235
- fig.show()
236
244
  fig.savefig(stats_pdf_path, bbox_inches="tight")
@@ -1,8 +1,8 @@
1
1
  import numpy as np
2
2
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
3
3
 
4
- from cesnet_datazoo.metrics.superclass_metrics import (per_app_superclass_metrics,
5
- superclass_accuracies)
4
+ from cesnet_datazoo.metrics.provider_metrics import (per_app_provider_metrics,
5
+ provider_accuracies)
6
6
  from cesnet_datazoo.utils.class_info import ClassInfo
7
7
 
8
8
 
@@ -10,23 +10,23 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
10
10
  p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
11
11
  labels=labels,
12
12
  zero_division=zero_division)
13
- sc_p, sc_r, sc_f1 = per_app_superclass_metrics(cm, class_info=class_info)
13
+ sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
14
14
  predicted_unknown = cm[:, -1]
15
15
  with np.errstate(divide="ignore", invalid="ignore"):
16
16
  predicted_unknown_perc = predicted_unknown / s
17
17
  predicted_unknown_perc = np.nan_to_num(predicted_unknown_perc)
18
- headers = ["precision (sc)", "recall (sc)", "f1-score (sc)", "pred unknown", "support"]
18
+ headers = ["precision (pr)", "recall (pr)", "f1-score (pr)", "pred unknown", "support"]
19
19
  headers_fmt = "{:>{width}} {:>15} {:>15} {:>15} {:>15} {:>9}\n"
20
- width = max(max(len(cn) for cn in class_info.target_names), len("failed superclass acc"))
20
+ width = max(max(len(cn) for cn in class_info.target_names), len("failed provider acc"))
21
21
  report = headers_fmt.format("", *headers, width=width)
22
22
  report += "\n"
23
- row_fmt_superclass = "{:>{width}} " + 3 * " {:>7.{digits}f} ({:.{digits}f}) " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
23
+ row_fmt_provider = "{:>{width}} " + 3 * " {:>7.{digits}f} ({:.{digits}f}) " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
24
24
  row_fmt = "{:>{width}} " + 3 * " {:>7.{digits}f} " + " {:>7} ({:.{digits}f}) " + "{:>9}\n"
25
25
  rows = zip(map(class_info.target_names.__getitem__, labels), p, sc_p, r, sc_r, f1, sc_f1, predicted_unknown, predicted_unknown_perc, s) # type: ignore
26
26
  for row in rows:
27
27
  app, p_, _, r_, _, f1_, _, u_, up_, s_ = row
28
- if class_info.has_superclass[app]:
29
- report += row_fmt_superclass.format(*row, width=width, digits=digits)
28
+ if class_info.has_provider[app]:
29
+ report += row_fmt_provider.format(*row, width=width, digits=digits)
30
30
  else:
31
31
  report += row_fmt.format(app, p_, r_, f1_, u_, up_, s_, width=width, digits=digits)
32
32
  report += "\n"
@@ -40,26 +40,26 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
40
40
  avg_sc_f1 = np.average(np.where(np.isnan([np.nan if x is None else x for x in sc_f1]), f1, sc_f1)[:-1])
41
41
  row_avg = [avg_p, avg_sc_p, avg_r, avg_sc_r, avg_f1, avg_sc_f1, predicted_unknown_sum, samples_sum]
42
42
 
43
- headers_avg = ["precision (sc)", "recall (sc)", "f1-score (sc)", "pred unknown", "support"]
43
+ headers_avg = ["precision (pr)", "recall (pr)", "f1-score (pr)", "pred unknown", "support"]
44
44
  row_fmt_avg = "{:>{width}} " + 3 * " {:>6.{digits}} ({:.{digits}f}) " + "{:>15} " + "{:>9}\n"
45
45
  digits = 3 # show more precise averages
46
46
  report += headers_fmt.format("", *headers_avg, width=width)
47
47
  report += row_fmt_avg.format("macro avg", *row_avg, width=width, digits=digits)
48
48
 
49
49
  acc = accuracy_score(y_true, y_pred)
50
- superclass_acc, failed_superclass_acc = superclass_accuracies(y_true, y_pred, class_info=class_info)
50
+ provider_acc, failed_provider_acc = provider_accuracies(y_true, y_pred, class_info=class_info)
51
51
 
52
52
  row_fmt_acc = "{:>{width}} {:>15} {:>15} {:>7.{digits}f}\n"
53
53
  report += row_fmt_acc.format("acc", "", "", acc, width=width, digits=digits)
54
- report += row_fmt_acc.format("superclass acc", "", "", superclass_acc, width=width, digits=digits)
55
- report += row_fmt_acc.format("failed superclass acc", "", "", failed_superclass_acc, width=width, digits=digits)
54
+ report += row_fmt_acc.format("provider acc", "", "", provider_acc, width=width, digits=digits)
55
+ report += row_fmt_acc.format("failed provider acc", "", "", failed_provider_acc, width=width, digits=digits)
56
56
  metrics = {
57
57
  "Test/Accuracy": acc,
58
- "Test/Superclass Accuracy": superclass_acc,
59
- "Test/Failed Superclass Accuracy": failed_superclass_acc,
58
+ "Test/Provider Accuracy": provider_acc,
59
+ "Test/Failed Provider Accuracy": failed_provider_acc,
60
60
  "Test/Fscore": avg_f1,
61
- "Test/Superclass Fscore": avg_sc_f1,
61
+ "Test/Provider Fscore": avg_sc_f1,
62
62
  "Test/Recall": avg_r,
63
- "Test/Superclass Recall": avg_sc_r,
63
+ "Test/Provider Recall": avg_sc_r,
64
64
  }
65
65
  return report, metrics
@@ -3,18 +3,19 @@ import numpy as np
3
3
  from cesnet_datazoo.utils.class_info import ClassInfo
4
4
 
5
5
 
6
- def superclass_accuracies(y_true: np.ndarray, y_pred: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
7
- y_true_sc = class_info.superclass_mapping_arr[y_true]
8
- y_pred_sc = class_info.superclass_mapping_arr[y_pred]
6
+ def provider_accuracies(y_true: np.ndarray, y_pred: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
7
+ provider_mapping_arr = np.array(list(class_info.provider_mapping.values()))
8
+ y_true_sc = provider_mapping_arr[y_true]
9
+ y_pred_sc = provider_mapping_arr[y_pred]
9
10
  mistakes = y_true != y_pred
10
- superclass_acc = (y_true_sc == y_pred_sc).sum() / len(y_true_sc)
11
- failed_superclass_acc = (y_true_sc[mistakes] == y_pred_sc[mistakes]).sum() / mistakes.sum()
12
- return superclass_acc, failed_superclass_acc
11
+ provider_acc = (y_true_sc == y_pred_sc).sum() / len(y_true_sc)
12
+ failed_provider_acc = (y_true_sc[mistakes] == y_pred_sc[mistakes]).sum() / mistakes.sum()
13
+ return provider_acc, failed_provider_acc
13
14
 
14
- def per_app_superclass_metrics(cm, class_info: ClassInfo):
15
+ def per_app_provider_metrics(cm, class_info: ClassInfo):
15
16
  metrics = []
16
17
  for i, app in enumerate(class_info.target_names):
17
- if not class_info.has_superclass[app]:
18
+ if not class_info.has_provider[app]:
18
19
  metrics.append((None, None, None))
19
20
  continue
20
21
  group = class_info.group_matrix[i]
@@ -140,7 +140,7 @@ def pytables_collate_fn(batch: tuple,
140
140
  flowstats_scaler: Scaler, flowstats_quantiles: pd.Series,
141
141
  psizes_scaler: Scaler, psizes_max: int,
142
142
  ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
143
- use_push_flags: bool, use_packet_histograms: bool, zero_ppi_start: int,
143
+ use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
144
144
  encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
145
145
  x_ppi, x_flowstats, labels = batch
146
146
  x_ppi = x_ppi.transpose(0, 2, 1)
@@ -164,12 +164,13 @@ def pytables_collate_fn(batch: tuple,
164
164
 
165
165
  if use_packet_histograms:
166
166
  x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
167
- src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
168
- dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
169
- np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
170
- np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
171
- np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
172
- np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
167
+ if normalize_packet_histograms:
168
+ src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
169
+ dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
170
+ np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
171
+ np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
172
+ np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
173
+ np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
173
174
  x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
174
175
  x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
175
176
  else:
@@ -0,0 +1,50 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Optional
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.preprocessing import LabelEncoder
7
+
8
+ from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROVIDER_COLUMN
9
+
10
+
11
+ @dataclass()
12
+ class ClassInfo:
13
+ target_names: list[str]
14
+ num_classes: int
15
+ known_apps: list[str]
16
+ unknown_apps: list[str]
17
+ unknown_class_label: int
18
+ group_matrix: np.ndarray
19
+ has_provider: dict[str, bool]
20
+ provider_mapping: dict[str, str]
21
+ provider_members: dict[str, list[str]]
22
+ categories_mapping: dict[str, Optional[str]]
23
+
24
+ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> ClassInfo:
25
+ known_apps = sorted(known_apps_database_enum.values())
26
+ unknown_apps = sorted(unknown_apps_database_enum.values())
27
+ target_names_arr = encoder.classes_
28
+ assert known_apps == list(target_names_arr[:-1])
29
+ group_matrix = np.array([[a == b or
30
+ (a in servicemap.index and b in servicemap.index and
31
+ not pd.isnull(servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN]) and not pd.isnull(servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]) and
32
+ servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN] == servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN])
33
+ for a in target_names_arr] for b in target_names_arr])
34
+ has_provider = {app: app in servicemap.index and not pd.isnull(servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN]) for app in target_names_arr}
35
+ provider_mapping = {app: servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN] if has_provider[app] else app for app in target_names_arr}
36
+ providers = sorted({provider_mapping[app] for app in target_names_arr if has_provider[app]})
37
+ provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
38
+ categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
39
+ return ClassInfo(
40
+ target_names=list(target_names_arr),
41
+ num_classes=len(known_apps),
42
+ known_apps=known_apps,
43
+ unknown_apps=unknown_apps,
44
+ unknown_class_label=len(known_apps),
45
+ group_matrix=group_matrix,
46
+ has_provider=has_provider,
47
+ provider_mapping=provider_mapping,
48
+ provider_members=provider_members,
49
+ categories_mapping=categories_mapping,
50
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -102,8 +102,8 @@ dataset = CESNET_QUIC22("/datasets/CESNET-QUIC22/", size="XS")
102
102
  dataset_config = DatasetConfig(
103
103
  dataset=dataset,
104
104
  apps_selection=AppSelection.ALL_KNOWN,
105
- train_period="W-2022-44",
106
- test_period="W-2022-45",
105
+ train_period_name="W-2022-44",
106
+ test_period_name="W-2022-45",
107
107
  )
108
108
  dataset.set_dataset_config_and_initialize(dataset_config)
109
109
  train_dataframe = dataset.get_train_df()
@@ -19,7 +19,7 @@ cesnet_datazoo/datasets/metadata/dataset_metadata.py
19
19
  cesnet_datazoo/datasets/metadata/metadata.csv
20
20
  cesnet_datazoo/metrics/__init__.py
21
21
  cesnet_datazoo/metrics/classification_report.py
22
- cesnet_datazoo/metrics/superclass_metrics.py
22
+ cesnet_datazoo/metrics/provider_metrics.py
23
23
  cesnet_datazoo/pytables_data/__init__.py
24
24
  cesnet_datazoo/pytables_data/apps_split.py
25
25
  cesnet_datazoo/pytables_data/indices_setup.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.0.10"
7
+ version = "0.0.12"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
@@ -1,46 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import numpy as np
4
- import pandas as pd
5
-
6
- from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROVIDER_COLUMN
7
-
8
-
9
- @dataclass()
10
- class ClassInfo:
11
- target_names: list[str]
12
- known_apps: list[str]
13
- group_matrix: np.ndarray
14
- superclass_members: dict[str, list[str]]
15
- has_superclass: dict[str, bool]
16
- superclass_mapping: dict[str, str]
17
- superclass_mapping_arr: np.ndarray
18
- categories_mapping: dict[str, str]
19
-
20
- def get_num_classes(self):
21
- return len(self.known_apps)
22
-
23
- def create_superclass_structures(servicemap: pd.DataFrame, target_names: list[str]) -> ClassInfo:
24
- known_apps = target_names[:-1]
25
- target_names_arr = np.array(target_names)
26
- group_matrix = np.array([[
27
- a in servicemap.index and b in servicemap.index and
28
- not pd.isnull(servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN]) and not pd.isnull(servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]) and
29
- servicemap.loc[a, SERVICEMAP_PROVIDER_COLUMN] == servicemap.loc[b, SERVICEMAP_PROVIDER_COLUMN]
30
- for a in target_names_arr] for b in target_names_arr])
31
- has_superclass = {app: app in servicemap.index and not pd.isnull(servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN]) for app in target_names_arr}
32
- superclass_mapping: dict[str, str] = {app: servicemap.loc[app, SERVICEMAP_PROVIDER_COLUMN] if has_superclass[app] else app for app in target_names_arr} # type: ignore
33
- superclass_mapping_arr = np.array(list(superclass_mapping.values()))
34
- superclass_members = {superclass: servicemap.loc[servicemap[SERVICEMAP_PROVIDER_COLUMN] == superclass].index.to_list()
35
- for superclass in servicemap.loc[:, SERVICEMAP_PROVIDER_COLUMN].dropna().unique()}
36
- categories_mapping: dict[str, str] = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr} # type: ignore
37
- return ClassInfo(
38
- target_names=target_names,
39
- known_apps=known_apps,
40
- group_matrix=group_matrix,
41
- superclass_members=superclass_members,
42
- has_superclass=has_superclass,
43
- superclass_mapping=superclass_mapping,
44
- superclass_mapping_arr=superclass_mapping_arr,
45
- categories_mapping=categories_mapping,
46
- )
File without changes