cesnet-datazoo 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/PKG-INFO +1 -1
  2. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/config.py +6 -2
  3. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/constants.py +5 -3
  4. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/cesnet_dataset.py +5 -2
  5. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +1 -1
  6. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/statistics.py +1 -1
  7. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/metrics/classification_report.py +12 -13
  8. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/metrics/provider_metrics.py +6 -6
  9. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/pytables_data/indices_setup.py +29 -18
  10. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/pytables_data/pytables_dataset.py +49 -26
  11. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo.egg-info/PKG-INFO +1 -1
  12. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/pyproject.toml +1 -1
  13. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/LICENCE +0 -0
  14. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/README.md +0 -0
  15. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/__init__.py +0 -0
  16. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/__init__.py +0 -0
  17. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/datasets.py +0 -0
  18. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
  19. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/loaders.py +0 -0
  20. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  21. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
  22. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/metrics/__init__.py +0 -0
  23. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  24. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  25. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/pytables_data/data_scalers.py +0 -0
  26. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/utils/__init__.py +0 -0
  27. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/utils/class_info.py +0 -0
  28. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/utils/download.py +0 -0
  29. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/utils/fileutils.py +0 -0
  30. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo/utils/random.py +0 -0
  31. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
  32. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  33. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo.egg-info/requires.txt +0 -0
  34. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  35. {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -14,8 +14,9 @@ import yaml
14
14
  from pydantic import model_validator
15
15
  from pydantic.dataclasses import dataclass
16
16
 
17
- from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, SELECTED_TCP_FLAGS,
18
- TCP_PPI_CHANNELS, UDP_PPI_CHANNELS)
17
+ from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, QUIC_SNI_COLUMN,
18
+ SELECTED_TCP_FLAGS, TCP_PPI_CHANNELS, TLS_SNI_COLUMN,
19
+ UDP_PPI_CHANNELS)
19
20
 
20
21
  if TYPE_CHECKING:
21
22
  from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
@@ -128,6 +129,7 @@ class DatasetConfig():
128
129
  flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
129
130
  flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
130
131
  other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
132
+ sni_column: Database column with SNI domains, can be None for datasets without SNI domains.
131
133
 
132
134
  # Configuration options
133
135
 
@@ -343,6 +345,8 @@ class DatasetConfig():
343
345
  # Configure features
344
346
  self.flowstats_features = dataset.metadata.flowstats_features
345
347
  self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
348
+ sni_column = TLS_SNI_COLUMN if dataset.metadata.protocol == Protocol.TLS else QUIC_SNI_COLUMN
349
+ self.sni_column = sni_column if sni_column in dataset.metadata.other_fields else None
346
350
  self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
347
351
  if self.use_packet_histograms:
348
352
  if len(dataset.metadata.packet_histograms) == 0:
@@ -39,6 +39,8 @@ DEFAULT_BACKGROUND_CLASS = "default-background"
39
39
  GOOGLE_BACKGROUND_CLASS = "google-background"
40
40
 
41
41
  # Indices
42
- INDICES_TABLE_POS = 0
43
- INDICES_INDEX_POS = 1
44
- INDICES_LABEL_POS = 2
42
+ INDICES_TABLE_FIELD = "TABLE"
43
+ INDICES_INDEX_FIELD = "INDEX"
44
+ INDICES_APP_FIELD = "APP"
45
+ INDICES_SNI_FIELD = "SNI"
46
+ INDICES_DTYPE = [(INDICES_TABLE_FIELD, "int32"), (INDICES_INDEX_FIELD, "int32"), (INDICES_APP_FIELD, "int32"), (INDICES_SNI_FIELD, "U50")]
@@ -16,7 +16,7 @@ from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, S
16
16
  from typing_extensions import assert_never
17
17
 
18
18
  from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
19
- from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_LABEL_POS,
19
+ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_APP_FIELD,
20
20
  SERVICEMAP_FILE, UNKNOWN_STR_LABEL)
21
21
  from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
22
22
  from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
@@ -555,7 +555,7 @@ class CesnetDataset():
555
555
  train_val_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.TRAIN_VAL_SPLIT)
556
556
  val_data_path = dataset_config._get_train_data_path()
557
557
  val_unknown_indices = train_unknown_indices
558
- train_labels = train_indices[:, INDICES_LABEL_POS]
558
+ train_labels = train_indices[INDICES_APP_FIELD]
559
559
  if dataset_config.train_dates_weigths is not None:
560
560
  assert dataset_config.val_known_size != "all"
561
561
  # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
@@ -626,6 +626,7 @@ class CesnetDataset():
626
626
  flowstats_features_boolean=dataset_config.flowstats_features_boolean,
627
627
  flowstats_features_phist=dataset_config.flowstats_features_phist,
628
628
  other_fields=self.dataset_config.other_fields,
629
+ sni_column=self.dataset_config.sni_column,
629
630
  ppi_channels=dataset_config.get_ppi_channels(),
630
631
  ppi_transform=dataset_config.ppi_transform,
631
632
  flowstats_transform=dataset_config.flowstats_transform,
@@ -644,6 +645,7 @@ class CesnetDataset():
644
645
  flowstats_features_boolean=dataset_config.flowstats_features_boolean,
645
646
  flowstats_features_phist=dataset_config.flowstats_features_phist,
646
647
  other_fields=self.dataset_config.other_fields,
648
+ sni_column=self.dataset_config.sni_column,
647
649
  ppi_channels=dataset_config.get_ppi_channels(),
648
650
  ppi_transform=dataset_config.ppi_transform,
649
651
  flowstats_transform=dataset_config.flowstats_transform,
@@ -664,6 +666,7 @@ class CesnetDataset():
664
666
  flowstats_features_boolean=dataset_config.flowstats_features_boolean,
665
667
  flowstats_features_phist=dataset_config.flowstats_features_phist,
666
668
  other_fields=self.dataset_config.other_fields,
669
+ sni_column=self.dataset_config.sni_column,
667
670
  ppi_channels=dataset_config.get_ppi_channels(),
668
671
  ppi_transform=dataset_config.ppi_transform,
669
672
  flowstats_transform=dataset_config.flowstats_transform,
@@ -39,5 +39,5 @@ class DatasetMetadata():
39
39
  metadata_df = pd.read_csv(os.path.join(os.path.dirname(__file__), "metadata.csv"), index_col="Name", keep_default_na=False)
40
40
  def load_metadata(dataset_name: str) -> DatasetMetadata:
41
41
  d = metadata_df.loc[dataset_name].to_dict()
42
- d = {k.replace(" ", "_").lower(): v for k, v in d.items()}
42
+ d = {k.replace(" ", "_").lower(): v for k, v in d.items()} # type: ignore
43
43
  return DatasetMetadata(**d)
@@ -92,7 +92,7 @@ def compute_dataset_statistics(database_path: str,
92
92
  tables_paths=table_paths,
93
93
  indices=None,
94
94
  disabled_apps=disabled_apps,
95
- return_all_fields=True,
95
+ return_raw_fields=True,
96
96
  flowstats_features=[],
97
97
  flowstats_features_boolean=[],
98
98
  flowstats_features_phist=[],
@@ -1,13 +1,12 @@
1
1
  import numpy as np
2
2
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
3
3
 
4
- from cesnet_datazoo.metrics.provider_metrics import (per_app_provider_metrics,
5
- provider_accuracies)
4
+ from cesnet_datazoo.metrics.provider_metrics import per_app_provider_metrics, provider_accuracies
6
5
  from cesnet_datazoo.utils.class_info import ClassInfo
7
6
 
8
7
 
9
- def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.ndarray, labels: list[int], class_info: ClassInfo, digits: int = 2, zero_division: int = 0) -> tuple[str, dict[str, float]]:
10
- p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
8
+ def better_classification_report(test_labels: np.ndarray, preds: np.ndarray, cm: np.ndarray, labels: list[int], class_info: ClassInfo, digits: int = 2, zero_division: int = 0) -> tuple[str, dict[str, float]]:
9
+ p, r, f1, s = precision_recall_fscore_support(test_labels, preds,
11
10
  labels=labels,
12
11
  zero_division=zero_division)
13
12
  sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
@@ -46,20 +45,20 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
46
45
  report += headers_fmt.format("", *headers_avg, width=width)
47
46
  report += row_fmt_avg.format("macro avg", *row_avg, width=width, digits=digits)
48
47
 
49
- acc = accuracy_score(y_true, y_pred)
50
- provider_acc, failed_provider_acc = provider_accuracies(y_true, y_pred, class_info=class_info)
48
+ acc = accuracy_score(test_labels, preds)
49
+ provider_acc, failed_provider_acc = provider_accuracies(test_labels, preds, class_info=class_info)
51
50
 
52
51
  row_fmt_acc = "{:>{width}} {:>15} {:>15} {:>7.{digits}f}\n"
53
52
  report += row_fmt_acc.format("acc", "", "", acc, width=width, digits=digits)
54
53
  report += row_fmt_acc.format("provider acc", "", "", provider_acc, width=width, digits=digits)
55
54
  report += row_fmt_acc.format("failed provider acc", "", "", failed_provider_acc, width=width, digits=digits)
56
55
  metrics = {
57
- "Test/Accuracy": acc,
58
- "Test/Provider Accuracy": provider_acc,
59
- "Test/Failed Provider Accuracy": failed_provider_acc,
60
- "Test/Fscore": avg_f1,
61
- "Test/Provider Fscore": avg_sc_f1,
62
- "Test/Recall": avg_r,
63
- "Test/Provider Recall": avg_sc_r,
56
+ "test/acc": acc,
57
+ "test/provider-acc": provider_acc,
58
+ "test/failed-provider-acc": failed_provider_acc,
59
+ "test/fscore": avg_f1,
60
+ "test/provider-fscore": avg_sc_f1,
61
+ "test/recall": avg_r,
62
+ "test/provider-recall": avg_sc_r,
64
63
  }
65
64
  return report, metrics
@@ -3,13 +3,13 @@ import numpy as np
3
3
  from cesnet_datazoo.utils.class_info import ClassInfo
4
4
 
5
5
 
6
- def provider_accuracies(y_true: np.ndarray, y_pred: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
6
+ def provider_accuracies(true_labels: np.ndarray, preds: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
7
7
  provider_mapping_arr = np.array(list(class_info.provider_mapping.values()))
8
- y_true_sc = provider_mapping_arr[y_true]
9
- y_pred_sc = provider_mapping_arr[y_pred]
10
- mistakes = y_true != y_pred
11
- provider_acc = (y_true_sc == y_pred_sc).sum() / len(y_true_sc)
12
- failed_provider_acc = (y_true_sc[mistakes] == y_pred_sc[mistakes]).sum() / mistakes.sum()
8
+ true_labels_provider = provider_mapping_arr[true_labels]
9
+ preds_provider = provider_mapping_arr[preds]
10
+ mistakes = true_labels != preds
11
+ provider_acc = (true_labels_provider == preds_provider).sum() / len(true_labels_provider)
12
+ failed_provider_acc = (true_labels_provider[mistakes] == preds_provider[mistakes]).sum() / mistakes.sum()
13
13
  return provider_acc, failed_provider_acc
14
14
 
15
15
  def per_app_provider_metrics(cm, class_info: ClassInfo):
@@ -1,6 +1,7 @@
1
1
  import dataclasses
2
2
  import logging
3
3
  import os
4
+ import time
4
5
  import warnings
5
6
  from collections import namedtuple
6
7
  from enum import Enum
@@ -9,7 +10,8 @@ import numpy as np
9
10
  import pandas as pd
10
11
 
11
12
  from cesnet_datazoo.config import DatasetConfig
12
- from cesnet_datazoo.constants import INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS
13
+ from cesnet_datazoo.constants import (INDICES_APP_FIELD, INDICES_DTYPE, INDICES_INDEX_FIELD,
14
+ INDICES_TABLE_FIELD)
13
15
  from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
14
16
  from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
15
17
  from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
@@ -21,8 +23,8 @@ IndicesTuple = namedtuple("IndicesTuple", ["train_indices", "val_known_indices",
21
23
 
22
24
 
23
25
  def sort_indices(indices: np.ndarray) -> np.ndarray:
24
- idxs = np.argsort(indices[:, INDICES_INDEX_POS])
25
- res = idxs[np.argsort(indices[idxs, INDICES_TABLE_POS], kind="stable")]
26
+ idxs = np.argsort(indices[INDICES_INDEX_FIELD])
27
+ res = idxs[np.argsort(indices[idxs][INDICES_TABLE_FIELD], kind="stable")]
26
28
  return indices[res]
27
29
 
28
30
  def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: IndicesTuple) -> IndicesTuple:
@@ -61,7 +63,7 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
61
63
 
62
64
  def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
63
65
  rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
64
- indices_per_date = [train_indices[train_indices[:, INDICES_TABLE_POS] == i] for i in np.unique(train_indices[:, INDICES_TABLE_POS])]
66
+ indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
65
67
  weights = np.array(dataset_config.train_dates_weigths)
66
68
  weights = weights / weights.sum()
67
69
  samples_per_date = np.ceil((weights * (num_samples))).astype(int)
@@ -77,7 +79,7 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
77
79
  return sampled_train_indices
78
80
 
79
81
  def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
80
- app_counts = pd.Series(indices[:, INDICES_LABEL_POS]).value_counts()
82
+ app_counts = pd.Series(indices[INDICES_APP_FIELD]).value_counts()
81
83
  app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
82
84
  return app_counts
83
85
 
@@ -99,23 +101,26 @@ def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: d
99
101
  init_train_data(train_data_path)
100
102
  if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
101
103
  log.info("Processing train indices")
104
+ start_time = time.time()
102
105
  train_data_params = dataset_config._get_train_data_params()
103
106
  train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
104
107
  database_path=dataset_config.database_path,
105
108
  tables_app_enum=tables_app_enum,
109
+ sni_column=dataset_config.sni_column,
106
110
  servicemap=servicemap,
107
111
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
108
112
  if not disable_indices_cache:
109
113
  yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
110
114
  yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
111
115
  yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
112
- np.save(os.path.join(train_data_path, "train_known_indices.npy"), train_known_indices)
113
- np.save(os.path.join(train_data_path, "train_unknown_indices.npy"), train_unknown_indices)
116
+ np.savez_compressed(os.path.join(train_data_path, "train_indices.npz"), train_known_indices=train_known_indices, train_unknown_indices=train_unknown_indices)
117
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
114
118
  else:
115
119
  known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
116
120
  unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
117
- train_known_indices = np.load(os.path.join(train_data_path, "train_known_indices.npy"))
118
- train_unknown_indices = np.load(os.path.join(train_data_path, "train_unknown_indices.npy"))
121
+ loaded = np.load(os.path.join(train_data_path, "train_indices.npz"))
122
+ train_known_indices = loaded["train_known_indices"]
123
+ train_unknown_indices = loaded["train_unknown_indices"]
119
124
  return train_known_indices, train_unknown_indices, known_apps, unknown_apps
120
125
 
121
126
  def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
@@ -123,17 +128,20 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str
123
128
  init_test_data(val_data_path)
124
129
  if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
125
130
  log.info("Processing validation indices")
131
+ start_time = time.time()
126
132
  val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
127
133
  database_path=dataset_config.database_path,
128
134
  tables_app_enum=tables_app_enum,
135
+ sni_column=dataset_config.sni_column,
129
136
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
130
137
  if not disable_indices_cache:
131
138
  yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
132
- np.save(os.path.join(val_data_path, "val_known_indices.npy"), val_known_indices)
133
- np.save(os.path.join(val_data_path, "val_unknown_indices.npy"), val_unknown_indices)
139
+ np.savez_compressed(os.path.join(val_data_path, "val_indices.npz"), val_known_indices=val_known_indices, val_unknown_indices=val_unknown_indices)
140
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
134
141
  else:
135
- val_known_indices = np.load(os.path.join(val_data_path, "val_known_indices.npy"))
136
- val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
142
+ loaded = np.load(os.path.join(val_data_path, "val_indices.npz"))
143
+ val_known_indices = loaded["val_known_indices"]
144
+ val_unknown_indices = loaded["val_unknown_indices"]
137
145
  return val_known_indices, val_unknown_indices, val_data_path
138
146
 
139
147
  def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
@@ -141,17 +149,20 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[st
141
149
  init_test_data(test_data_path)
142
150
  if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
143
151
  log.info("Processing test indices")
152
+ start_time = time.time()
144
153
  test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
145
154
  database_path=dataset_config.database_path,
146
155
  tables_app_enum=tables_app_enum,
156
+ sni_column=dataset_config.sni_column,
147
157
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
148
158
  if not disable_indices_cache:
149
159
  yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
150
- np.save(os.path.join(test_data_path, "test_known_indices.npy"), test_known_indices)
151
- np.save(os.path.join(test_data_path, "test_unknown_indices.npy"), test_unknown_indices)
160
+ np.savez_compressed(os.path.join(test_data_path, "test_indices.npz"), test_known_indices=test_known_indices, test_unknown_indices=test_unknown_indices)
161
+ log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
152
162
  else:
153
- test_known_indices = np.load(os.path.join(test_data_path, "test_known_indices.npy"))
154
- test_unknown_indices = np.load(os.path.join(test_data_path, "test_unknown_indices.npy"))
163
+ loaded = np.load(os.path.join(test_data_path, "test_indices.npz"))
164
+ test_known_indices = loaded["test_known_indices"]
165
+ test_unknown_indices = loaded["test_unknown_indices"]
155
166
  return test_known_indices, test_unknown_indices, test_data_path
156
167
 
157
168
  def init_train_data(train_data_path: str):
@@ -164,4 +175,4 @@ def init_test_data(test_data_path: str):
164
175
  os.makedirs(os.path.join(test_data_path, "preload"), exist_ok=True)
165
176
 
166
177
  def no_indices() -> np.ndarray:
167
- return np.zeros((0,3), dtype=np.int64)
178
+ return np.empty(shape=(0,), dtype=INDICES_DTYPE)
@@ -16,8 +16,8 @@ from typing_extensions import assert_never
16
16
 
17
17
  from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
18
18
  TrainDataParams)
19
- from cesnet_datazoo.constants import (APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN,
20
- QUIC_SNI_COLUMN, TLS_SNI_COLUMN)
19
+ from cesnet_datazoo.constants import (APP_COLUMN, INDICES_DTYPE, INDICES_INDEX_FIELD,
20
+ INDICES_TABLE_FIELD, PPI_COLUMN)
21
21
  from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
22
22
  split_apps_topx_with_provider_groups)
23
23
 
@@ -36,12 +36,13 @@ class PyTablesDataset(Dataset):
36
36
  flowstats_features_phist: list[str],
37
37
  other_fields: list[str],
38
38
  ppi_channels: list[int],
39
+ sni_column: Optional[str] = None,
39
40
  ppi_transform: Optional[Callable] = None,
40
41
  flowstats_transform: Optional[Callable] = None,
41
42
  flowstats_phist_transform: Optional[Callable] = None,
42
43
  target_transform: Optional[Callable] = None,
43
44
  return_tensors: bool = False,
44
- return_all_fields: bool = False,
45
+ return_raw_fields: bool = False,
45
46
  preload: bool = False,
46
47
  preload_blob: Optional[str] = None,
47
48
  disabled_apps: Optional[list[str]] = None,):
@@ -60,14 +61,14 @@ class PyTablesDataset(Dataset):
60
61
  self.flowstats_features_boolean = flowstats_features_boolean
61
62
  self.flowstats_features_phist = flowstats_features_phist
62
63
  self.other_fields = other_fields
64
+ self.sni_column = sni_column
63
65
  self.ppi_channels = ppi_channels
64
66
  self.ppi_transform = ppi_transform
65
67
  self.flowstats_transform = flowstats_transform
66
68
  self.flowstats_phist_transform = flowstats_phist_transform
67
69
  self.target_transform = target_transform
68
70
  self.return_tensors = return_tensors
69
- self.return_all_fields = return_all_fields
70
- self.sni_column = TLS_SNI_COLUMN if TLS_SNI_COLUMN in self.other_fields else QUIC_SNI_COLUMN if QUIC_SNI_COLUMN in self.other_fields else None
71
+ self.return_raw_fields = return_raw_fields
71
72
 
72
73
  self.preload = preload
73
74
  self.preload_blob = preload_blob
@@ -78,7 +79,7 @@ class PyTablesDataset(Dataset):
78
79
  batch_data = self.data[batch_idx]
79
80
  else:
80
81
  batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
81
- if self.return_all_fields:
82
+ if self.return_raw_fields:
82
83
  return (batch_data, batch_idx)
83
84
 
84
85
  # Prepare data
@@ -173,16 +174,21 @@ def worker_init_fn(worker_id):
173
174
  dataset = worker_info.dataset
174
175
  dataset.pytables_worker_init(worker_id)
175
176
 
176
- def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
177
+ def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], sni_column: Optional[str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
177
178
  database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
178
179
  inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
179
- all_app_labels = {}
180
+ all_labels = {}
181
+ all_sni_domains = {}
180
182
  app_counts = pd.Series(dtype="int64")
181
183
  start_time = time.time()
182
184
  for i, table_path in enumerate(train_data_params.train_tables_paths):
183
- all_app_labels[i] = train_tables[i].read(field=APP_COLUMN)
184
- log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
185
- app_counts = app_counts.add(pd.Series(all_app_labels[i]).value_counts(), fill_value=0)
185
+ all_labels[i] = train_tables[i].read(field=APP_COLUMN)
186
+ if sni_column is not None:
187
+ all_sni_domains[i] = train_tables[i].read(field=sni_column)
188
+ else:
189
+ all_sni_domains[i] = np.full_like(all_labels[i], "", dtype="U1")
190
+ log.info(f"Reading app and SNI columns for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
191
+ app_counts = app_counts.add(pd.Series(all_labels[i]).value_counts(), fill_value=0)
186
192
  database.close()
187
193
  # Handle disabled apps and apps with less than min_samples_per_app samples
188
194
  if len(train_data_params.disabled_apps) > 0:
@@ -202,8 +208,9 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
202
208
  # Base indices are indices of samples that are not disabled and have enough samples
203
209
  base_indices = {}
204
210
  for i, table_path in enumerate(train_data_params.train_tables_paths):
205
- base_indices[i] = np.nonzero(np.isin(all_app_labels[i], disabled_apps_ids, invert=True))[0]
206
- base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_app_labels.items()}
211
+ base_indices[i] = np.nonzero(np.isin(all_labels[i], disabled_apps_ids, invert=True))[0]
212
+ base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_labels.items()}
213
+ base_sni_domains = {table_id: arr[base_indices[table_id]] for table_id, arr in all_sni_domains.items()}
207
214
  # Apps selection
208
215
  if train_data_params.apps_selection != AppSelection.FIXED:
209
216
  app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
@@ -230,26 +237,38 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
230
237
  known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
231
238
  unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
232
239
 
233
- train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
240
+ train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices,
241
+ base_labels=base_labels,
242
+ base_sni_domains=base_sni_domains,
243
+ known_apps_ids=known_apps_ids,
244
+ unknown_apps_ids=unknown_apps_ids)
234
245
  rng.shuffle(train_known_indices)
235
246
  rng.shuffle(train_unknown_indices)
236
- log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
237
247
  return train_known_indices, train_unknown_indices, known_apps, unknown_apps
238
248
 
239
- def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
249
+ def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], sni_column: Optional[str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
240
250
  database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
241
251
  inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
242
252
  base_labels = {}
253
+ base_sni_domains = {}
243
254
  base_indices = {}
244
255
  start_time = time.time()
245
256
  for i, table_path in enumerate(test_data_params.test_tables_paths):
246
257
  base_labels[i] = test_tables[i].read(field=APP_COLUMN)
247
- log.info(f"Reading app column for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
258
+ if sni_column is not None:
259
+ base_sni_domains[i] = test_tables[i].read(field=sni_column)
260
+ else:
261
+ base_sni_domains[i] = np.full_like(base_labels[i], "", dtype="U1")
262
+ log.info(f"Reading app and SNI columns for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
248
263
  base_indices[i] = np.arange(len(test_tables[i]))
249
264
  database.close()
250
265
  known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
251
266
  unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
252
- test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
267
+ test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices,
268
+ base_labels=base_labels,
269
+ base_sni_domains=base_sni_domains,
270
+ known_apps_ids=known_apps_ids,
271
+ unknown_apps_ids=unknown_apps_ids)
253
272
  rng.shuffle(test_known_indices)
254
273
  rng.shuffle(test_unknown_indices)
255
274
  log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
@@ -271,28 +290,32 @@ def list_all_tables(database_path: str) -> list[str]:
271
290
  with tb.open_file(database_path, mode="r") as database:
272
291
  return list(map(lambda x: x._v_pathname, iter(database.get_node(f"/flows"))))
273
292
 
274
- def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[int, np.ndarray], known_apps_ids: list[int], unknown_apps_ids: list[int]) -> tuple[np.ndarray, np.ndarray]:
293
+ def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[int, np.ndarray], base_sni_domains: dict[int, np.ndarray], known_apps_ids: list[int], unknown_apps_ids: list[int]) -> tuple[np.ndarray, np.ndarray]:
275
294
  is_known = {table_id: np.isin(table_arr, known_apps_ids) for table_id, table_arr in base_labels.items()}
276
295
  is_unknown = {table_id: np.isin(table_arr, unknown_apps_ids) for table_id, table_arr in base_labels.items()}
277
296
  known_indices_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_indices.items()}
278
297
  unknown_indices_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_indices.items()}
279
298
  known_labels_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_labels.items()}
280
299
  unknown_labels_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_labels.items()}
281
- known_indices = np.column_stack((
300
+ known_sni_domains_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_sni_domains.items()}
301
+ unknown_sni_domains_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_sni_domains.items()}
302
+ known_indices = np.array(list(zip(
282
303
  np.concatenate([[table_id] * table_arr.sum() for table_id, table_arr in is_known.items()]),
283
304
  np.concatenate(list(known_indices_dict.values())),
284
- np.concatenate(list(known_labels_dict.values()))))
285
- unknown_indices = np.column_stack((
305
+ np.concatenate(list(known_labels_dict.values())),
306
+ np.concatenate(list(known_sni_domains_dict.values())))), dtype=INDICES_DTYPE)
307
+ unknown_indices = np.array(list(zip(
286
308
  np.concatenate([[table_id] * table_arr.sum() for table_id, table_arr in is_unknown.items()]),
287
309
  np.concatenate(list(unknown_indices_dict.values())),
288
- np.concatenate(list(unknown_labels_dict.values()))))
310
+ np.concatenate(list(unknown_labels_dict.values())),
311
+ np.concatenate(list(unknown_sni_domains_dict.values())))), dtype=INDICES_DTYPE)
289
312
  return known_indices, unknown_indices
290
313
 
291
314
  def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) -> np.ndarray:
292
- sorted_indices = indices[indices[:, INDICES_TABLE_POS].argsort(kind="stable")]
293
- unique_tables, split_bounderies = np.unique(sorted_indices[:, INDICES_TABLE_POS], return_index=True)
315
+ sorted_indices = indices[indices[INDICES_TABLE_FIELD].argsort(kind="stable")]
316
+ unique_tables, split_bounderies = np.unique(sorted_indices[INDICES_TABLE_FIELD], return_index=True)
294
317
  indices_per_table = np.split(sorted_indices, split_bounderies[1:])
295
318
  data = np.zeros(len(indices), dtype=data_dtype)
296
319
  for table_id, table_indices in zip(unique_tables, indices_per_table):
297
- data[np.where(indices[:, INDICES_TABLE_POS] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[:, INDICES_INDEX_POS])
320
+ data[np.where(indices[INDICES_TABLE_FIELD] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[INDICES_INDEX_FIELD])
298
321
  return data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.1.3"
7
+ version = "0.1.4"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
File without changes
File without changes
File without changes