cesnet-datazoo 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +6 -2
- cesnet_datazoo/constants.py +5 -3
- cesnet_datazoo/datasets/cesnet_dataset.py +5 -2
- cesnet_datazoo/datasets/metadata/dataset_metadata.py +1 -1
- cesnet_datazoo/datasets/statistics.py +1 -1
- cesnet_datazoo/metrics/classification_report.py +12 -13
- cesnet_datazoo/metrics/provider_metrics.py +6 -6
- cesnet_datazoo/pytables_data/indices_setup.py +29 -18
- cesnet_datazoo/pytables_data/pytables_dataset.py +49 -26
- {cesnet_datazoo-0.1.3.dist-info → cesnet_datazoo-0.1.4.dist-info}/METADATA +1 -1
- {cesnet_datazoo-0.1.3.dist-info → cesnet_datazoo-0.1.4.dist-info}/RECORD +14 -14
- {cesnet_datazoo-0.1.3.dist-info → cesnet_datazoo-0.1.4.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.1.3.dist-info → cesnet_datazoo-0.1.4.dist-info}/WHEEL +0 -0
- {cesnet_datazoo-0.1.3.dist-info → cesnet_datazoo-0.1.4.dist-info}/top_level.txt +0 -0
cesnet_datazoo/config.py
CHANGED
@@ -14,8 +14,9 @@ import yaml
|
|
14
14
|
from pydantic import model_validator
|
15
15
|
from pydantic.dataclasses import dataclass
|
16
16
|
|
17
|
-
from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN,
|
18
|
-
TCP_PPI_CHANNELS,
|
17
|
+
from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, QUIC_SNI_COLUMN,
|
18
|
+
SELECTED_TCP_FLAGS, TCP_PPI_CHANNELS, TLS_SNI_COLUMN,
|
19
|
+
UDP_PPI_CHANNELS)
|
19
20
|
|
20
21
|
if TYPE_CHECKING:
|
21
22
|
from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
|
@@ -128,6 +129,7 @@ class DatasetConfig():
|
|
128
129
|
flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
|
129
130
|
flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
|
130
131
|
other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
|
132
|
+
sni_column: Database column with SNI domains, can be None for datasets without SNI domains.
|
131
133
|
|
132
134
|
# Configuration options
|
133
135
|
|
@@ -343,6 +345,8 @@ class DatasetConfig():
|
|
343
345
|
# Configure features
|
344
346
|
self.flowstats_features = dataset.metadata.flowstats_features
|
345
347
|
self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
|
348
|
+
sni_column = TLS_SNI_COLUMN if dataset.metadata.protocol == Protocol.TLS else QUIC_SNI_COLUMN
|
349
|
+
self.sni_column = sni_column if sni_column in dataset.metadata.other_fields else None
|
346
350
|
self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
|
347
351
|
if self.use_packet_histograms:
|
348
352
|
if len(dataset.metadata.packet_histograms) == 0:
|
cesnet_datazoo/constants.py
CHANGED
@@ -39,6 +39,8 @@ DEFAULT_BACKGROUND_CLASS = "default-background"
|
|
39
39
|
GOOGLE_BACKGROUND_CLASS = "google-background"
|
40
40
|
|
41
41
|
# Indices
|
42
|
-
|
43
|
-
|
44
|
-
|
42
|
+
INDICES_TABLE_FIELD = "TABLE"
|
43
|
+
INDICES_INDEX_FIELD = "INDEX"
|
44
|
+
INDICES_APP_FIELD = "APP"
|
45
|
+
INDICES_SNI_FIELD = "SNI"
|
46
|
+
INDICES_DTYPE = [(INDICES_TABLE_FIELD, "int32"), (INDICES_INDEX_FIELD, "int32"), (INDICES_APP_FIELD, "int32"), (INDICES_SNI_FIELD, "U50")]
|
@@ -16,7 +16,7 @@ from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, S
|
|
16
16
|
from typing_extensions import assert_never
|
17
17
|
|
18
18
|
from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
|
19
|
-
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES,
|
19
|
+
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_APP_FIELD,
|
20
20
|
SERVICEMAP_FILE, UNKNOWN_STR_LABEL)
|
21
21
|
from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
|
22
22
|
from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
|
@@ -555,7 +555,7 @@ class CesnetDataset():
|
|
555
555
|
train_val_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.TRAIN_VAL_SPLIT)
|
556
556
|
val_data_path = dataset_config._get_train_data_path()
|
557
557
|
val_unknown_indices = train_unknown_indices
|
558
|
-
train_labels = train_indices[
|
558
|
+
train_labels = train_indices[INDICES_APP_FIELD]
|
559
559
|
if dataset_config.train_dates_weigths is not None:
|
560
560
|
assert dataset_config.val_known_size != "all"
|
561
561
|
# When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
|
@@ -626,6 +626,7 @@ class CesnetDataset():
|
|
626
626
|
flowstats_features_boolean=dataset_config.flowstats_features_boolean,
|
627
627
|
flowstats_features_phist=dataset_config.flowstats_features_phist,
|
628
628
|
other_fields=self.dataset_config.other_fields,
|
629
|
+
sni_column=self.dataset_config.sni_column,
|
629
630
|
ppi_channels=dataset_config.get_ppi_channels(),
|
630
631
|
ppi_transform=dataset_config.ppi_transform,
|
631
632
|
flowstats_transform=dataset_config.flowstats_transform,
|
@@ -644,6 +645,7 @@ class CesnetDataset():
|
|
644
645
|
flowstats_features_boolean=dataset_config.flowstats_features_boolean,
|
645
646
|
flowstats_features_phist=dataset_config.flowstats_features_phist,
|
646
647
|
other_fields=self.dataset_config.other_fields,
|
648
|
+
sni_column=self.dataset_config.sni_column,
|
647
649
|
ppi_channels=dataset_config.get_ppi_channels(),
|
648
650
|
ppi_transform=dataset_config.ppi_transform,
|
649
651
|
flowstats_transform=dataset_config.flowstats_transform,
|
@@ -664,6 +666,7 @@ class CesnetDataset():
|
|
664
666
|
flowstats_features_boolean=dataset_config.flowstats_features_boolean,
|
665
667
|
flowstats_features_phist=dataset_config.flowstats_features_phist,
|
666
668
|
other_fields=self.dataset_config.other_fields,
|
669
|
+
sni_column=self.dataset_config.sni_column,
|
667
670
|
ppi_channels=dataset_config.get_ppi_channels(),
|
668
671
|
ppi_transform=dataset_config.ppi_transform,
|
669
672
|
flowstats_transform=dataset_config.flowstats_transform,
|
@@ -39,5 +39,5 @@ class DatasetMetadata():
|
|
39
39
|
metadata_df = pd.read_csv(os.path.join(os.path.dirname(__file__), "metadata.csv"), index_col="Name", keep_default_na=False)
|
40
40
|
def load_metadata(dataset_name: str) -> DatasetMetadata:
|
41
41
|
d = metadata_df.loc[dataset_name].to_dict()
|
42
|
-
d = {k.replace(" ", "_").lower(): v for k, v in d.items()}
|
42
|
+
d = {k.replace(" ", "_").lower(): v for k, v in d.items()} # type: ignore
|
43
43
|
return DatasetMetadata(**d)
|
@@ -92,7 +92,7 @@ def compute_dataset_statistics(database_path: str,
|
|
92
92
|
tables_paths=table_paths,
|
93
93
|
indices=None,
|
94
94
|
disabled_apps=disabled_apps,
|
95
|
-
|
95
|
+
return_raw_fields=True,
|
96
96
|
flowstats_features=[],
|
97
97
|
flowstats_features_boolean=[],
|
98
98
|
flowstats_features_phist=[],
|
@@ -1,13 +1,12 @@
|
|
1
1
|
import numpy as np
|
2
2
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
3
3
|
|
4
|
-
from cesnet_datazoo.metrics.provider_metrics import
|
5
|
-
provider_accuracies)
|
4
|
+
from cesnet_datazoo.metrics.provider_metrics import per_app_provider_metrics, provider_accuracies
|
6
5
|
from cesnet_datazoo.utils.class_info import ClassInfo
|
7
6
|
|
8
7
|
|
9
|
-
def better_classification_report(
|
10
|
-
p, r, f1, s = precision_recall_fscore_support(
|
8
|
+
def better_classification_report(test_labels: np.ndarray, preds: np.ndarray, cm: np.ndarray, labels: list[int], class_info: ClassInfo, digits: int = 2, zero_division: int = 0) -> tuple[str, dict[str, float]]:
|
9
|
+
p, r, f1, s = precision_recall_fscore_support(test_labels, preds,
|
11
10
|
labels=labels,
|
12
11
|
zero_division=zero_division)
|
13
12
|
sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
|
@@ -46,20 +45,20 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
|
|
46
45
|
report += headers_fmt.format("", *headers_avg, width=width)
|
47
46
|
report += row_fmt_avg.format("macro avg", *row_avg, width=width, digits=digits)
|
48
47
|
|
49
|
-
acc = accuracy_score(
|
50
|
-
provider_acc, failed_provider_acc = provider_accuracies(
|
48
|
+
acc = accuracy_score(test_labels, preds)
|
49
|
+
provider_acc, failed_provider_acc = provider_accuracies(test_labels, preds, class_info=class_info)
|
51
50
|
|
52
51
|
row_fmt_acc = "{:>{width}} {:>15} {:>15} {:>7.{digits}f}\n"
|
53
52
|
report += row_fmt_acc.format("acc", "", "", acc, width=width, digits=digits)
|
54
53
|
report += row_fmt_acc.format("provider acc", "", "", provider_acc, width=width, digits=digits)
|
55
54
|
report += row_fmt_acc.format("failed provider acc", "", "", failed_provider_acc, width=width, digits=digits)
|
56
55
|
metrics = {
|
57
|
-
"
|
58
|
-
"
|
59
|
-
"
|
60
|
-
"
|
61
|
-
"
|
62
|
-
"
|
63
|
-
"
|
56
|
+
"test/acc": acc,
|
57
|
+
"test/provider-acc": provider_acc,
|
58
|
+
"test/failed-provider-acc": failed_provider_acc,
|
59
|
+
"test/fscore": avg_f1,
|
60
|
+
"test/provider-fscore": avg_sc_f1,
|
61
|
+
"test/recall": avg_r,
|
62
|
+
"test/provider-recall": avg_sc_r,
|
64
63
|
}
|
65
64
|
return report, metrics
|
@@ -3,13 +3,13 @@ import numpy as np
|
|
3
3
|
from cesnet_datazoo.utils.class_info import ClassInfo
|
4
4
|
|
5
5
|
|
6
|
-
def provider_accuracies(
|
6
|
+
def provider_accuracies(true_labels: np.ndarray, preds: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
|
7
7
|
provider_mapping_arr = np.array(list(class_info.provider_mapping.values()))
|
8
|
-
|
9
|
-
|
10
|
-
mistakes =
|
11
|
-
provider_acc = (
|
12
|
-
failed_provider_acc = (
|
8
|
+
true_labels_provider = provider_mapping_arr[true_labels]
|
9
|
+
preds_provider = provider_mapping_arr[preds]
|
10
|
+
mistakes = true_labels != preds
|
11
|
+
provider_acc = (true_labels_provider == preds_provider).sum() / len(true_labels_provider)
|
12
|
+
failed_provider_acc = (true_labels_provider[mistakes] == preds_provider[mistakes]).sum() / mistakes.sum()
|
13
13
|
return provider_acc, failed_provider_acc
|
14
14
|
|
15
15
|
def per_app_provider_metrics(cm, class_info: ClassInfo):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import dataclasses
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import time
|
4
5
|
import warnings
|
5
6
|
from collections import namedtuple
|
6
7
|
from enum import Enum
|
@@ -9,7 +10,8 @@ import numpy as np
|
|
9
10
|
import pandas as pd
|
10
11
|
|
11
12
|
from cesnet_datazoo.config import DatasetConfig
|
12
|
-
from cesnet_datazoo.constants import
|
13
|
+
from cesnet_datazoo.constants import (INDICES_APP_FIELD, INDICES_DTYPE, INDICES_INDEX_FIELD,
|
14
|
+
INDICES_TABLE_FIELD)
|
13
15
|
from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
|
14
16
|
from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
|
15
17
|
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
@@ -21,8 +23,8 @@ IndicesTuple = namedtuple("IndicesTuple", ["train_indices", "val_known_indices",
|
|
21
23
|
|
22
24
|
|
23
25
|
def sort_indices(indices: np.ndarray) -> np.ndarray:
|
24
|
-
idxs = np.argsort(indices[
|
25
|
-
res = idxs[np.argsort(indices[idxs
|
26
|
+
idxs = np.argsort(indices[INDICES_INDEX_FIELD])
|
27
|
+
res = idxs[np.argsort(indices[idxs][INDICES_TABLE_FIELD], kind="stable")]
|
26
28
|
return indices[res]
|
27
29
|
|
28
30
|
def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: IndicesTuple) -> IndicesTuple:
|
@@ -61,7 +63,7 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
|
|
61
63
|
|
62
64
|
def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
|
63
65
|
rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
|
64
|
-
indices_per_date = [train_indices[train_indices[
|
66
|
+
indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
|
65
67
|
weights = np.array(dataset_config.train_dates_weigths)
|
66
68
|
weights = weights / weights.sum()
|
67
69
|
samples_per_date = np.ceil((weights * (num_samples))).astype(int)
|
@@ -77,7 +79,7 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
|
|
77
79
|
return sampled_train_indices
|
78
80
|
|
79
81
|
def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
|
80
|
-
app_counts = pd.Series(indices[
|
82
|
+
app_counts = pd.Series(indices[INDICES_APP_FIELD]).value_counts()
|
81
83
|
app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
|
82
84
|
return app_counts
|
83
85
|
|
@@ -99,23 +101,26 @@ def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: d
|
|
99
101
|
init_train_data(train_data_path)
|
100
102
|
if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
|
101
103
|
log.info("Processing train indices")
|
104
|
+
start_time = time.time()
|
102
105
|
train_data_params = dataset_config._get_train_data_params()
|
103
106
|
train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
|
104
107
|
database_path=dataset_config.database_path,
|
105
108
|
tables_app_enum=tables_app_enum,
|
109
|
+
sni_column=dataset_config.sni_column,
|
106
110
|
servicemap=servicemap,
|
107
111
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
|
108
112
|
if not disable_indices_cache:
|
109
113
|
yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
|
110
114
|
yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
|
111
115
|
yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
|
112
|
-
np.
|
113
|
-
|
116
|
+
np.savez_compressed(os.path.join(train_data_path, "train_indices.npz"), train_known_indices=train_known_indices, train_unknown_indices=train_unknown_indices)
|
117
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
|
114
118
|
else:
|
115
119
|
known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
|
116
120
|
unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
|
117
|
-
|
118
|
-
|
121
|
+
loaded = np.load(os.path.join(train_data_path, "train_indices.npz"))
|
122
|
+
train_known_indices = loaded["train_known_indices"]
|
123
|
+
train_unknown_indices = loaded["train_unknown_indices"]
|
119
124
|
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
120
125
|
|
121
126
|
def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
@@ -123,17 +128,20 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str
|
|
123
128
|
init_test_data(val_data_path)
|
124
129
|
if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
|
125
130
|
log.info("Processing validation indices")
|
131
|
+
start_time = time.time()
|
126
132
|
val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
|
127
133
|
database_path=dataset_config.database_path,
|
128
134
|
tables_app_enum=tables_app_enum,
|
135
|
+
sni_column=dataset_config.sni_column,
|
129
136
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
|
130
137
|
if not disable_indices_cache:
|
131
138
|
yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
|
132
|
-
np.
|
133
|
-
|
139
|
+
np.savez_compressed(os.path.join(val_data_path, "val_indices.npz"), val_known_indices=val_known_indices, val_unknown_indices=val_unknown_indices)
|
140
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
|
134
141
|
else:
|
135
|
-
|
136
|
-
|
142
|
+
loaded = np.load(os.path.join(val_data_path, "val_indices.npz"))
|
143
|
+
val_known_indices = loaded["val_known_indices"]
|
144
|
+
val_unknown_indices = loaded["val_unknown_indices"]
|
137
145
|
return val_known_indices, val_unknown_indices, val_data_path
|
138
146
|
|
139
147
|
def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
@@ -141,17 +149,20 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[st
|
|
141
149
|
init_test_data(test_data_path)
|
142
150
|
if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
|
143
151
|
log.info("Processing test indices")
|
152
|
+
start_time = time.time()
|
144
153
|
test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
|
145
154
|
database_path=dataset_config.database_path,
|
146
155
|
tables_app_enum=tables_app_enum,
|
156
|
+
sni_column=dataset_config.sni_column,
|
147
157
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
|
148
158
|
if not disable_indices_cache:
|
149
159
|
yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
|
150
|
-
np.
|
151
|
-
|
160
|
+
np.savez_compressed(os.path.join(test_data_path, "test_indices.npz"), test_known_indices=test_known_indices, test_unknown_indices=test_unknown_indices)
|
161
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
|
152
162
|
else:
|
153
|
-
|
154
|
-
|
163
|
+
loaded = np.load(os.path.join(test_data_path, "test_indices.npz"))
|
164
|
+
test_known_indices = loaded["test_known_indices"]
|
165
|
+
test_unknown_indices = loaded["test_unknown_indices"]
|
155
166
|
return test_known_indices, test_unknown_indices, test_data_path
|
156
167
|
|
157
168
|
def init_train_data(train_data_path: str):
|
@@ -164,4 +175,4 @@ def init_test_data(test_data_path: str):
|
|
164
175
|
os.makedirs(os.path.join(test_data_path, "preload"), exist_ok=True)
|
165
176
|
|
166
177
|
def no_indices() -> np.ndarray:
|
167
|
-
return np.
|
178
|
+
return np.empty(shape=(0,), dtype=INDICES_DTYPE)
|
@@ -16,8 +16,8 @@ from typing_extensions import assert_never
|
|
16
16
|
|
17
17
|
from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
|
18
18
|
TrainDataParams)
|
19
|
-
from cesnet_datazoo.constants import (APP_COLUMN,
|
20
|
-
|
19
|
+
from cesnet_datazoo.constants import (APP_COLUMN, INDICES_DTYPE, INDICES_INDEX_FIELD,
|
20
|
+
INDICES_TABLE_FIELD, PPI_COLUMN)
|
21
21
|
from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
|
22
22
|
split_apps_topx_with_provider_groups)
|
23
23
|
|
@@ -36,12 +36,13 @@ class PyTablesDataset(Dataset):
|
|
36
36
|
flowstats_features_phist: list[str],
|
37
37
|
other_fields: list[str],
|
38
38
|
ppi_channels: list[int],
|
39
|
+
sni_column: Optional[str] = None,
|
39
40
|
ppi_transform: Optional[Callable] = None,
|
40
41
|
flowstats_transform: Optional[Callable] = None,
|
41
42
|
flowstats_phist_transform: Optional[Callable] = None,
|
42
43
|
target_transform: Optional[Callable] = None,
|
43
44
|
return_tensors: bool = False,
|
44
|
-
|
45
|
+
return_raw_fields: bool = False,
|
45
46
|
preload: bool = False,
|
46
47
|
preload_blob: Optional[str] = None,
|
47
48
|
disabled_apps: Optional[list[str]] = None,):
|
@@ -60,14 +61,14 @@ class PyTablesDataset(Dataset):
|
|
60
61
|
self.flowstats_features_boolean = flowstats_features_boolean
|
61
62
|
self.flowstats_features_phist = flowstats_features_phist
|
62
63
|
self.other_fields = other_fields
|
64
|
+
self.sni_column = sni_column
|
63
65
|
self.ppi_channels = ppi_channels
|
64
66
|
self.ppi_transform = ppi_transform
|
65
67
|
self.flowstats_transform = flowstats_transform
|
66
68
|
self.flowstats_phist_transform = flowstats_phist_transform
|
67
69
|
self.target_transform = target_transform
|
68
70
|
self.return_tensors = return_tensors
|
69
|
-
self.
|
70
|
-
self.sni_column = TLS_SNI_COLUMN if TLS_SNI_COLUMN in self.other_fields else QUIC_SNI_COLUMN if QUIC_SNI_COLUMN in self.other_fields else None
|
71
|
+
self.return_raw_fields = return_raw_fields
|
71
72
|
|
72
73
|
self.preload = preload
|
73
74
|
self.preload_blob = preload_blob
|
@@ -78,7 +79,7 @@ class PyTablesDataset(Dataset):
|
|
78
79
|
batch_data = self.data[batch_idx]
|
79
80
|
else:
|
80
81
|
batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
|
81
|
-
if self.
|
82
|
+
if self.return_raw_fields:
|
82
83
|
return (batch_data, batch_idx)
|
83
84
|
|
84
85
|
# Prepare data
|
@@ -173,16 +174,21 @@ def worker_init_fn(worker_id):
|
|
173
174
|
dataset = worker_info.dataset
|
174
175
|
dataset.pytables_worker_init(worker_id)
|
175
176
|
|
176
|
-
def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
177
|
+
def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], sni_column: Optional[str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
177
178
|
database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
|
178
179
|
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
179
|
-
|
180
|
+
all_labels = {}
|
181
|
+
all_sni_domains = {}
|
180
182
|
app_counts = pd.Series(dtype="int64")
|
181
183
|
start_time = time.time()
|
182
184
|
for i, table_path in enumerate(train_data_params.train_tables_paths):
|
183
|
-
|
184
|
-
|
185
|
-
|
185
|
+
all_labels[i] = train_tables[i].read(field=APP_COLUMN)
|
186
|
+
if sni_column is not None:
|
187
|
+
all_sni_domains[i] = train_tables[i].read(field=sni_column)
|
188
|
+
else:
|
189
|
+
all_sni_domains[i] = np.full_like(all_labels[i], "", dtype="U1")
|
190
|
+
log.info(f"Reading app and SNI columns for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
191
|
+
app_counts = app_counts.add(pd.Series(all_labels[i]).value_counts(), fill_value=0)
|
186
192
|
database.close()
|
187
193
|
# Handle disabled apps and apps with less than min_samples_per_app samples
|
188
194
|
if len(train_data_params.disabled_apps) > 0:
|
@@ -202,8 +208,9 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
202
208
|
# Base indices are indices of samples that are not disabled and have enough samples
|
203
209
|
base_indices = {}
|
204
210
|
for i, table_path in enumerate(train_data_params.train_tables_paths):
|
205
|
-
base_indices[i] = np.nonzero(np.isin(
|
206
|
-
base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in
|
211
|
+
base_indices[i] = np.nonzero(np.isin(all_labels[i], disabled_apps_ids, invert=True))[0]
|
212
|
+
base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_labels.items()}
|
213
|
+
base_sni_domains = {table_id: arr[base_indices[table_id]] for table_id, arr in all_sni_domains.items()}
|
207
214
|
# Apps selection
|
208
215
|
if train_data_params.apps_selection != AppSelection.FIXED:
|
209
216
|
app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
|
@@ -230,26 +237,38 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
230
237
|
known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
|
231
238
|
unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
|
232
239
|
|
233
|
-
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
240
|
+
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
241
|
+
base_labels=base_labels,
|
242
|
+
base_sni_domains=base_sni_domains,
|
243
|
+
known_apps_ids=known_apps_ids,
|
244
|
+
unknown_apps_ids=unknown_apps_ids)
|
234
245
|
rng.shuffle(train_known_indices)
|
235
246
|
rng.shuffle(train_unknown_indices)
|
236
|
-
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
237
247
|
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
238
248
|
|
239
|
-
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
249
|
+
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], sni_column: Optional[str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
240
250
|
database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
|
241
251
|
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
242
252
|
base_labels = {}
|
253
|
+
base_sni_domains = {}
|
243
254
|
base_indices = {}
|
244
255
|
start_time = time.time()
|
245
256
|
for i, table_path in enumerate(test_data_params.test_tables_paths):
|
246
257
|
base_labels[i] = test_tables[i].read(field=APP_COLUMN)
|
247
|
-
|
258
|
+
if sni_column is not None:
|
259
|
+
base_sni_domains[i] = test_tables[i].read(field=sni_column)
|
260
|
+
else:
|
261
|
+
base_sni_domains[i] = np.full_like(base_labels[i], "", dtype="U1")
|
262
|
+
log.info(f"Reading app and SNI columns for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
248
263
|
base_indices[i] = np.arange(len(test_tables[i]))
|
249
264
|
database.close()
|
250
265
|
known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
|
251
266
|
unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
|
252
|
-
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
267
|
+
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
268
|
+
base_labels=base_labels,
|
269
|
+
base_sni_domains=base_sni_domains,
|
270
|
+
known_apps_ids=known_apps_ids,
|
271
|
+
unknown_apps_ids=unknown_apps_ids)
|
253
272
|
rng.shuffle(test_known_indices)
|
254
273
|
rng.shuffle(test_unknown_indices)
|
255
274
|
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
@@ -271,28 +290,32 @@ def list_all_tables(database_path: str) -> list[str]:
|
|
271
290
|
with tb.open_file(database_path, mode="r") as database:
|
272
291
|
return list(map(lambda x: x._v_pathname, iter(database.get_node(f"/flows"))))
|
273
292
|
|
274
|
-
def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[int, np.ndarray], known_apps_ids: list[int], unknown_apps_ids: list[int]) -> tuple[np.ndarray, np.ndarray]:
|
293
|
+
def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[int, np.ndarray], base_sni_domains: dict[int, np.ndarray], known_apps_ids: list[int], unknown_apps_ids: list[int]) -> tuple[np.ndarray, np.ndarray]:
|
275
294
|
is_known = {table_id: np.isin(table_arr, known_apps_ids) for table_id, table_arr in base_labels.items()}
|
276
295
|
is_unknown = {table_id: np.isin(table_arr, unknown_apps_ids) for table_id, table_arr in base_labels.items()}
|
277
296
|
known_indices_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_indices.items()}
|
278
297
|
unknown_indices_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_indices.items()}
|
279
298
|
known_labels_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_labels.items()}
|
280
299
|
unknown_labels_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_labels.items()}
|
281
|
-
|
300
|
+
known_sni_domains_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_sni_domains.items()}
|
301
|
+
unknown_sni_domains_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_sni_domains.items()}
|
302
|
+
known_indices = np.array(list(zip(
|
282
303
|
np.concatenate([[table_id] * table_arr.sum() for table_id, table_arr in is_known.items()]),
|
283
304
|
np.concatenate(list(known_indices_dict.values())),
|
284
|
-
np.concatenate(list(known_labels_dict.values()))
|
285
|
-
|
305
|
+
np.concatenate(list(known_labels_dict.values())),
|
306
|
+
np.concatenate(list(known_sni_domains_dict.values())))), dtype=INDICES_DTYPE)
|
307
|
+
unknown_indices = np.array(list(zip(
|
286
308
|
np.concatenate([[table_id] * table_arr.sum() for table_id, table_arr in is_unknown.items()]),
|
287
309
|
np.concatenate(list(unknown_indices_dict.values())),
|
288
|
-
np.concatenate(list(unknown_labels_dict.values()))
|
310
|
+
np.concatenate(list(unknown_labels_dict.values())),
|
311
|
+
np.concatenate(list(unknown_sni_domains_dict.values())))), dtype=INDICES_DTYPE)
|
289
312
|
return known_indices, unknown_indices
|
290
313
|
|
291
314
|
def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) -> np.ndarray:
|
292
|
-
sorted_indices = indices[indices[
|
293
|
-
unique_tables, split_bounderies = np.unique(sorted_indices[
|
315
|
+
sorted_indices = indices[indices[INDICES_TABLE_FIELD].argsort(kind="stable")]
|
316
|
+
unique_tables, split_bounderies = np.unique(sorted_indices[INDICES_TABLE_FIELD], return_index=True)
|
294
317
|
indices_per_table = np.split(sorted_indices, split_bounderies[1:])
|
295
318
|
data = np.zeros(len(indices), dtype=data_dtype)
|
296
319
|
for table_id, table_indices in zip(unique_tables, indices_per_table):
|
297
|
-
data[np.where(indices[
|
320
|
+
data[np.where(indices[INDICES_TABLE_FIELD] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[INDICES_INDEX_FIELD])
|
298
321
|
return data
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -1,30 +1,30 @@
|
|
1
1
|
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
cesnet_datazoo/config.py,sha256=
|
3
|
-
cesnet_datazoo/constants.py,sha256=
|
2
|
+
cesnet_datazoo/config.py,sha256=x8bugBZmBZ9PNd0D5TNHLPHbvx4ZTCQGwQzXPypenjc,38406
|
3
|
+
cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
|
4
4
|
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
-
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=
|
5
|
+
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=DtC597aRm4n8rlbVknG25yd9rsCqRG94jn7xMxZoC1g,46635
|
6
6
|
cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
|
7
7
|
cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
|
8
8
|
cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
|
9
|
-
cesnet_datazoo/datasets/statistics.py,sha256=
|
9
|
+
cesnet_datazoo/datasets/statistics.py,sha256=DfeCq-o7ML8u2Wg_AlAaarEBZ5oulCJz4S7enGswXRg,15137
|
10
10
|
cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=
|
11
|
+
cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=o0rHXZ9S5NjjboGiXRQkBoZ7kYKlweQMRsMSAQm1EPE,1623
|
12
12
|
cesnet_datazoo/datasets/metadata/metadata.csv,sha256=lG1Wz7Rr66pG2hWnMqoERIN_oX53DpAmlRZLw3T2p34,2175
|
13
13
|
cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
cesnet_datazoo/metrics/classification_report.py,sha256=
|
15
|
-
cesnet_datazoo/metrics/provider_metrics.py,sha256=
|
14
|
+
cesnet_datazoo/metrics/classification_report.py,sha256=kqVW35uEctTiWpMqxhWzOmmDkV4p3yEFLMRqLn_R6AU,3981
|
15
|
+
cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKUGYOEFYqdBbNY4,1374
|
16
16
|
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
18
18
|
cesnet_datazoo/pytables_data/data_scalers.py,sha256=ednTRVl-sjrFLX6vwzCuPLJDpFuwNWDlJz7msV3yM9M,5083
|
19
|
-
cesnet_datazoo/pytables_data/indices_setup.py,sha256=
|
20
|
-
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=
|
19
|
+
cesnet_datazoo/pytables_data/indices_setup.py,sha256=M5J2BevkQK8fuC22vUauKyKAEVwYg8xRz9JJK8E1VX8,13717
|
20
|
+
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=NkN0PKKUIiAhrGFM9OCR0s48TH66JLzZLiaIOE6d7AE,19413
|
21
21
|
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
|
23
23
|
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
24
24
|
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
25
25
|
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
26
|
-
cesnet_datazoo-0.1.
|
27
|
-
cesnet_datazoo-0.1.
|
28
|
-
cesnet_datazoo-0.1.
|
29
|
-
cesnet_datazoo-0.1.
|
30
|
-
cesnet_datazoo-0.1.
|
26
|
+
cesnet_datazoo-0.1.4.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
27
|
+
cesnet_datazoo-0.1.4.dist-info/METADATA,sha256=8HeN2mch9VTCpeBr66ZgwrdJyrS53y4INsiU0Rhgcts,12964
|
28
|
+
cesnet_datazoo-0.1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
29
|
+
cesnet_datazoo-0.1.4.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
30
|
+
cesnet_datazoo-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|