cesnet-datazoo 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/PKG-INFO +1 -1
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/config.py +6 -2
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/constants.py +5 -3
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/cesnet_dataset.py +11 -6
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +1 -1
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/statistics.py +1 -1
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/classification_report.py +12 -13
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/provider_metrics.py +6 -6
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/data_scalers.py +13 -11
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/indices_setup.py +29 -18
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/pytables_dataset.py +51 -30
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/PKG-INFO +1 -1
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/pyproject.toml +1 -1
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/LICENCE +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/README.md +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/__init__.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/__init__.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/datasets.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/loaders.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/__init__.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/__init__.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/__init__.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/class_info.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/download.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/fileutils.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/random.py +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/requires.txt +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/top_level.txt +0 -0
- {cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -14,8 +14,9 @@ import yaml
|
|
14
14
|
from pydantic import model_validator
|
15
15
|
from pydantic.dataclasses import dataclass
|
16
16
|
|
17
|
-
from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN,
|
18
|
-
TCP_PPI_CHANNELS,
|
17
|
+
from cesnet_datazoo.constants import (PHIST_BIN_COUNT, PPI_MAX_LEN, QUIC_SNI_COLUMN,
|
18
|
+
SELECTED_TCP_FLAGS, TCP_PPI_CHANNELS, TLS_SNI_COLUMN,
|
19
|
+
UDP_PPI_CHANNELS)
|
19
20
|
|
20
21
|
if TYPE_CHECKING:
|
21
22
|
from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
|
@@ -128,6 +129,7 @@ class DatasetConfig():
|
|
128
129
|
flowstats_features_boolean: Taken from `dataset.metadata.flowstats_features_boolean`.
|
129
130
|
flowstats_features_phist: Taken from `dataset.metadata.packet_histograms` if `use_packet_histograms` is true, otherwise an empty list.
|
130
131
|
other_fields: Taken from `dataset.metadata.other_fields` if `return_other_fields` is true, otherwise an empty list.
|
132
|
+
sni_column: Database column with SNI domains, can be None for datasets without SNI domains.
|
131
133
|
|
132
134
|
# Configuration options
|
133
135
|
|
@@ -343,6 +345,8 @@ class DatasetConfig():
|
|
343
345
|
# Configure features
|
344
346
|
self.flowstats_features = dataset.metadata.flowstats_features
|
345
347
|
self.flowstats_features_boolean = dataset.metadata.flowstats_features_boolean
|
348
|
+
sni_column = TLS_SNI_COLUMN if dataset.metadata.protocol == Protocol.TLS else QUIC_SNI_COLUMN
|
349
|
+
self.sni_column = sni_column if sni_column in dataset.metadata.other_fields else None
|
346
350
|
self.other_fields = dataset.metadata.other_fields if self.return_other_fields else []
|
347
351
|
if self.use_packet_histograms:
|
348
352
|
if len(dataset.metadata.packet_histograms) == 0:
|
@@ -39,6 +39,8 @@ DEFAULT_BACKGROUND_CLASS = "default-background"
|
|
39
39
|
GOOGLE_BACKGROUND_CLASS = "google-background"
|
40
40
|
|
41
41
|
# Indices
|
42
|
-
|
43
|
-
|
44
|
-
|
42
|
+
INDICES_TABLE_FIELD = "TABLE"
|
43
|
+
INDICES_INDEX_FIELD = "INDEX"
|
44
|
+
INDICES_APP_FIELD = "APP"
|
45
|
+
INDICES_SNI_FIELD = "SNI"
|
46
|
+
INDICES_DTYPE = [(INDICES_TABLE_FIELD, "int32"), (INDICES_INDEX_FIELD, "int32"), (INDICES_APP_FIELD, "int32"), (INDICES_SNI_FIELD, "U50")]
|
@@ -10,14 +10,16 @@ import numpy as np
|
|
10
10
|
import pandas as pd
|
11
11
|
import tables as tb
|
12
12
|
import torch
|
13
|
+
from numpy.lib.recfunctions import repack_fields
|
13
14
|
from sklearn.model_selection import train_test_split
|
14
15
|
from sklearn.preprocessing import LabelEncoder
|
15
16
|
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, SequentialSampler
|
16
17
|
from typing_extensions import assert_never
|
17
18
|
|
18
19
|
from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
|
19
|
-
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES,
|
20
|
-
SERVICEMAP_FILE,
|
20
|
+
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_APP_FIELD,
|
21
|
+
INDICES_INDEX_FIELD, INDICES_TABLE_FIELD, SERVICEMAP_FILE,
|
22
|
+
UNKNOWN_STR_LABEL)
|
21
23
|
from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
|
22
24
|
from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
|
23
25
|
from cesnet_datazoo.datasets.statistics import compute_dataset_statistics
|
@@ -555,7 +557,7 @@ class CesnetDataset():
|
|
555
557
|
train_val_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.TRAIN_VAL_SPLIT)
|
556
558
|
val_data_path = dataset_config._get_train_data_path()
|
557
559
|
val_unknown_indices = train_unknown_indices
|
558
|
-
train_labels = train_indices[
|
560
|
+
train_labels = train_indices[INDICES_APP_FIELD]
|
559
561
|
if dataset_config.train_dates_weigths is not None:
|
560
562
|
assert dataset_config.val_known_size != "all"
|
561
563
|
# When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
|
@@ -619,13 +621,14 @@ class CesnetDataset():
|
|
619
621
|
train_dataset = PyTablesDataset(
|
620
622
|
database_path=dataset_config.database_path,
|
621
623
|
tables_paths=dataset_config._get_train_tables_paths(),
|
622
|
-
indices=dataset_indices.train_indices,
|
624
|
+
indices=repack_fields(dataset_indices.train_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
|
623
625
|
tables_app_enum=self._tables_app_enum,
|
624
626
|
tables_cat_enum=self._tables_cat_enum,
|
625
627
|
flowstats_features=dataset_config.flowstats_features,
|
626
628
|
flowstats_features_boolean=dataset_config.flowstats_features_boolean,
|
627
629
|
flowstats_features_phist=dataset_config.flowstats_features_phist,
|
628
630
|
other_fields=self.dataset_config.other_fields,
|
631
|
+
sni_column=self.dataset_config.sni_column,
|
629
632
|
ppi_channels=dataset_config.get_ppi_channels(),
|
630
633
|
ppi_transform=dataset_config.ppi_transform,
|
631
634
|
flowstats_transform=dataset_config.flowstats_transform,
|
@@ -637,13 +640,14 @@ class CesnetDataset():
|
|
637
640
|
val_dataset = PyTablesDataset(
|
638
641
|
database_path=dataset_config.database_path,
|
639
642
|
tables_paths=dataset_config._get_val_tables_paths(),
|
640
|
-
indices=dataset_indices.val_known_indices,
|
643
|
+
indices=repack_fields(dataset_indices.val_known_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
|
641
644
|
tables_app_enum=self._tables_app_enum,
|
642
645
|
tables_cat_enum=self._tables_cat_enum,
|
643
646
|
flowstats_features=dataset_config.flowstats_features,
|
644
647
|
flowstats_features_boolean=dataset_config.flowstats_features_boolean,
|
645
648
|
flowstats_features_phist=dataset_config.flowstats_features_phist,
|
646
649
|
other_fields=self.dataset_config.other_fields,
|
650
|
+
sni_column=self.dataset_config.sni_column,
|
647
651
|
ppi_channels=dataset_config.get_ppi_channels(),
|
648
652
|
ppi_transform=dataset_config.ppi_transform,
|
649
653
|
flowstats_transform=dataset_config.flowstats_transform,
|
@@ -657,13 +661,14 @@ class CesnetDataset():
|
|
657
661
|
test_dataset = PyTablesDataset(
|
658
662
|
database_path=dataset_config.database_path,
|
659
663
|
tables_paths=dataset_config._get_test_tables_paths(),
|
660
|
-
indices=test_combined_indices,
|
664
|
+
indices=repack_fields(test_combined_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
|
661
665
|
tables_app_enum=self._tables_app_enum,
|
662
666
|
tables_cat_enum=self._tables_cat_enum,
|
663
667
|
flowstats_features=dataset_config.flowstats_features,
|
664
668
|
flowstats_features_boolean=dataset_config.flowstats_features_boolean,
|
665
669
|
flowstats_features_phist=dataset_config.flowstats_features_phist,
|
666
670
|
other_fields=self.dataset_config.other_fields,
|
671
|
+
sni_column=self.dataset_config.sni_column,
|
667
672
|
ppi_channels=dataset_config.get_ppi_channels(),
|
668
673
|
ppi_transform=dataset_config.ppi_transform,
|
669
674
|
flowstats_transform=dataset_config.flowstats_transform,
|
{cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/dataset_metadata.py
RENAMED
@@ -39,5 +39,5 @@ class DatasetMetadata():
|
|
39
39
|
metadata_df = pd.read_csv(os.path.join(os.path.dirname(__file__), "metadata.csv"), index_col="Name", keep_default_na=False)
|
40
40
|
def load_metadata(dataset_name: str) -> DatasetMetadata:
|
41
41
|
d = metadata_df.loc[dataset_name].to_dict()
|
42
|
-
d = {k.replace(" ", "_").lower(): v for k, v in d.items()}
|
42
|
+
d = {k.replace(" ", "_").lower(): v for k, v in d.items()} # type: ignore
|
43
43
|
return DatasetMetadata(**d)
|
@@ -92,7 +92,7 @@ def compute_dataset_statistics(database_path: str,
|
|
92
92
|
tables_paths=table_paths,
|
93
93
|
indices=None,
|
94
94
|
disabled_apps=disabled_apps,
|
95
|
-
|
95
|
+
return_raw_fields=True,
|
96
96
|
flowstats_features=[],
|
97
97
|
flowstats_features_boolean=[],
|
98
98
|
flowstats_features_phist=[],
|
{cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/classification_report.py
RENAMED
@@ -1,13 +1,12 @@
|
|
1
1
|
import numpy as np
|
2
2
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
3
3
|
|
4
|
-
from cesnet_datazoo.metrics.provider_metrics import
|
5
|
-
provider_accuracies)
|
4
|
+
from cesnet_datazoo.metrics.provider_metrics import per_app_provider_metrics, provider_accuracies
|
6
5
|
from cesnet_datazoo.utils.class_info import ClassInfo
|
7
6
|
|
8
7
|
|
9
|
-
def better_classification_report(
|
10
|
-
p, r, f1, s = precision_recall_fscore_support(
|
8
|
+
def better_classification_report(test_labels: np.ndarray, preds: np.ndarray, cm: np.ndarray, labels: list[int], class_info: ClassInfo, digits: int = 2, zero_division: int = 0) -> tuple[str, dict[str, float]]:
|
9
|
+
p, r, f1, s = precision_recall_fscore_support(test_labels, preds,
|
11
10
|
labels=labels,
|
12
11
|
zero_division=zero_division)
|
13
12
|
sc_p, sc_r, sc_f1 = per_app_provider_metrics(cm, class_info=class_info)
|
@@ -46,20 +45,20 @@ def better_classification_report(y_true: np.ndarray, y_pred: np.ndarray, cm: np.
|
|
46
45
|
report += headers_fmt.format("", *headers_avg, width=width)
|
47
46
|
report += row_fmt_avg.format("macro avg", *row_avg, width=width, digits=digits)
|
48
47
|
|
49
|
-
acc = accuracy_score(
|
50
|
-
provider_acc, failed_provider_acc = provider_accuracies(
|
48
|
+
acc = accuracy_score(test_labels, preds)
|
49
|
+
provider_acc, failed_provider_acc = provider_accuracies(test_labels, preds, class_info=class_info)
|
51
50
|
|
52
51
|
row_fmt_acc = "{:>{width}} {:>15} {:>15} {:>7.{digits}f}\n"
|
53
52
|
report += row_fmt_acc.format("acc", "", "", acc, width=width, digits=digits)
|
54
53
|
report += row_fmt_acc.format("provider acc", "", "", provider_acc, width=width, digits=digits)
|
55
54
|
report += row_fmt_acc.format("failed provider acc", "", "", failed_provider_acc, width=width, digits=digits)
|
56
55
|
metrics = {
|
57
|
-
"
|
58
|
-
"
|
59
|
-
"
|
60
|
-
"
|
61
|
-
"
|
62
|
-
"
|
63
|
-
"
|
56
|
+
"test/acc": acc,
|
57
|
+
"test/provider-acc": provider_acc,
|
58
|
+
"test/failed-provider-acc": failed_provider_acc,
|
59
|
+
"test/fscore": avg_f1,
|
60
|
+
"test/provider-fscore": avg_sc_f1,
|
61
|
+
"test/recall": avg_r,
|
62
|
+
"test/provider-recall": avg_sc_r,
|
64
63
|
}
|
65
64
|
return report, metrics
|
@@ -3,13 +3,13 @@ import numpy as np
|
|
3
3
|
from cesnet_datazoo.utils.class_info import ClassInfo
|
4
4
|
|
5
5
|
|
6
|
-
def provider_accuracies(
|
6
|
+
def provider_accuracies(true_labels: np.ndarray, preds: np.ndarray, class_info: ClassInfo) -> tuple[float, float]:
|
7
7
|
provider_mapping_arr = np.array(list(class_info.provider_mapping.values()))
|
8
|
-
|
9
|
-
|
10
|
-
mistakes =
|
11
|
-
provider_acc = (
|
12
|
-
failed_provider_acc = (
|
8
|
+
true_labels_provider = provider_mapping_arr[true_labels]
|
9
|
+
preds_provider = provider_mapping_arr[preds]
|
10
|
+
mistakes = true_labels != preds
|
11
|
+
provider_acc = (true_labels_provider == preds_provider).sum() / len(true_labels_provider)
|
12
|
+
failed_provider_acc = (true_labels_provider[mistakes] == preds_provider[mistakes]).sum() / mistakes.sum()
|
13
13
|
return provider_acc, failed_provider_acc
|
14
14
|
|
15
15
|
def per_app_provider_metrics(cm, class_info: ClassInfo):
|
@@ -46,18 +46,20 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
|
|
46
46
|
data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
|
47
47
|
padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
|
48
48
|
# Fit IPT scaler
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
if clip_and_scale_ppi_transform.ipt_scaler:
|
50
|
+
train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
|
51
|
+
train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
|
52
|
+
if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
|
53
|
+
# Let zero be the minimum for minmax scaling
|
54
|
+
train_ipt = np.concatenate((train_ipt, [0]))
|
55
|
+
clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
|
55
56
|
# Fit packet sizes scaler
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
57
|
+
if clip_and_scale_ppi_transform.psizes_scaler:
|
58
|
+
train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
|
59
|
+
train_psizes[padding_mask] = np.nan
|
60
|
+
if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
|
61
|
+
train_psizes = np.concatenate((train_psizes, [0]))
|
62
|
+
clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
61
63
|
clip_and_scale_ppi_transform.needs_fitting = False
|
62
64
|
json.dump(clip_and_scale_ppi_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "ppi-transform.json"), "w"), indent=4)
|
63
65
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import dataclasses
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import time
|
4
5
|
import warnings
|
5
6
|
from collections import namedtuple
|
6
7
|
from enum import Enum
|
@@ -9,7 +10,8 @@ import numpy as np
|
|
9
10
|
import pandas as pd
|
10
11
|
|
11
12
|
from cesnet_datazoo.config import DatasetConfig
|
12
|
-
from cesnet_datazoo.constants import
|
13
|
+
from cesnet_datazoo.constants import (INDICES_APP_FIELD, INDICES_DTYPE, INDICES_INDEX_FIELD,
|
14
|
+
INDICES_TABLE_FIELD)
|
13
15
|
from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
|
14
16
|
from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
|
15
17
|
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
@@ -21,8 +23,8 @@ IndicesTuple = namedtuple("IndicesTuple", ["train_indices", "val_known_indices",
|
|
21
23
|
|
22
24
|
|
23
25
|
def sort_indices(indices: np.ndarray) -> np.ndarray:
|
24
|
-
idxs = np.argsort(indices[
|
25
|
-
res = idxs[np.argsort(indices[idxs
|
26
|
+
idxs = np.argsort(indices[INDICES_INDEX_FIELD])
|
27
|
+
res = idxs[np.argsort(indices[idxs][INDICES_TABLE_FIELD], kind="stable")]
|
26
28
|
return indices[res]
|
27
29
|
|
28
30
|
def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: IndicesTuple) -> IndicesTuple:
|
@@ -61,7 +63,7 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
|
|
61
63
|
|
62
64
|
def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
|
63
65
|
rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
|
64
|
-
indices_per_date = [train_indices[train_indices[
|
66
|
+
indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
|
65
67
|
weights = np.array(dataset_config.train_dates_weigths)
|
66
68
|
weights = weights / weights.sum()
|
67
69
|
samples_per_date = np.ceil((weights * (num_samples))).astype(int)
|
@@ -77,7 +79,7 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
|
|
77
79
|
return sampled_train_indices
|
78
80
|
|
79
81
|
def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
|
80
|
-
app_counts = pd.Series(indices[
|
82
|
+
app_counts = pd.Series(indices[INDICES_APP_FIELD]).value_counts()
|
81
83
|
app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
|
82
84
|
return app_counts
|
83
85
|
|
@@ -99,23 +101,26 @@ def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: d
|
|
99
101
|
init_train_data(train_data_path)
|
100
102
|
if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
|
101
103
|
log.info("Processing train indices")
|
104
|
+
start_time = time.time()
|
102
105
|
train_data_params = dataset_config._get_train_data_params()
|
103
106
|
train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
|
104
107
|
database_path=dataset_config.database_path,
|
105
108
|
tables_app_enum=tables_app_enum,
|
109
|
+
sni_column=dataset_config.sni_column,
|
106
110
|
servicemap=servicemap,
|
107
111
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
|
108
112
|
if not disable_indices_cache:
|
109
113
|
yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
|
110
114
|
yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
|
111
115
|
yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
|
112
|
-
np.
|
113
|
-
|
116
|
+
np.savez_compressed(os.path.join(train_data_path, "train_indices.npz"), train_known_indices=train_known_indices, train_unknown_indices=train_unknown_indices)
|
117
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
|
114
118
|
else:
|
115
119
|
known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
|
116
120
|
unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
|
117
|
-
|
118
|
-
|
121
|
+
loaded = np.load(os.path.join(train_data_path, "train_indices.npz"))
|
122
|
+
train_known_indices = loaded["train_known_indices"]
|
123
|
+
train_unknown_indices = loaded["train_unknown_indices"]
|
119
124
|
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
120
125
|
|
121
126
|
def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
@@ -123,17 +128,20 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str
|
|
123
128
|
init_test_data(val_data_path)
|
124
129
|
if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
|
125
130
|
log.info("Processing validation indices")
|
131
|
+
start_time = time.time()
|
126
132
|
val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
|
127
133
|
database_path=dataset_config.database_path,
|
128
134
|
tables_app_enum=tables_app_enum,
|
135
|
+
sni_column=dataset_config.sni_column,
|
129
136
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
|
130
137
|
if not disable_indices_cache:
|
131
138
|
yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
|
132
|
-
np.
|
133
|
-
|
139
|
+
np.savez_compressed(os.path.join(val_data_path, "val_indices.npz"), val_known_indices=val_known_indices, val_unknown_indices=val_unknown_indices)
|
140
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
|
134
141
|
else:
|
135
|
-
|
136
|
-
|
142
|
+
loaded = np.load(os.path.join(val_data_path, "val_indices.npz"))
|
143
|
+
val_known_indices = loaded["val_known_indices"]
|
144
|
+
val_unknown_indices = loaded["val_unknown_indices"]
|
137
145
|
return val_known_indices, val_unknown_indices, val_data_path
|
138
146
|
|
139
147
|
def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
@@ -141,17 +149,20 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[st
|
|
141
149
|
init_test_data(test_data_path)
|
142
150
|
if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
|
143
151
|
log.info("Processing test indices")
|
152
|
+
start_time = time.time()
|
144
153
|
test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
|
145
154
|
database_path=dataset_config.database_path,
|
146
155
|
tables_app_enum=tables_app_enum,
|
156
|
+
sni_column=dataset_config.sni_column,
|
147
157
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
|
148
158
|
if not disable_indices_cache:
|
149
159
|
yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
|
150
|
-
np.
|
151
|
-
|
160
|
+
np.savez_compressed(os.path.join(test_data_path, "test_indices.npz"), test_known_indices=test_known_indices, test_unknown_indices=test_unknown_indices)
|
161
|
+
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds")
|
152
162
|
else:
|
153
|
-
|
154
|
-
|
163
|
+
loaded = np.load(os.path.join(test_data_path, "test_indices.npz"))
|
164
|
+
test_known_indices = loaded["test_known_indices"]
|
165
|
+
test_unknown_indices = loaded["test_unknown_indices"]
|
155
166
|
return test_known_indices, test_unknown_indices, test_data_path
|
156
167
|
|
157
168
|
def init_train_data(train_data_path: str):
|
@@ -164,4 +175,4 @@ def init_test_data(test_data_path: str):
|
|
164
175
|
os.makedirs(os.path.join(test_data_path, "preload"), exist_ok=True)
|
165
176
|
|
166
177
|
def no_indices() -> np.ndarray:
|
167
|
-
return np.
|
178
|
+
return np.empty(shape=(0,), dtype=INDICES_DTYPE)
|
{cesnet-datazoo-0.1.3 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/pytables_dataset.py
RENAMED
@@ -16,8 +16,8 @@ from typing_extensions import assert_never
|
|
16
16
|
|
17
17
|
from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
|
18
18
|
TrainDataParams)
|
19
|
-
from cesnet_datazoo.constants import (APP_COLUMN,
|
20
|
-
|
19
|
+
from cesnet_datazoo.constants import (APP_COLUMN, INDICES_DTYPE, INDICES_INDEX_FIELD,
|
20
|
+
INDICES_TABLE_FIELD, PPI_COLUMN)
|
21
21
|
from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
|
22
22
|
split_apps_topx_with_provider_groups)
|
23
23
|
|
@@ -36,12 +36,13 @@ class PyTablesDataset(Dataset):
|
|
36
36
|
flowstats_features_phist: list[str],
|
37
37
|
other_fields: list[str],
|
38
38
|
ppi_channels: list[int],
|
39
|
+
sni_column: Optional[str] = None,
|
39
40
|
ppi_transform: Optional[Callable] = None,
|
40
41
|
flowstats_transform: Optional[Callable] = None,
|
41
42
|
flowstats_phist_transform: Optional[Callable] = None,
|
42
43
|
target_transform: Optional[Callable] = None,
|
43
44
|
return_tensors: bool = False,
|
44
|
-
|
45
|
+
return_raw_fields: bool = False,
|
45
46
|
preload: bool = False,
|
46
47
|
preload_blob: Optional[str] = None,
|
47
48
|
disabled_apps: Optional[list[str]] = None,):
|
@@ -60,14 +61,14 @@ class PyTablesDataset(Dataset):
|
|
60
61
|
self.flowstats_features_boolean = flowstats_features_boolean
|
61
62
|
self.flowstats_features_phist = flowstats_features_phist
|
62
63
|
self.other_fields = other_fields
|
64
|
+
self.sni_column = sni_column
|
63
65
|
self.ppi_channels = ppi_channels
|
64
66
|
self.ppi_transform = ppi_transform
|
65
67
|
self.flowstats_transform = flowstats_transform
|
66
68
|
self.flowstats_phist_transform = flowstats_phist_transform
|
67
69
|
self.target_transform = target_transform
|
68
70
|
self.return_tensors = return_tensors
|
69
|
-
self.
|
70
|
-
self.sni_column = TLS_SNI_COLUMN if TLS_SNI_COLUMN in self.other_fields else QUIC_SNI_COLUMN if QUIC_SNI_COLUMN in self.other_fields else None
|
71
|
+
self.return_raw_fields = return_raw_fields
|
71
72
|
|
72
73
|
self.preload = preload
|
73
74
|
self.preload_blob = preload_blob
|
@@ -78,7 +79,7 @@ class PyTablesDataset(Dataset):
|
|
78
79
|
batch_data = self.data[batch_idx]
|
79
80
|
else:
|
80
81
|
batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
|
81
|
-
if self.
|
82
|
+
if self.return_raw_fields:
|
82
83
|
return (batch_data, batch_idx)
|
83
84
|
|
84
85
|
# Prepare data
|
@@ -157,11 +158,9 @@ class PyTablesDataset(Dataset):
|
|
157
158
|
for i in range(len(tables)):
|
158
159
|
base_labels[i] = tables[i].read(field=APP_COLUMN)
|
159
160
|
base_indices[i] = np.nonzero(np.isin(base_labels[i], disabled_apps_ids, invert=True))[0]
|
160
|
-
indices = np.
|
161
|
+
indices = np.array(list(zip(
|
161
162
|
np.concatenate([[table_id] * len(base_indices[table_id]) for table_id in tables]),
|
162
|
-
np.concatenate(list(base_indices.values())),
|
163
|
-
np.concatenate(list(base_labels.values()))
|
164
|
-
)).astype(np.int32)
|
163
|
+
np.concatenate(list(base_indices.values())))), dtype=[field for field in INDICES_DTYPE if field[0] in [INDICES_INDEX_FIELD, INDICES_TABLE_FIELD]])
|
165
164
|
self.indices = indices
|
166
165
|
database.close()
|
167
166
|
|
@@ -173,16 +172,21 @@ def worker_init_fn(worker_id):
|
|
173
172
|
dataset = worker_info.dataset
|
174
173
|
dataset.pytables_worker_init(worker_id)
|
175
174
|
|
176
|
-
def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
175
|
+
def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], sni_column: Optional[str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
177
176
|
database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
|
178
177
|
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
179
|
-
|
178
|
+
all_labels = {}
|
179
|
+
all_sni_domains = {}
|
180
180
|
app_counts = pd.Series(dtype="int64")
|
181
181
|
start_time = time.time()
|
182
182
|
for i, table_path in enumerate(train_data_params.train_tables_paths):
|
183
|
-
|
184
|
-
|
185
|
-
|
183
|
+
all_labels[i] = train_tables[i].read(field=APP_COLUMN)
|
184
|
+
if sni_column is not None:
|
185
|
+
all_sni_domains[i] = train_tables[i].read(field=sni_column)
|
186
|
+
else:
|
187
|
+
all_sni_domains[i] = np.full_like(all_labels[i], "", dtype="U1")
|
188
|
+
log.info(f"Reading app and SNI columns for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
189
|
+
app_counts = app_counts.add(pd.Series(all_labels[i]).value_counts(), fill_value=0)
|
186
190
|
database.close()
|
187
191
|
# Handle disabled apps and apps with less than min_samples_per_app samples
|
188
192
|
if len(train_data_params.disabled_apps) > 0:
|
@@ -202,8 +206,9 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
202
206
|
# Base indices are indices of samples that are not disabled and have enough samples
|
203
207
|
base_indices = {}
|
204
208
|
for i, table_path in enumerate(train_data_params.train_tables_paths):
|
205
|
-
base_indices[i] = np.nonzero(np.isin(
|
206
|
-
base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in
|
209
|
+
base_indices[i] = np.nonzero(np.isin(all_labels[i], disabled_apps_ids, invert=True))[0]
|
210
|
+
base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_labels.items()}
|
211
|
+
base_sni_domains = {table_id: arr[base_indices[table_id]] for table_id, arr in all_sni_domains.items()}
|
207
212
|
# Apps selection
|
208
213
|
if train_data_params.apps_selection != AppSelection.FIXED:
|
209
214
|
app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
|
@@ -230,26 +235,38 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
230
235
|
known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
|
231
236
|
unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
|
232
237
|
|
233
|
-
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
238
|
+
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
239
|
+
base_labels=base_labels,
|
240
|
+
base_sni_domains=base_sni_domains,
|
241
|
+
known_apps_ids=known_apps_ids,
|
242
|
+
unknown_apps_ids=unknown_apps_ids)
|
234
243
|
rng.shuffle(train_known_indices)
|
235
244
|
rng.shuffle(train_unknown_indices)
|
236
|
-
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
237
245
|
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
238
246
|
|
239
|
-
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
247
|
+
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], sni_column: Optional[str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
240
248
|
database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
|
241
249
|
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
242
250
|
base_labels = {}
|
251
|
+
base_sni_domains = {}
|
243
252
|
base_indices = {}
|
244
253
|
start_time = time.time()
|
245
254
|
for i, table_path in enumerate(test_data_params.test_tables_paths):
|
246
255
|
base_labels[i] = test_tables[i].read(field=APP_COLUMN)
|
247
|
-
|
256
|
+
if sni_column is not None:
|
257
|
+
base_sni_domains[i] = test_tables[i].read(field=sni_column)
|
258
|
+
else:
|
259
|
+
base_sni_domains[i] = np.full_like(base_labels[i], "", dtype="U1")
|
260
|
+
log.info(f"Reading app and SNI columns for table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
248
261
|
base_indices[i] = np.arange(len(test_tables[i]))
|
249
262
|
database.close()
|
250
263
|
known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
|
251
264
|
unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
|
252
|
-
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
265
|
+
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices,
|
266
|
+
base_labels=base_labels,
|
267
|
+
base_sni_domains=base_sni_domains,
|
268
|
+
known_apps_ids=known_apps_ids,
|
269
|
+
unknown_apps_ids=unknown_apps_ids)
|
253
270
|
rng.shuffle(test_known_indices)
|
254
271
|
rng.shuffle(test_unknown_indices)
|
255
272
|
log.info(f"Processing indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
@@ -271,28 +288,32 @@ def list_all_tables(database_path: str) -> list[str]:
|
|
271
288
|
with tb.open_file(database_path, mode="r") as database:
|
272
289
|
return list(map(lambda x: x._v_pathname, iter(database.get_node(f"/flows"))))
|
273
290
|
|
274
|
-
def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[int, np.ndarray], known_apps_ids: list[int], unknown_apps_ids: list[int]) -> tuple[np.ndarray, np.ndarray]:
|
291
|
+
def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[int, np.ndarray], base_sni_domains: dict[int, np.ndarray], known_apps_ids: list[int], unknown_apps_ids: list[int]) -> tuple[np.ndarray, np.ndarray]:
|
275
292
|
is_known = {table_id: np.isin(table_arr, known_apps_ids) for table_id, table_arr in base_labels.items()}
|
276
293
|
is_unknown = {table_id: np.isin(table_arr, unknown_apps_ids) for table_id, table_arr in base_labels.items()}
|
277
294
|
known_indices_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_indices.items()}
|
278
295
|
unknown_indices_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_indices.items()}
|
279
296
|
known_labels_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_labels.items()}
|
280
297
|
unknown_labels_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_labels.items()}
|
281
|
-
|
298
|
+
known_sni_domains_dict = {table_id: table_arr[is_known[table_id]] for table_id, table_arr in base_sni_domains.items()}
|
299
|
+
unknown_sni_domains_dict = {table_id: table_arr[is_unknown[table_id]] for table_id, table_arr in base_sni_domains.items()}
|
300
|
+
known_indices = np.array(list(zip(
|
282
301
|
np.concatenate([[table_id] * table_arr.sum() for table_id, table_arr in is_known.items()]),
|
283
302
|
np.concatenate(list(known_indices_dict.values())),
|
284
|
-
np.concatenate(list(known_labels_dict.values()))
|
285
|
-
|
303
|
+
np.concatenate(list(known_labels_dict.values())),
|
304
|
+
np.concatenate(list(known_sni_domains_dict.values())))), dtype=INDICES_DTYPE)
|
305
|
+
unknown_indices = np.array(list(zip(
|
286
306
|
np.concatenate([[table_id] * table_arr.sum() for table_id, table_arr in is_unknown.items()]),
|
287
307
|
np.concatenate(list(unknown_indices_dict.values())),
|
288
|
-
np.concatenate(list(unknown_labels_dict.values()))
|
308
|
+
np.concatenate(list(unknown_labels_dict.values())),
|
309
|
+
np.concatenate(list(unknown_sni_domains_dict.values())))), dtype=INDICES_DTYPE)
|
289
310
|
return known_indices, unknown_indices
|
290
311
|
|
291
312
|
def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) -> np.ndarray:
|
292
|
-
sorted_indices = indices[indices[
|
293
|
-
unique_tables, split_bounderies = np.unique(sorted_indices[
|
313
|
+
sorted_indices = indices[indices[INDICES_TABLE_FIELD].argsort(kind="stable")]
|
314
|
+
unique_tables, split_bounderies = np.unique(sorted_indices[INDICES_TABLE_FIELD], return_index=True)
|
294
315
|
indices_per_table = np.split(sorted_indices, split_bounderies[1:])
|
295
316
|
data = np.zeros(len(indices), dtype=data_dtype)
|
296
317
|
for table_id, table_indices in zip(unique_tables, indices_per_table):
|
297
|
-
data[np.where(indices[
|
318
|
+
data[np.where(indices[INDICES_TABLE_FIELD] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[INDICES_INDEX_FIELD])
|
298
319
|
return data
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|