cesnet-datazoo 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +173 -168
- cesnet_datazoo/constants.py +4 -6
- cesnet_datazoo/datasets/cesnet_dataset.py +200 -177
- cesnet_datazoo/datasets/datasets.py +22 -2
- cesnet_datazoo/datasets/datasets_constants.py +670 -0
- cesnet_datazoo/datasets/loaders.py +3 -0
- cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
- cesnet_datazoo/datasets/metadata/metadata.csv +4 -4
- cesnet_datazoo/datasets/statistics.py +36 -16
- cesnet_datazoo/pytables_data/data_scalers.py +68 -154
- cesnet_datazoo/pytables_data/indices_setup.py +29 -33
- cesnet_datazoo/pytables_data/pytables_dataset.py +99 -122
- cesnet_datazoo/utils/class_info.py +7 -5
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/METADATA +2 -1
- cesnet_datazoo-0.1.0.dist-info/RECORD +30 -0
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/WHEEL +1 -1
- cesnet_datazoo-0.0.17.dist-info/RECORD +0 -29
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,9 @@ from tqdm import tqdm
|
|
6
6
|
from cesnet_datazoo.constants import APP_COLUMN
|
7
7
|
|
8
8
|
|
9
|
+
def collate_fn_simple(batch):
|
10
|
+
return batch
|
11
|
+
|
9
12
|
def load_from_dataloader(dataloader: DataLoader, silent: bool = False) -> tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
|
10
13
|
other_fields = []
|
11
14
|
data_ppi = []
|
@@ -18,18 +18,19 @@ class DatasetMetadata():
|
|
18
18
|
collection_period: str
|
19
19
|
missing_dates_in_collection_period: list[str]
|
20
20
|
application_count: int
|
21
|
-
|
22
|
-
|
23
|
-
packet_histogram_features: list[str]
|
21
|
+
background_traffic_classes: list[str]
|
22
|
+
ppi_features: list[str]
|
24
23
|
flowstats_features: list[str]
|
24
|
+
flowstats_features_boolean: list[str]
|
25
|
+
packet_histograms: list[str]
|
25
26
|
tcp_features: list[str]
|
26
27
|
other_fields: list[str]
|
27
28
|
cite: str
|
28
29
|
zenodo_url: str
|
29
30
|
related_papers: list[str]
|
30
31
|
|
31
|
-
@field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "
|
32
|
-
"
|
32
|
+
@field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "background_traffic_classes", "ppi_features",
|
33
|
+
"flowstats_features", "flowstats_features_boolean", "packet_histograms", "tcp_features", "other_fields", "related_papers", mode="before")
|
33
34
|
@classmethod
|
34
35
|
def parse_string_to_list(cls, v: str, info: ValidationInfo) -> list[str]:
|
35
36
|
l = list(map(str.strip, v.split(","))) if v else []
|
@@ -1,4 +1,4 @@
|
|
1
|
-
Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic,
|
2
|
-
CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE"
|
3
|
-
CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","
|
4
|
-
CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","
|
1
|
+
Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic classes,PPI features,Flowstats features,Flowstats features boolean,Packet histograms,TCP features,Other fields,Cite,Zenodo URL,Related papers
|
2
|
+
CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
|
3
|
+
CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
|
4
|
+
CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
import os
|
3
2
|
from collections import Counter
|
4
3
|
from typing import Any, Literal
|
@@ -12,8 +11,9 @@ from torch.utils.data import BatchSampler, DataLoader, SequentialSampler
|
|
12
11
|
from tqdm import tqdm
|
13
12
|
|
14
13
|
from cesnet_datazoo.config import Protocol
|
15
|
-
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN,
|
16
|
-
|
14
|
+
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, IPT_POS, PPI_COLUMN, SIZE_POS,
|
15
|
+
UDP_PPI_CHANNELS)
|
16
|
+
from cesnet_datazoo.datasets.loaders import collate_fn_simple
|
17
17
|
from cesnet_datazoo.pytables_data.indices_setup import sort_indices
|
18
18
|
from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
|
19
19
|
load_database, worker_init_fn)
|
@@ -36,17 +36,25 @@ def pick_stats_fields(batch):
|
|
36
36
|
batch[CATEGORY_COLUMN],
|
37
37
|
)
|
38
38
|
|
39
|
-
def pick_extra_fields(batch,
|
39
|
+
def pick_extra_fields(batch, packet_histograms: list[str], flow_endreason_features: list[str]):
|
40
40
|
return (
|
41
41
|
batch["DST_ASN"],
|
42
|
-
batch[
|
43
|
-
batch[
|
42
|
+
batch[packet_histograms],
|
43
|
+
batch[flow_endreason_features],
|
44
44
|
)
|
45
45
|
|
46
|
-
def
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
def compute_dataset_statistics(database_path: str,
|
47
|
+
tables_app_enum: dict[int, str],
|
48
|
+
tables_cat_enum: dict[int, str],
|
49
|
+
output_dir: str,
|
50
|
+
packet_histograms: list[str],
|
51
|
+
flowstats_features_boolean: list[str],
|
52
|
+
protocol: Protocol, extra_fields: bool,
|
53
|
+
disabled_apps: list[str],
|
54
|
+
num_samples: int | Literal["all"] = 10_000_000,
|
55
|
+
num_workers: int = 4,
|
56
|
+
batch_size: int = 4096,
|
57
|
+
silent: bool = False):
|
50
58
|
stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
|
51
59
|
stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
|
52
60
|
categories_csv_path = os.path.join(output_dir, "categories.csv")
|
@@ -74,10 +82,22 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
74
82
|
feature_bytes_total = []
|
75
83
|
packet_sizes_counter = Counter()
|
76
84
|
ipt_counter = Counter()
|
85
|
+
flow_endreason_features = [f for f in flowstats_features_boolean if f.startswith("FLOW_ENDREASON")]
|
77
86
|
if not silent:
|
78
87
|
print(f"Reading data from {database_path} for statistics")
|
79
88
|
table_paths = list_all_tables(database_path)
|
80
|
-
stats_dataset = PyTablesDataset(database_path=database_path,
|
89
|
+
stats_dataset = PyTablesDataset(database_path=database_path,
|
90
|
+
tables_app_enum=tables_app_enum,
|
91
|
+
tables_cat_enum=tables_cat_enum,
|
92
|
+
tables_paths=table_paths,
|
93
|
+
indices=None,
|
94
|
+
disabled_apps=disabled_apps,
|
95
|
+
return_all_fields=True,
|
96
|
+
flowstats_features=[],
|
97
|
+
flowstats_features_boolean=[],
|
98
|
+
flowstats_features_phist=[],
|
99
|
+
other_fields=[],
|
100
|
+
ppi_channels=UDP_PPI_CHANNELS,)
|
81
101
|
if num_samples != "all":
|
82
102
|
subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
|
83
103
|
stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
|
@@ -87,7 +107,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
87
107
|
pin_memory=False,
|
88
108
|
num_workers=num_workers,
|
89
109
|
worker_init_fn=worker_init_fn,
|
90
|
-
collate_fn=
|
110
|
+
collate_fn=collate_fn_simple,
|
91
111
|
persistent_workers=False,
|
92
112
|
batch_size=None,
|
93
113
|
sampler=stats_batch_sampler)
|
@@ -116,10 +136,10 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
116
136
|
quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
|
117
137
|
quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
|
118
138
|
if extra_fields:
|
119
|
-
asn, phist, flowend_reason = pick_extra_fields(batch,
|
139
|
+
asn, phist, flowend_reason = pick_extra_fields(batch, packet_histograms=packet_histograms, flow_endreason_features=flow_endreason_features)
|
120
140
|
asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
|
121
141
|
flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
|
122
|
-
df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=
|
142
|
+
df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=packet_histograms)
|
123
143
|
df_phist = df_phist.add(df2, fill_value=0)
|
124
144
|
feature_duration = np.concatenate(feature_duration)
|
125
145
|
feature_packets_total = np.concatenate(feature_packets_total)
|
@@ -183,11 +203,11 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
183
203
|
flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
|
184
204
|
flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
|
185
205
|
flow_endreason_df.index.name = "FLOW ENDREASON"
|
186
|
-
flow_endreason_df.index = pd.Index(
|
206
|
+
flow_endreason_df.index = pd.Index(flow_endreason_features)
|
187
207
|
flow_endreason_df.to_csv(flow_endreason_path)
|
188
208
|
# PHIST output
|
189
209
|
df_phist.index.name = "BINS"
|
190
|
-
df_phist.columns = list(map(lambda x: x.upper().replace("_", " "),
|
210
|
+
df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), packet_histograms))
|
191
211
|
df_phist = df_phist.astype("int64")
|
192
212
|
for i, column in zip((1, 3, 5, 7), df_phist.columns):
|
193
213
|
df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
|
@@ -1,30 +1,23 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
import json
|
3
2
|
import logging
|
4
3
|
import os
|
5
4
|
import time
|
6
5
|
import warnings
|
7
|
-
from typing import TYPE_CHECKING
|
8
6
|
|
9
7
|
import numpy as np
|
8
|
+
from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI
|
10
9
|
from numpy.lib.recfunctions import structured_to_unstructured
|
11
10
|
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
|
12
|
-
from typing_extensions import assert_never
|
13
11
|
|
14
|
-
from cesnet_datazoo.config import DatasetConfig
|
15
|
-
from cesnet_datazoo.constants import
|
16
|
-
PPI_COLUMN, SIZE_POS)
|
12
|
+
from cesnet_datazoo.config import DatasetConfig
|
13
|
+
from cesnet_datazoo.constants import DIR_POS, FLOWSTATS_NO_CLIP, IPT_POS, PPI_COLUMN, SIZE_POS
|
17
14
|
from cesnet_datazoo.pytables_data.pytables_dataset import load_data_from_tables, load_database
|
18
|
-
from cesnet_datazoo.utils.fileutils import pickle_dump, pickle_load
|
19
15
|
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
20
16
|
|
21
|
-
if TYPE_CHECKING:
|
22
|
-
from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
|
23
|
-
|
24
17
|
log = logging.getLogger(__name__)
|
25
18
|
|
26
|
-
|
27
|
-
|
19
|
+
|
20
|
+
def get_scaler_attrs(scaler: StandardScaler | RobustScaler | MinMaxScaler) -> dict[str, list[float]]:
|
28
21
|
if isinstance(scaler, StandardScaler):
|
29
22
|
assert hasattr(scaler, "mean_") and scaler.mean_ is not None and hasattr(scaler, "scale_") and scaler.scale_ is not None
|
30
23
|
scaler_attrs = {"mean_": scaler.mean_.tolist(), "scale_": scaler.scale_.tolist()}
|
@@ -36,97 +29,8 @@ def get_scaler_attrs(scaler: Scaler) -> dict[str, list[float]]:
|
|
36
29
|
scaler_attrs = {"min_": scaler.min_.tolist(), "scale_": scaler.scale_.tolist()}
|
37
30
|
return scaler_attrs
|
38
31
|
|
39
|
-
def
|
40
|
-
|
41
|
-
if isinstance(scaler, StandardScaler):
|
42
|
-
assert "mean_" in scaler_attrs and "scale_" in scaler_attrs
|
43
|
-
scaler.mean_ = np.array(scaler_attrs["mean_"])
|
44
|
-
scaler.scale_ = np.array(scaler_attrs["scale_"])
|
45
|
-
elif isinstance(scaler, RobustScaler):
|
46
|
-
assert "center_" in scaler_attrs and "scale_" in scaler_attrs
|
47
|
-
scaler.center_ = np.array(scaler_attrs["center_"])
|
48
|
-
scaler.scale_ = np.array(scaler_attrs["scale_"])
|
49
|
-
elif isinstance(scaler, MinMaxScaler):
|
50
|
-
assert "min_" in scaler_attrs and "scale_" in scaler_attrs
|
51
|
-
scaler.min_ = np.array(scaler_attrs["min_"])
|
52
|
-
scaler.scale_ = np.array(scaler_attrs["scale_"])
|
53
|
-
|
54
|
-
def save_scalers_attrs_as_dict(dataset: CesnetDataset) -> dict:
|
55
|
-
assert dataset.flowstats_scaler is not None or dataset.psizes_scaler is not None or dataset.ipt_scaler is not None
|
56
|
-
scalers_dict = {}
|
57
|
-
if dataset.flowstats_scaler is not None:
|
58
|
-
scalers_dict["flowstats_scaler_attrs"] = get_scaler_attrs(dataset.flowstats_scaler)
|
59
|
-
if dataset.psizes_scaler is not None:
|
60
|
-
scalers_dict["psizes_scaler_attrs"] = get_scaler_attrs(dataset.psizes_scaler)
|
61
|
-
if dataset.ipt_scaler is not None:
|
62
|
-
scalers_dict["ipt_scaler_attrs"] = get_scaler_attrs(dataset.ipt_scaler)
|
63
|
-
assert dataset.flowstats_quantiles is not None
|
64
|
-
scalers_dict["flowstats_quantiles"] = dataset.flowstats_quantiles.tolist()
|
65
|
-
return scalers_dict
|
66
|
-
|
67
|
-
def fit_or_load_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> tuple[Scaler, Scaler, Scaler, np.ndarray]:
|
68
|
-
# Load the scalers from pickled files if scalers_attrs are not provided
|
69
|
-
if dataset_config.scalers_attrs is None:
|
70
|
-
train_data_path = dataset_config._get_train_data_path()
|
71
|
-
flowstats_scaler_path = os.path.join(train_data_path, "stand", f"flowstats_scaler-{dataset_config.flowstats_scaler}-q{dataset_config.flowstats_clip}.pickle")
|
72
|
-
psizes_sizes_scaler_path = os.path.join(train_data_path, "stand", f"psizes_scaler-{dataset_config.psizes_scaler}-psizes_max{dataset_config.psizes_max}.pickle")
|
73
|
-
ipt_scaler_path = os.path.join(train_data_path, "stand", f"ipt_scaler-{dataset_config.ipt_scaler}-ipt_min{dataset_config.ipt_min}-ipt_max{dataset_config.ipt_max}.pickle")
|
74
|
-
flowstats_quantiles_path = os.path.join(train_data_path, "stand", f"flowstats_quantiles-q{dataset_config.flowstats_clip}.pickle")
|
75
|
-
if os.path.isfile(flowstats_scaler_path) and os.path.isfile(flowstats_quantiles_path) and os.path.isfile(ipt_scaler_path) and os.path.isfile(psizes_sizes_scaler_path):
|
76
|
-
flowstats_scaler = pickle_load(flowstats_scaler_path)
|
77
|
-
psizes_scaler = pickle_load(psizes_sizes_scaler_path)
|
78
|
-
ipt_scaler = pickle_load(ipt_scaler_path)
|
79
|
-
flowstats_quantiles = pickle_load(flowstats_quantiles_path)
|
80
|
-
return flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles
|
81
|
-
# Initialize the scalers classes based on the config
|
82
|
-
if dataset_config.flowstats_scaler == ScalerEnum.ROBUST:
|
83
|
-
flowstats_scaler = RobustScaler()
|
84
|
-
elif dataset_config.flowstats_scaler == ScalerEnum.STANDARD:
|
85
|
-
flowstats_scaler = StandardScaler()
|
86
|
-
elif dataset_config.flowstats_scaler == ScalerEnum.MINMAX:
|
87
|
-
flowstats_scaler = MinMaxScaler()
|
88
|
-
elif dataset_config.flowstats_scaler == ScalerEnum.NO_SCALER:
|
89
|
-
flowstats_scaler = None
|
90
|
-
else: assert_never(dataset_config.flowstats_scaler)
|
91
|
-
if dataset_config.ipt_scaler == ScalerEnum.ROBUST:
|
92
|
-
ipt_scaler = RobustScaler()
|
93
|
-
elif dataset_config.ipt_scaler == ScalerEnum.STANDARD:
|
94
|
-
ipt_scaler = StandardScaler()
|
95
|
-
elif dataset_config.ipt_scaler == ScalerEnum.MINMAX:
|
96
|
-
ipt_scaler = MinMaxScaler()
|
97
|
-
elif dataset_config.ipt_scaler == ScalerEnum.NO_SCALER:
|
98
|
-
ipt_scaler = None
|
99
|
-
else: assert_never(dataset_config.ipt_scaler)
|
100
|
-
if dataset_config.psizes_scaler == ScalerEnum.ROBUST:
|
101
|
-
psizes_scaler = RobustScaler()
|
102
|
-
elif dataset_config.psizes_scaler == ScalerEnum.STANDARD:
|
103
|
-
psizes_scaler = StandardScaler()
|
104
|
-
elif dataset_config.psizes_scaler == ScalerEnum.MINMAX:
|
105
|
-
psizes_scaler = MinMaxScaler()
|
106
|
-
elif dataset_config.psizes_scaler == ScalerEnum.NO_SCALER:
|
107
|
-
psizes_scaler = None
|
108
|
-
else: assert_never(dataset_config.psizes_scaler)
|
109
|
-
# Load scalers learned attributes from config if provided
|
110
|
-
if dataset_config.scalers_attrs is not None:
|
111
|
-
if "flowstats_scaler_attrs" in dataset_config.scalers_attrs:
|
112
|
-
if flowstats_scaler is not None:
|
113
|
-
set_scaler_attrs(flowstats_scaler, dataset_config.scalers_attrs["flowstats_scaler_attrs"])
|
114
|
-
else:
|
115
|
-
warnings.warn("Ignoring flowstats_scaler_attrs because flowstats_scaler is None")
|
116
|
-
if "psizes_scaler_attrs" in dataset_config.scalers_attrs:
|
117
|
-
if psizes_scaler is not None:
|
118
|
-
set_scaler_attrs(psizes_scaler, dataset_config.scalers_attrs["psizes_scaler_attrs"])
|
119
|
-
else:
|
120
|
-
warnings.warn("Ignoring psizes_scaler_attrs because psizes_scaler is None")
|
121
|
-
if "ipt_scaler_attrs" in dataset_config.scalers_attrs:
|
122
|
-
if ipt_scaler is not None:
|
123
|
-
set_scaler_attrs(ipt_scaler, dataset_config.scalers_attrs["ipt_scaler_attrs"])
|
124
|
-
else:
|
125
|
-
warnings.warn("Ignoring ipt_scaler_attrs because ipt_scaler is None")
|
126
|
-
assert "flowstats_quantiles" in dataset_config.scalers_attrs
|
127
|
-
flowstats_quantiles = np.array(dataset_config.scalers_attrs["flowstats_quantiles"])
|
128
|
-
return flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles
|
129
|
-
# If the scalers are not loaded at this point, fit them
|
32
|
+
def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> None:
|
33
|
+
# Define indices for fitting scalers
|
130
34
|
if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
|
131
35
|
warnings.warn(f"The number of samples for fitting scalers ({dataset_config.fit_scalers_samples}) is larger than the number of train samples ({len(train_indices)}), using the number of train samples instead")
|
132
36
|
dataset_config.fit_scalers_samples = len(train_indices)
|
@@ -136,61 +40,71 @@ def fit_or_load_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray
|
|
136
40
|
else:
|
137
41
|
num_samples = dataset_config.fit_scalers_samples
|
138
42
|
fit_scalers_indices = train_indices[fit_scalers_rng.choice(len(train_indices), size=num_samples, replace=False)]
|
139
|
-
|
140
|
-
database_path=dataset_config.database_path,
|
141
|
-
train_tables_paths=dataset_config._get_train_tables_paths(),
|
142
|
-
fit_scalers_indices=fit_scalers_indices,
|
143
|
-
flowstats_scaler=flowstats_scaler,
|
144
|
-
psizes_scaler=psizes_scaler,
|
145
|
-
ipt_scaler=ipt_scaler,
|
146
|
-
flowstats_quantile_clip=dataset_config.flowstats_clip,
|
147
|
-
ipt_min=dataset_config.ipt_min,
|
148
|
-
ipt_max=dataset_config.ipt_max,
|
149
|
-
psizes_max=dataset_config.psizes_max)
|
150
|
-
pickle_dump(flowstats_scaler, flowstats_scaler_path)
|
151
|
-
pickle_dump(psizes_scaler, psizes_sizes_scaler_path)
|
152
|
-
pickle_dump(ipt_scaler, ipt_scaler_path)
|
153
|
-
pickle_dump(flowstats_quantiles, flowstats_quantiles_path)
|
154
|
-
return flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles
|
155
|
-
|
156
|
-
def fit_scalers(database_path: str,
|
157
|
-
train_tables_paths: list[str],
|
158
|
-
fit_scalers_indices: np.ndarray,
|
159
|
-
flowstats_scaler: Scaler,
|
160
|
-
psizes_scaler: Scaler,
|
161
|
-
ipt_scaler: Scaler,
|
162
|
-
flowstats_quantile_clip: float,
|
163
|
-
ipt_min: int,
|
164
|
-
ipt_max: int,
|
165
|
-
psizes_max: int) -> np.ndarray:
|
43
|
+
# Load data
|
166
44
|
start_time = time.time()
|
167
|
-
database, tables = load_database(database_path, tables_paths=
|
45
|
+
database, tables = load_database(dataset_config.database_path, tables_paths=dataset_config._get_train_tables_paths())
|
168
46
|
data = load_data_from_tables(tables=tables, indices=fit_scalers_indices, data_dtype=tables[0].dtype)
|
169
47
|
database.close()
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
if
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
48
|
+
|
49
|
+
clip_and_scale_ppi_transform = dataset_config.ppi_transform # TODO Fix after transforms composing is implemented
|
50
|
+
clip_and_scale_flowstats_transform = dataset_config.flowstats_transform
|
51
|
+
|
52
|
+
# Fit the ClipAndScalePPI transform
|
53
|
+
if clip_and_scale_ppi_transform is not None and clip_and_scale_ppi_transform.needs_fitting:
|
54
|
+
assert isinstance(clip_and_scale_ppi_transform, ClipAndScalePPI)
|
55
|
+
data_ppi = data[PPI_COLUMN].astype("float32")
|
56
|
+
ppi_channels = data_ppi.shape[1]
|
57
|
+
data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
|
58
|
+
padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
|
59
|
+
# Fit IPT scaler
|
60
|
+
train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
|
61
|
+
train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
|
62
|
+
if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
|
63
|
+
# Let zero be the minimum for minmax scaling
|
180
64
|
train_ipt = np.concatenate((train_ipt, [0]))
|
181
|
-
ipt_scaler.fit(train_ipt.reshape(-1, 1))
|
182
|
-
|
183
|
-
train_psizes = data_ppi[:, SIZE_POS].clip(max=psizes_max, min=
|
65
|
+
clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
|
66
|
+
# Fit packet sizes scaler
|
67
|
+
train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
|
184
68
|
train_psizes[padding_mask] = np.nan
|
185
|
-
if isinstance(psizes_scaler, MinMaxScaler):
|
69
|
+
if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
|
186
70
|
train_psizes = np.concatenate((train_psizes, [0]))
|
187
|
-
psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
71
|
+
clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
72
|
+
clip_and_scale_ppi_transform.needs_fitting = False
|
73
|
+
|
74
|
+
# Fit the ClipAndScaleFlowstats transform
|
75
|
+
if clip_and_scale_flowstats_transform is not None and clip_and_scale_flowstats_transform.needs_fitting:
|
76
|
+
assert isinstance(clip_and_scale_flowstats_transform, ClipAndScaleFlowstats)
|
77
|
+
train_flowstats = structured_to_unstructured(data[dataset_config.flowstats_features])
|
78
|
+
flowstats_quantiles = np.quantile(train_flowstats, q=clip_and_scale_flowstats_transform.quantile_clip, axis=0)
|
79
|
+
idx_no_clip = [dataset_config.flowstats_features.index(f) for f in FLOWSTATS_NO_CLIP]
|
80
|
+
flowstats_quantiles[idx_no_clip] = np.inf # Disable clipping for features with "fixed" range
|
193
81
|
train_flowstats = train_flowstats.clip(max=flowstats_quantiles)
|
194
|
-
flowstats_scaler.fit(train_flowstats)
|
82
|
+
clip_and_scale_flowstats_transform.flowstats_scaler.fit(train_flowstats)
|
83
|
+
clip_and_scale_flowstats_transform.flowstats_quantiles = flowstats_quantiles.tolist()
|
84
|
+
clip_and_scale_flowstats_transform.needs_fitting = False
|
85
|
+
|
195
86
|
log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
|
196
|
-
|
87
|
+
train_data_path = dataset_config._get_train_data_path()
|
88
|
+
if clip_and_scale_ppi_transform is not None:
|
89
|
+
ppi_transform_path = os.path.join(train_data_path, "transforms", "ppi-transform.json")
|
90
|
+
ppi_transform_dict = {
|
91
|
+
"psizes_scaler_enum": str(clip_and_scale_ppi_transform._psizes_scaler_enum),
|
92
|
+
"psizes_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.psizes_scaler),
|
93
|
+
"pszies_min": clip_and_scale_ppi_transform.pszies_min,
|
94
|
+
"psizes_max": clip_and_scale_ppi_transform.psizes_max,
|
95
|
+
"ipt_scaler_enum": str(clip_and_scale_ppi_transform._ipt_scaler_enum),
|
96
|
+
"ipt_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.ipt_scaler),
|
97
|
+
"ipt_min": clip_and_scale_ppi_transform.ipt_min,
|
98
|
+
"ipt_max": clip_and_scale_ppi_transform.ipt_max,
|
99
|
+
}
|
100
|
+
json.dump(ppi_transform_dict, open(ppi_transform_path, "w"), indent=4)
|
101
|
+
if clip_and_scale_flowstats_transform is not None:
|
102
|
+
assert clip_and_scale_flowstats_transform.flowstats_quantiles is not None
|
103
|
+
flowstats_transform_path = os.path.join(train_data_path, "transforms", "flowstats-transform.json")
|
104
|
+
flowstats_transform_dict = {
|
105
|
+
"flowstats_scaler_enum": str(clip_and_scale_flowstats_transform._flowstats_scaler_enum),
|
106
|
+
"flowstats_scaler_attrs": get_scaler_attrs(clip_and_scale_flowstats_transform.flowstats_scaler),
|
107
|
+
"flowstats_quantiles": clip_and_scale_flowstats_transform.flowstats_quantiles,
|
108
|
+
"quantile_clip": clip_and_scale_flowstats_transform.quantile_clip,
|
109
|
+
}
|
110
|
+
json.dump(flowstats_transform_dict, open(flowstats_transform_path, "w"), indent=4)
|
@@ -7,13 +7,11 @@ from enum import Enum
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
import pandas as pd
|
10
|
-
from sklearn.preprocessing import LabelEncoder
|
11
10
|
|
12
11
|
from cesnet_datazoo.config import DatasetConfig
|
13
|
-
from cesnet_datazoo.constants import
|
14
|
-
UNKNOWN_STR_LABEL)
|
12
|
+
from cesnet_datazoo.constants import INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS
|
15
13
|
from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
|
16
|
-
from cesnet_datazoo.utils.fileutils import
|
14
|
+
from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
|
17
15
|
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
18
16
|
|
19
17
|
log = logging.getLogger(__name__)
|
@@ -78,59 +76,56 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
|
|
78
76
|
sampled_train_indices = np.concatenate(sampled_indicies_per_date)
|
79
77
|
return sampled_train_indices
|
80
78
|
|
81
|
-
def indices_to_app_counts(indices: np.ndarray,
|
79
|
+
def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
|
82
80
|
app_counts = pd.Series(indices[:, INDICES_LABEL_POS]).value_counts()
|
83
|
-
app_counts.index = app_counts.index.map(lambda x:
|
81
|
+
app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
|
84
82
|
return app_counts
|
85
83
|
|
86
|
-
def compute_known_app_counts(dataset_indices: IndicesTuple,
|
87
|
-
train_app_counts = indices_to_app_counts(dataset_indices.train_indices,
|
88
|
-
val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices,
|
89
|
-
test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices,
|
84
|
+
def compute_known_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
|
85
|
+
train_app_counts = indices_to_app_counts(dataset_indices.train_indices, tables_app_enum)
|
86
|
+
val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices, tables_app_enum)
|
87
|
+
test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices, tables_app_enum)
|
90
88
|
df = pd.DataFrame(data={"Train": train_app_counts, "Validation": val_known_app_counts, "Test": test_known_app_counts}).fillna(0).astype("int64")
|
91
89
|
return df
|
92
90
|
|
93
|
-
def compute_unknown_app_counts(dataset_indices: IndicesTuple,
|
94
|
-
val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices,
|
95
|
-
test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices,
|
91
|
+
def compute_unknown_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
|
92
|
+
val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices, tables_app_enum)
|
93
|
+
test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices, tables_app_enum)
|
96
94
|
df = pd.DataFrame(data={"Validation": val_unknown_app_counts, "Test": test_unknown_app_counts}).fillna(0).astype("int64")
|
97
95
|
return df
|
98
96
|
|
99
|
-
def init_or_load_train_indices(dataset_config: DatasetConfig, servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray,
|
97
|
+
def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
100
98
|
train_data_path = dataset_config._get_train_data_path()
|
101
99
|
init_train_data(train_data_path)
|
102
100
|
if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
|
103
101
|
log.info("Processing train indices")
|
104
102
|
train_data_params = dataset_config._get_train_data_params()
|
105
|
-
train_known_indices, train_unknown_indices,
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
encoder = LabelEncoder().fit(list(known_apps_database_enum.values()))
|
111
|
-
encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
|
103
|
+
train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
|
104
|
+
database_path=dataset_config.database_path,
|
105
|
+
tables_app_enum=tables_app_enum,
|
106
|
+
servicemap=servicemap,
|
107
|
+
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
|
112
108
|
if not disable_indices_cache:
|
113
109
|
yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
|
114
|
-
yaml_dump(
|
115
|
-
yaml_dump(
|
116
|
-
pickle_dump(encoder, os.path.join(train_data_path, "encoder.pickle"))
|
110
|
+
yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
|
111
|
+
yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
|
117
112
|
np.save(os.path.join(train_data_path, "train_known_indices.npy"), train_known_indices)
|
118
113
|
np.save(os.path.join(train_data_path, "train_unknown_indices.npy"), train_unknown_indices)
|
119
114
|
else:
|
120
|
-
|
121
|
-
|
122
|
-
encoder = pickle_load(os.path.join(train_data_path, "encoder.pickle"))
|
115
|
+
known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
|
116
|
+
unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
|
123
117
|
train_known_indices = np.load(os.path.join(train_data_path, "train_known_indices.npy"))
|
124
118
|
train_unknown_indices = np.load(os.path.join(train_data_path, "train_unknown_indices.npy"))
|
125
|
-
return train_known_indices, train_unknown_indices,
|
119
|
+
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
126
120
|
|
127
|
-
def init_or_load_val_indices(dataset_config: DatasetConfig,
|
128
|
-
val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(
|
121
|
+
def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
122
|
+
val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
|
129
123
|
init_test_data(val_data_path)
|
130
124
|
if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
|
131
125
|
log.info("Processing validation indices")
|
132
126
|
val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
|
133
127
|
database_path=dataset_config.database_path,
|
128
|
+
tables_app_enum=tables_app_enum,
|
134
129
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
|
135
130
|
if not disable_indices_cache:
|
136
131
|
yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
|
@@ -141,13 +136,14 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps_database_
|
|
141
136
|
val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
|
142
137
|
return val_known_indices, val_unknown_indices, val_data_path
|
143
138
|
|
144
|
-
def init_or_load_test_indices(dataset_config: DatasetConfig,
|
145
|
-
test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(
|
139
|
+
def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
140
|
+
test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
|
146
141
|
init_test_data(test_data_path)
|
147
142
|
if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
|
148
143
|
log.info("Processing test indices")
|
149
144
|
test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
|
150
145
|
database_path=dataset_config.database_path,
|
146
|
+
tables_app_enum=tables_app_enum,
|
151
147
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
|
152
148
|
if not disable_indices_cache:
|
153
149
|
yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
|
@@ -160,7 +156,7 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps_database
|
|
160
156
|
|
161
157
|
def init_train_data(train_data_path: str):
|
162
158
|
os.makedirs(train_data_path, exist_ok=True)
|
163
|
-
os.makedirs(os.path.join(train_data_path, "
|
159
|
+
os.makedirs(os.path.join(train_data_path, "transforms"), exist_ok=True)
|
164
160
|
os.makedirs(os.path.join(train_data_path, "preload"), exist_ok=True)
|
165
161
|
|
166
162
|
def init_test_data(test_data_path: str):
|