cesnet-datazoo 0.0.16__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +174 -167
- cesnet_datazoo/constants.py +4 -6
- cesnet_datazoo/datasets/cesnet_dataset.py +200 -172
- cesnet_datazoo/datasets/datasets.py +22 -2
- cesnet_datazoo/datasets/datasets_constants.py +670 -0
- cesnet_datazoo/datasets/loaders.py +3 -0
- cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
- cesnet_datazoo/datasets/metadata/metadata.csv +4 -4
- cesnet_datazoo/datasets/statistics.py +36 -16
- cesnet_datazoo/pytables_data/data_scalers.py +110 -0
- cesnet_datazoo/pytables_data/indices_setup.py +29 -33
- cesnet_datazoo/pytables_data/pytables_dataset.py +103 -229
- cesnet_datazoo/utils/class_info.py +7 -5
- cesnet_datazoo/utils/download.py +6 -1
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/METADATA +2 -1
- cesnet_datazoo-0.1.0.dist-info/RECORD +30 -0
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/WHEEL +1 -1
- cesnet_datazoo-0.0.16.dist-info/RECORD +0 -28
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,9 @@ from tqdm import tqdm
|
|
6
6
|
from cesnet_datazoo.constants import APP_COLUMN
|
7
7
|
|
8
8
|
|
9
|
+
def collate_fn_simple(batch):
|
10
|
+
return batch
|
11
|
+
|
9
12
|
def load_from_dataloader(dataloader: DataLoader, silent: bool = False) -> tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
|
10
13
|
other_fields = []
|
11
14
|
data_ppi = []
|
@@ -18,18 +18,19 @@ class DatasetMetadata():
|
|
18
18
|
collection_period: str
|
19
19
|
missing_dates_in_collection_period: list[str]
|
20
20
|
application_count: int
|
21
|
-
|
22
|
-
|
23
|
-
packet_histogram_features: list[str]
|
21
|
+
background_traffic_classes: list[str]
|
22
|
+
ppi_features: list[str]
|
24
23
|
flowstats_features: list[str]
|
24
|
+
flowstats_features_boolean: list[str]
|
25
|
+
packet_histograms: list[str]
|
25
26
|
tcp_features: list[str]
|
26
27
|
other_fields: list[str]
|
27
28
|
cite: str
|
28
29
|
zenodo_url: str
|
29
30
|
related_papers: list[str]
|
30
31
|
|
31
|
-
@field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "
|
32
|
-
"
|
32
|
+
@field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "background_traffic_classes", "ppi_features",
|
33
|
+
"flowstats_features", "flowstats_features_boolean", "packet_histograms", "tcp_features", "other_fields", "related_papers", mode="before")
|
33
34
|
@classmethod
|
34
35
|
def parse_string_to_list(cls, v: str, info: ValidationInfo) -> list[str]:
|
35
36
|
l = list(map(str.strip, v.split(","))) if v else []
|
@@ -1,4 +1,4 @@
|
|
1
|
-
Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic,
|
2
|
-
CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE"
|
3
|
-
CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","
|
4
|
-
CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","
|
1
|
+
Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic classes,PPI features,Flowstats features,Flowstats features boolean,Packet histograms,TCP features,Other fields,Cite,Zenodo URL,Related papers
|
2
|
+
CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
|
3
|
+
CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
|
4
|
+
CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
import os
|
3
2
|
from collections import Counter
|
4
3
|
from typing import Any, Literal
|
@@ -12,8 +11,9 @@ from torch.utils.data import BatchSampler, DataLoader, SequentialSampler
|
|
12
11
|
from tqdm import tqdm
|
13
12
|
|
14
13
|
from cesnet_datazoo.config import Protocol
|
15
|
-
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN,
|
16
|
-
|
14
|
+
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, IPT_POS, PPI_COLUMN, SIZE_POS,
|
15
|
+
UDP_PPI_CHANNELS)
|
16
|
+
from cesnet_datazoo.datasets.loaders import collate_fn_simple
|
17
17
|
from cesnet_datazoo.pytables_data.indices_setup import sort_indices
|
18
18
|
from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
|
19
19
|
load_database, worker_init_fn)
|
@@ -36,17 +36,25 @@ def pick_stats_fields(batch):
|
|
36
36
|
batch[CATEGORY_COLUMN],
|
37
37
|
)
|
38
38
|
|
39
|
-
def pick_extra_fields(batch,
|
39
|
+
def pick_extra_fields(batch, packet_histograms: list[str], flow_endreason_features: list[str]):
|
40
40
|
return (
|
41
41
|
batch["DST_ASN"],
|
42
|
-
batch[
|
43
|
-
batch[
|
42
|
+
batch[packet_histograms],
|
43
|
+
batch[flow_endreason_features],
|
44
44
|
)
|
45
45
|
|
46
|
-
def
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
def compute_dataset_statistics(database_path: str,
|
47
|
+
tables_app_enum: dict[int, str],
|
48
|
+
tables_cat_enum: dict[int, str],
|
49
|
+
output_dir: str,
|
50
|
+
packet_histograms: list[str],
|
51
|
+
flowstats_features_boolean: list[str],
|
52
|
+
protocol: Protocol, extra_fields: bool,
|
53
|
+
disabled_apps: list[str],
|
54
|
+
num_samples: int | Literal["all"] = 10_000_000,
|
55
|
+
num_workers: int = 4,
|
56
|
+
batch_size: int = 4096,
|
57
|
+
silent: bool = False):
|
50
58
|
stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
|
51
59
|
stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
|
52
60
|
categories_csv_path = os.path.join(output_dir, "categories.csv")
|
@@ -74,10 +82,22 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
74
82
|
feature_bytes_total = []
|
75
83
|
packet_sizes_counter = Counter()
|
76
84
|
ipt_counter = Counter()
|
85
|
+
flow_endreason_features = [f for f in flowstats_features_boolean if f.startswith("FLOW_ENDREASON")]
|
77
86
|
if not silent:
|
78
87
|
print(f"Reading data from {database_path} for statistics")
|
79
88
|
table_paths = list_all_tables(database_path)
|
80
|
-
stats_dataset = PyTablesDataset(database_path=database_path,
|
89
|
+
stats_dataset = PyTablesDataset(database_path=database_path,
|
90
|
+
tables_app_enum=tables_app_enum,
|
91
|
+
tables_cat_enum=tables_cat_enum,
|
92
|
+
tables_paths=table_paths,
|
93
|
+
indices=None,
|
94
|
+
disabled_apps=disabled_apps,
|
95
|
+
return_all_fields=True,
|
96
|
+
flowstats_features=[],
|
97
|
+
flowstats_features_boolean=[],
|
98
|
+
flowstats_features_phist=[],
|
99
|
+
other_fields=[],
|
100
|
+
ppi_channels=UDP_PPI_CHANNELS,)
|
81
101
|
if num_samples != "all":
|
82
102
|
subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
|
83
103
|
stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
|
@@ -87,7 +107,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
87
107
|
pin_memory=False,
|
88
108
|
num_workers=num_workers,
|
89
109
|
worker_init_fn=worker_init_fn,
|
90
|
-
collate_fn=
|
110
|
+
collate_fn=collate_fn_simple,
|
91
111
|
persistent_workers=False,
|
92
112
|
batch_size=None,
|
93
113
|
sampler=stats_batch_sampler)
|
@@ -116,10 +136,10 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
116
136
|
quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
|
117
137
|
quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
|
118
138
|
if extra_fields:
|
119
|
-
asn, phist, flowend_reason = pick_extra_fields(batch,
|
139
|
+
asn, phist, flowend_reason = pick_extra_fields(batch, packet_histograms=packet_histograms, flow_endreason_features=flow_endreason_features)
|
120
140
|
asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
|
121
141
|
flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
|
122
|
-
df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=
|
142
|
+
df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=packet_histograms)
|
123
143
|
df_phist = df_phist.add(df2, fill_value=0)
|
124
144
|
feature_duration = np.concatenate(feature_duration)
|
125
145
|
feature_packets_total = np.concatenate(feature_packets_total)
|
@@ -183,11 +203,11 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
|
|
183
203
|
flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
|
184
204
|
flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
|
185
205
|
flow_endreason_df.index.name = "FLOW ENDREASON"
|
186
|
-
flow_endreason_df.index = pd.Index(
|
206
|
+
flow_endreason_df.index = pd.Index(flow_endreason_features)
|
187
207
|
flow_endreason_df.to_csv(flow_endreason_path)
|
188
208
|
# PHIST output
|
189
209
|
df_phist.index.name = "BINS"
|
190
|
-
df_phist.columns = list(map(lambda x: x.upper().replace("_", " "),
|
210
|
+
df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), packet_histograms))
|
191
211
|
df_phist = df_phist.astype("int64")
|
192
212
|
for i, column in zip((1, 3, 5, 7), df_phist.columns):
|
193
213
|
df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
import time
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI
|
9
|
+
from numpy.lib.recfunctions import structured_to_unstructured
|
10
|
+
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
|
11
|
+
|
12
|
+
from cesnet_datazoo.config import DatasetConfig
|
13
|
+
from cesnet_datazoo.constants import DIR_POS, FLOWSTATS_NO_CLIP, IPT_POS, PPI_COLUMN, SIZE_POS
|
14
|
+
from cesnet_datazoo.pytables_data.pytables_dataset import load_data_from_tables, load_database
|
15
|
+
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
16
|
+
|
17
|
+
log = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def get_scaler_attrs(scaler: StandardScaler | RobustScaler | MinMaxScaler) -> dict[str, list[float]]:
|
21
|
+
if isinstance(scaler, StandardScaler):
|
22
|
+
assert hasattr(scaler, "mean_") and scaler.mean_ is not None and hasattr(scaler, "scale_") and scaler.scale_ is not None
|
23
|
+
scaler_attrs = {"mean_": scaler.mean_.tolist(), "scale_": scaler.scale_.tolist()}
|
24
|
+
elif isinstance(scaler, RobustScaler):
|
25
|
+
assert hasattr(scaler, "center_") and hasattr(scaler, "scale_")
|
26
|
+
scaler_attrs = {"center_": scaler.center_.tolist(), "scale_": scaler.scale_.tolist()}
|
27
|
+
elif isinstance(scaler, MinMaxScaler):
|
28
|
+
assert hasattr(scaler, "min_") and hasattr(scaler, "scale_")
|
29
|
+
scaler_attrs = {"min_": scaler.min_.tolist(), "scale_": scaler.scale_.tolist()}
|
30
|
+
return scaler_attrs
|
31
|
+
|
32
|
+
def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> None:
|
33
|
+
# Define indices for fitting scalers
|
34
|
+
if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
|
35
|
+
warnings.warn(f"The number of samples for fitting scalers ({dataset_config.fit_scalers_samples}) is larger than the number of train samples ({len(train_indices)}), using the number of train samples instead")
|
36
|
+
dataset_config.fit_scalers_samples = len(train_indices)
|
37
|
+
fit_scalers_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.FIT_SCALERS_SAMPLE)
|
38
|
+
if isinstance(dataset_config.fit_scalers_samples, float):
|
39
|
+
num_samples = int(dataset_config.fit_scalers_samples * len(train_indices))
|
40
|
+
else:
|
41
|
+
num_samples = dataset_config.fit_scalers_samples
|
42
|
+
fit_scalers_indices = train_indices[fit_scalers_rng.choice(len(train_indices), size=num_samples, replace=False)]
|
43
|
+
# Load data
|
44
|
+
start_time = time.time()
|
45
|
+
database, tables = load_database(dataset_config.database_path, tables_paths=dataset_config._get_train_tables_paths())
|
46
|
+
data = load_data_from_tables(tables=tables, indices=fit_scalers_indices, data_dtype=tables[0].dtype)
|
47
|
+
database.close()
|
48
|
+
|
49
|
+
clip_and_scale_ppi_transform = dataset_config.ppi_transform # TODO Fix after transforms composing is implemented
|
50
|
+
clip_and_scale_flowstats_transform = dataset_config.flowstats_transform
|
51
|
+
|
52
|
+
# Fit the ClipAndScalePPI transform
|
53
|
+
if clip_and_scale_ppi_transform is not None and clip_and_scale_ppi_transform.needs_fitting:
|
54
|
+
assert isinstance(clip_and_scale_ppi_transform, ClipAndScalePPI)
|
55
|
+
data_ppi = data[PPI_COLUMN].astype("float32")
|
56
|
+
ppi_channels = data_ppi.shape[1]
|
57
|
+
data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
|
58
|
+
padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
|
59
|
+
# Fit IPT scaler
|
60
|
+
train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
|
61
|
+
train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
|
62
|
+
if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
|
63
|
+
# Let zero be the minimum for minmax scaling
|
64
|
+
train_ipt = np.concatenate((train_ipt, [0]))
|
65
|
+
clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
|
66
|
+
# Fit packet sizes scaler
|
67
|
+
train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
|
68
|
+
train_psizes[padding_mask] = np.nan
|
69
|
+
if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
|
70
|
+
train_psizes = np.concatenate((train_psizes, [0]))
|
71
|
+
clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
72
|
+
clip_and_scale_ppi_transform.needs_fitting = False
|
73
|
+
|
74
|
+
# Fit the ClipAndScaleFlowstats transform
|
75
|
+
if clip_and_scale_flowstats_transform is not None and clip_and_scale_flowstats_transform.needs_fitting:
|
76
|
+
assert isinstance(clip_and_scale_flowstats_transform, ClipAndScaleFlowstats)
|
77
|
+
train_flowstats = structured_to_unstructured(data[dataset_config.flowstats_features])
|
78
|
+
flowstats_quantiles = np.quantile(train_flowstats, q=clip_and_scale_flowstats_transform.quantile_clip, axis=0)
|
79
|
+
idx_no_clip = [dataset_config.flowstats_features.index(f) for f in FLOWSTATS_NO_CLIP]
|
80
|
+
flowstats_quantiles[idx_no_clip] = np.inf # Disable clipping for features with "fixed" range
|
81
|
+
train_flowstats = train_flowstats.clip(max=flowstats_quantiles)
|
82
|
+
clip_and_scale_flowstats_transform.flowstats_scaler.fit(train_flowstats)
|
83
|
+
clip_and_scale_flowstats_transform.flowstats_quantiles = flowstats_quantiles.tolist()
|
84
|
+
clip_and_scale_flowstats_transform.needs_fitting = False
|
85
|
+
|
86
|
+
log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
|
87
|
+
train_data_path = dataset_config._get_train_data_path()
|
88
|
+
if clip_and_scale_ppi_transform is not None:
|
89
|
+
ppi_transform_path = os.path.join(train_data_path, "transforms", "ppi-transform.json")
|
90
|
+
ppi_transform_dict = {
|
91
|
+
"psizes_scaler_enum": str(clip_and_scale_ppi_transform._psizes_scaler_enum),
|
92
|
+
"psizes_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.psizes_scaler),
|
93
|
+
"pszies_min": clip_and_scale_ppi_transform.pszies_min,
|
94
|
+
"psizes_max": clip_and_scale_ppi_transform.psizes_max,
|
95
|
+
"ipt_scaler_enum": str(clip_and_scale_ppi_transform._ipt_scaler_enum),
|
96
|
+
"ipt_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.ipt_scaler),
|
97
|
+
"ipt_min": clip_and_scale_ppi_transform.ipt_min,
|
98
|
+
"ipt_max": clip_and_scale_ppi_transform.ipt_max,
|
99
|
+
}
|
100
|
+
json.dump(ppi_transform_dict, open(ppi_transform_path, "w"), indent=4)
|
101
|
+
if clip_and_scale_flowstats_transform is not None:
|
102
|
+
assert clip_and_scale_flowstats_transform.flowstats_quantiles is not None
|
103
|
+
flowstats_transform_path = os.path.join(train_data_path, "transforms", "flowstats-transform.json")
|
104
|
+
flowstats_transform_dict = {
|
105
|
+
"flowstats_scaler_enum": str(clip_and_scale_flowstats_transform._flowstats_scaler_enum),
|
106
|
+
"flowstats_scaler_attrs": get_scaler_attrs(clip_and_scale_flowstats_transform.flowstats_scaler),
|
107
|
+
"flowstats_quantiles": clip_and_scale_flowstats_transform.flowstats_quantiles,
|
108
|
+
"quantile_clip": clip_and_scale_flowstats_transform.quantile_clip,
|
109
|
+
}
|
110
|
+
json.dump(flowstats_transform_dict, open(flowstats_transform_path, "w"), indent=4)
|
@@ -7,13 +7,11 @@ from enum import Enum
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
import pandas as pd
|
10
|
-
from sklearn.preprocessing import LabelEncoder
|
11
10
|
|
12
11
|
from cesnet_datazoo.config import DatasetConfig
|
13
|
-
from cesnet_datazoo.constants import
|
14
|
-
UNKNOWN_STR_LABEL)
|
12
|
+
from cesnet_datazoo.constants import INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS
|
15
13
|
from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
|
16
|
-
from cesnet_datazoo.utils.fileutils import
|
14
|
+
from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
|
17
15
|
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
18
16
|
|
19
17
|
log = logging.getLogger(__name__)
|
@@ -78,59 +76,56 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
|
|
78
76
|
sampled_train_indices = np.concatenate(sampled_indicies_per_date)
|
79
77
|
return sampled_train_indices
|
80
78
|
|
81
|
-
def indices_to_app_counts(indices: np.ndarray,
|
79
|
+
def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
|
82
80
|
app_counts = pd.Series(indices[:, INDICES_LABEL_POS]).value_counts()
|
83
|
-
app_counts.index = app_counts.index.map(lambda x:
|
81
|
+
app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
|
84
82
|
return app_counts
|
85
83
|
|
86
|
-
def compute_known_app_counts(dataset_indices: IndicesTuple,
|
87
|
-
train_app_counts = indices_to_app_counts(dataset_indices.train_indices,
|
88
|
-
val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices,
|
89
|
-
test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices,
|
84
|
+
def compute_known_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
|
85
|
+
train_app_counts = indices_to_app_counts(dataset_indices.train_indices, tables_app_enum)
|
86
|
+
val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices, tables_app_enum)
|
87
|
+
test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices, tables_app_enum)
|
90
88
|
df = pd.DataFrame(data={"Train": train_app_counts, "Validation": val_known_app_counts, "Test": test_known_app_counts}).fillna(0).astype("int64")
|
91
89
|
return df
|
92
90
|
|
93
|
-
def compute_unknown_app_counts(dataset_indices: IndicesTuple,
|
94
|
-
val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices,
|
95
|
-
test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices,
|
91
|
+
def compute_unknown_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
|
92
|
+
val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices, tables_app_enum)
|
93
|
+
test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices, tables_app_enum)
|
96
94
|
df = pd.DataFrame(data={"Validation": val_unknown_app_counts, "Test": test_unknown_app_counts}).fillna(0).astype("int64")
|
97
95
|
return df
|
98
96
|
|
99
|
-
def init_or_load_train_indices(dataset_config: DatasetConfig, servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray,
|
97
|
+
def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
100
98
|
train_data_path = dataset_config._get_train_data_path()
|
101
99
|
init_train_data(train_data_path)
|
102
100
|
if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
|
103
101
|
log.info("Processing train indices")
|
104
102
|
train_data_params = dataset_config._get_train_data_params()
|
105
|
-
train_known_indices, train_unknown_indices,
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
encoder = LabelEncoder().fit(list(known_apps_database_enum.values()))
|
111
|
-
encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
|
103
|
+
train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
|
104
|
+
database_path=dataset_config.database_path,
|
105
|
+
tables_app_enum=tables_app_enum,
|
106
|
+
servicemap=servicemap,
|
107
|
+
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
|
112
108
|
if not disable_indices_cache:
|
113
109
|
yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
|
114
|
-
yaml_dump(
|
115
|
-
yaml_dump(
|
116
|
-
pickle_dump(encoder, os.path.join(train_data_path, "encoder.pickle"))
|
110
|
+
yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
|
111
|
+
yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
|
117
112
|
np.save(os.path.join(train_data_path, "train_known_indices.npy"), train_known_indices)
|
118
113
|
np.save(os.path.join(train_data_path, "train_unknown_indices.npy"), train_unknown_indices)
|
119
114
|
else:
|
120
|
-
|
121
|
-
|
122
|
-
encoder = pickle_load(os.path.join(train_data_path, "encoder.pickle"))
|
115
|
+
known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
|
116
|
+
unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
|
123
117
|
train_known_indices = np.load(os.path.join(train_data_path, "train_known_indices.npy"))
|
124
118
|
train_unknown_indices = np.load(os.path.join(train_data_path, "train_unknown_indices.npy"))
|
125
|
-
return train_known_indices, train_unknown_indices,
|
119
|
+
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
126
120
|
|
127
|
-
def init_or_load_val_indices(dataset_config: DatasetConfig,
|
128
|
-
val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(
|
121
|
+
def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
122
|
+
val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
|
129
123
|
init_test_data(val_data_path)
|
130
124
|
if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
|
131
125
|
log.info("Processing validation indices")
|
132
126
|
val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
|
133
127
|
database_path=dataset_config.database_path,
|
128
|
+
tables_app_enum=tables_app_enum,
|
134
129
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
|
135
130
|
if not disable_indices_cache:
|
136
131
|
yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
|
@@ -141,13 +136,14 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps_database_
|
|
141
136
|
val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
|
142
137
|
return val_known_indices, val_unknown_indices, val_data_path
|
143
138
|
|
144
|
-
def init_or_load_test_indices(dataset_config: DatasetConfig,
|
145
|
-
test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(
|
139
|
+
def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
|
140
|
+
test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
|
146
141
|
init_test_data(test_data_path)
|
147
142
|
if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
|
148
143
|
log.info("Processing test indices")
|
149
144
|
test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
|
150
145
|
database_path=dataset_config.database_path,
|
146
|
+
tables_app_enum=tables_app_enum,
|
151
147
|
rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
|
152
148
|
if not disable_indices_cache:
|
153
149
|
yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
|
@@ -160,7 +156,7 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps_database
|
|
160
156
|
|
161
157
|
def init_train_data(train_data_path: str):
|
162
158
|
os.makedirs(train_data_path, exist_ok=True)
|
163
|
-
os.makedirs(os.path.join(train_data_path, "
|
159
|
+
os.makedirs(os.path.join(train_data_path, "transforms"), exist_ok=True)
|
164
160
|
os.makedirs(os.path.join(train_data_path, "preload"), exist_ok=True)
|
165
161
|
|
166
162
|
def init_test_data(test_data_path: str):
|