cesnet-datazoo 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,9 @@ from tqdm import tqdm
6
6
  from cesnet_datazoo.constants import APP_COLUMN
7
7
 
8
8
 
9
+ def collate_fn_simple(batch):
10
+ return batch
11
+
9
12
  def load_from_dataloader(dataloader: DataLoader, silent: bool = False) -> tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
10
13
  other_fields = []
11
14
  data_ppi = []
@@ -18,18 +18,19 @@ class DatasetMetadata():
18
18
  collection_period: str
19
19
  missing_dates_in_collection_period: list[str]
20
20
  application_count: int
21
- background_traffic: list[str]
22
- features_in_packet_sequences: list[str]
23
- packet_histogram_features: list[str]
21
+ background_traffic_classes: list[str]
22
+ ppi_features: list[str]
24
23
  flowstats_features: list[str]
24
+ flowstats_features_boolean: list[str]
25
+ packet_histograms: list[str]
25
26
  tcp_features: list[str]
26
27
  other_fields: list[str]
27
28
  cite: str
28
29
  zenodo_url: str
29
30
  related_papers: list[str]
30
31
 
31
- @field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "background_traffic", "features_in_packet_sequences",
32
- "packet_histogram_features", "flowstats_features", "tcp_features", "other_fields", "related_papers", mode="before")
32
+ @field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "background_traffic_classes", "ppi_features",
33
+ "flowstats_features", "flowstats_features_boolean", "packet_histograms", "tcp_features", "other_fields", "related_papers", mode="before")
33
34
  @classmethod
34
35
  def parse_string_to_list(cls, v: str, info: ValidationInfo) -> list[str]:
35
36
  l = list(map(str.strip, v.split(","))) if v else []
@@ -1,4 +1,4 @@
1
- Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic,Features in packet sequences,Packet histogram features,Flowstats features,TCP features,Other fields,Cite,Zenodo URL,Related papers
2
- CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE",,"BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
3
- CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
4
- CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
1
+ Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic classes,PPI features,Flowstats features,Flowstats features boolean,Packet histograms,TCP features,Other fields,Cite,Zenodo URL,Related papers
2
+ CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
3
+ CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
4
+ CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
@@ -1,4 +1,3 @@
1
- import logging
2
1
  import os
3
2
  from collections import Counter
4
3
  from typing import Any, Literal
@@ -12,8 +11,9 @@ from torch.utils.data import BatchSampler, DataLoader, SequentialSampler
12
11
  from tqdm import tqdm
13
12
 
14
13
  from cesnet_datazoo.config import Protocol
15
- from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASON_FEATURES, IPT_POS,
16
- PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
14
+ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, IPT_POS, PPI_COLUMN, SIZE_POS,
15
+ UDP_PPI_CHANNELS)
16
+ from cesnet_datazoo.datasets.loaders import collate_fn_simple
17
17
  from cesnet_datazoo.pytables_data.indices_setup import sort_indices
18
18
  from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
19
19
  load_database, worker_init_fn)
@@ -36,17 +36,25 @@ def pick_stats_fields(batch):
36
36
  batch[CATEGORY_COLUMN],
37
37
  )
38
38
 
39
- def pick_extra_fields(batch, flowstats_features: list[str]):
39
+ def pick_extra_fields(batch, packet_histograms: list[str], flow_endreason_features: list[str]):
40
40
  return (
41
41
  batch["DST_ASN"],
42
- batch[PHISTS_FEATURES],
43
- batch[[f for f in FLOWEND_REASON_FEATURES if f in flowstats_features]],
42
+ batch[packet_histograms],
43
+ batch[flow_endreason_features],
44
44
  )
45
45
 
46
- def simple_collate_fn(batch):
47
- return batch
48
-
49
- def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, extra_fields: bool, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
46
+ def compute_dataset_statistics(database_path: str,
47
+ tables_app_enum: dict[int, str],
48
+ tables_cat_enum: dict[int, str],
49
+ output_dir: str,
50
+ packet_histograms: list[str],
51
+ flowstats_features_boolean: list[str],
52
+ protocol: Protocol, extra_fields: bool,
53
+ disabled_apps: list[str],
54
+ num_samples: int | Literal["all"] = 10_000_000,
55
+ num_workers: int = 4,
56
+ batch_size: int = 4096,
57
+ silent: bool = False):
50
58
  stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
51
59
  stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
52
60
  categories_csv_path = os.path.join(output_dir, "categories.csv")
@@ -74,10 +82,22 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
74
82
  feature_bytes_total = []
75
83
  packet_sizes_counter = Counter()
76
84
  ipt_counter = Counter()
85
+ flow_endreason_features = [f for f in flowstats_features_boolean if f.startswith("FLOW_ENDREASON")]
77
86
  if not silent:
78
87
  print(f"Reading data from {database_path} for statistics")
79
88
  table_paths = list_all_tables(database_path)
80
- stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_paths, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
89
+ stats_dataset = PyTablesDataset(database_path=database_path,
90
+ tables_app_enum=tables_app_enum,
91
+ tables_cat_enum=tables_cat_enum,
92
+ tables_paths=table_paths,
93
+ indices=None,
94
+ disabled_apps=disabled_apps,
95
+ return_all_fields=True,
96
+ flowstats_features=[],
97
+ flowstats_features_boolean=[],
98
+ flowstats_features_phist=[],
99
+ other_fields=[],
100
+ ppi_channels=UDP_PPI_CHANNELS,)
81
101
  if num_samples != "all":
82
102
  subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
83
103
  stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
@@ -87,7 +107,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
87
107
  pin_memory=False,
88
108
  num_workers=num_workers,
89
109
  worker_init_fn=worker_init_fn,
90
- collate_fn=simple_collate_fn,
110
+ collate_fn=collate_fn_simple,
91
111
  persistent_workers=False,
92
112
  batch_size=None,
93
113
  sampler=stats_batch_sampler)
@@ -116,10 +136,10 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
116
136
  quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
117
137
  quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
118
138
  if extra_fields:
119
- asn, phist, flowend_reason = pick_extra_fields(batch, flowstats_features=flowstats_features)
139
+ asn, phist, flowend_reason = pick_extra_fields(batch, packet_histograms=packet_histograms, flow_endreason_features=flow_endreason_features)
120
140
  asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
121
141
  flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
122
- df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
142
+ df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=packet_histograms)
123
143
  df_phist = df_phist.add(df2, fill_value=0)
124
144
  feature_duration = np.concatenate(feature_duration)
125
145
  feature_packets_total = np.concatenate(feature_packets_total)
@@ -183,11 +203,11 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
183
203
  flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
184
204
  flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
185
205
  flow_endreason_df.index.name = "FLOW ENDREASON"
186
- flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
206
+ flow_endreason_df.index = pd.Index(flow_endreason_features)
187
207
  flow_endreason_df.to_csv(flow_endreason_path)
188
208
  # PHIST output
189
209
  df_phist.index.name = "BINS"
190
- df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
210
+ df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), packet_histograms))
191
211
  df_phist = df_phist.astype("int64")
192
212
  for i, column in zip((1, 3, 5, 7), df_phist.columns):
193
213
  df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
@@ -1,30 +1,23 @@
1
- from __future__ import annotations
2
-
1
+ import json
3
2
  import logging
4
3
  import os
5
4
  import time
6
5
  import warnings
7
- from typing import TYPE_CHECKING
8
6
 
9
7
  import numpy as np
8
+ from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI
10
9
  from numpy.lib.recfunctions import structured_to_unstructured
11
10
  from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
12
- from typing_extensions import assert_never
13
11
 
14
- from cesnet_datazoo.config import DatasetConfig, Scaler, ScalerEnum
15
- from cesnet_datazoo.constants import (DIR_POS, FLOWSTATS_NO_CLIP, FLOWSTATS_TO_SCALE, IPT_POS,
16
- PPI_COLUMN, SIZE_POS)
12
+ from cesnet_datazoo.config import DatasetConfig
13
+ from cesnet_datazoo.constants import DIR_POS, FLOWSTATS_NO_CLIP, IPT_POS, PPI_COLUMN, SIZE_POS
17
14
  from cesnet_datazoo.pytables_data.pytables_dataset import load_data_from_tables, load_database
18
- from cesnet_datazoo.utils.fileutils import pickle_dump, pickle_load
19
15
  from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
20
16
 
21
- if TYPE_CHECKING:
22
- from cesnet_datazoo.datasets.cesnet_dataset import CesnetDataset
23
-
24
17
  log = logging.getLogger(__name__)
25
18
 
26
- def get_scaler_attrs(scaler: Scaler) -> dict[str, list[float]]:
27
- assert Scaler is not None
19
+
20
+ def get_scaler_attrs(scaler: StandardScaler | RobustScaler | MinMaxScaler) -> dict[str, list[float]]:
28
21
  if isinstance(scaler, StandardScaler):
29
22
  assert hasattr(scaler, "mean_") and scaler.mean_ is not None and hasattr(scaler, "scale_") and scaler.scale_ is not None
30
23
  scaler_attrs = {"mean_": scaler.mean_.tolist(), "scale_": scaler.scale_.tolist()}
@@ -36,97 +29,8 @@ def get_scaler_attrs(scaler: Scaler) -> dict[str, list[float]]:
36
29
  scaler_attrs = {"min_": scaler.min_.tolist(), "scale_": scaler.scale_.tolist()}
37
30
  return scaler_attrs
38
31
 
39
- def set_scaler_attrs(scaler: Scaler, scaler_attrs: dict[str, list[float]]):
40
- assert Scaler is not None
41
- if isinstance(scaler, StandardScaler):
42
- assert "mean_" in scaler_attrs and "scale_" in scaler_attrs
43
- scaler.mean_ = np.array(scaler_attrs["mean_"])
44
- scaler.scale_ = np.array(scaler_attrs["scale_"])
45
- elif isinstance(scaler, RobustScaler):
46
- assert "center_" in scaler_attrs and "scale_" in scaler_attrs
47
- scaler.center_ = np.array(scaler_attrs["center_"])
48
- scaler.scale_ = np.array(scaler_attrs["scale_"])
49
- elif isinstance(scaler, MinMaxScaler):
50
- assert "min_" in scaler_attrs and "scale_" in scaler_attrs
51
- scaler.min_ = np.array(scaler_attrs["min_"])
52
- scaler.scale_ = np.array(scaler_attrs["scale_"])
53
-
54
- def save_scalers_attrs_as_dict(dataset: CesnetDataset) -> dict:
55
- assert dataset.flowstats_scaler is not None or dataset.psizes_scaler is not None or dataset.ipt_scaler is not None
56
- scalers_dict = {}
57
- if dataset.flowstats_scaler is not None:
58
- scalers_dict["flowstats_scaler_attrs"] = get_scaler_attrs(dataset.flowstats_scaler)
59
- if dataset.psizes_scaler is not None:
60
- scalers_dict["psizes_scaler_attrs"] = get_scaler_attrs(dataset.psizes_scaler)
61
- if dataset.ipt_scaler is not None:
62
- scalers_dict["ipt_scaler_attrs"] = get_scaler_attrs(dataset.ipt_scaler)
63
- assert dataset.flowstats_quantiles is not None
64
- scalers_dict["flowstats_quantiles"] = dataset.flowstats_quantiles.tolist()
65
- return scalers_dict
66
-
67
- def fit_or_load_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> tuple[Scaler, Scaler, Scaler, np.ndarray]:
68
- # Load the scalers from pickled files if scalers_attrs are not provided
69
- if dataset_config.scalers_attrs is None:
70
- train_data_path = dataset_config._get_train_data_path()
71
- flowstats_scaler_path = os.path.join(train_data_path, "stand", f"flowstats_scaler-{dataset_config.flowstats_scaler}-q{dataset_config.flowstats_clip}.pickle")
72
- psizes_sizes_scaler_path = os.path.join(train_data_path, "stand", f"psizes_scaler-{dataset_config.psizes_scaler}-psizes_max{dataset_config.psizes_max}.pickle")
73
- ipt_scaler_path = os.path.join(train_data_path, "stand", f"ipt_scaler-{dataset_config.ipt_scaler}-ipt_min{dataset_config.ipt_min}-ipt_max{dataset_config.ipt_max}.pickle")
74
- flowstats_quantiles_path = os.path.join(train_data_path, "stand", f"flowstats_quantiles-q{dataset_config.flowstats_clip}.pickle")
75
- if os.path.isfile(flowstats_scaler_path) and os.path.isfile(flowstats_quantiles_path) and os.path.isfile(ipt_scaler_path) and os.path.isfile(psizes_sizes_scaler_path):
76
- flowstats_scaler = pickle_load(flowstats_scaler_path)
77
- psizes_scaler = pickle_load(psizes_sizes_scaler_path)
78
- ipt_scaler = pickle_load(ipt_scaler_path)
79
- flowstats_quantiles = pickle_load(flowstats_quantiles_path)
80
- return flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles
81
- # Initialize the scalers classes based on the config
82
- if dataset_config.flowstats_scaler == ScalerEnum.ROBUST:
83
- flowstats_scaler = RobustScaler()
84
- elif dataset_config.flowstats_scaler == ScalerEnum.STANDARD:
85
- flowstats_scaler = StandardScaler()
86
- elif dataset_config.flowstats_scaler == ScalerEnum.MINMAX:
87
- flowstats_scaler = MinMaxScaler()
88
- elif dataset_config.flowstats_scaler == ScalerEnum.NO_SCALER:
89
- flowstats_scaler = None
90
- else: assert_never(dataset_config.flowstats_scaler)
91
- if dataset_config.ipt_scaler == ScalerEnum.ROBUST:
92
- ipt_scaler = RobustScaler()
93
- elif dataset_config.ipt_scaler == ScalerEnum.STANDARD:
94
- ipt_scaler = StandardScaler()
95
- elif dataset_config.ipt_scaler == ScalerEnum.MINMAX:
96
- ipt_scaler = MinMaxScaler()
97
- elif dataset_config.ipt_scaler == ScalerEnum.NO_SCALER:
98
- ipt_scaler = None
99
- else: assert_never(dataset_config.ipt_scaler)
100
- if dataset_config.psizes_scaler == ScalerEnum.ROBUST:
101
- psizes_scaler = RobustScaler()
102
- elif dataset_config.psizes_scaler == ScalerEnum.STANDARD:
103
- psizes_scaler = StandardScaler()
104
- elif dataset_config.psizes_scaler == ScalerEnum.MINMAX:
105
- psizes_scaler = MinMaxScaler()
106
- elif dataset_config.psizes_scaler == ScalerEnum.NO_SCALER:
107
- psizes_scaler = None
108
- else: assert_never(dataset_config.psizes_scaler)
109
- # Load scalers learned attributes from config if provided
110
- if dataset_config.scalers_attrs is not None:
111
- if "flowstats_scaler_attrs" in dataset_config.scalers_attrs:
112
- if flowstats_scaler is not None:
113
- set_scaler_attrs(flowstats_scaler, dataset_config.scalers_attrs["flowstats_scaler_attrs"])
114
- else:
115
- warnings.warn("Ignoring flowstats_scaler_attrs because flowstats_scaler is None")
116
- if "psizes_scaler_attrs" in dataset_config.scalers_attrs:
117
- if psizes_scaler is not None:
118
- set_scaler_attrs(psizes_scaler, dataset_config.scalers_attrs["psizes_scaler_attrs"])
119
- else:
120
- warnings.warn("Ignoring psizes_scaler_attrs because psizes_scaler is None")
121
- if "ipt_scaler_attrs" in dataset_config.scalers_attrs:
122
- if ipt_scaler is not None:
123
- set_scaler_attrs(ipt_scaler, dataset_config.scalers_attrs["ipt_scaler_attrs"])
124
- else:
125
- warnings.warn("Ignoring ipt_scaler_attrs because ipt_scaler is None")
126
- assert "flowstats_quantiles" in dataset_config.scalers_attrs
127
- flowstats_quantiles = np.array(dataset_config.scalers_attrs["flowstats_quantiles"])
128
- return flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles
129
- # If the scalers are not loaded at this point, fit them
32
+ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> None:
33
+ # Define indices for fitting scalers
130
34
  if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
131
35
  warnings.warn(f"The number of samples for fitting scalers ({dataset_config.fit_scalers_samples}) is larger than the number of train samples ({len(train_indices)}), using the number of train samples instead")
132
36
  dataset_config.fit_scalers_samples = len(train_indices)
@@ -136,61 +40,71 @@ def fit_or_load_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray
136
40
  else:
137
41
  num_samples = dataset_config.fit_scalers_samples
138
42
  fit_scalers_indices = train_indices[fit_scalers_rng.choice(len(train_indices), size=num_samples, replace=False)]
139
- flowstats_quantiles = fit_scalers(
140
- database_path=dataset_config.database_path,
141
- train_tables_paths=dataset_config._get_train_tables_paths(),
142
- fit_scalers_indices=fit_scalers_indices,
143
- flowstats_scaler=flowstats_scaler,
144
- psizes_scaler=psizes_scaler,
145
- ipt_scaler=ipt_scaler,
146
- flowstats_quantile_clip=dataset_config.flowstats_clip,
147
- ipt_min=dataset_config.ipt_min,
148
- ipt_max=dataset_config.ipt_max,
149
- psizes_max=dataset_config.psizes_max)
150
- pickle_dump(flowstats_scaler, flowstats_scaler_path)
151
- pickle_dump(psizes_scaler, psizes_sizes_scaler_path)
152
- pickle_dump(ipt_scaler, ipt_scaler_path)
153
- pickle_dump(flowstats_quantiles, flowstats_quantiles_path)
154
- return flowstats_scaler, psizes_scaler, ipt_scaler, flowstats_quantiles
155
-
156
- def fit_scalers(database_path: str,
157
- train_tables_paths: list[str],
158
- fit_scalers_indices: np.ndarray,
159
- flowstats_scaler: Scaler,
160
- psizes_scaler: Scaler,
161
- ipt_scaler: Scaler,
162
- flowstats_quantile_clip: float,
163
- ipt_min: int,
164
- ipt_max: int,
165
- psizes_max: int) -> np.ndarray:
43
+ # Load data
166
44
  start_time = time.time()
167
- database, tables = load_database(database_path, tables_paths=train_tables_paths)
45
+ database, tables = load_database(dataset_config.database_path, tables_paths=dataset_config._get_train_tables_paths())
168
46
  data = load_data_from_tables(tables=tables, indices=fit_scalers_indices, data_dtype=tables[0].dtype)
169
47
  database.close()
170
- # PPI
171
- data_ppi = data[PPI_COLUMN].astype("float32")
172
- ppi_channels = data_ppi.shape[1]
173
- data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
174
- padding_mask = data_ppi[:, DIR_POS] == 0 # mask of padded packets
175
- if ipt_scaler:
176
- train_ipt = data_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
177
- train_ipt[padding_mask] = np.nan # nans are ignored in sklearn scalers
178
- if isinstance(ipt_scaler, MinMaxScaler):
179
- # let zero be the minimum for minmax scaling
48
+
49
+ clip_and_scale_ppi_transform = dataset_config.ppi_transform # TODO Fix after transforms composing is implemented
50
+ clip_and_scale_flowstats_transform = dataset_config.flowstats_transform
51
+
52
+ # Fit the ClipAndScalePPI transform
53
+ if clip_and_scale_ppi_transform is not None and clip_and_scale_ppi_transform.needs_fitting:
54
+ assert isinstance(clip_and_scale_ppi_transform, ClipAndScalePPI)
55
+ data_ppi = data[PPI_COLUMN].astype("float32")
56
+ ppi_channels = data_ppi.shape[1]
57
+ data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
58
+ padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
59
+ # Fit IPT scaler
60
+ train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
61
+ train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
62
+ if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
63
+ # Let zero be the minimum for minmax scaling
180
64
  train_ipt = np.concatenate((train_ipt, [0]))
181
- ipt_scaler.fit(train_ipt.reshape(-1, 1))
182
- if psizes_scaler:
183
- train_psizes = data_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
65
+ clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
66
+ # Fit packet sizes scaler
67
+ train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
184
68
  train_psizes[padding_mask] = np.nan
185
- if isinstance(psizes_scaler, MinMaxScaler):
69
+ if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
186
70
  train_psizes = np.concatenate((train_psizes, [0]))
187
- psizes_scaler.fit(train_psizes.reshape(-1, 1))
188
- # Flow statistics
189
- train_flowstats = structured_to_unstructured(data[FLOWSTATS_TO_SCALE])
190
- flowstats_quantiles = np.quantile(train_flowstats, q=flowstats_quantile_clip, axis=0)
191
- flowstats_quantiles[-len(FLOWSTATS_NO_CLIP):] = np.inf # disable clipping for features with "fixed" range
192
- if flowstats_scaler:
71
+ clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
72
+ clip_and_scale_ppi_transform.needs_fitting = False
73
+
74
+ # Fit the ClipAndScaleFlowstats transform
75
+ if clip_and_scale_flowstats_transform is not None and clip_and_scale_flowstats_transform.needs_fitting:
76
+ assert isinstance(clip_and_scale_flowstats_transform, ClipAndScaleFlowstats)
77
+ train_flowstats = structured_to_unstructured(data[dataset_config.flowstats_features])
78
+ flowstats_quantiles = np.quantile(train_flowstats, q=clip_and_scale_flowstats_transform.quantile_clip, axis=0)
79
+ idx_no_clip = [dataset_config.flowstats_features.index(f) for f in FLOWSTATS_NO_CLIP]
80
+ flowstats_quantiles[idx_no_clip] = np.inf # Disable clipping for features with "fixed" range
193
81
  train_flowstats = train_flowstats.clip(max=flowstats_quantiles)
194
- flowstats_scaler.fit(train_flowstats)
82
+ clip_and_scale_flowstats_transform.flowstats_scaler.fit(train_flowstats)
83
+ clip_and_scale_flowstats_transform.flowstats_quantiles = flowstats_quantiles.tolist()
84
+ clip_and_scale_flowstats_transform.needs_fitting = False
85
+
195
86
  log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
196
- return flowstats_quantiles
87
+ train_data_path = dataset_config._get_train_data_path()
88
+ if clip_and_scale_ppi_transform is not None:
89
+ ppi_transform_path = os.path.join(train_data_path, "transforms", "ppi-transform.json")
90
+ ppi_transform_dict = {
91
+ "psizes_scaler_enum": str(clip_and_scale_ppi_transform._psizes_scaler_enum),
92
+ "psizes_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.psizes_scaler),
93
+ "pszies_min": clip_and_scale_ppi_transform.pszies_min,
94
+ "psizes_max": clip_and_scale_ppi_transform.psizes_max,
95
+ "ipt_scaler_enum": str(clip_and_scale_ppi_transform._ipt_scaler_enum),
96
+ "ipt_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.ipt_scaler),
97
+ "ipt_min": clip_and_scale_ppi_transform.ipt_min,
98
+ "ipt_max": clip_and_scale_ppi_transform.ipt_max,
99
+ }
100
+ json.dump(ppi_transform_dict, open(ppi_transform_path, "w"), indent=4)
101
+ if clip_and_scale_flowstats_transform is not None:
102
+ assert clip_and_scale_flowstats_transform.flowstats_quantiles is not None
103
+ flowstats_transform_path = os.path.join(train_data_path, "transforms", "flowstats-transform.json")
104
+ flowstats_transform_dict = {
105
+ "flowstats_scaler_enum": str(clip_and_scale_flowstats_transform._flowstats_scaler_enum),
106
+ "flowstats_scaler_attrs": get_scaler_attrs(clip_and_scale_flowstats_transform.flowstats_scaler),
107
+ "flowstats_quantiles": clip_and_scale_flowstats_transform.flowstats_quantiles,
108
+ "quantile_clip": clip_and_scale_flowstats_transform.quantile_clip,
109
+ }
110
+ json.dump(flowstats_transform_dict, open(flowstats_transform_path, "w"), indent=4)
@@ -7,13 +7,11 @@ from enum import Enum
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
- from sklearn.preprocessing import LabelEncoder
11
10
 
12
11
  from cesnet_datazoo.config import DatasetConfig
13
- from cesnet_datazoo.constants import (INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS,
14
- UNKNOWN_STR_LABEL)
12
+ from cesnet_datazoo.constants import INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS
15
13
  from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
16
- from cesnet_datazoo.utils.fileutils import pickle_dump, pickle_load, yaml_dump, yaml_load
14
+ from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
17
15
  from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
18
16
 
19
17
  log = logging.getLogger(__name__)
@@ -78,59 +76,56 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
78
76
  sampled_train_indices = np.concatenate(sampled_indicies_per_date)
79
77
  return sampled_train_indices
80
78
 
81
- def indices_to_app_counts(indices: np.ndarray, database_enum: dict[int, str]) -> pd.Series:
79
+ def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
82
80
  app_counts = pd.Series(indices[:, INDICES_LABEL_POS]).value_counts()
83
- app_counts.index = app_counts.index.map(lambda x: database_enum[x])
81
+ app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
84
82
  return app_counts
85
83
 
86
- def compute_known_app_counts(dataset_indices: IndicesTuple, database_enum: dict[int, str]) -> pd.DataFrame:
87
- train_app_counts = indices_to_app_counts(dataset_indices.train_indices, database_enum)
88
- val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices, database_enum)
89
- test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices, database_enum)
84
+ def compute_known_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
85
+ train_app_counts = indices_to_app_counts(dataset_indices.train_indices, tables_app_enum)
86
+ val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices, tables_app_enum)
87
+ test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices, tables_app_enum)
90
88
  df = pd.DataFrame(data={"Train": train_app_counts, "Validation": val_known_app_counts, "Test": test_known_app_counts}).fillna(0).astype("int64")
91
89
  return df
92
90
 
93
- def compute_unknown_app_counts(dataset_indices: IndicesTuple, database_enum: dict[int, str]) -> pd.DataFrame:
94
- val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices, database_enum)
95
- test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices, database_enum)
91
+ def compute_unknown_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
92
+ val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices, tables_app_enum)
93
+ test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices, tables_app_enum)
96
94
  df = pd.DataFrame(data={"Validation": val_unknown_app_counts, "Test": test_unknown_app_counts}).fillna(0).astype("int64")
97
95
  return df
98
96
 
99
- def init_or_load_train_indices(dataset_config: DatasetConfig, servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, LabelEncoder, dict[int, str], dict[int, str]]:
97
+ def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
100
98
  train_data_path = dataset_config._get_train_data_path()
101
99
  init_train_data(train_data_path)
102
100
  if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
103
101
  log.info("Processing train indices")
104
102
  train_data_params = dataset_config._get_train_data_params()
105
- train_known_indices, train_unknown_indices, known_apps_database_enum, unknown_apps_database_enum = init_train_indices(
106
- train_data_params=train_data_params,
107
- servicemap=servicemap,
108
- database_path=dataset_config.database_path,
109
- rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
110
- encoder = LabelEncoder().fit(list(known_apps_database_enum.values()))
111
- encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
103
+ train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
104
+ database_path=dataset_config.database_path,
105
+ tables_app_enum=tables_app_enum,
106
+ servicemap=servicemap,
107
+ rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
112
108
  if not disable_indices_cache:
113
109
  yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
114
- yaml_dump(known_apps_database_enum, os.path.join(train_data_path, "known_apps_database_enum.yaml"))
115
- yaml_dump(unknown_apps_database_enum, os.path.join(train_data_path, "unknown_apps_database_enum.yaml"))
116
- pickle_dump(encoder, os.path.join(train_data_path, "encoder.pickle"))
110
+ yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
111
+ yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
117
112
  np.save(os.path.join(train_data_path, "train_known_indices.npy"), train_known_indices)
118
113
  np.save(os.path.join(train_data_path, "train_unknown_indices.npy"), train_unknown_indices)
119
114
  else:
120
- known_apps_database_enum = yaml_load(os.path.join(train_data_path, "known_apps_database_enum.yaml"))
121
- unknown_apps_database_enum = yaml_load(os.path.join(train_data_path, "unknown_apps_database_enum.yaml"))
122
- encoder = pickle_load(os.path.join(train_data_path, "encoder.pickle"))
115
+ known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
116
+ unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
123
117
  train_known_indices = np.load(os.path.join(train_data_path, "train_known_indices.npy"))
124
118
  train_unknown_indices = np.load(os.path.join(train_data_path, "train_unknown_indices.npy"))
125
- return train_known_indices, train_unknown_indices, encoder, known_apps_database_enum, unknown_apps_database_enum
119
+ return train_known_indices, train_unknown_indices, known_apps, unknown_apps
126
120
 
127
- def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
128
- val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
121
+ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
122
+ val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
129
123
  init_test_data(val_data_path)
130
124
  if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
131
125
  log.info("Processing validation indices")
132
126
  val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
133
127
  database_path=dataset_config.database_path,
128
+ tables_app_enum=tables_app_enum,
134
129
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
135
130
  if not disable_indices_cache:
136
131
  yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
@@ -141,13 +136,14 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps_database_
141
136
  val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
142
137
  return val_known_indices, val_unknown_indices, val_data_path
143
138
 
144
- def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
145
- test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
139
+ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
140
+ test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
146
141
  init_test_data(test_data_path)
147
142
  if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
148
143
  log.info("Processing test indices")
149
144
  test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
150
145
  database_path=dataset_config.database_path,
146
+ tables_app_enum=tables_app_enum,
151
147
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
152
148
  if not disable_indices_cache:
153
149
  yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
@@ -160,7 +156,7 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps_database
160
156
 
161
157
  def init_train_data(train_data_path: str):
162
158
  os.makedirs(train_data_path, exist_ok=True)
163
- os.makedirs(os.path.join(train_data_path, "stand"), exist_ok=True)
159
+ os.makedirs(os.path.join(train_data_path, "transforms"), exist_ok=True)
164
160
  os.makedirs(os.path.join(train_data_path, "preload"), exist_ok=True)
165
161
 
166
162
  def init_test_data(test_data_path: str):