cesnet-datazoo 0.0.16__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,9 @@ from tqdm import tqdm
6
6
  from cesnet_datazoo.constants import APP_COLUMN
7
7
 
8
8
 
9
+ def collate_fn_simple(batch):
10
+ return batch
11
+
9
12
  def load_from_dataloader(dataloader: DataLoader, silent: bool = False) -> tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
10
13
  other_fields = []
11
14
  data_ppi = []
@@ -18,18 +18,19 @@ class DatasetMetadata():
18
18
  collection_period: str
19
19
  missing_dates_in_collection_period: list[str]
20
20
  application_count: int
21
- background_traffic: list[str]
22
- features_in_packet_sequences: list[str]
23
- packet_histogram_features: list[str]
21
+ background_traffic_classes: list[str]
22
+ ppi_features: list[str]
24
23
  flowstats_features: list[str]
24
+ flowstats_features_boolean: list[str]
25
+ packet_histograms: list[str]
25
26
  tcp_features: list[str]
26
27
  other_fields: list[str]
27
28
  cite: str
28
29
  zenodo_url: str
29
30
  related_papers: list[str]
30
31
 
31
- @field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "background_traffic", "features_in_packet_sequences",
32
- "packet_histogram_features", "flowstats_features", "tcp_features", "other_fields", "related_papers", mode="before")
32
+ @field_validator("available_dataset_sizes", "missing_dates_in_collection_period", "background_traffic_classes", "ppi_features",
33
+ "flowstats_features", "flowstats_features_boolean", "packet_histograms", "tcp_features", "other_fields", "related_papers", mode="before")
33
34
  @classmethod
34
35
  def parse_string_to_list(cls, v: str, info: ValidationInfo) -> list[str]:
35
36
  l = list(map(str.strip, v.split(","))) if v else []
@@ -1,4 +1,4 @@
1
- Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic,Features in packet sequences,Packet histogram features,Flowstats features,TCP features,Other fields,Cite,Zenodo URL,Related papers
2
- CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE",,"BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
3
- CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
4
- CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
1
+ Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic classes,PPI features,Flowstats features,Flowstats features boolean,Packet histograms,TCP features,Other fields,Cite,Zenodo URL,Related papers
2
+ CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION",,,"FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
3
+ CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
4
+ CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",180,,"IPT, DIR, SIZE, PUSH_FLAG","BYTES, BYTES_REV, PACKETS, PACKETS_REV, DURATION, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION","FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
@@ -1,4 +1,3 @@
1
- import logging
2
1
  import os
3
2
  from collections import Counter
4
3
  from typing import Any, Literal
@@ -12,8 +11,9 @@ from torch.utils.data import BatchSampler, DataLoader, SequentialSampler
12
11
  from tqdm import tqdm
13
12
 
14
13
  from cesnet_datazoo.config import Protocol
15
- from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASON_FEATURES, IPT_POS,
16
- PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
14
+ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, IPT_POS, PPI_COLUMN, SIZE_POS,
15
+ UDP_PPI_CHANNELS)
16
+ from cesnet_datazoo.datasets.loaders import collate_fn_simple
17
17
  from cesnet_datazoo.pytables_data.indices_setup import sort_indices
18
18
  from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
19
19
  load_database, worker_init_fn)
@@ -36,17 +36,25 @@ def pick_stats_fields(batch):
36
36
  batch[CATEGORY_COLUMN],
37
37
  )
38
38
 
39
- def pick_extra_fields(batch, flowstats_features: list[str]):
39
+ def pick_extra_fields(batch, packet_histograms: list[str], flow_endreason_features: list[str]):
40
40
  return (
41
41
  batch["DST_ASN"],
42
- batch[PHISTS_FEATURES],
43
- batch[[f for f in FLOWEND_REASON_FEATURES if f in flowstats_features]],
42
+ batch[packet_histograms],
43
+ batch[flow_endreason_features],
44
44
  )
45
45
 
46
- def simple_collate_fn(batch):
47
- return batch
48
-
49
- def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, extra_fields: bool, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
46
+ def compute_dataset_statistics(database_path: str,
47
+ tables_app_enum: dict[int, str],
48
+ tables_cat_enum: dict[int, str],
49
+ output_dir: str,
50
+ packet_histograms: list[str],
51
+ flowstats_features_boolean: list[str],
52
+ protocol: Protocol, extra_fields: bool,
53
+ disabled_apps: list[str],
54
+ num_samples: int | Literal["all"] = 10_000_000,
55
+ num_workers: int = 4,
56
+ batch_size: int = 4096,
57
+ silent: bool = False):
50
58
  stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
51
59
  stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
52
60
  categories_csv_path = os.path.join(output_dir, "categories.csv")
@@ -74,10 +82,22 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
74
82
  feature_bytes_total = []
75
83
  packet_sizes_counter = Counter()
76
84
  ipt_counter = Counter()
85
+ flow_endreason_features = [f for f in flowstats_features_boolean if f.startswith("FLOW_ENDREASON")]
77
86
  if not silent:
78
87
  print(f"Reading data from {database_path} for statistics")
79
88
  table_paths = list_all_tables(database_path)
80
- stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_paths, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
89
+ stats_dataset = PyTablesDataset(database_path=database_path,
90
+ tables_app_enum=tables_app_enum,
91
+ tables_cat_enum=tables_cat_enum,
92
+ tables_paths=table_paths,
93
+ indices=None,
94
+ disabled_apps=disabled_apps,
95
+ return_all_fields=True,
96
+ flowstats_features=[],
97
+ flowstats_features_boolean=[],
98
+ flowstats_features_phist=[],
99
+ other_fields=[],
100
+ ppi_channels=UDP_PPI_CHANNELS,)
81
101
  if num_samples != "all":
82
102
  subset_indices = np.random.randint(low=0, high=len(stats_dataset.indices), size=num_samples)
83
103
  stats_dataset.indices = sort_indices(stats_dataset.indices[subset_indices])
@@ -87,7 +107,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
87
107
  pin_memory=False,
88
108
  num_workers=num_workers,
89
109
  worker_init_fn=worker_init_fn,
90
- collate_fn=simple_collate_fn,
110
+ collate_fn=collate_fn_simple,
91
111
  persistent_workers=False,
92
112
  batch_size=None,
93
113
  sampler=stats_batch_sampler)
@@ -116,10 +136,10 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
116
136
  quic_ua_series = quic_ua_series.add(pd.Series(user_agent).str.decode("utf-8").value_counts(), fill_value=0)
117
137
  quic_version_series = quic_version_series.add(pd.Series(quic_version).value_counts(), fill_value=0)
118
138
  if extra_fields:
119
- asn, phist, flowend_reason = pick_extra_fields(batch, flowstats_features=flowstats_features)
139
+ asn, phist, flowend_reason = pick_extra_fields(batch, packet_histograms=packet_histograms, flow_endreason_features=flow_endreason_features)
120
140
  asn_series = asn_series.add(pd.Series(asn).value_counts(), fill_value=0)
121
141
  flow_endreason_series = flow_endreason_series.add(pd.Series(structured_to_unstructured(flowend_reason).sum(axis=0)), fill_value=0)
122
- df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=PHISTS_FEATURES)
142
+ df2 = pd.DataFrame(data=zip(*np.split(structured_to_unstructured(phist).sum(axis=0), 4)), columns=packet_histograms)
123
143
  df_phist = df_phist.add(df2, fill_value=0)
124
144
  feature_duration = np.concatenate(feature_duration)
125
145
  feature_packets_total = np.concatenate(feature_packets_total)
@@ -183,11 +203,11 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
183
203
  flow_endreason_df = pd.DataFrame({"COUNT": flow_endreason_series.astype("int64")})
184
204
  flow_endreason_df["PERC"] = (flow_endreason_df["COUNT"] / flow_endreason_df["COUNT"].sum() * 100).round(3)
185
205
  flow_endreason_df.index.name = "FLOW ENDREASON"
186
- flow_endreason_df.index = pd.Index([f for f in FLOWEND_REASON_FEATURES if f in flowstats_features])
206
+ flow_endreason_df.index = pd.Index(flow_endreason_features)
187
207
  flow_endreason_df.to_csv(flow_endreason_path)
188
208
  # PHIST output
189
209
  df_phist.index.name = "BINS"
190
- df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), PHISTS_FEATURES))
210
+ df_phist.columns = list(map(lambda x: x.upper().replace("_", " "), packet_histograms))
191
211
  df_phist = df_phist.astype("int64")
192
212
  for i, column in zip((1, 3, 5, 7), df_phist.columns):
193
213
  df_phist.insert(i, column + " PERC", (df_phist[column] / df_phist[column].sum() * 100).round(3))
@@ -0,0 +1,110 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import time
5
+ import warnings
6
+
7
+ import numpy as np
8
+ from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI
9
+ from numpy.lib.recfunctions import structured_to_unstructured
10
+ from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
11
+
12
+ from cesnet_datazoo.config import DatasetConfig
13
+ from cesnet_datazoo.constants import DIR_POS, FLOWSTATS_NO_CLIP, IPT_POS, PPI_COLUMN, SIZE_POS
14
+ from cesnet_datazoo.pytables_data.pytables_dataset import load_data_from_tables, load_database
15
+ from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ def get_scaler_attrs(scaler: StandardScaler | RobustScaler | MinMaxScaler) -> dict[str, list[float]]:
21
+ if isinstance(scaler, StandardScaler):
22
+ assert hasattr(scaler, "mean_") and scaler.mean_ is not None and hasattr(scaler, "scale_") and scaler.scale_ is not None
23
+ scaler_attrs = {"mean_": scaler.mean_.tolist(), "scale_": scaler.scale_.tolist()}
24
+ elif isinstance(scaler, RobustScaler):
25
+ assert hasattr(scaler, "center_") and hasattr(scaler, "scale_")
26
+ scaler_attrs = {"center_": scaler.center_.tolist(), "scale_": scaler.scale_.tolist()}
27
+ elif isinstance(scaler, MinMaxScaler):
28
+ assert hasattr(scaler, "min_") and hasattr(scaler, "scale_")
29
+ scaler_attrs = {"min_": scaler.min_.tolist(), "scale_": scaler.scale_.tolist()}
30
+ return scaler_attrs
31
+
32
+ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> None:
33
+ # Define indices for fitting scalers
34
+ if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
35
+ warnings.warn(f"The number of samples for fitting scalers ({dataset_config.fit_scalers_samples}) is larger than the number of train samples ({len(train_indices)}), using the number of train samples instead")
36
+ dataset_config.fit_scalers_samples = len(train_indices)
37
+ fit_scalers_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.FIT_SCALERS_SAMPLE)
38
+ if isinstance(dataset_config.fit_scalers_samples, float):
39
+ num_samples = int(dataset_config.fit_scalers_samples * len(train_indices))
40
+ else:
41
+ num_samples = dataset_config.fit_scalers_samples
42
+ fit_scalers_indices = train_indices[fit_scalers_rng.choice(len(train_indices), size=num_samples, replace=False)]
43
+ # Load data
44
+ start_time = time.time()
45
+ database, tables = load_database(dataset_config.database_path, tables_paths=dataset_config._get_train_tables_paths())
46
+ data = load_data_from_tables(tables=tables, indices=fit_scalers_indices, data_dtype=tables[0].dtype)
47
+ database.close()
48
+
49
+ clip_and_scale_ppi_transform = dataset_config.ppi_transform # TODO Fix after transforms composing is implemented
50
+ clip_and_scale_flowstats_transform = dataset_config.flowstats_transform
51
+
52
+ # Fit the ClipAndScalePPI transform
53
+ if clip_and_scale_ppi_transform is not None and clip_and_scale_ppi_transform.needs_fitting:
54
+ assert isinstance(clip_and_scale_ppi_transform, ClipAndScalePPI)
55
+ data_ppi = data[PPI_COLUMN].astype("float32")
56
+ ppi_channels = data_ppi.shape[1]
57
+ data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
58
+ padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
59
+ # Fit IPT scaler
60
+ train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
61
+ train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
62
+ if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
63
+ # Let zero be the minimum for minmax scaling
64
+ train_ipt = np.concatenate((train_ipt, [0]))
65
+ clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
66
+ # Fit packet sizes scaler
67
+ train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
68
+ train_psizes[padding_mask] = np.nan
69
+ if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
70
+ train_psizes = np.concatenate((train_psizes, [0]))
71
+ clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
72
+ clip_and_scale_ppi_transform.needs_fitting = False
73
+
74
+ # Fit the ClipAndScaleFlowstats transform
75
+ if clip_and_scale_flowstats_transform is not None and clip_and_scale_flowstats_transform.needs_fitting:
76
+ assert isinstance(clip_and_scale_flowstats_transform, ClipAndScaleFlowstats)
77
+ train_flowstats = structured_to_unstructured(data[dataset_config.flowstats_features])
78
+ flowstats_quantiles = np.quantile(train_flowstats, q=clip_and_scale_flowstats_transform.quantile_clip, axis=0)
79
+ idx_no_clip = [dataset_config.flowstats_features.index(f) for f in FLOWSTATS_NO_CLIP]
80
+ flowstats_quantiles[idx_no_clip] = np.inf # Disable clipping for features with "fixed" range
81
+ train_flowstats = train_flowstats.clip(max=flowstats_quantiles)
82
+ clip_and_scale_flowstats_transform.flowstats_scaler.fit(train_flowstats)
83
+ clip_and_scale_flowstats_transform.flowstats_quantiles = flowstats_quantiles.tolist()
84
+ clip_and_scale_flowstats_transform.needs_fitting = False
85
+
86
+ log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
87
+ train_data_path = dataset_config._get_train_data_path()
88
+ if clip_and_scale_ppi_transform is not None:
89
+ ppi_transform_path = os.path.join(train_data_path, "transforms", "ppi-transform.json")
90
+ ppi_transform_dict = {
91
+ "psizes_scaler_enum": str(clip_and_scale_ppi_transform._psizes_scaler_enum),
92
+ "psizes_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.psizes_scaler),
93
+ "pszies_min": clip_and_scale_ppi_transform.pszies_min,
94
+ "psizes_max": clip_and_scale_ppi_transform.psizes_max,
95
+ "ipt_scaler_enum": str(clip_and_scale_ppi_transform._ipt_scaler_enum),
96
+ "ipt_scaler_attrs": get_scaler_attrs(clip_and_scale_ppi_transform.ipt_scaler),
97
+ "ipt_min": clip_and_scale_ppi_transform.ipt_min,
98
+ "ipt_max": clip_and_scale_ppi_transform.ipt_max,
99
+ }
100
+ json.dump(ppi_transform_dict, open(ppi_transform_path, "w"), indent=4)
101
+ if clip_and_scale_flowstats_transform is not None:
102
+ assert clip_and_scale_flowstats_transform.flowstats_quantiles is not None
103
+ flowstats_transform_path = os.path.join(train_data_path, "transforms", "flowstats-transform.json")
104
+ flowstats_transform_dict = {
105
+ "flowstats_scaler_enum": str(clip_and_scale_flowstats_transform._flowstats_scaler_enum),
106
+ "flowstats_scaler_attrs": get_scaler_attrs(clip_and_scale_flowstats_transform.flowstats_scaler),
107
+ "flowstats_quantiles": clip_and_scale_flowstats_transform.flowstats_quantiles,
108
+ "quantile_clip": clip_and_scale_flowstats_transform.quantile_clip,
109
+ }
110
+ json.dump(flowstats_transform_dict, open(flowstats_transform_path, "w"), indent=4)
@@ -7,13 +7,11 @@ from enum import Enum
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
- from sklearn.preprocessing import LabelEncoder
11
10
 
12
11
  from cesnet_datazoo.config import DatasetConfig
13
- from cesnet_datazoo.constants import (INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS,
14
- UNKNOWN_STR_LABEL)
12
+ from cesnet_datazoo.constants import INDICES_INDEX_POS, INDICES_LABEL_POS, INDICES_TABLE_POS
15
13
  from cesnet_datazoo.pytables_data.pytables_dataset import init_test_indices, init_train_indices
16
- from cesnet_datazoo.utils.fileutils import pickle_dump, pickle_load, yaml_dump, yaml_load
14
+ from cesnet_datazoo.utils.fileutils import yaml_dump, yaml_load
17
15
  from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
18
16
 
19
17
  log = logging.getLogger(__name__)
@@ -78,59 +76,56 @@ def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indice
78
76
  sampled_train_indices = np.concatenate(sampled_indicies_per_date)
79
77
  return sampled_train_indices
80
78
 
81
- def indices_to_app_counts(indices: np.ndarray, database_enum: dict[int, str]) -> pd.Series:
79
+ def indices_to_app_counts(indices: np.ndarray, tables_app_enum: dict[int, str]) -> pd.Series:
82
80
  app_counts = pd.Series(indices[:, INDICES_LABEL_POS]).value_counts()
83
- app_counts.index = app_counts.index.map(lambda x: database_enum[x])
81
+ app_counts.index = app_counts.index.map(lambda x: tables_app_enum[x])
84
82
  return app_counts
85
83
 
86
- def compute_known_app_counts(dataset_indices: IndicesTuple, database_enum: dict[int, str]) -> pd.DataFrame:
87
- train_app_counts = indices_to_app_counts(dataset_indices.train_indices, database_enum)
88
- val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices, database_enum)
89
- test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices, database_enum)
84
+ def compute_known_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
85
+ train_app_counts = indices_to_app_counts(dataset_indices.train_indices, tables_app_enum)
86
+ val_known_app_counts = indices_to_app_counts(dataset_indices.val_known_indices, tables_app_enum)
87
+ test_known_app_counts = indices_to_app_counts(dataset_indices.test_known_indices, tables_app_enum)
90
88
  df = pd.DataFrame(data={"Train": train_app_counts, "Validation": val_known_app_counts, "Test": test_known_app_counts}).fillna(0).astype("int64")
91
89
  return df
92
90
 
93
- def compute_unknown_app_counts(dataset_indices: IndicesTuple, database_enum: dict[int, str]) -> pd.DataFrame:
94
- val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices, database_enum)
95
- test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices, database_enum)
91
+ def compute_unknown_app_counts(dataset_indices: IndicesTuple, tables_app_enum: dict[int, str]) -> pd.DataFrame:
92
+ val_unknown_app_counts = indices_to_app_counts(dataset_indices.val_unknown_indices, tables_app_enum)
93
+ test_unknown_app_counts = indices_to_app_counts(dataset_indices.test_unknown_indices, tables_app_enum)
96
94
  df = pd.DataFrame(data={"Validation": val_unknown_app_counts, "Test": test_unknown_app_counts}).fillna(0).astype("int64")
97
95
  return df
98
96
 
99
- def init_or_load_train_indices(dataset_config: DatasetConfig, servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, LabelEncoder, dict[int, str], dict[int, str]]:
97
+ def init_or_load_train_indices(dataset_config: DatasetConfig, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
100
98
  train_data_path = dataset_config._get_train_data_path()
101
99
  init_train_data(train_data_path)
102
100
  if not os.path.isfile(os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE)):
103
101
  log.info("Processing train indices")
104
102
  train_data_params = dataset_config._get_train_data_params()
105
- train_known_indices, train_unknown_indices, known_apps_database_enum, unknown_apps_database_enum = init_train_indices(
106
- train_data_params=train_data_params,
107
- servicemap=servicemap,
108
- database_path=dataset_config.database_path,
109
- rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
110
- encoder = LabelEncoder().fit(list(known_apps_database_enum.values()))
111
- encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
103
+ train_known_indices, train_unknown_indices, known_apps, unknown_apps = init_train_indices(train_data_params=train_data_params,
104
+ database_path=dataset_config.database_path,
105
+ tables_app_enum=tables_app_enum,
106
+ servicemap=servicemap,
107
+ rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TRAIN_INDICES))
112
108
  if not disable_indices_cache:
113
109
  yaml_dump({k: str(v) if isinstance(v, Enum) else list(v) if isinstance(v, tuple) else v for k, v in dataclasses.asdict(train_data_params).items()}, os.path.join(train_data_path, TRAIN_DATA_PARAMS_FILE))
114
- yaml_dump(known_apps_database_enum, os.path.join(train_data_path, "known_apps_database_enum.yaml"))
115
- yaml_dump(unknown_apps_database_enum, os.path.join(train_data_path, "unknown_apps_database_enum.yaml"))
116
- pickle_dump(encoder, os.path.join(train_data_path, "encoder.pickle"))
110
+ yaml_dump(known_apps, os.path.join(train_data_path, "known_apps.yaml"))
111
+ yaml_dump(unknown_apps, os.path.join(train_data_path, "unknown_apps.yaml"))
117
112
  np.save(os.path.join(train_data_path, "train_known_indices.npy"), train_known_indices)
118
113
  np.save(os.path.join(train_data_path, "train_unknown_indices.npy"), train_unknown_indices)
119
114
  else:
120
- known_apps_database_enum = yaml_load(os.path.join(train_data_path, "known_apps_database_enum.yaml"))
121
- unknown_apps_database_enum = yaml_load(os.path.join(train_data_path, "unknown_apps_database_enum.yaml"))
122
- encoder = pickle_load(os.path.join(train_data_path, "encoder.pickle"))
115
+ known_apps = yaml_load(os.path.join(train_data_path, "known_apps.yaml"))
116
+ unknown_apps = yaml_load(os.path.join(train_data_path, "unknown_apps.yaml"))
123
117
  train_known_indices = np.load(os.path.join(train_data_path, "train_known_indices.npy"))
124
118
  train_unknown_indices = np.load(os.path.join(train_data_path, "train_unknown_indices.npy"))
125
- return train_known_indices, train_unknown_indices, encoder, known_apps_database_enum, unknown_apps_database_enum
119
+ return train_known_indices, train_unknown_indices, known_apps, unknown_apps
126
120
 
127
- def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
128
- val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
121
+ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
122
+ val_data_params, val_data_path = dataset_config._get_val_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
129
123
  init_test_data(val_data_path)
130
124
  if not os.path.isfile(os.path.join(val_data_path, TEST_DATA_PARAMS_FILE)):
131
125
  log.info("Processing validation indices")
132
126
  val_known_indices, val_unknown_indices = init_test_indices(test_data_params=val_data_params,
133
127
  database_path=dataset_config.database_path,
128
+ tables_app_enum=tables_app_enum,
134
129
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_VAL_INIDICES))
135
130
  if not disable_indices_cache:
136
131
  yaml_dump(dataclasses.asdict(val_data_params), os.path.join(val_data_path, TEST_DATA_PARAMS_FILE))
@@ -141,13 +136,14 @@ def init_or_load_val_indices(dataset_config: DatasetConfig, known_apps_database_
141
136
  val_unknown_indices = np.load(os.path.join(val_data_path, "val_unknown_indices.npy"))
142
137
  return val_known_indices, val_unknown_indices, val_data_path
143
138
 
144
- def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
145
- test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(known_apps_database_enum=known_apps_database_enum, unknown_apps_database_enum=unknown_apps_database_enum)
139
+ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps: list[str], unknown_apps: list[str], tables_app_enum: dict[int, str], disable_indices_cache: bool) -> tuple[np.ndarray, np.ndarray, str]:
140
+ test_data_params, test_data_path = dataset_config._get_test_data_params_and_path(known_apps=known_apps, unknown_apps=unknown_apps)
146
141
  init_test_data(test_data_path)
147
142
  if not os.path.isfile(os.path.join(test_data_path, TEST_DATA_PARAMS_FILE)):
148
143
  log.info("Processing test indices")
149
144
  test_known_indices, test_unknown_indices = init_test_indices(test_data_params=test_data_params,
150
145
  database_path=dataset_config.database_path,
146
+ tables_app_enum=tables_app_enum,
151
147
  rng=get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.INIT_TEST_INDICES))
152
148
  if not disable_indices_cache:
153
149
  yaml_dump(dataclasses.asdict(test_data_params), os.path.join(test_data_path, TEST_DATA_PARAMS_FILE))
@@ -160,7 +156,7 @@ def init_or_load_test_indices(dataset_config: DatasetConfig, known_apps_database
160
156
 
161
157
  def init_train_data(train_data_path: str):
162
158
  os.makedirs(train_data_path, exist_ok=True)
163
- os.makedirs(os.path.join(train_data_path, "stand"), exist_ok=True)
159
+ os.makedirs(os.path.join(train_data_path, "transforms"), exist_ok=True)
164
160
  os.makedirs(os.path.join(train_data_path, "preload"), exist_ok=True)
165
161
 
166
162
  def init_test_data(test_data_path: str):