cesnet-datazoo 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +173 -168
- cesnet_datazoo/constants.py +4 -6
- cesnet_datazoo/datasets/cesnet_dataset.py +200 -177
- cesnet_datazoo/datasets/datasets.py +22 -2
- cesnet_datazoo/datasets/datasets_constants.py +670 -0
- cesnet_datazoo/datasets/loaders.py +3 -0
- cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
- cesnet_datazoo/datasets/metadata/metadata.csv +4 -4
- cesnet_datazoo/datasets/statistics.py +36 -16
- cesnet_datazoo/pytables_data/data_scalers.py +68 -154
- cesnet_datazoo/pytables_data/indices_setup.py +29 -33
- cesnet_datazoo/pytables_data/pytables_dataset.py +99 -122
- cesnet_datazoo/utils/class_info.py +7 -5
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/METADATA +2 -1
- cesnet_datazoo-0.1.0.dist-info/RECORD +30 -0
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/WHEEL +1 -1
- cesnet_datazoo-0.0.17.dist-info/RECORD +0 -29
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.0.17.dist-info → cesnet_datazoo-0.1.0.dist-info}/top_level.txt +0 -0
@@ -4,23 +4,19 @@ import os
|
|
4
4
|
import time
|
5
5
|
import warnings
|
6
6
|
from datetime import datetime
|
7
|
-
from typing import Any, Optional
|
7
|
+
from typing import Any, Callable, Optional
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
11
11
|
import tables as tb
|
12
12
|
import torch
|
13
|
-
from numpy.lib.recfunctions import
|
14
|
-
from sklearn.preprocessing import LabelEncoder
|
13
|
+
from numpy.lib.recfunctions import structured_to_unstructured
|
15
14
|
from torch.utils.data import Dataset
|
16
15
|
from typing_extensions import assert_never
|
17
16
|
|
18
|
-
from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck,
|
17
|
+
from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
|
19
18
|
TrainDataParams)
|
20
|
-
from cesnet_datazoo.constants import
|
21
|
-
INDICES_INDEX_POS, INDICES_TABLE_POS, IPT_POS,
|
22
|
-
PHIST_BIN_COUNT, PHISTS_FEATURES, PPI_COLUMN, SIZE_POS,
|
23
|
-
UNKNOWN_STR_LABEL)
|
19
|
+
from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
|
24
20
|
from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
|
25
21
|
split_apps_topx_with_provider_groups)
|
26
22
|
|
@@ -28,27 +24,52 @@ log = logging.getLogger(__name__)
|
|
28
24
|
|
29
25
|
|
30
26
|
class PyTablesDataset(Dataset):
|
31
|
-
def __init__(self,
|
27
|
+
def __init__(self,
|
28
|
+
database_path: str,
|
32
29
|
tables_paths: list[str],
|
33
30
|
indices: Optional[np.ndarray],
|
31
|
+
tables_app_enum: dict[int, str],
|
32
|
+
tables_cat_enum: dict[int, str],
|
34
33
|
flowstats_features: list[str],
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
flowstats_features_boolean: list[str],
|
35
|
+
flowstats_features_phist: list[str],
|
36
|
+
other_fields: list[str],
|
37
|
+
ppi_channels: list[int],
|
38
|
+
ppi_transform: Optional[Callable] = None,
|
39
|
+
flowstats_transform: Optional[Callable] = None,
|
40
|
+
flowstats_phist_transform: Optional[Callable] = None,
|
41
|
+
target_transform: Optional[Callable] = None,
|
42
|
+
return_tensors: bool = False,
|
43
|
+
return_all_fields: bool = False,
|
44
|
+
preload: bool = False,
|
45
|
+
preload_blob: Optional[str] = None,
|
46
|
+
disabled_apps: Optional[list[str]] = None,):
|
39
47
|
self.database_path = database_path
|
40
48
|
self.tables_paths = tables_paths
|
41
49
|
self.tables = {}
|
42
|
-
self.
|
43
|
-
self.
|
44
|
-
self.
|
45
|
-
self.preload_blob = preload_blob
|
46
|
-
self.return_all_fields = return_all_fields
|
50
|
+
self.tables_app_enum = tables_app_enum
|
51
|
+
self.tables_app_arr = np.array(list(tables_app_enum.values()))
|
52
|
+
self.tables_cat_enum = tables_cat_enum
|
47
53
|
if indices is None:
|
48
54
|
self.set_all_indices(disabled_apps=disabled_apps)
|
49
55
|
else:
|
50
56
|
self.indices = indices
|
51
57
|
|
58
|
+
self.flowstats_features = flowstats_features
|
59
|
+
self.flowstats_features_boolean = flowstats_features_boolean
|
60
|
+
self.flowstats_features_phist = flowstats_features_phist
|
61
|
+
self.other_fields = other_fields
|
62
|
+
self.ppi_channels = ppi_channels
|
63
|
+
self.ppi_transform = ppi_transform
|
64
|
+
self.flowstats_transform = flowstats_transform
|
65
|
+
self.flowstats_phist_transform = flowstats_phist_transform
|
66
|
+
self.target_transform = target_transform
|
67
|
+
self.return_tensors = return_tensors
|
68
|
+
self.return_all_fields = return_all_fields
|
69
|
+
|
70
|
+
self.preload = preload
|
71
|
+
self.preload_blob = preload_blob
|
72
|
+
|
52
73
|
def __getitem__(self, batch_idx):
|
53
74
|
# log.debug(f"worker {self.worker_id}: __getitem__")
|
54
75
|
if self.preload:
|
@@ -57,7 +78,44 @@ class PyTablesDataset(Dataset):
|
|
57
78
|
batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
|
58
79
|
if self.return_all_fields:
|
59
80
|
return (batch_data, batch_idx)
|
60
|
-
|
81
|
+
|
82
|
+
# Prepare data
|
83
|
+
x_ppi = batch_data[PPI_COLUMN].astype("float32")
|
84
|
+
x_ppi = x_ppi[:, self.ppi_channels, :]
|
85
|
+
x_flowstats = structured_to_unstructured(batch_data[self.flowstats_features], dtype="float32")
|
86
|
+
if self.flowstats_features_boolean:
|
87
|
+
x_flowstats_boolean = structured_to_unstructured(batch_data[self.flowstats_features_boolean], dtype="float32")
|
88
|
+
else:
|
89
|
+
x_flowstats_boolean = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
|
90
|
+
if self.flowstats_features_phist:
|
91
|
+
x_flowstats_phist = structured_to_unstructured(batch_data[self.flowstats_features_phist], dtype="float32")
|
92
|
+
else:
|
93
|
+
x_flowstats_phist = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
|
94
|
+
# Feature transformations
|
95
|
+
if self.ppi_transform:
|
96
|
+
x_ppi = self.ppi_transform(x_ppi)
|
97
|
+
if self.flowstats_transform:
|
98
|
+
x_flowstats = self.flowstats_transform(x_flowstats)
|
99
|
+
if self.flowstats_phist_transform:
|
100
|
+
x_flowstats_phist = self.flowstats_phist_transform(x_flowstats_phist)
|
101
|
+
x_flowstats = np.concatenate([x_flowstats, x_flowstats_boolean, x_flowstats_phist], axis=1).astype("float32")
|
102
|
+
# Labels transformation
|
103
|
+
labels = self.tables_app_arr[batch_data[APP_COLUMN]]
|
104
|
+
if self.target_transform:
|
105
|
+
labels = self.target_transform(labels)
|
106
|
+
# Prepare dataframe with other fields
|
107
|
+
other_fields_df = pd.DataFrame(batch_data[self.other_fields]) if len(self.other_fields) > 0 else pd.DataFrame()
|
108
|
+
for column in other_fields_df.columns:
|
109
|
+
if other_fields_df[column].dtype.kind == "O":
|
110
|
+
other_fields_df[column] = other_fields_df[column].astype(str)
|
111
|
+
elif column.startswith("TIME_"):
|
112
|
+
other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
|
113
|
+
|
114
|
+
if self.return_tensors:
|
115
|
+
x_ppi = torch.from_numpy(x_ppi)
|
116
|
+
x_flowstats = torch.from_numpy(x_flowstats)
|
117
|
+
labels = torch.from_numpy(labels).long() # PyTorch loss functions require long type for labels
|
118
|
+
return_data = (other_fields_df, x_ppi, x_flowstats, labels)
|
61
119
|
return return_data
|
62
120
|
|
63
121
|
def __len__(self):
|
@@ -68,8 +126,6 @@ class PyTablesDataset(Dataset):
|
|
68
126
|
log.debug(f"Initializing dataloader worker id {self.worker_id}")
|
69
127
|
self.database, self.tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
70
128
|
atexit.register(self.cleanup)
|
71
|
-
self.app_enum = self.tables[0].get_enum(APP_COLUMN)
|
72
|
-
self.cat_enum = self.tables[0].get_enum(CATEGORY_COLUMN)
|
73
129
|
self.data_dtype = self.tables[0].dtype
|
74
130
|
if self.preload:
|
75
131
|
data = None
|
@@ -86,34 +142,14 @@ class PyTablesDataset(Dataset):
|
|
86
142
|
np.savez_compressed(self.preload_blob, data=self.data)
|
87
143
|
log.debug(f"Finish initialization worker id {self.worker_id}")
|
88
144
|
|
89
|
-
def get_app_enum(self) -> tb.Enum:
|
90
|
-
if self.app_enum:
|
91
|
-
return self.app_enum
|
92
|
-
database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
93
|
-
app_enum = tables[0].get_enum(APP_COLUMN)
|
94
|
-
cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
|
95
|
-
self.app_enum, self.cat_enum = app_enum, cat_enum
|
96
|
-
database.close()
|
97
|
-
return app_enum
|
98
|
-
|
99
|
-
def get_cat_enum(self) -> tb.Enum:
|
100
|
-
if self.cat_enum:
|
101
|
-
return self.cat_enum
|
102
|
-
database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
103
|
-
app_enum = tables[0].get_enum(APP_COLUMN)
|
104
|
-
cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
|
105
|
-
self.app_enum, self.cat_enum = app_enum, cat_enum
|
106
|
-
database.close()
|
107
|
-
return cat_enum
|
108
|
-
|
109
145
|
def set_all_indices(self, disabled_apps: Optional[list[str]] = None):
|
110
146
|
"""
|
111
147
|
This should be called from the main process, before dataloader workers split the work.
|
112
148
|
Does no filter apps with not enough samples.
|
113
149
|
"""
|
114
150
|
database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
115
|
-
|
116
|
-
disabled_apps_ids =
|
151
|
+
inverted_tables_app_enum = {v: k for k, v in self.tables_app_enum.items()}
|
152
|
+
disabled_apps_ids = [inverted_tables_app_enum[app] for app in disabled_apps] if disabled_apps is not None else []
|
117
153
|
base_labels = {}
|
118
154
|
base_indices = {}
|
119
155
|
for i in range(len(tables)):
|
@@ -135,64 +171,9 @@ def worker_init_fn(worker_id):
|
|
135
171
|
dataset = worker_info.dataset
|
136
172
|
dataset.pytables_worker_init(worker_id)
|
137
173
|
|
138
|
-
def
|
139
|
-
flowstats_scaler: Scaler, flowstats_quantiles: np.ndarray,
|
140
|
-
psizes_scaler: Scaler, psizes_max: int,
|
141
|
-
ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
|
142
|
-
use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
|
143
|
-
encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
|
144
|
-
other_fields, x_ppi, x_flowstats, labels = batch
|
145
|
-
x_ppi = x_ppi.transpose(0, 2, 1)
|
146
|
-
orig_shape = x_ppi.shape
|
147
|
-
ppi_channels = x_ppi.shape[-1]
|
148
|
-
x_ppi = x_ppi.reshape(-1, ppi_channels)
|
149
|
-
x_ppi[:, IPT_POS] = x_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
|
150
|
-
x_ppi[:, SIZE_POS] = x_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
|
151
|
-
padding_mask = x_ppi[:, DIR_POS] == 0 # mask of zero padding
|
152
|
-
if ipt_scaler:
|
153
|
-
x_ppi[:, IPT_POS] = ipt_scaler.transform(x_ppi[:, IPT_POS].reshape(-1, 1)).reshape(-1) # type: ignore
|
154
|
-
if psizes_scaler:
|
155
|
-
x_ppi[:, SIZE_POS] = psizes_scaler.transform(x_ppi[:, SIZE_POS].reshape(-1, 1)).reshape(-1) # type: ignore
|
156
|
-
x_ppi[padding_mask, IPT_POS] = 0
|
157
|
-
x_ppi[padding_mask, SIZE_POS] = 0
|
158
|
-
x_ppi = x_ppi.reshape(orig_shape).transpose(0, 2, 1)
|
159
|
-
if not use_push_flags:
|
160
|
-
x_ppi = x_ppi[:, (IPT_POS, DIR_POS, SIZE_POS), :]
|
161
|
-
if zero_ppi_start > 0:
|
162
|
-
x_ppi[:,:,:zero_ppi_start] = 0
|
163
|
-
|
164
|
-
if use_packet_histograms:
|
165
|
-
x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
|
166
|
-
if normalize_packet_histograms:
|
167
|
-
src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
|
168
|
-
dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
|
169
|
-
np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
|
170
|
-
np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
|
171
|
-
np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
|
172
|
-
np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
|
173
|
-
x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
|
174
|
-
x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
|
175
|
-
else:
|
176
|
-
x_flowstats = structured_to_unstructured(x_flowstats, dtype="float32")
|
177
|
-
np.clip(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)], a_max=flowstats_quantiles, a_min=0, out=x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
|
178
|
-
if flowstats_scaler:
|
179
|
-
x_flowstats[:, :len(FLOWSTATS_TO_SCALE)] = flowstats_scaler.transform(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
|
180
|
-
|
181
|
-
other_fields_df = pd.DataFrame(other_fields) if len(other_fields) > 0 else pd.DataFrame()
|
182
|
-
for column in other_fields_df.columns:
|
183
|
-
if other_fields_df[column].dtype.kind == "O":
|
184
|
-
other_fields_df[column] = other_fields_df[column].astype(str)
|
185
|
-
elif column.startswith("TIME_"):
|
186
|
-
other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
|
187
|
-
|
188
|
-
labels = encoder.transform(np.where(np.isin(labels, known_apps), labels, UNKNOWN_STR_LABEL)).astype("int64") # type: ignore
|
189
|
-
if return_torch:
|
190
|
-
return other_fields_df, torch.from_numpy(x_ppi), torch.from_numpy(x_flowstats), torch.from_numpy(labels)
|
191
|
-
return other_fields_df, x_ppi, x_flowstats, labels
|
192
|
-
|
193
|
-
def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFrame, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, dict[int, str], dict[int, str]]:
|
174
|
+
def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
194
175
|
database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
|
195
|
-
|
176
|
+
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
196
177
|
all_app_labels = {}
|
197
178
|
app_counts = pd.Series(dtype="int64")
|
198
179
|
start_time = time.time()
|
@@ -204,15 +185,16 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
|
|
204
185
|
# Handle disabled apps and apps with less than min_samples_per_app samples
|
205
186
|
if len(train_data_params.disabled_apps) > 0:
|
206
187
|
log.info(f"Disabled applications in dataset config: {sorted(train_data_params.disabled_apps)}")
|
207
|
-
disabled_apps_ids =
|
188
|
+
disabled_apps_ids = [inverted_tables_app_enum[app] for app in train_data_params.disabled_apps]
|
208
189
|
min_samples_apps_ids = set(app_counts[app_counts<train_data_params.min_train_samples_per_app].index.tolist())
|
209
190
|
if len(min_samples_apps_ids) > 0:
|
191
|
+
min_samples_apps_names = sorted([tables_app_enum[app_id] for app_id in min_samples_apps_ids])
|
210
192
|
if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
|
211
|
-
warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {
|
193
|
+
warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
212
194
|
"To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
|
213
195
|
exit()
|
214
196
|
elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
|
215
|
-
log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {
|
197
|
+
log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
216
198
|
"Disabling these applications")
|
217
199
|
disabled_apps_ids.extend(min_samples_apps_ids)
|
218
200
|
# Base indices are indices of samples that are not disabled and have enough samples
|
@@ -221,9 +203,9 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
|
|
221
203
|
base_indices[i] = np.nonzero(np.isin(all_app_labels[i], disabled_apps_ids, invert=True))[0]
|
222
204
|
base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_app_labels.items()}
|
223
205
|
# Apps selection
|
224
|
-
if train_data_params.apps_selection != AppSelection.
|
206
|
+
if train_data_params.apps_selection != AppSelection.FIXED:
|
225
207
|
app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
|
226
|
-
app_counts.index = app_counts.index.map(
|
208
|
+
app_counts.index = app_counts.index.map(tables_app_enum)
|
227
209
|
app_counts = app_counts.sort_values(ascending=False).astype("int64")
|
228
210
|
sorted_apps = app_counts.index.to_list()
|
229
211
|
if train_data_params.apps_selection == AppSelection.ALL_KNOWN:
|
@@ -233,31 +215,26 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
|
|
233
215
|
known_apps, unknown_apps = split_apps_topx_with_provider_groups(sorted_apps=sorted_apps, known_count=train_data_params.apps_selection_topx, servicemap=servicemap)
|
234
216
|
if len(known_apps) < train_data_params.apps_selection_topx:
|
235
217
|
warnings.warn(f"The number of known applications ({len(known_apps)}) is lower than requested in config.apps_selection_topx ({train_data_params.apps_selection_topx}).")
|
236
|
-
elif train_data_params.apps_selection == AppSelection.
|
237
|
-
unknown_apps = train_data_params.
|
238
|
-
missing_unknown_apps = [app for app in unknown_apps if app not in sorted_apps]
|
239
|
-
if len(missing_unknown_apps) > 0:
|
240
|
-
raise ValueError(f"Applications configured in config.apps_selection_explicit_unknown are not present in the dataset (or might be disabled): {sorted(missing_unknown_apps)}")
|
218
|
+
elif train_data_params.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
|
219
|
+
unknown_apps = train_data_params.apps_selection_background_unknown
|
241
220
|
known_apps = [app for app in sorted_apps if not (is_background_app(app) or app in unknown_apps)]
|
242
221
|
else: assert_never(train_data_params.apps_selection)
|
243
|
-
|
244
222
|
log.info(f"Selected {len(known_apps)} known applications and {len(unknown_apps)} unknown applications")
|
245
|
-
known_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in known_apps}
|
246
|
-
unknown_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in unknown_apps}
|
247
223
|
else:
|
248
|
-
|
249
|
-
|
250
|
-
known_apps_ids =
|
251
|
-
unknown_apps_ids =
|
224
|
+
known_apps = train_data_params.apps_selection_fixed_known
|
225
|
+
unknown_apps = train_data_params.apps_selection_fixed_unknown
|
226
|
+
known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
|
227
|
+
unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
|
252
228
|
|
253
229
|
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
|
254
230
|
rng.shuffle(train_known_indices)
|
255
231
|
rng.shuffle(train_unknown_indices)
|
256
232
|
log.info(f"Processing train indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
257
|
-
return train_known_indices, train_unknown_indices,
|
233
|
+
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
258
234
|
|
259
|
-
def init_test_indices(test_data_params: TestDataParams, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
235
|
+
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
260
236
|
database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
|
237
|
+
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
261
238
|
base_labels = {}
|
262
239
|
base_indices = {}
|
263
240
|
start_time = time.time()
|
@@ -266,8 +243,8 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, rng:
|
|
266
243
|
log.info(f"Reading app column for test table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
267
244
|
base_indices[i] = np.arange(len(test_tables[i]))
|
268
245
|
database.close()
|
269
|
-
known_apps_ids =
|
270
|
-
unknown_apps_ids =
|
246
|
+
known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
|
247
|
+
unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
|
271
248
|
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
|
272
249
|
rng.shuffle(test_known_indices)
|
273
250
|
rng.shuffle(test_unknown_indices)
|
@@ -311,7 +288,7 @@ def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) ->
|
|
311
288
|
sorted_indices = indices[indices[:, INDICES_TABLE_POS].argsort(kind="stable")]
|
312
289
|
unique_tables, split_bounderies = np.unique(sorted_indices[:, INDICES_TABLE_POS], return_index=True)
|
313
290
|
indices_per_table = np.split(sorted_indices, split_bounderies[1:])
|
314
|
-
data = np.
|
291
|
+
data = np.zeros(len(indices), dtype=data_dtype)
|
315
292
|
for table_id, table_indices in zip(unique_tables, indices_per_table):
|
316
293
|
data[np.where(indices[:, INDICES_TABLE_POS] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[:, INDICES_INDEX_POS])
|
317
294
|
return data
|
@@ -10,10 +10,11 @@ from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROV
|
|
10
10
|
|
11
11
|
@dataclass()
|
12
12
|
class ClassInfo:
|
13
|
-
target_names: list[str]
|
14
13
|
num_classes: int
|
15
14
|
known_apps: list[str]
|
16
15
|
unknown_apps: list[str]
|
16
|
+
encoder: LabelEncoder
|
17
|
+
target_names: list[str]
|
17
18
|
unknown_class_label: int
|
18
19
|
group_matrix: np.ndarray
|
19
20
|
has_provider: dict[str, bool]
|
@@ -21,9 +22,9 @@ class ClassInfo:
|
|
21
22
|
provider_members: dict[str, list[str]]
|
22
23
|
categories_mapping: dict[str, Optional[str]]
|
23
24
|
|
24
|
-
def create_class_info(servicemap: Any, encoder: LabelEncoder,
|
25
|
-
known_apps = sorted(
|
26
|
-
unknown_apps = sorted(
|
25
|
+
def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
|
26
|
+
known_apps = sorted(known_apps)
|
27
|
+
unknown_apps = sorted(unknown_apps)
|
27
28
|
target_names_arr = encoder.classes_
|
28
29
|
assert known_apps == list(target_names_arr[:-1])
|
29
30
|
group_matrix = np.array([[a == b or
|
@@ -37,10 +38,11 @@ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_databas
|
|
37
38
|
provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
|
38
39
|
categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
|
39
40
|
return ClassInfo(
|
40
|
-
target_names=list(target_names_arr),
|
41
41
|
num_classes=len(known_apps),
|
42
42
|
known_apps=known_apps,
|
43
43
|
unknown_apps=unknown_apps,
|
44
|
+
encoder=encoder,
|
45
|
+
target_names=list(target_names_arr),
|
44
46
|
unknown_class_label=len(known_apps),
|
45
47
|
group_matrix=group_matrix,
|
46
48
|
has_provider=has_provider,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
|
|
16
16
|
Requires-Python: >=3.10
|
17
17
|
Description-Content-Type: text/markdown
|
18
18
|
License-File: LICENCE
|
19
|
+
Requires-Dist: cesnet-models
|
19
20
|
Requires-Dist: matplotlib
|
20
21
|
Requires-Dist: numpy
|
21
22
|
Requires-Dist: pandas
|
@@ -0,0 +1,30 @@
|
|
1
|
+
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
cesnet_datazoo/config.py,sha256=vvNyM7TCMolH-uLj3ant7rGkYb_2FPyCWlRQ3mllKWs,37427
|
3
|
+
cesnet_datazoo/constants.py,sha256=EDeeo0xrBt_pnWf3m-ZTiC5HMvyVwcikgCZ9LwZIcAE,1276
|
4
|
+
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
+
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=zoLFduBg6ZK96zoec0kEMB1hFCGn3QOtBtYFTcCbIU0,46546
|
6
|
+
cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
|
7
|
+
cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
|
8
|
+
cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
|
9
|
+
cesnet_datazoo/datasets/statistics.py,sha256=wR8QISIh-KC7CQ5SjN7WoTMFaoRuq0G7pgTFGhC8ek0,15137
|
10
|
+
cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=Ntlp8mHUSr7g-ZTvtBVh238TswZHwGAudMuE52-OA-c,1608
|
12
|
+
cesnet_datazoo/datasets/metadata/metadata.csv,sha256=or0CB7t06G_V1OzClqtpx7sRt_ZoQWE_f7F5SDLlPC8,2175
|
13
|
+
cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
|
15
|
+
cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
|
16
|
+
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
18
|
+
cesnet_datazoo/pytables_data/data_scalers.py,sha256=IfTymhVubjLNetjOIxDhtzkETp_1xmFXbC0rSjQHVUQ,7254
|
19
|
+
cesnet_datazoo/pytables_data/indices_setup.py,sha256=rBW1HwebPXkwLRuKg9ILO_LfUrfnJfqQYsrIAYfXtZo,12932
|
20
|
+
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=kCvbOgIseBdUUGz3nRr7oGsuN2JicXGlsp8-Z9n4JyM,17599
|
21
|
+
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
cesnet_datazoo/utils/class_info.py,sha256=zMt2ndfwvtnE5QOKS1OPbw8VUlsCCbB_SVjzyFn1Wdw,2540
|
23
|
+
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
24
|
+
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
25
|
+
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
26
|
+
cesnet_datazoo-0.1.0.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
27
|
+
cesnet_datazoo-0.1.0.dist-info/METADATA,sha256=c6GMIPE5rkiZtsbGNv28405o-G02J4wyvP-DJL8BfJM,12679
|
28
|
+
cesnet_datazoo-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
29
|
+
cesnet_datazoo-0.1.0.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
30
|
+
cesnet_datazoo-0.1.0.dist-info/RECORD,,
|
@@ -1,29 +0,0 @@
|
|
1
|
-
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
cesnet_datazoo/config.py,sha256=ZNjCM85XFl3jTMj6UsnsEBvmxNxJbiNUhvBx4dEiyw8,37711
|
3
|
-
cesnet_datazoo/constants.py,sha256=EliK-KvW3GXeKw00W_Pd-ypJMwvFQVqMQS9A9ULyTj4,1420
|
4
|
-
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
-
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=h7OfmxrAAwMouSQFbKcOhHWJMaZznePxuOw1h8g2Oa0,43399
|
6
|
-
cesnet_datazoo/datasets/datasets.py,sha256=gj7jflxqDgEfHXSFUz6JOW2x8wEUSCqVe6KapaK4IKg,2279
|
7
|
-
cesnet_datazoo/datasets/loaders.py,sha256=HU2Au0P87BCAvdgpiwO5T0xgeQgs_gL4E1d12OP1JoQ,1803
|
8
|
-
cesnet_datazoo/datasets/statistics.py,sha256=GoM7-vFTvqx9ym239VCZd1os2TdoxLOW7WNpNtOU7Fc,14030
|
9
|
-
cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=OZf-NMai2XuSg57y2IdV-804ZpPcmI9sWoDu8IO7e4Y,1567
|
11
|
-
cesnet_datazoo/datasets/metadata/metadata.csv,sha256=Zr2hf9qpJpPE_Js9XmyaHffdho912ikdQfFVQx6q8NE,2161
|
12
|
-
cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
|
14
|
-
cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
|
15
|
-
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
17
|
-
cesnet_datazoo/pytables_data/data_scalers.py,sha256=dRWHiOxe0VhfhYaOviAO5o8uitpehjHeaRwjWZhDEQA,11468
|
18
|
-
cesnet_datazoo/pytables_data/indices_setup.py,sha256=IraCOFys0p7ZojR-0E99bKN9dTjwCfQO4L6lMqcTEFg,13070
|
19
|
-
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=dE_9DOgkwKgw2WZHLZAmTvrqc22f5D3drFE9c2bDpxo,19744
|
20
|
-
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
cesnet_datazoo/utils/class_info.py,sha256=ync9U3PWo0DloRwX3uMgKW798kC6echioEbEgrPqY4E,2567
|
22
|
-
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
23
|
-
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
24
|
-
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
25
|
-
cesnet_datazoo-0.0.17.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
26
|
-
cesnet_datazoo-0.0.17.dist-info/METADATA,sha256=iRj-jZEsmH6YZEfR-KFDP9yJAaC7gI3V24jkNzGUWsU,12650
|
27
|
-
cesnet_datazoo-0.0.17.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
28
|
-
cesnet_datazoo-0.0.17.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
29
|
-
cesnet_datazoo-0.0.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|