cesnet-datazoo 0.0.16__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,62 +4,118 @@ import os
4
4
  import time
5
5
  import warnings
6
6
  from datetime import datetime
7
- from typing import Any, Optional
7
+ from typing import Any, Callable, Optional
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  import tables as tb
12
12
  import torch
13
- from numpy.lib.recfunctions import drop_fields, structured_to_unstructured
14
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
13
+ from numpy.lib.recfunctions import structured_to_unstructured
15
14
  from torch.utils.data import Dataset
16
15
  from typing_extensions import assert_never
17
16
 
18
- from cesnet_datazoo.config import (AppSelection, DatasetConfig, MinTrainSamplesCheck, Scaler,
19
- ScalerEnum, TestDataParams, TrainDataParams)
20
- from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DIR_POS, FLOWSTATS_NO_CLIP,
21
- FLOWSTATS_TO_SCALE, INDICES_INDEX_POS, INDICES_TABLE_POS,
22
- IPT_POS, PHIST_BIN_COUNT, PHISTS_FEATURES, PPI_COLUMN,
23
- SIZE_POS, UNKNOWN_STR_LABEL)
17
+ from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
18
+ TrainDataParams)
19
+ from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
24
20
  from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
25
21
  split_apps_topx_with_provider_groups)
26
- from cesnet_datazoo.utils.fileutils import pickle_dump, pickle_load
27
- from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
28
22
 
29
23
  log = logging.getLogger(__name__)
30
24
 
31
25
 
32
26
  class PyTablesDataset(Dataset):
33
- def __init__(self, database_path: str,
27
+ def __init__(self,
28
+ database_path: str,
34
29
  tables_paths: list[str],
35
30
  indices: Optional[np.ndarray],
31
+ tables_app_enum: dict[int, str],
32
+ tables_cat_enum: dict[int, str],
36
33
  flowstats_features: list[str],
37
- other_fields: Optional[list[str]] = None,
38
- preload: bool = False, preload_blob: Optional[str] = None,
39
- disabled_apps: Optional[list[str]] = None,
40
- return_all_fields: bool = False,):
34
+ flowstats_features_boolean: list[str],
35
+ flowstats_features_phist: list[str],
36
+ other_fields: list[str],
37
+ ppi_channels: list[int],
38
+ ppi_transform: Optional[Callable] = None,
39
+ flowstats_transform: Optional[Callable] = None,
40
+ flowstats_phist_transform: Optional[Callable] = None,
41
+ target_transform: Optional[Callable] = None,
42
+ return_tensors: bool = False,
43
+ return_all_fields: bool = False,
44
+ preload: bool = False,
45
+ preload_blob: Optional[str] = None,
46
+ disabled_apps: Optional[list[str]] = None,):
41
47
  self.database_path = database_path
42
48
  self.tables_paths = tables_paths
43
49
  self.tables = {}
44
- self.flowstats_features = flowstats_features
45
- self.other_fields = other_fields if other_fields is not None else []
46
- self.preload = preload
47
- self.preload_blob = preload_blob
48
- self.return_all_fields = return_all_fields
50
+ self.tables_app_enum = tables_app_enum
51
+ self.tables_app_arr = np.array(list(tables_app_enum.values()))
52
+ self.tables_cat_enum = tables_cat_enum
49
53
  if indices is None:
50
54
  self.set_all_indices(disabled_apps=disabled_apps)
51
55
  else:
52
56
  self.indices = indices
53
57
 
58
+ self.flowstats_features = flowstats_features
59
+ self.flowstats_features_boolean = flowstats_features_boolean
60
+ self.flowstats_features_phist = flowstats_features_phist
61
+ self.other_fields = other_fields
62
+ self.ppi_channels = ppi_channels
63
+ self.ppi_transform = ppi_transform
64
+ self.flowstats_transform = flowstats_transform
65
+ self.flowstats_phist_transform = flowstats_phist_transform
66
+ self.target_transform = target_transform
67
+ self.return_tensors = return_tensors
68
+ self.return_all_fields = return_all_fields
69
+
70
+ self.preload = preload
71
+ self.preload_blob = preload_blob
72
+
54
73
  def __getitem__(self, batch_idx):
55
74
  # log.debug(f"worker {self.worker_id}: __getitem__")
56
75
  if self.preload:
57
76
  batch_data = self.data[batch_idx]
58
77
  else:
59
- batch_data = load_data_from_pytables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
78
+ batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
60
79
  if self.return_all_fields:
61
80
  return (batch_data, batch_idx)
62
- return_data = (batch_data[self.other_fields], batch_data[PPI_COLUMN].astype("float32"), batch_data[self.flowstats_features], list(map(self.app_enum, batch_data[APP_COLUMN])))
81
+
82
+ # Prepare data
83
+ x_ppi = batch_data[PPI_COLUMN].astype("float32")
84
+ x_ppi = x_ppi[:, self.ppi_channels, :]
85
+ x_flowstats = structured_to_unstructured(batch_data[self.flowstats_features], dtype="float32")
86
+ if self.flowstats_features_boolean:
87
+ x_flowstats_boolean = structured_to_unstructured(batch_data[self.flowstats_features_boolean], dtype="float32")
88
+ else:
89
+ x_flowstats_boolean = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
90
+ if self.flowstats_features_phist:
91
+ x_flowstats_phist = structured_to_unstructured(batch_data[self.flowstats_features_phist], dtype="float32")
92
+ else:
93
+ x_flowstats_phist = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
94
+ # Feature transformations
95
+ if self.ppi_transform:
96
+ x_ppi = self.ppi_transform(x_ppi)
97
+ if self.flowstats_transform:
98
+ x_flowstats = self.flowstats_transform(x_flowstats)
99
+ if self.flowstats_phist_transform:
100
+ x_flowstats_phist = self.flowstats_phist_transform(x_flowstats_phist)
101
+ x_flowstats = np.concatenate([x_flowstats, x_flowstats_boolean, x_flowstats_phist], axis=1).astype("float32")
102
+ # Labels transformation
103
+ labels = self.tables_app_arr[batch_data[APP_COLUMN]]
104
+ if self.target_transform:
105
+ labels = self.target_transform(labels)
106
+ # Prepare dataframe with other fields
107
+ other_fields_df = pd.DataFrame(batch_data[self.other_fields]) if len(self.other_fields) > 0 else pd.DataFrame()
108
+ for column in other_fields_df.columns:
109
+ if other_fields_df[column].dtype.kind == "O":
110
+ other_fields_df[column] = other_fields_df[column].astype(str)
111
+ elif column.startswith("TIME_"):
112
+ other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
113
+
114
+ if self.return_tensors:
115
+ x_ppi = torch.from_numpy(x_ppi)
116
+ x_flowstats = torch.from_numpy(x_flowstats)
117
+ labels = torch.from_numpy(labels).long() # PyTorch loss functions require long type for labels
118
+ return_data = (other_fields_df, x_ppi, x_flowstats, labels)
63
119
  return return_data
64
120
 
65
121
  def __len__(self):
@@ -70,8 +126,6 @@ class PyTablesDataset(Dataset):
70
126
  log.debug(f"Initializing dataloader worker id {self.worker_id}")
71
127
  self.database, self.tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
72
128
  atexit.register(self.cleanup)
73
- self.app_enum = self.tables[0].get_enum(APP_COLUMN)
74
- self.cat_enum = self.tables[0].get_enum(CATEGORY_COLUMN)
75
129
  self.data_dtype = self.tables[0].dtype
76
130
  if self.preload:
77
131
  data = None
@@ -82,40 +136,20 @@ class PyTablesDataset(Dataset):
82
136
  except:
83
137
  pass # ignore if the file is corrupted (or being written at the moment)
84
138
  if data is None:
85
- data = load_data_from_pytables(tables=self.tables, indices=self.indices, data_dtype=self.data_dtype)
139
+ data = load_data_from_tables(tables=self.tables, indices=self.indices, data_dtype=self.data_dtype)
86
140
  self.data = data
87
141
  if self.preload_blob and not os.path.isfile(self.preload_blob):
88
142
  np.savez_compressed(self.preload_blob, data=self.data)
89
143
  log.debug(f"Finish initialization worker id {self.worker_id}")
90
144
 
91
- def get_app_enum(self) -> tb.Enum:
92
- if self.app_enum:
93
- return self.app_enum
94
- database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
95
- app_enum = tables[0].get_enum(APP_COLUMN)
96
- cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
97
- self.app_enum, self.cat_enum = app_enum, cat_enum
98
- database.close()
99
- return app_enum
100
-
101
- def get_cat_enum(self) -> tb.Enum:
102
- if self.cat_enum:
103
- return self.cat_enum
104
- database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
105
- app_enum = tables[0].get_enum(APP_COLUMN)
106
- cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
107
- self.app_enum, self.cat_enum = app_enum, cat_enum
108
- database.close()
109
- return cat_enum
110
-
111
145
  def set_all_indices(self, disabled_apps: Optional[list[str]] = None):
112
146
  """
113
147
  This should be called from the main process, before dataloader workers split the work.
114
148
  Does no filter apps with not enough samples.
115
149
  """
116
150
  database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
117
- app_enum = tables[0].get_enum(APP_COLUMN)
118
- disabled_apps_ids = list(map(lambda x: app_enum[x], disabled_apps)) if disabled_apps is not None else []
151
+ inverted_tables_app_enum = {v: k for k, v in self.tables_app_enum.items()}
152
+ disabled_apps_ids = [inverted_tables_app_enum[app] for app in disabled_apps] if disabled_apps is not None else []
119
153
  base_labels = {}
120
154
  base_indices = {}
121
155
  for i in range(len(tables)):
@@ -137,64 +171,9 @@ def worker_init_fn(worker_id):
137
171
  dataset = worker_info.dataset
138
172
  dataset.pytables_worker_init(worker_id)
139
173
 
140
- def pytables_collate_fn(batch: tuple,
141
- flowstats_scaler: Scaler, flowstats_quantiles: pd.Series,
142
- psizes_scaler: Scaler, psizes_max: int,
143
- ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
144
- use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
145
- encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
146
- other_fields, x_ppi, x_flowstats, labels = batch
147
- x_ppi = x_ppi.transpose(0, 2, 1)
148
- orig_shape = x_ppi.shape
149
- ppi_channels = x_ppi.shape[-1]
150
- x_ppi = x_ppi.reshape(-1, ppi_channels)
151
- x_ppi[:, IPT_POS] = x_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
152
- x_ppi[:, SIZE_POS] = x_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
153
- padding_mask = x_ppi[:, DIR_POS] == 0 # mask of zero padding
154
- if ipt_scaler:
155
- x_ppi[:, IPT_POS] = ipt_scaler.transform(x_ppi[:, IPT_POS].reshape(-1, 1)).reshape(-1) # type: ignore
156
- if psizes_scaler:
157
- x_ppi[:, SIZE_POS] = psizes_scaler.transform(x_ppi[:, SIZE_POS].reshape(-1, 1)).reshape(-1) # type: ignore
158
- x_ppi[padding_mask, IPT_POS] = 0
159
- x_ppi[padding_mask, SIZE_POS] = 0
160
- x_ppi = x_ppi.reshape(orig_shape).transpose(0, 2, 1)
161
- if not use_push_flags:
162
- x_ppi = x_ppi[:, (IPT_POS, DIR_POS, SIZE_POS), :]
163
- if zero_ppi_start > 0:
164
- x_ppi[:,:,:zero_ppi_start] = 0
165
-
166
- if use_packet_histograms:
167
- x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
168
- if normalize_packet_histograms:
169
- src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
170
- dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
171
- np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
172
- np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
173
- np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
174
- np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
175
- x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
176
- x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
177
- else:
178
- x_flowstats = structured_to_unstructured(x_flowstats, dtype="float32")
179
- np.clip(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)], a_max=flowstats_quantiles, a_min=0, out=x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
180
- if flowstats_scaler:
181
- x_flowstats[:, :len(FLOWSTATS_TO_SCALE)] = flowstats_scaler.transform(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
182
-
183
- other_fields_df = pd.DataFrame(other_fields) if len(other_fields) > 0 else pd.DataFrame()
184
- for column in other_fields_df.columns:
185
- if other_fields_df[column].dtype.kind == "O":
186
- other_fields_df[column] = other_fields_df[column].astype(str)
187
- elif column.startswith("TIME_"):
188
- other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
189
-
190
- labels = encoder.transform(np.where(np.isin(labels, known_apps), labels, UNKNOWN_STR_LABEL)).astype("int64") # type: ignore
191
- if return_torch:
192
- return other_fields_df, torch.from_numpy(x_ppi), torch.from_numpy(x_flowstats), torch.from_numpy(labels)
193
- return other_fields_df, x_ppi, x_flowstats, labels
194
-
195
- def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFrame, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, dict[int, str], dict[int, str]]:
174
+ def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
196
175
  database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
197
- app_enum = train_tables[0].get_enum(APP_COLUMN)
176
+ inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
198
177
  all_app_labels = {}
199
178
  app_counts = pd.Series(dtype="int64")
200
179
  start_time = time.time()
@@ -206,15 +185,16 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
206
185
  # Handle disabled apps and apps with less than min_samples_per_app samples
207
186
  if len(train_data_params.disabled_apps) > 0:
208
187
  log.info(f"Disabled applications in dataset config: {sorted(train_data_params.disabled_apps)}")
209
- disabled_apps_ids = list(map(lambda x: app_enum[x], train_data_params.disabled_apps))
188
+ disabled_apps_ids = [inverted_tables_app_enum[app] for app in train_data_params.disabled_apps]
210
189
  min_samples_apps_ids = set(app_counts[app_counts<train_data_params.min_train_samples_per_app].index.tolist())
211
190
  if len(min_samples_apps_ids) > 0:
191
+ min_samples_apps_names = sorted([tables_app_enum[app_id] for app_id in min_samples_apps_ids])
212
192
  if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
213
- warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {sorted(map(app_enum, min_samples_apps_ids))}. " +
193
+ warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
214
194
  "To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
215
195
  exit()
216
196
  elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
217
- log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {sorted(map(app_enum, min_samples_apps_ids))}. " +
197
+ log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
218
198
  "Disabling these applications")
219
199
  disabled_apps_ids.extend(min_samples_apps_ids)
220
200
  # Base indices are indices of samples that are not disabled and have enough samples
@@ -223,9 +203,9 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
223
203
  base_indices[i] = np.nonzero(np.isin(all_app_labels[i], disabled_apps_ids, invert=True))[0]
224
204
  base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_app_labels.items()}
225
205
  # Apps selection
226
- if train_data_params.apps_selection != AppSelection.LONGTERM_FIXED:
206
+ if train_data_params.apps_selection != AppSelection.FIXED:
227
207
  app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
228
- app_counts.index = app_counts.index.map(app_enum)
208
+ app_counts.index = app_counts.index.map(tables_app_enum)
229
209
  app_counts = app_counts.sort_values(ascending=False).astype("int64")
230
210
  sorted_apps = app_counts.index.to_list()
231
211
  if train_data_params.apps_selection == AppSelection.ALL_KNOWN:
@@ -235,31 +215,26 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
235
215
  known_apps, unknown_apps = split_apps_topx_with_provider_groups(sorted_apps=sorted_apps, known_count=train_data_params.apps_selection_topx, servicemap=servicemap)
236
216
  if len(known_apps) < train_data_params.apps_selection_topx:
237
217
  warnings.warn(f"The number of known applications ({len(known_apps)}) is lower than requested in config.apps_selection_topx ({train_data_params.apps_selection_topx}).")
238
- elif train_data_params.apps_selection == AppSelection.EXPLICIT_UNKNOWN:
239
- unknown_apps = train_data_params.apps_selection_explicit_unknown
240
- missing_unknown_apps = [app for app in unknown_apps if app not in sorted_apps]
241
- if len(missing_unknown_apps) > 0:
242
- raise ValueError(f"Applications configured in config.apps_selection_explicit_unknown are not present in the dataset (or might be disabled): {sorted(missing_unknown_apps)}")
218
+ elif train_data_params.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
219
+ unknown_apps = train_data_params.apps_selection_background_unknown
243
220
  known_apps = [app for app in sorted_apps if not (is_background_app(app) or app in unknown_apps)]
244
221
  else: assert_never(train_data_params.apps_selection)
245
-
246
222
  log.info(f"Selected {len(known_apps)} known applications and {len(unknown_apps)} unknown applications")
247
- known_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in known_apps}
248
- unknown_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in unknown_apps}
249
223
  else:
250
- assert train_data_params.apps_selection_fixed_longterm is not None
251
- known_apps_database_enum, unknown_apps_database_enum = train_data_params.apps_selection_fixed_longterm
252
- known_apps_ids = list(known_apps_database_enum)
253
- unknown_apps_ids = list(unknown_apps_database_enum)
224
+ known_apps = train_data_params.apps_selection_fixed_known
225
+ unknown_apps = train_data_params.apps_selection_fixed_unknown
226
+ known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
227
+ unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
254
228
 
255
229
  train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
256
230
  rng.shuffle(train_known_indices)
257
231
  rng.shuffle(train_unknown_indices)
258
232
  log.info(f"Processing train indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
259
- return train_known_indices, train_unknown_indices, known_apps_database_enum, unknown_apps_database_enum
233
+ return train_known_indices, train_unknown_indices, known_apps, unknown_apps
260
234
 
261
- def init_test_indices(test_data_params: TestDataParams, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
235
+ def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
262
236
  database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
237
+ inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
263
238
  base_labels = {}
264
239
  base_indices = {}
265
240
  start_time = time.time()
@@ -268,115 +243,14 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, rng:
268
243
  log.info(f"Reading app column for test table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
269
244
  base_indices[i] = np.arange(len(test_tables[i]))
270
245
  database.close()
271
- known_apps_ids = list(test_data_params.known_apps_database_enum)
272
- unknown_apps_ids = list(test_data_params.unknown_apps_database_enum)
246
+ known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
247
+ unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
273
248
  test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
274
249
  rng.shuffle(test_known_indices)
275
250
  rng.shuffle(test_unknown_indices)
276
251
  log.info(f"Processing test indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
277
252
  return test_known_indices, test_unknown_indices
278
253
 
279
- def fit_or_load_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> tuple[Scaler, pd.Series, Scaler, Scaler]:
280
- train_data_path = dataset_config._get_train_data_path()
281
- flowstats_scaler_path = os.path.join(train_data_path, "stand", f"flowstats_scaler-{dataset_config.flowstats_scaler}-q{dataset_config.flowstats_clip}.pickle")
282
- flowstats_quantiles_path = os.path.join(train_data_path, "stand", f"flowstats_quantiles-q{dataset_config.flowstats_clip}.pickle")
283
- ipt_scaler_path = os.path.join(train_data_path, "stand", f"ipt_scaler-{dataset_config.ipt_scaler}-ipt_min{dataset_config.ipt_min}-ipt_max{dataset_config.ipt_max}.pickle")
284
- psizes_sizes_scaler_path = os.path.join(train_data_path, "stand", f"psizes_scaler-{dataset_config.psizes_scaler}-psizes_max{dataset_config.psizes_max}.pickle")
285
- if os.path.isfile(flowstats_scaler_path) and os.path.isfile(flowstats_quantiles_path) and os.path.isfile(ipt_scaler_path) and os.path.isfile(psizes_sizes_scaler_path):
286
- flowstats_scaler = pickle_load(flowstats_scaler_path)
287
- flowstats_quantiles = pickle_load(flowstats_quantiles_path)
288
- ipt_scaler = pickle_load(ipt_scaler_path)
289
- psizes_scaler = pickle_load(psizes_sizes_scaler_path)
290
- else:
291
- if dataset_config.flowstats_scaler == ScalerEnum.ROBUST:
292
- flowstats_scaler = RobustScaler()
293
- elif dataset_config.flowstats_scaler == ScalerEnum.STANDARD:
294
- flowstats_scaler = StandardScaler()
295
- elif dataset_config.flowstats_scaler == ScalerEnum.MINMAX:
296
- flowstats_scaler = MinMaxScaler()
297
- elif dataset_config.flowstats_scaler == ScalerEnum.NO_SCALER:
298
- flowstats_scaler = None
299
- else: assert_never(dataset_config.flowstats_scaler)
300
-
301
- if dataset_config.ipt_scaler == ScalerEnum.ROBUST:
302
- ipt_scaler = RobustScaler()
303
- elif dataset_config.ipt_scaler == ScalerEnum.STANDARD:
304
- ipt_scaler = StandardScaler()
305
- elif dataset_config.ipt_scaler == ScalerEnum.MINMAX:
306
- ipt_scaler = MinMaxScaler()
307
- elif dataset_config.ipt_scaler == ScalerEnum.NO_SCALER:
308
- ipt_scaler = None
309
- else: assert_never(dataset_config.ipt_scaler)
310
-
311
- if dataset_config.psizes_scaler == ScalerEnum.ROBUST:
312
- psizes_scaler = RobustScaler()
313
- elif dataset_config.psizes_scaler == ScalerEnum.STANDARD:
314
- psizes_scaler = StandardScaler()
315
- elif dataset_config.psizes_scaler == ScalerEnum.MINMAX:
316
- psizes_scaler = MinMaxScaler()
317
- elif dataset_config.psizes_scaler == ScalerEnum.NO_SCALER:
318
- psizes_scaler = None
319
- else: assert_never(dataset_config.psizes_scaler)
320
-
321
- if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
322
- warnings.warn(f"The number of samples for fitting scalers ({dataset_config.fit_scalers_samples}) is larger than the number of train samples ({len(train_indices)}), using the number of train samples instead")
323
- dataset_config.fit_scalers_samples = len(train_indices)
324
- fit_scalers_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.FIT_SCALERS_SAMPLE)
325
- if isinstance(dataset_config.fit_scalers_samples, float):
326
- num_samples = int(dataset_config.fit_scalers_samples * len(train_indices))
327
- else:
328
- num_samples = dataset_config.fit_scalers_samples
329
- fit_scalers_indices = train_indices[fit_scalers_rng.choice(len(train_indices), size=num_samples, replace=False)]
330
- flowstats_quantiles = fit_scalers(
331
- database_path=dataset_config.database_path,
332
- train_tables_paths=dataset_config._get_train_tables_paths(),
333
- fit_scalers_indices=fit_scalers_indices,
334
- flowstats_scaler=flowstats_scaler,
335
- flowstats_quantile_clip=dataset_config.flowstats_clip,
336
- ipt_scaler=ipt_scaler,
337
- psizes_scaler=psizes_scaler,
338
- ipt_min=dataset_config.ipt_min,
339
- ipt_max=dataset_config.ipt_max,
340
- psizes_max=dataset_config.psizes_max)
341
- pickle_dump(flowstats_scaler, flowstats_scaler_path)
342
- pickle_dump(flowstats_quantiles, flowstats_quantiles_path)
343
- pickle_dump(ipt_scaler, ipt_scaler_path)
344
- pickle_dump(psizes_scaler, psizes_sizes_scaler_path)
345
- return flowstats_scaler, flowstats_quantiles, ipt_scaler, psizes_scaler
346
-
347
- def fit_scalers(database_path: str, train_tables_paths: list[str], fit_scalers_indices: np.ndarray, flowstats_scaler, flowstats_quantile_clip: float, ipt_scaler, psizes_scaler, ipt_min: int, ipt_max: int, psizes_max: int) -> pd.Series:
348
- start_time = time.time()
349
- database, tables = load_database(database_path, tables_paths=train_tables_paths)
350
- data = load_data_from_pytables(tables=tables, indices=fit_scalers_indices, data_dtype=tables[0].dtype)
351
- database.close()
352
- # PPI
353
- data_ppi = data[PPI_COLUMN].astype("float32")
354
- ppi_channels = data_ppi.shape[1]
355
- data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
356
- padding_mask = data_ppi[:, DIR_POS] == 0 # mask of padded packets
357
- if ipt_scaler:
358
- train_ipt = data_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
359
- train_ipt[padding_mask] = np.nan # nans are ignored in sklearn scalers
360
- if isinstance(ipt_scaler, MinMaxScaler):
361
- # let zero be the minimum for minmax scaling
362
- train_ipt = np.concatenate((train_ipt, [0]))
363
- ipt_scaler.fit(train_ipt.reshape(-1, 1))
364
- if psizes_scaler:
365
- train_psizes = data_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
366
- train_psizes[padding_mask] = np.nan
367
- if isinstance(psizes_scaler, MinMaxScaler):
368
- train_psizes = np.concatenate((train_psizes, [0]))
369
- psizes_scaler.fit(train_psizes.reshape(-1, 1))
370
- # FLOWSTATS
371
- train_flowstats = pd.DataFrame(data, columns=FLOWSTATS_TO_SCALE)
372
- upper_quantiles = train_flowstats.quantile(flowstats_quantile_clip)
373
- upper_quantiles[FLOWSTATS_NO_CLIP] = np.Inf # disable clipping for features with "fixed" range
374
- if flowstats_scaler:
375
- train_flowstats = train_flowstats.clip(upper=upper_quantiles, lower=0, axis=1).to_numpy() # type: ignore
376
- flowstats_scaler.fit(train_flowstats)
377
- log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
378
- return upper_quantiles
379
-
380
254
  def load_database(database_path: str, tables_paths: Optional[list[str]] = None, mode: str = "r") -> tuple[tb.File, dict[int, Any]]: # dict[int, tb.Table]
381
255
  database = tb.open_file(database_path, mode=mode)
382
256
  if tables_paths is None:
@@ -410,11 +284,11 @@ def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[
410
284
  np.concatenate(list(unknown_labels_dict.values()))))
411
285
  return known_indices, unknown_indices
412
286
 
413
- def load_data_from_pytables(tables, indices: np.ndarray, data_dtype: np.dtype) -> np.ndarray:
287
+ def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) -> np.ndarray:
414
288
  sorted_indices = indices[indices[:, INDICES_TABLE_POS].argsort(kind="stable")]
415
289
  unique_tables, split_bounderies = np.unique(sorted_indices[:, INDICES_TABLE_POS], return_index=True)
416
290
  indices_per_table = np.split(sorted_indices, split_bounderies[1:])
417
- data = np.empty(len(indices), dtype=data_dtype)
291
+ data = np.zeros(len(indices), dtype=data_dtype)
418
292
  for table_id, table_indices in zip(unique_tables, indices_per_table):
419
293
  data[np.where(indices[:, INDICES_TABLE_POS] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[:, INDICES_INDEX_POS])
420
294
  return data
@@ -10,10 +10,11 @@ from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROV
10
10
 
11
11
  @dataclass()
12
12
  class ClassInfo:
13
- target_names: list[str]
14
13
  num_classes: int
15
14
  known_apps: list[str]
16
15
  unknown_apps: list[str]
16
+ encoder: LabelEncoder
17
+ target_names: list[str]
17
18
  unknown_class_label: int
18
19
  group_matrix: np.ndarray
19
20
  has_provider: dict[str, bool]
@@ -21,9 +22,9 @@ class ClassInfo:
21
22
  provider_members: dict[str, list[str]]
22
23
  categories_mapping: dict[str, Optional[str]]
23
24
 
24
- def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> ClassInfo:
25
- known_apps = sorted(known_apps_database_enum.values())
26
- unknown_apps = sorted(unknown_apps_database_enum.values())
25
+ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
26
+ known_apps = sorted(known_apps)
27
+ unknown_apps = sorted(unknown_apps)
27
28
  target_names_arr = encoder.classes_
28
29
  assert known_apps == list(target_names_arr[:-1])
29
30
  group_matrix = np.array([[a == b or
@@ -37,10 +38,11 @@ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_databas
37
38
  provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
38
39
  categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
39
40
  return ClassInfo(
40
- target_names=list(target_names_arr),
41
41
  num_classes=len(known_apps),
42
42
  known_apps=known_apps,
43
43
  unknown_apps=unknown_apps,
44
+ encoder=encoder,
45
+ target_names=list(target_names_arr),
44
46
  unknown_class_label=len(known_apps),
45
47
  group_matrix=group_matrix,
46
48
  has_provider=has_provider,
@@ -11,9 +11,14 @@ def simple_download(url: str, file_path: str):
11
11
 
12
12
  def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2, silent: bool = False):
13
13
  r1 = requests.get(url, stream=True)
14
+ try:
15
+ r1.raise_for_status()
16
+ except requests.exceptions.HTTPError as e:
17
+ print("The dataset hosting server is unreachable. Please contact us at https://github.com/CESNET/cesnet-datazoo/issues.")
18
+ raise e
19
+
14
20
  redirected_url = r1.url
15
21
  content_size = int(r1.headers["Content-Length"])
16
-
17
22
  if os.path.exists(file_path):
18
23
  temp_size = os.path.getsize(file_path)
19
24
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.16
3
+ Version: 0.1.0
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
16
16
  Requires-Python: >=3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENCE
19
+ Requires-Dist: cesnet-models
19
20
  Requires-Dist: matplotlib
20
21
  Requires-Dist: numpy
21
22
  Requires-Dist: pandas
@@ -0,0 +1,30 @@
1
+ cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ cesnet_datazoo/config.py,sha256=vvNyM7TCMolH-uLj3ant7rGkYb_2FPyCWlRQ3mllKWs,37427
3
+ cesnet_datazoo/constants.py,sha256=EDeeo0xrBt_pnWf3m-ZTiC5HMvyVwcikgCZ9LwZIcAE,1276
4
+ cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
+ cesnet_datazoo/datasets/cesnet_dataset.py,sha256=zoLFduBg6ZK96zoec0kEMB1hFCGn3QOtBtYFTcCbIU0,46546
6
+ cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
7
+ cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
8
+ cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
9
+ cesnet_datazoo/datasets/statistics.py,sha256=wR8QISIh-KC7CQ5SjN7WoTMFaoRuq0G7pgTFGhC8ek0,15137
10
+ cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=Ntlp8mHUSr7g-ZTvtBVh238TswZHwGAudMuE52-OA-c,1608
12
+ cesnet_datazoo/datasets/metadata/metadata.csv,sha256=or0CB7t06G_V1OzClqtpx7sRt_ZoQWE_f7F5SDLlPC8,2175
13
+ cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
15
+ cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
16
+ cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
18
+ cesnet_datazoo/pytables_data/data_scalers.py,sha256=IfTymhVubjLNetjOIxDhtzkETp_1xmFXbC0rSjQHVUQ,7254
19
+ cesnet_datazoo/pytables_data/indices_setup.py,sha256=rBW1HwebPXkwLRuKg9ILO_LfUrfnJfqQYsrIAYfXtZo,12932
20
+ cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=kCvbOgIseBdUUGz3nRr7oGsuN2JicXGlsp8-Z9n4JyM,17599
21
+ cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ cesnet_datazoo/utils/class_info.py,sha256=zMt2ndfwvtnE5QOKS1OPbw8VUlsCCbB_SVjzyFn1Wdw,2540
23
+ cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
24
+ cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
25
+ cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
26
+ cesnet_datazoo-0.1.0.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
+ cesnet_datazoo-0.1.0.dist-info/METADATA,sha256=c6GMIPE5rkiZtsbGNv28405o-G02J4wyvP-DJL8BfJM,12679
28
+ cesnet_datazoo-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
29
+ cesnet_datazoo-0.1.0.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
+ cesnet_datazoo-0.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,28 +0,0 @@
1
- cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- cesnet_datazoo/config.py,sha256=LawxdianPcNcuzxf01FTyED7PoAvQj8RMSE8QG4FZNo,37531
3
- cesnet_datazoo/constants.py,sha256=EliK-KvW3GXeKw00W_Pd-ypJMwvFQVqMQS9A9ULyTj4,1420
4
- cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
- cesnet_datazoo/datasets/cesnet_dataset.py,sha256=S0FsatG3fq21fVJctWOaLbF1ZzSvxUnzg9Hbe3TFNmo,43116
6
- cesnet_datazoo/datasets/datasets.py,sha256=gj7jflxqDgEfHXSFUz6JOW2x8wEUSCqVe6KapaK4IKg,2279
7
- cesnet_datazoo/datasets/loaders.py,sha256=HU2Au0P87BCAvdgpiwO5T0xgeQgs_gL4E1d12OP1JoQ,1803
8
- cesnet_datazoo/datasets/statistics.py,sha256=GoM7-vFTvqx9ym239VCZd1os2TdoxLOW7WNpNtOU7Fc,14030
9
- cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=OZf-NMai2XuSg57y2IdV-804ZpPcmI9sWoDu8IO7e4Y,1567
11
- cesnet_datazoo/datasets/metadata/metadata.csv,sha256=Zr2hf9qpJpPE_Js9XmyaHffdho912ikdQfFVQx6q8NE,2161
12
- cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
14
- cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
15
- cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
17
- cesnet_datazoo/pytables_data/indices_setup.py,sha256=IraCOFys0p7ZojR-0E99bKN9dTjwCfQO4L6lMqcTEFg,13070
18
- cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=y2BXuuA73w58XITKFsPm-FS8LB76TH6prNUMsKkXNBM,26511
19
- cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- cesnet_datazoo/utils/class_info.py,sha256=ync9U3PWo0DloRwX3uMgKW798kC6echioEbEgrPqY4E,2567
21
- cesnet_datazoo/utils/download.py,sha256=QVbYKuWUO9j6VUJISPTVBXscjuTuuX-XRez7MJzG3dk,1204
22
- cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
23
- cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
24
- cesnet_datazoo-0.0.16.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
25
- cesnet_datazoo-0.0.16.dist-info/METADATA,sha256=YQBQeLwNIoHcCR3W4Dn46iWLlRMWC4c8B7U8r81gRd8,12650
26
- cesnet_datazoo-0.0.16.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
27
- cesnet_datazoo-0.0.16.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
28
- cesnet_datazoo-0.0.16.dist-info/RECORD,,