cesnet-datazoo 0.0.17__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,23 +4,19 @@ import os
4
4
  import time
5
5
  import warnings
6
6
  from datetime import datetime
7
- from typing import Any, Optional
7
+ from typing import Any, Callable, Optional
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  import tables as tb
12
12
  import torch
13
- from numpy.lib.recfunctions import drop_fields, structured_to_unstructured
14
- from sklearn.preprocessing import LabelEncoder
13
+ from numpy.lib.recfunctions import structured_to_unstructured
15
14
  from torch.utils.data import Dataset
16
15
  from typing_extensions import assert_never
17
16
 
18
- from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, Scaler, TestDataParams,
17
+ from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
19
18
  TrainDataParams)
20
- from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DIR_POS, FLOWSTATS_TO_SCALE,
21
- INDICES_INDEX_POS, INDICES_TABLE_POS, IPT_POS,
22
- PHIST_BIN_COUNT, PHISTS_FEATURES, PPI_COLUMN, SIZE_POS,
23
- UNKNOWN_STR_LABEL)
19
+ from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
24
20
  from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
25
21
  split_apps_topx_with_provider_groups)
26
22
 
@@ -28,27 +24,52 @@ log = logging.getLogger(__name__)
28
24
 
29
25
 
30
26
  class PyTablesDataset(Dataset):
31
- def __init__(self, database_path: str,
27
+ def __init__(self,
28
+ database_path: str,
32
29
  tables_paths: list[str],
33
30
  indices: Optional[np.ndarray],
31
+ tables_app_enum: dict[int, str],
32
+ tables_cat_enum: dict[int, str],
34
33
  flowstats_features: list[str],
35
- other_fields: Optional[list[str]] = None,
36
- preload: bool = False, preload_blob: Optional[str] = None,
37
- disabled_apps: Optional[list[str]] = None,
38
- return_all_fields: bool = False,):
34
+ flowstats_features_boolean: list[str],
35
+ flowstats_features_phist: list[str],
36
+ other_fields: list[str],
37
+ ppi_channels: list[int],
38
+ ppi_transform: Optional[Callable] = None,
39
+ flowstats_transform: Optional[Callable] = None,
40
+ flowstats_phist_transform: Optional[Callable] = None,
41
+ target_transform: Optional[Callable] = None,
42
+ return_tensors: bool = False,
43
+ return_all_fields: bool = False,
44
+ preload: bool = False,
45
+ preload_blob: Optional[str] = None,
46
+ disabled_apps: Optional[list[str]] = None,):
39
47
  self.database_path = database_path
40
48
  self.tables_paths = tables_paths
41
49
  self.tables = {}
42
- self.flowstats_features = flowstats_features
43
- self.other_fields = other_fields if other_fields is not None else []
44
- self.preload = preload
45
- self.preload_blob = preload_blob
46
- self.return_all_fields = return_all_fields
50
+ self.tables_app_enum = tables_app_enum
51
+ self.tables_app_arr = np.array(list(tables_app_enum.values()))
52
+ self.tables_cat_enum = tables_cat_enum
47
53
  if indices is None:
48
54
  self.set_all_indices(disabled_apps=disabled_apps)
49
55
  else:
50
56
  self.indices = indices
51
57
 
58
+ self.flowstats_features = flowstats_features
59
+ self.flowstats_features_boolean = flowstats_features_boolean
60
+ self.flowstats_features_phist = flowstats_features_phist
61
+ self.other_fields = other_fields
62
+ self.ppi_channels = ppi_channels
63
+ self.ppi_transform = ppi_transform
64
+ self.flowstats_transform = flowstats_transform
65
+ self.flowstats_phist_transform = flowstats_phist_transform
66
+ self.target_transform = target_transform
67
+ self.return_tensors = return_tensors
68
+ self.return_all_fields = return_all_fields
69
+
70
+ self.preload = preload
71
+ self.preload_blob = preload_blob
72
+
52
73
  def __getitem__(self, batch_idx):
53
74
  # log.debug(f"worker {self.worker_id}: __getitem__")
54
75
  if self.preload:
@@ -57,7 +78,44 @@ class PyTablesDataset(Dataset):
57
78
  batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
58
79
  if self.return_all_fields:
59
80
  return (batch_data, batch_idx)
60
- return_data = (batch_data[self.other_fields], batch_data[PPI_COLUMN].astype("float32"), batch_data[self.flowstats_features], list(map(self.app_enum, batch_data[APP_COLUMN])))
81
+
82
+ # Prepare data
83
+ x_ppi = batch_data[PPI_COLUMN].astype("float32")
84
+ x_ppi = x_ppi[:, self.ppi_channels, :]
85
+ x_flowstats = structured_to_unstructured(batch_data[self.flowstats_features], dtype="float32")
86
+ if self.flowstats_features_boolean:
87
+ x_flowstats_boolean = structured_to_unstructured(batch_data[self.flowstats_features_boolean], dtype="float32")
88
+ else:
89
+ x_flowstats_boolean = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
90
+ if self.flowstats_features_phist:
91
+ x_flowstats_phist = structured_to_unstructured(batch_data[self.flowstats_features_phist], dtype="float32")
92
+ else:
93
+ x_flowstats_phist = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
94
+ # Feature transformations
95
+ if self.ppi_transform:
96
+ x_ppi = self.ppi_transform(x_ppi)
97
+ if self.flowstats_transform:
98
+ x_flowstats = self.flowstats_transform(x_flowstats)
99
+ if self.flowstats_phist_transform:
100
+ x_flowstats_phist = self.flowstats_phist_transform(x_flowstats_phist)
101
+ x_flowstats = np.concatenate([x_flowstats, x_flowstats_boolean, x_flowstats_phist], axis=1).astype("float32")
102
+ # Labels transformation
103
+ labels = self.tables_app_arr[batch_data[APP_COLUMN]]
104
+ if self.target_transform:
105
+ labels = self.target_transform(labels)
106
+ # Prepare dataframe with other fields
107
+ other_fields_df = pd.DataFrame(batch_data[self.other_fields]) if len(self.other_fields) > 0 else pd.DataFrame()
108
+ for column in other_fields_df.columns:
109
+ if other_fields_df[column].dtype.kind == "O":
110
+ other_fields_df[column] = other_fields_df[column].astype(str)
111
+ elif column.startswith("TIME_"):
112
+ other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
113
+
114
+ if self.return_tensors:
115
+ x_ppi = torch.from_numpy(x_ppi)
116
+ x_flowstats = torch.from_numpy(x_flowstats)
117
+ labels = torch.from_numpy(labels).long() # PyTorch loss functions require long type for labels
118
+ return_data = (other_fields_df, x_ppi, x_flowstats, labels)
61
119
  return return_data
62
120
 
63
121
  def __len__(self):
@@ -68,8 +126,6 @@ class PyTablesDataset(Dataset):
68
126
  log.debug(f"Initializing dataloader worker id {self.worker_id}")
69
127
  self.database, self.tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
70
128
  atexit.register(self.cleanup)
71
- self.app_enum = self.tables[0].get_enum(APP_COLUMN)
72
- self.cat_enum = self.tables[0].get_enum(CATEGORY_COLUMN)
73
129
  self.data_dtype = self.tables[0].dtype
74
130
  if self.preload:
75
131
  data = None
@@ -86,34 +142,14 @@ class PyTablesDataset(Dataset):
86
142
  np.savez_compressed(self.preload_blob, data=self.data)
87
143
  log.debug(f"Finish initialization worker id {self.worker_id}")
88
144
 
89
- def get_app_enum(self) -> tb.Enum:
90
- if self.app_enum:
91
- return self.app_enum
92
- database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
93
- app_enum = tables[0].get_enum(APP_COLUMN)
94
- cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
95
- self.app_enum, self.cat_enum = app_enum, cat_enum
96
- database.close()
97
- return app_enum
98
-
99
- def get_cat_enum(self) -> tb.Enum:
100
- if self.cat_enum:
101
- return self.cat_enum
102
- database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
103
- app_enum = tables[0].get_enum(APP_COLUMN)
104
- cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
105
- self.app_enum, self.cat_enum = app_enum, cat_enum
106
- database.close()
107
- return cat_enum
108
-
109
145
  def set_all_indices(self, disabled_apps: Optional[list[str]] = None):
110
146
  """
111
147
  This should be called from the main process, before dataloader workers split the work.
112
148
  Does no filter apps with not enough samples.
113
149
  """
114
150
  database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
115
- app_enum = tables[0].get_enum(APP_COLUMN)
116
- disabled_apps_ids = list(map(lambda x: app_enum[x], disabled_apps)) if disabled_apps is not None else []
151
+ inverted_tables_app_enum = {v: k for k, v in self.tables_app_enum.items()}
152
+ disabled_apps_ids = [inverted_tables_app_enum[app] for app in disabled_apps] if disabled_apps is not None else []
117
153
  base_labels = {}
118
154
  base_indices = {}
119
155
  for i in range(len(tables)):
@@ -135,64 +171,9 @@ def worker_init_fn(worker_id):
135
171
  dataset = worker_info.dataset
136
172
  dataset.pytables_worker_init(worker_id)
137
173
 
138
- def pytables_collate_fn(batch: tuple,
139
- flowstats_scaler: Scaler, flowstats_quantiles: np.ndarray,
140
- psizes_scaler: Scaler, psizes_max: int,
141
- ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
142
- use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
143
- encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
144
- other_fields, x_ppi, x_flowstats, labels = batch
145
- x_ppi = x_ppi.transpose(0, 2, 1)
146
- orig_shape = x_ppi.shape
147
- ppi_channels = x_ppi.shape[-1]
148
- x_ppi = x_ppi.reshape(-1, ppi_channels)
149
- x_ppi[:, IPT_POS] = x_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
150
- x_ppi[:, SIZE_POS] = x_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
151
- padding_mask = x_ppi[:, DIR_POS] == 0 # mask of zero padding
152
- if ipt_scaler:
153
- x_ppi[:, IPT_POS] = ipt_scaler.transform(x_ppi[:, IPT_POS].reshape(-1, 1)).reshape(-1) # type: ignore
154
- if psizes_scaler:
155
- x_ppi[:, SIZE_POS] = psizes_scaler.transform(x_ppi[:, SIZE_POS].reshape(-1, 1)).reshape(-1) # type: ignore
156
- x_ppi[padding_mask, IPT_POS] = 0
157
- x_ppi[padding_mask, SIZE_POS] = 0
158
- x_ppi = x_ppi.reshape(orig_shape).transpose(0, 2, 1)
159
- if not use_push_flags:
160
- x_ppi = x_ppi[:, (IPT_POS, DIR_POS, SIZE_POS), :]
161
- if zero_ppi_start > 0:
162
- x_ppi[:,:,:zero_ppi_start] = 0
163
-
164
- if use_packet_histograms:
165
- x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
166
- if normalize_packet_histograms:
167
- src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
168
- dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
169
- np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
170
- np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
171
- np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
172
- np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
173
- x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
174
- x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
175
- else:
176
- x_flowstats = structured_to_unstructured(x_flowstats, dtype="float32")
177
- np.clip(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)], a_max=flowstats_quantiles, a_min=0, out=x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
178
- if flowstats_scaler:
179
- x_flowstats[:, :len(FLOWSTATS_TO_SCALE)] = flowstats_scaler.transform(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
180
-
181
- other_fields_df = pd.DataFrame(other_fields) if len(other_fields) > 0 else pd.DataFrame()
182
- for column in other_fields_df.columns:
183
- if other_fields_df[column].dtype.kind == "O":
184
- other_fields_df[column] = other_fields_df[column].astype(str)
185
- elif column.startswith("TIME_"):
186
- other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
187
-
188
- labels = encoder.transform(np.where(np.isin(labels, known_apps), labels, UNKNOWN_STR_LABEL)).astype("int64") # type: ignore
189
- if return_torch:
190
- return other_fields_df, torch.from_numpy(x_ppi), torch.from_numpy(x_flowstats), torch.from_numpy(labels)
191
- return other_fields_df, x_ppi, x_flowstats, labels
192
-
193
- def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFrame, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, dict[int, str], dict[int, str]]:
174
+ def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
194
175
  database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
195
- app_enum = train_tables[0].get_enum(APP_COLUMN)
176
+ inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
196
177
  all_app_labels = {}
197
178
  app_counts = pd.Series(dtype="int64")
198
179
  start_time = time.time()
@@ -204,15 +185,16 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
204
185
  # Handle disabled apps and apps with less than min_samples_per_app samples
205
186
  if len(train_data_params.disabled_apps) > 0:
206
187
  log.info(f"Disabled applications in dataset config: {sorted(train_data_params.disabled_apps)}")
207
- disabled_apps_ids = list(map(lambda x: app_enum[x], train_data_params.disabled_apps))
188
+ disabled_apps_ids = [inverted_tables_app_enum[app] for app in train_data_params.disabled_apps]
208
189
  min_samples_apps_ids = set(app_counts[app_counts<train_data_params.min_train_samples_per_app].index.tolist())
209
190
  if len(min_samples_apps_ids) > 0:
191
+ min_samples_apps_names = sorted([tables_app_enum[app_id] for app_id in min_samples_apps_ids])
210
192
  if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
211
- warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {sorted(map(app_enum, min_samples_apps_ids))}. " +
193
+ warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
212
194
  "To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
213
195
  exit()
214
196
  elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
215
- log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {sorted(map(app_enum, min_samples_apps_ids))}. " +
197
+ log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
216
198
  "Disabling these applications")
217
199
  disabled_apps_ids.extend(min_samples_apps_ids)
218
200
  # Base indices are indices of samples that are not disabled and have enough samples
@@ -221,9 +203,9 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
221
203
  base_indices[i] = np.nonzero(np.isin(all_app_labels[i], disabled_apps_ids, invert=True))[0]
222
204
  base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_app_labels.items()}
223
205
  # Apps selection
224
- if train_data_params.apps_selection != AppSelection.LONGTERM_FIXED:
206
+ if train_data_params.apps_selection != AppSelection.FIXED:
225
207
  app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
226
- app_counts.index = app_counts.index.map(app_enum)
208
+ app_counts.index = app_counts.index.map(tables_app_enum)
227
209
  app_counts = app_counts.sort_values(ascending=False).astype("int64")
228
210
  sorted_apps = app_counts.index.to_list()
229
211
  if train_data_params.apps_selection == AppSelection.ALL_KNOWN:
@@ -233,31 +215,26 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
233
215
  known_apps, unknown_apps = split_apps_topx_with_provider_groups(sorted_apps=sorted_apps, known_count=train_data_params.apps_selection_topx, servicemap=servicemap)
234
216
  if len(known_apps) < train_data_params.apps_selection_topx:
235
217
  warnings.warn(f"The number of known applications ({len(known_apps)}) is lower than requested in config.apps_selection_topx ({train_data_params.apps_selection_topx}).")
236
- elif train_data_params.apps_selection == AppSelection.EXPLICIT_UNKNOWN:
237
- unknown_apps = train_data_params.apps_selection_explicit_unknown
238
- missing_unknown_apps = [app for app in unknown_apps if app not in sorted_apps]
239
- if len(missing_unknown_apps) > 0:
240
- raise ValueError(f"Applications configured in config.apps_selection_explicit_unknown are not present in the dataset (or might be disabled): {sorted(missing_unknown_apps)}")
218
+ elif train_data_params.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
219
+ unknown_apps = train_data_params.apps_selection_background_unknown
241
220
  known_apps = [app for app in sorted_apps if not (is_background_app(app) or app in unknown_apps)]
242
221
  else: assert_never(train_data_params.apps_selection)
243
-
244
222
  log.info(f"Selected {len(known_apps)} known applications and {len(unknown_apps)} unknown applications")
245
- known_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in known_apps}
246
- unknown_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in unknown_apps}
247
223
  else:
248
- assert train_data_params.apps_selection_fixed_longterm is not None
249
- known_apps_database_enum, unknown_apps_database_enum = train_data_params.apps_selection_fixed_longterm
250
- known_apps_ids = list(known_apps_database_enum)
251
- unknown_apps_ids = list(unknown_apps_database_enum)
224
+ known_apps = train_data_params.apps_selection_fixed_known
225
+ unknown_apps = train_data_params.apps_selection_fixed_unknown
226
+ known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
227
+ unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
252
228
 
253
229
  train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
254
230
  rng.shuffle(train_known_indices)
255
231
  rng.shuffle(train_unknown_indices)
256
232
  log.info(f"Processing train indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
257
- return train_known_indices, train_unknown_indices, known_apps_database_enum, unknown_apps_database_enum
233
+ return train_known_indices, train_unknown_indices, known_apps, unknown_apps
258
234
 
259
- def init_test_indices(test_data_params: TestDataParams, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
235
+ def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
260
236
  database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
237
+ inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
261
238
  base_labels = {}
262
239
  base_indices = {}
263
240
  start_time = time.time()
@@ -266,8 +243,8 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, rng:
266
243
  log.info(f"Reading app column for test table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
267
244
  base_indices[i] = np.arange(len(test_tables[i]))
268
245
  database.close()
269
- known_apps_ids = list(test_data_params.known_apps_database_enum)
270
- unknown_apps_ids = list(test_data_params.unknown_apps_database_enum)
246
+ known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
247
+ unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
271
248
  test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
272
249
  rng.shuffle(test_known_indices)
273
250
  rng.shuffle(test_unknown_indices)
@@ -311,7 +288,7 @@ def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) ->
311
288
  sorted_indices = indices[indices[:, INDICES_TABLE_POS].argsort(kind="stable")]
312
289
  unique_tables, split_bounderies = np.unique(sorted_indices[:, INDICES_TABLE_POS], return_index=True)
313
290
  indices_per_table = np.split(sorted_indices, split_bounderies[1:])
314
- data = np.empty(len(indices), dtype=data_dtype)
291
+ data = np.zeros(len(indices), dtype=data_dtype)
315
292
  for table_id, table_indices in zip(unique_tables, indices_per_table):
316
293
  data[np.where(indices[:, INDICES_TABLE_POS] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[:, INDICES_INDEX_POS])
317
294
  return data
@@ -10,10 +10,11 @@ from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROV
10
10
 
11
11
  @dataclass()
12
12
  class ClassInfo:
13
- target_names: list[str]
14
13
  num_classes: int
15
14
  known_apps: list[str]
16
15
  unknown_apps: list[str]
16
+ encoder: LabelEncoder
17
+ target_names: list[str]
17
18
  unknown_class_label: int
18
19
  group_matrix: np.ndarray
19
20
  has_provider: dict[str, bool]
@@ -21,9 +22,9 @@ class ClassInfo:
21
22
  provider_members: dict[str, list[str]]
22
23
  categories_mapping: dict[str, Optional[str]]
23
24
 
24
- def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_database_enum: dict[int, str], unknown_apps_database_enum: dict[int, str]) -> ClassInfo:
25
- known_apps = sorted(known_apps_database_enum.values())
26
- unknown_apps = sorted(unknown_apps_database_enum.values())
25
+ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
26
+ known_apps = sorted(known_apps)
27
+ unknown_apps = sorted(unknown_apps)
27
28
  target_names_arr = encoder.classes_
28
29
  assert known_apps == list(target_names_arr[:-1])
29
30
  group_matrix = np.array([[a == b or
@@ -37,10 +38,11 @@ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_databas
37
38
  provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
38
39
  categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
39
40
  return ClassInfo(
40
- target_names=list(target_names_arr),
41
41
  num_classes=len(known_apps),
42
42
  known_apps=known_apps,
43
43
  unknown_apps=unknown_apps,
44
+ encoder=encoder,
45
+ target_names=list(target_names_arr),
44
46
  unknown_class_label=len(known_apps),
45
47
  group_matrix=group_matrix,
46
48
  has_provider=has_provider,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.17
3
+ Version: 0.1.0
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
16
16
  Requires-Python: >=3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENCE
19
+ Requires-Dist: cesnet-models
19
20
  Requires-Dist: matplotlib
20
21
  Requires-Dist: numpy
21
22
  Requires-Dist: pandas
@@ -0,0 +1,30 @@
1
+ cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ cesnet_datazoo/config.py,sha256=vvNyM7TCMolH-uLj3ant7rGkYb_2FPyCWlRQ3mllKWs,37427
3
+ cesnet_datazoo/constants.py,sha256=EDeeo0xrBt_pnWf3m-ZTiC5HMvyVwcikgCZ9LwZIcAE,1276
4
+ cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
+ cesnet_datazoo/datasets/cesnet_dataset.py,sha256=zoLFduBg6ZK96zoec0kEMB1hFCGn3QOtBtYFTcCbIU0,46546
6
+ cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
7
+ cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
8
+ cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
9
+ cesnet_datazoo/datasets/statistics.py,sha256=wR8QISIh-KC7CQ5SjN7WoTMFaoRuq0G7pgTFGhC8ek0,15137
10
+ cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=Ntlp8mHUSr7g-ZTvtBVh238TswZHwGAudMuE52-OA-c,1608
12
+ cesnet_datazoo/datasets/metadata/metadata.csv,sha256=or0CB7t06G_V1OzClqtpx7sRt_ZoQWE_f7F5SDLlPC8,2175
13
+ cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
15
+ cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
16
+ cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
18
+ cesnet_datazoo/pytables_data/data_scalers.py,sha256=IfTymhVubjLNetjOIxDhtzkETp_1xmFXbC0rSjQHVUQ,7254
19
+ cesnet_datazoo/pytables_data/indices_setup.py,sha256=rBW1HwebPXkwLRuKg9ILO_LfUrfnJfqQYsrIAYfXtZo,12932
20
+ cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=kCvbOgIseBdUUGz3nRr7oGsuN2JicXGlsp8-Z9n4JyM,17599
21
+ cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ cesnet_datazoo/utils/class_info.py,sha256=zMt2ndfwvtnE5QOKS1OPbw8VUlsCCbB_SVjzyFn1Wdw,2540
23
+ cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
24
+ cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
25
+ cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
26
+ cesnet_datazoo-0.1.0.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
+ cesnet_datazoo-0.1.0.dist-info/METADATA,sha256=c6GMIPE5rkiZtsbGNv28405o-G02J4wyvP-DJL8BfJM,12679
28
+ cesnet_datazoo-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
29
+ cesnet_datazoo-0.1.0.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
+ cesnet_datazoo-0.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,29 +0,0 @@
1
- cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- cesnet_datazoo/config.py,sha256=ZNjCM85XFl3jTMj6UsnsEBvmxNxJbiNUhvBx4dEiyw8,37711
3
- cesnet_datazoo/constants.py,sha256=EliK-KvW3GXeKw00W_Pd-ypJMwvFQVqMQS9A9ULyTj4,1420
4
- cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
- cesnet_datazoo/datasets/cesnet_dataset.py,sha256=h7OfmxrAAwMouSQFbKcOhHWJMaZznePxuOw1h8g2Oa0,43399
6
- cesnet_datazoo/datasets/datasets.py,sha256=gj7jflxqDgEfHXSFUz6JOW2x8wEUSCqVe6KapaK4IKg,2279
7
- cesnet_datazoo/datasets/loaders.py,sha256=HU2Au0P87BCAvdgpiwO5T0xgeQgs_gL4E1d12OP1JoQ,1803
8
- cesnet_datazoo/datasets/statistics.py,sha256=GoM7-vFTvqx9ym239VCZd1os2TdoxLOW7WNpNtOU7Fc,14030
9
- cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=OZf-NMai2XuSg57y2IdV-804ZpPcmI9sWoDu8IO7e4Y,1567
11
- cesnet_datazoo/datasets/metadata/metadata.csv,sha256=Zr2hf9qpJpPE_Js9XmyaHffdho912ikdQfFVQx6q8NE,2161
12
- cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
14
- cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
15
- cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
17
- cesnet_datazoo/pytables_data/data_scalers.py,sha256=dRWHiOxe0VhfhYaOviAO5o8uitpehjHeaRwjWZhDEQA,11468
18
- cesnet_datazoo/pytables_data/indices_setup.py,sha256=IraCOFys0p7ZojR-0E99bKN9dTjwCfQO4L6lMqcTEFg,13070
19
- cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=dE_9DOgkwKgw2WZHLZAmTvrqc22f5D3drFE9c2bDpxo,19744
20
- cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- cesnet_datazoo/utils/class_info.py,sha256=ync9U3PWo0DloRwX3uMgKW798kC6echioEbEgrPqY4E,2567
22
- cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
23
- cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
24
- cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
25
- cesnet_datazoo-0.0.17.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
26
- cesnet_datazoo-0.0.17.dist-info/METADATA,sha256=iRj-jZEsmH6YZEfR-KFDP9yJAaC7gI3V24jkNzGUWsU,12650
27
- cesnet_datazoo-0.0.17.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
28
- cesnet_datazoo-0.0.17.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
29
- cesnet_datazoo-0.0.17.dist-info/RECORD,,