cesnet-datazoo 0.0.16__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +174 -167
- cesnet_datazoo/constants.py +4 -6
- cesnet_datazoo/datasets/cesnet_dataset.py +200 -172
- cesnet_datazoo/datasets/datasets.py +22 -2
- cesnet_datazoo/datasets/datasets_constants.py +670 -0
- cesnet_datazoo/datasets/loaders.py +3 -0
- cesnet_datazoo/datasets/metadata/dataset_metadata.py +6 -5
- cesnet_datazoo/datasets/metadata/metadata.csv +4 -4
- cesnet_datazoo/datasets/statistics.py +36 -16
- cesnet_datazoo/pytables_data/data_scalers.py +110 -0
- cesnet_datazoo/pytables_data/indices_setup.py +29 -33
- cesnet_datazoo/pytables_data/pytables_dataset.py +103 -229
- cesnet_datazoo/utils/class_info.py +7 -5
- cesnet_datazoo/utils/download.py +6 -1
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/METADATA +2 -1
- cesnet_datazoo-0.1.0.dist-info/RECORD +30 -0
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/WHEEL +1 -1
- cesnet_datazoo-0.0.16.dist-info/RECORD +0 -28
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.0.16.dist-info → cesnet_datazoo-0.1.0.dist-info}/top_level.txt +0 -0
@@ -4,62 +4,118 @@ import os
|
|
4
4
|
import time
|
5
5
|
import warnings
|
6
6
|
from datetime import datetime
|
7
|
-
from typing import Any, Optional
|
7
|
+
from typing import Any, Callable, Optional
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
11
11
|
import tables as tb
|
12
12
|
import torch
|
13
|
-
from numpy.lib.recfunctions import
|
14
|
-
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
|
13
|
+
from numpy.lib.recfunctions import structured_to_unstructured
|
15
14
|
from torch.utils.data import Dataset
|
16
15
|
from typing_extensions import assert_never
|
17
16
|
|
18
|
-
from cesnet_datazoo.config import (AppSelection,
|
19
|
-
|
20
|
-
from cesnet_datazoo.constants import
|
21
|
-
FLOWSTATS_TO_SCALE, INDICES_INDEX_POS, INDICES_TABLE_POS,
|
22
|
-
IPT_POS, PHIST_BIN_COUNT, PHISTS_FEATURES, PPI_COLUMN,
|
23
|
-
SIZE_POS, UNKNOWN_STR_LABEL)
|
17
|
+
from cesnet_datazoo.config import (AppSelection, MinTrainSamplesCheck, TestDataParams,
|
18
|
+
TrainDataParams)
|
19
|
+
from cesnet_datazoo.constants import APP_COLUMN, INDICES_INDEX_POS, INDICES_TABLE_POS, PPI_COLUMN
|
24
20
|
from cesnet_datazoo.pytables_data.apps_split import (is_background_app,
|
25
21
|
split_apps_topx_with_provider_groups)
|
26
|
-
from cesnet_datazoo.utils.fileutils import pickle_dump, pickle_load
|
27
|
-
from cesnet_datazoo.utils.random import RandomizedSection, get_fresh_random_generator
|
28
22
|
|
29
23
|
log = logging.getLogger(__name__)
|
30
24
|
|
31
25
|
|
32
26
|
class PyTablesDataset(Dataset):
|
33
|
-
def __init__(self,
|
27
|
+
def __init__(self,
|
28
|
+
database_path: str,
|
34
29
|
tables_paths: list[str],
|
35
30
|
indices: Optional[np.ndarray],
|
31
|
+
tables_app_enum: dict[int, str],
|
32
|
+
tables_cat_enum: dict[int, str],
|
36
33
|
flowstats_features: list[str],
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
34
|
+
flowstats_features_boolean: list[str],
|
35
|
+
flowstats_features_phist: list[str],
|
36
|
+
other_fields: list[str],
|
37
|
+
ppi_channels: list[int],
|
38
|
+
ppi_transform: Optional[Callable] = None,
|
39
|
+
flowstats_transform: Optional[Callable] = None,
|
40
|
+
flowstats_phist_transform: Optional[Callable] = None,
|
41
|
+
target_transform: Optional[Callable] = None,
|
42
|
+
return_tensors: bool = False,
|
43
|
+
return_all_fields: bool = False,
|
44
|
+
preload: bool = False,
|
45
|
+
preload_blob: Optional[str] = None,
|
46
|
+
disabled_apps: Optional[list[str]] = None,):
|
41
47
|
self.database_path = database_path
|
42
48
|
self.tables_paths = tables_paths
|
43
49
|
self.tables = {}
|
44
|
-
self.
|
45
|
-
self.
|
46
|
-
self.
|
47
|
-
self.preload_blob = preload_blob
|
48
|
-
self.return_all_fields = return_all_fields
|
50
|
+
self.tables_app_enum = tables_app_enum
|
51
|
+
self.tables_app_arr = np.array(list(tables_app_enum.values()))
|
52
|
+
self.tables_cat_enum = tables_cat_enum
|
49
53
|
if indices is None:
|
50
54
|
self.set_all_indices(disabled_apps=disabled_apps)
|
51
55
|
else:
|
52
56
|
self.indices = indices
|
53
57
|
|
58
|
+
self.flowstats_features = flowstats_features
|
59
|
+
self.flowstats_features_boolean = flowstats_features_boolean
|
60
|
+
self.flowstats_features_phist = flowstats_features_phist
|
61
|
+
self.other_fields = other_fields
|
62
|
+
self.ppi_channels = ppi_channels
|
63
|
+
self.ppi_transform = ppi_transform
|
64
|
+
self.flowstats_transform = flowstats_transform
|
65
|
+
self.flowstats_phist_transform = flowstats_phist_transform
|
66
|
+
self.target_transform = target_transform
|
67
|
+
self.return_tensors = return_tensors
|
68
|
+
self.return_all_fields = return_all_fields
|
69
|
+
|
70
|
+
self.preload = preload
|
71
|
+
self.preload_blob = preload_blob
|
72
|
+
|
54
73
|
def __getitem__(self, batch_idx):
|
55
74
|
# log.debug(f"worker {self.worker_id}: __getitem__")
|
56
75
|
if self.preload:
|
57
76
|
batch_data = self.data[batch_idx]
|
58
77
|
else:
|
59
|
-
batch_data =
|
78
|
+
batch_data = load_data_from_tables(tables=self.tables, indices=self.indices[batch_idx], data_dtype=self.data_dtype)
|
60
79
|
if self.return_all_fields:
|
61
80
|
return (batch_data, batch_idx)
|
62
|
-
|
81
|
+
|
82
|
+
# Prepare data
|
83
|
+
x_ppi = batch_data[PPI_COLUMN].astype("float32")
|
84
|
+
x_ppi = x_ppi[:, self.ppi_channels, :]
|
85
|
+
x_flowstats = structured_to_unstructured(batch_data[self.flowstats_features], dtype="float32")
|
86
|
+
if self.flowstats_features_boolean:
|
87
|
+
x_flowstats_boolean = structured_to_unstructured(batch_data[self.flowstats_features_boolean], dtype="float32")
|
88
|
+
else:
|
89
|
+
x_flowstats_boolean = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
|
90
|
+
if self.flowstats_features_phist:
|
91
|
+
x_flowstats_phist = structured_to_unstructured(batch_data[self.flowstats_features_phist], dtype="float32")
|
92
|
+
else:
|
93
|
+
x_flowstats_phist = np.zeros(shape=(x_flowstats.shape[0], 0), dtype="float32")
|
94
|
+
# Feature transformations
|
95
|
+
if self.ppi_transform:
|
96
|
+
x_ppi = self.ppi_transform(x_ppi)
|
97
|
+
if self.flowstats_transform:
|
98
|
+
x_flowstats = self.flowstats_transform(x_flowstats)
|
99
|
+
if self.flowstats_phist_transform:
|
100
|
+
x_flowstats_phist = self.flowstats_phist_transform(x_flowstats_phist)
|
101
|
+
x_flowstats = np.concatenate([x_flowstats, x_flowstats_boolean, x_flowstats_phist], axis=1).astype("float32")
|
102
|
+
# Labels transformation
|
103
|
+
labels = self.tables_app_arr[batch_data[APP_COLUMN]]
|
104
|
+
if self.target_transform:
|
105
|
+
labels = self.target_transform(labels)
|
106
|
+
# Prepare dataframe with other fields
|
107
|
+
other_fields_df = pd.DataFrame(batch_data[self.other_fields]) if len(self.other_fields) > 0 else pd.DataFrame()
|
108
|
+
for column in other_fields_df.columns:
|
109
|
+
if other_fields_df[column].dtype.kind == "O":
|
110
|
+
other_fields_df[column] = other_fields_df[column].astype(str)
|
111
|
+
elif column.startswith("TIME_"):
|
112
|
+
other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
|
113
|
+
|
114
|
+
if self.return_tensors:
|
115
|
+
x_ppi = torch.from_numpy(x_ppi)
|
116
|
+
x_flowstats = torch.from_numpy(x_flowstats)
|
117
|
+
labels = torch.from_numpy(labels).long() # PyTorch loss functions require long type for labels
|
118
|
+
return_data = (other_fields_df, x_ppi, x_flowstats, labels)
|
63
119
|
return return_data
|
64
120
|
|
65
121
|
def __len__(self):
|
@@ -70,8 +126,6 @@ class PyTablesDataset(Dataset):
|
|
70
126
|
log.debug(f"Initializing dataloader worker id {self.worker_id}")
|
71
127
|
self.database, self.tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
72
128
|
atexit.register(self.cleanup)
|
73
|
-
self.app_enum = self.tables[0].get_enum(APP_COLUMN)
|
74
|
-
self.cat_enum = self.tables[0].get_enum(CATEGORY_COLUMN)
|
75
129
|
self.data_dtype = self.tables[0].dtype
|
76
130
|
if self.preload:
|
77
131
|
data = None
|
@@ -82,40 +136,20 @@ class PyTablesDataset(Dataset):
|
|
82
136
|
except:
|
83
137
|
pass # ignore if the file is corrupted (or being written at the moment)
|
84
138
|
if data is None:
|
85
|
-
data =
|
139
|
+
data = load_data_from_tables(tables=self.tables, indices=self.indices, data_dtype=self.data_dtype)
|
86
140
|
self.data = data
|
87
141
|
if self.preload_blob and not os.path.isfile(self.preload_blob):
|
88
142
|
np.savez_compressed(self.preload_blob, data=self.data)
|
89
143
|
log.debug(f"Finish initialization worker id {self.worker_id}")
|
90
144
|
|
91
|
-
def get_app_enum(self) -> tb.Enum:
|
92
|
-
if self.app_enum:
|
93
|
-
return self.app_enum
|
94
|
-
database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
95
|
-
app_enum = tables[0].get_enum(APP_COLUMN)
|
96
|
-
cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
|
97
|
-
self.app_enum, self.cat_enum = app_enum, cat_enum
|
98
|
-
database.close()
|
99
|
-
return app_enum
|
100
|
-
|
101
|
-
def get_cat_enum(self) -> tb.Enum:
|
102
|
-
if self.cat_enum:
|
103
|
-
return self.cat_enum
|
104
|
-
database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
105
|
-
app_enum = tables[0].get_enum(APP_COLUMN)
|
106
|
-
cat_enum = tables[0].get_enum(CATEGORY_COLUMN)
|
107
|
-
self.app_enum, self.cat_enum = app_enum, cat_enum
|
108
|
-
database.close()
|
109
|
-
return cat_enum
|
110
|
-
|
111
145
|
def set_all_indices(self, disabled_apps: Optional[list[str]] = None):
|
112
146
|
"""
|
113
147
|
This should be called from the main process, before dataloader workers split the work.
|
114
148
|
Does no filter apps with not enough samples.
|
115
149
|
"""
|
116
150
|
database, tables = load_database(database_path=self.database_path, tables_paths=self.tables_paths)
|
117
|
-
|
118
|
-
disabled_apps_ids =
|
151
|
+
inverted_tables_app_enum = {v: k for k, v in self.tables_app_enum.items()}
|
152
|
+
disabled_apps_ids = [inverted_tables_app_enum[app] for app in disabled_apps] if disabled_apps is not None else []
|
119
153
|
base_labels = {}
|
120
154
|
base_indices = {}
|
121
155
|
for i in range(len(tables)):
|
@@ -137,64 +171,9 @@ def worker_init_fn(worker_id):
|
|
137
171
|
dataset = worker_info.dataset
|
138
172
|
dataset.pytables_worker_init(worker_id)
|
139
173
|
|
140
|
-
def
|
141
|
-
flowstats_scaler: Scaler, flowstats_quantiles: pd.Series,
|
142
|
-
psizes_scaler: Scaler, psizes_max: int,
|
143
|
-
ipt_scaler: Scaler, ipt_min: int, ipt_max: int,
|
144
|
-
use_push_flags: bool, use_packet_histograms: bool, normalize_packet_histograms: bool, zero_ppi_start: int,
|
145
|
-
encoder: LabelEncoder, known_apps: list[str], return_torch: bool = False):
|
146
|
-
other_fields, x_ppi, x_flowstats, labels = batch
|
147
|
-
x_ppi = x_ppi.transpose(0, 2, 1)
|
148
|
-
orig_shape = x_ppi.shape
|
149
|
-
ppi_channels = x_ppi.shape[-1]
|
150
|
-
x_ppi = x_ppi.reshape(-1, ppi_channels)
|
151
|
-
x_ppi[:, IPT_POS] = x_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
|
152
|
-
x_ppi[:, SIZE_POS] = x_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
|
153
|
-
padding_mask = x_ppi[:, DIR_POS] == 0 # mask of zero padding
|
154
|
-
if ipt_scaler:
|
155
|
-
x_ppi[:, IPT_POS] = ipt_scaler.transform(x_ppi[:, IPT_POS].reshape(-1, 1)).reshape(-1) # type: ignore
|
156
|
-
if psizes_scaler:
|
157
|
-
x_ppi[:, SIZE_POS] = psizes_scaler.transform(x_ppi[:, SIZE_POS].reshape(-1, 1)).reshape(-1) # type: ignore
|
158
|
-
x_ppi[padding_mask, IPT_POS] = 0
|
159
|
-
x_ppi[padding_mask, SIZE_POS] = 0
|
160
|
-
x_ppi = x_ppi.reshape(orig_shape).transpose(0, 2, 1)
|
161
|
-
if not use_push_flags:
|
162
|
-
x_ppi = x_ppi[:, (IPT_POS, DIR_POS, SIZE_POS), :]
|
163
|
-
if zero_ppi_start > 0:
|
164
|
-
x_ppi[:,:,:zero_ppi_start] = 0
|
165
|
-
|
166
|
-
if use_packet_histograms:
|
167
|
-
x_phist = structured_to_unstructured(x_flowstats[PHISTS_FEATURES], dtype="float32")
|
168
|
-
if normalize_packet_histograms:
|
169
|
-
src_sizes_pkt_count = x_phist[:, :PHIST_BIN_COUNT].sum(axis=1)[:, np.newaxis]
|
170
|
-
dst_sizes_pkt_count = x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)].sum(axis=1)[:, np.newaxis]
|
171
|
-
np.divide(x_phist[:, :PHIST_BIN_COUNT], src_sizes_pkt_count, out=x_phist[:, :PHIST_BIN_COUNT], where=src_sizes_pkt_count != 0)
|
172
|
-
np.divide(x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], dst_sizes_pkt_count, out=x_phist[:, PHIST_BIN_COUNT:(2*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count != 0)
|
173
|
-
np.divide(x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], src_sizes_pkt_count - 1, out=x_phist[:, (2*PHIST_BIN_COUNT):(3*PHIST_BIN_COUNT)], where=src_sizes_pkt_count > 1)
|
174
|
-
np.divide(x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], dst_sizes_pkt_count - 1, out=x_phist[:, (3*PHIST_BIN_COUNT):(4*PHIST_BIN_COUNT)], where=dst_sizes_pkt_count > 1)
|
175
|
-
x_flowstats = structured_to_unstructured(drop_fields(x_flowstats, PHISTS_FEATURES), dtype="float32")
|
176
|
-
x_flowstats = np.concatenate([x_flowstats, x_phist], axis=1)
|
177
|
-
else:
|
178
|
-
x_flowstats = structured_to_unstructured(x_flowstats, dtype="float32")
|
179
|
-
np.clip(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)], a_max=flowstats_quantiles, a_min=0, out=x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
|
180
|
-
if flowstats_scaler:
|
181
|
-
x_flowstats[:, :len(FLOWSTATS_TO_SCALE)] = flowstats_scaler.transform(x_flowstats[:, :len(FLOWSTATS_TO_SCALE)])
|
182
|
-
|
183
|
-
other_fields_df = pd.DataFrame(other_fields) if len(other_fields) > 0 else pd.DataFrame()
|
184
|
-
for column in other_fields_df.columns:
|
185
|
-
if other_fields_df[column].dtype.kind == "O":
|
186
|
-
other_fields_df[column] = other_fields_df[column].astype(str)
|
187
|
-
elif column.startswith("TIME_"):
|
188
|
-
other_fields_df[column] = other_fields_df[column].map(lambda x: datetime.fromtimestamp(x))
|
189
|
-
|
190
|
-
labels = encoder.transform(np.where(np.isin(labels, known_apps), labels, UNKNOWN_STR_LABEL)).astype("int64") # type: ignore
|
191
|
-
if return_torch:
|
192
|
-
return other_fields_df, torch.from_numpy(x_ppi), torch.from_numpy(x_flowstats), torch.from_numpy(labels)
|
193
|
-
return other_fields_df, x_ppi, x_flowstats, labels
|
194
|
-
|
195
|
-
def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFrame, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, dict[int, str], dict[int, str]]:
|
174
|
+
def init_train_indices(train_data_params: TrainDataParams, database_path: str, tables_app_enum: dict[int, str], servicemap: pd.DataFrame, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray, list[str], list[str]]:
|
196
175
|
database, train_tables = load_database(database_path, tables_paths=train_data_params.train_tables_paths)
|
197
|
-
|
176
|
+
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
198
177
|
all_app_labels = {}
|
199
178
|
app_counts = pd.Series(dtype="int64")
|
200
179
|
start_time = time.time()
|
@@ -206,15 +185,16 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
|
|
206
185
|
# Handle disabled apps and apps with less than min_samples_per_app samples
|
207
186
|
if len(train_data_params.disabled_apps) > 0:
|
208
187
|
log.info(f"Disabled applications in dataset config: {sorted(train_data_params.disabled_apps)}")
|
209
|
-
disabled_apps_ids =
|
188
|
+
disabled_apps_ids = [inverted_tables_app_enum[app] for app in train_data_params.disabled_apps]
|
210
189
|
min_samples_apps_ids = set(app_counts[app_counts<train_data_params.min_train_samples_per_app].index.tolist())
|
211
190
|
if len(min_samples_apps_ids) > 0:
|
191
|
+
min_samples_apps_names = sorted([tables_app_enum[app_id] for app_id in min_samples_apps_ids])
|
212
192
|
if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
|
213
|
-
warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {
|
193
|
+
warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
214
194
|
"To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
|
215
195
|
exit()
|
216
196
|
elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
|
217
|
-
log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {
|
197
|
+
log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
218
198
|
"Disabling these applications")
|
219
199
|
disabled_apps_ids.extend(min_samples_apps_ids)
|
220
200
|
# Base indices are indices of samples that are not disabled and have enough samples
|
@@ -223,9 +203,9 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
|
|
223
203
|
base_indices[i] = np.nonzero(np.isin(all_app_labels[i], disabled_apps_ids, invert=True))[0]
|
224
204
|
base_labels = {table_id: arr[base_indices[table_id]] for table_id, arr in all_app_labels.items()}
|
225
205
|
# Apps selection
|
226
|
-
if train_data_params.apps_selection != AppSelection.
|
206
|
+
if train_data_params.apps_selection != AppSelection.FIXED:
|
227
207
|
app_counts = app_counts[[app for app in app_counts.index.tolist() if app not in disabled_apps_ids]]
|
228
|
-
app_counts.index = app_counts.index.map(
|
208
|
+
app_counts.index = app_counts.index.map(tables_app_enum)
|
229
209
|
app_counts = app_counts.sort_values(ascending=False).astype("int64")
|
230
210
|
sorted_apps = app_counts.index.to_list()
|
231
211
|
if train_data_params.apps_selection == AppSelection.ALL_KNOWN:
|
@@ -235,31 +215,26 @@ def init_train_indices(train_data_params: TrainDataParams, servicemap: pd.DataFr
|
|
235
215
|
known_apps, unknown_apps = split_apps_topx_with_provider_groups(sorted_apps=sorted_apps, known_count=train_data_params.apps_selection_topx, servicemap=servicemap)
|
236
216
|
if len(known_apps) < train_data_params.apps_selection_topx:
|
237
217
|
warnings.warn(f"The number of known applications ({len(known_apps)}) is lower than requested in config.apps_selection_topx ({train_data_params.apps_selection_topx}).")
|
238
|
-
elif train_data_params.apps_selection == AppSelection.
|
239
|
-
unknown_apps = train_data_params.
|
240
|
-
missing_unknown_apps = [app for app in unknown_apps if app not in sorted_apps]
|
241
|
-
if len(missing_unknown_apps) > 0:
|
242
|
-
raise ValueError(f"Applications configured in config.apps_selection_explicit_unknown are not present in the dataset (or might be disabled): {sorted(missing_unknown_apps)}")
|
218
|
+
elif train_data_params.apps_selection == AppSelection.BACKGROUND_UNKNOWN:
|
219
|
+
unknown_apps = train_data_params.apps_selection_background_unknown
|
243
220
|
known_apps = [app for app in sorted_apps if not (is_background_app(app) or app in unknown_apps)]
|
244
221
|
else: assert_never(train_data_params.apps_selection)
|
245
|
-
|
246
222
|
log.info(f"Selected {len(known_apps)} known applications and {len(unknown_apps)} unknown applications")
|
247
|
-
known_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in known_apps}
|
248
|
-
unknown_apps_database_enum: dict[int, str] = {int(app_enum[app]): app for app in unknown_apps}
|
249
223
|
else:
|
250
|
-
|
251
|
-
|
252
|
-
known_apps_ids =
|
253
|
-
unknown_apps_ids =
|
224
|
+
known_apps = train_data_params.apps_selection_fixed_known
|
225
|
+
unknown_apps = train_data_params.apps_selection_fixed_unknown
|
226
|
+
known_apps_ids = [inverted_tables_app_enum[app] for app in known_apps]
|
227
|
+
unknown_apps_ids = [inverted_tables_app_enum[app] for app in unknown_apps]
|
254
228
|
|
255
229
|
train_known_indices, train_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
|
256
230
|
rng.shuffle(train_known_indices)
|
257
231
|
rng.shuffle(train_unknown_indices)
|
258
232
|
log.info(f"Processing train indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
259
|
-
return train_known_indices, train_unknown_indices,
|
233
|
+
return train_known_indices, train_unknown_indices, known_apps, unknown_apps
|
260
234
|
|
261
|
-
def init_test_indices(test_data_params: TestDataParams, database_path: str, rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
235
|
+
def init_test_indices(test_data_params: TestDataParams, database_path: str, tables_app_enum: dict[int, str], rng: np.random.RandomState) -> tuple[np.ndarray, np.ndarray]:
|
262
236
|
database, test_tables = load_database(database_path, tables_paths=test_data_params.test_tables_paths)
|
237
|
+
inverted_tables_app_enum = {v: k for k, v in tables_app_enum.items()}
|
263
238
|
base_labels = {}
|
264
239
|
base_indices = {}
|
265
240
|
start_time = time.time()
|
@@ -268,115 +243,14 @@ def init_test_indices(test_data_params: TestDataParams, database_path: str, rng:
|
|
268
243
|
log.info(f"Reading app column for test table {table_path} took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
269
244
|
base_indices[i] = np.arange(len(test_tables[i]))
|
270
245
|
database.close()
|
271
|
-
known_apps_ids =
|
272
|
-
unknown_apps_ids =
|
246
|
+
known_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.known_apps]
|
247
|
+
unknown_apps_ids = [inverted_tables_app_enum[app] for app in test_data_params.unknown_apps]
|
273
248
|
test_known_indices, test_unknown_indices = convert_dict_indices(base_indices=base_indices, base_labels=base_labels, known_apps_ids=known_apps_ids, unknown_apps_ids=unknown_apps_ids)
|
274
249
|
rng.shuffle(test_known_indices)
|
275
250
|
rng.shuffle(test_unknown_indices)
|
276
251
|
log.info(f"Processing test indices took {time.time() - start_time:.2f} seconds"); start_time = time.time()
|
277
252
|
return test_known_indices, test_unknown_indices
|
278
253
|
|
279
|
-
def fit_or_load_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> tuple[Scaler, pd.Series, Scaler, Scaler]:
|
280
|
-
train_data_path = dataset_config._get_train_data_path()
|
281
|
-
flowstats_scaler_path = os.path.join(train_data_path, "stand", f"flowstats_scaler-{dataset_config.flowstats_scaler}-q{dataset_config.flowstats_clip}.pickle")
|
282
|
-
flowstats_quantiles_path = os.path.join(train_data_path, "stand", f"flowstats_quantiles-q{dataset_config.flowstats_clip}.pickle")
|
283
|
-
ipt_scaler_path = os.path.join(train_data_path, "stand", f"ipt_scaler-{dataset_config.ipt_scaler}-ipt_min{dataset_config.ipt_min}-ipt_max{dataset_config.ipt_max}.pickle")
|
284
|
-
psizes_sizes_scaler_path = os.path.join(train_data_path, "stand", f"psizes_scaler-{dataset_config.psizes_scaler}-psizes_max{dataset_config.psizes_max}.pickle")
|
285
|
-
if os.path.isfile(flowstats_scaler_path) and os.path.isfile(flowstats_quantiles_path) and os.path.isfile(ipt_scaler_path) and os.path.isfile(psizes_sizes_scaler_path):
|
286
|
-
flowstats_scaler = pickle_load(flowstats_scaler_path)
|
287
|
-
flowstats_quantiles = pickle_load(flowstats_quantiles_path)
|
288
|
-
ipt_scaler = pickle_load(ipt_scaler_path)
|
289
|
-
psizes_scaler = pickle_load(psizes_sizes_scaler_path)
|
290
|
-
else:
|
291
|
-
if dataset_config.flowstats_scaler == ScalerEnum.ROBUST:
|
292
|
-
flowstats_scaler = RobustScaler()
|
293
|
-
elif dataset_config.flowstats_scaler == ScalerEnum.STANDARD:
|
294
|
-
flowstats_scaler = StandardScaler()
|
295
|
-
elif dataset_config.flowstats_scaler == ScalerEnum.MINMAX:
|
296
|
-
flowstats_scaler = MinMaxScaler()
|
297
|
-
elif dataset_config.flowstats_scaler == ScalerEnum.NO_SCALER:
|
298
|
-
flowstats_scaler = None
|
299
|
-
else: assert_never(dataset_config.flowstats_scaler)
|
300
|
-
|
301
|
-
if dataset_config.ipt_scaler == ScalerEnum.ROBUST:
|
302
|
-
ipt_scaler = RobustScaler()
|
303
|
-
elif dataset_config.ipt_scaler == ScalerEnum.STANDARD:
|
304
|
-
ipt_scaler = StandardScaler()
|
305
|
-
elif dataset_config.ipt_scaler == ScalerEnum.MINMAX:
|
306
|
-
ipt_scaler = MinMaxScaler()
|
307
|
-
elif dataset_config.ipt_scaler == ScalerEnum.NO_SCALER:
|
308
|
-
ipt_scaler = None
|
309
|
-
else: assert_never(dataset_config.ipt_scaler)
|
310
|
-
|
311
|
-
if dataset_config.psizes_scaler == ScalerEnum.ROBUST:
|
312
|
-
psizes_scaler = RobustScaler()
|
313
|
-
elif dataset_config.psizes_scaler == ScalerEnum.STANDARD:
|
314
|
-
psizes_scaler = StandardScaler()
|
315
|
-
elif dataset_config.psizes_scaler == ScalerEnum.MINMAX:
|
316
|
-
psizes_scaler = MinMaxScaler()
|
317
|
-
elif dataset_config.psizes_scaler == ScalerEnum.NO_SCALER:
|
318
|
-
psizes_scaler = None
|
319
|
-
else: assert_never(dataset_config.psizes_scaler)
|
320
|
-
|
321
|
-
if isinstance(dataset_config.fit_scalers_samples, int) and dataset_config.fit_scalers_samples > len(train_indices):
|
322
|
-
warnings.warn(f"The number of samples for fitting scalers ({dataset_config.fit_scalers_samples}) is larger than the number of train samples ({len(train_indices)}), using the number of train samples instead")
|
323
|
-
dataset_config.fit_scalers_samples = len(train_indices)
|
324
|
-
fit_scalers_rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.FIT_SCALERS_SAMPLE)
|
325
|
-
if isinstance(dataset_config.fit_scalers_samples, float):
|
326
|
-
num_samples = int(dataset_config.fit_scalers_samples * len(train_indices))
|
327
|
-
else:
|
328
|
-
num_samples = dataset_config.fit_scalers_samples
|
329
|
-
fit_scalers_indices = train_indices[fit_scalers_rng.choice(len(train_indices), size=num_samples, replace=False)]
|
330
|
-
flowstats_quantiles = fit_scalers(
|
331
|
-
database_path=dataset_config.database_path,
|
332
|
-
train_tables_paths=dataset_config._get_train_tables_paths(),
|
333
|
-
fit_scalers_indices=fit_scalers_indices,
|
334
|
-
flowstats_scaler=flowstats_scaler,
|
335
|
-
flowstats_quantile_clip=dataset_config.flowstats_clip,
|
336
|
-
ipt_scaler=ipt_scaler,
|
337
|
-
psizes_scaler=psizes_scaler,
|
338
|
-
ipt_min=dataset_config.ipt_min,
|
339
|
-
ipt_max=dataset_config.ipt_max,
|
340
|
-
psizes_max=dataset_config.psizes_max)
|
341
|
-
pickle_dump(flowstats_scaler, flowstats_scaler_path)
|
342
|
-
pickle_dump(flowstats_quantiles, flowstats_quantiles_path)
|
343
|
-
pickle_dump(ipt_scaler, ipt_scaler_path)
|
344
|
-
pickle_dump(psizes_scaler, psizes_sizes_scaler_path)
|
345
|
-
return flowstats_scaler, flowstats_quantiles, ipt_scaler, psizes_scaler
|
346
|
-
|
347
|
-
def fit_scalers(database_path: str, train_tables_paths: list[str], fit_scalers_indices: np.ndarray, flowstats_scaler, flowstats_quantile_clip: float, ipt_scaler, psizes_scaler, ipt_min: int, ipt_max: int, psizes_max: int) -> pd.Series:
|
348
|
-
start_time = time.time()
|
349
|
-
database, tables = load_database(database_path, tables_paths=train_tables_paths)
|
350
|
-
data = load_data_from_pytables(tables=tables, indices=fit_scalers_indices, data_dtype=tables[0].dtype)
|
351
|
-
database.close()
|
352
|
-
# PPI
|
353
|
-
data_ppi = data[PPI_COLUMN].astype("float32")
|
354
|
-
ppi_channels = data_ppi.shape[1]
|
355
|
-
data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
|
356
|
-
padding_mask = data_ppi[:, DIR_POS] == 0 # mask of padded packets
|
357
|
-
if ipt_scaler:
|
358
|
-
train_ipt = data_ppi[:, IPT_POS].clip(max=ipt_max, min=ipt_min)
|
359
|
-
train_ipt[padding_mask] = np.nan # nans are ignored in sklearn scalers
|
360
|
-
if isinstance(ipt_scaler, MinMaxScaler):
|
361
|
-
# let zero be the minimum for minmax scaling
|
362
|
-
train_ipt = np.concatenate((train_ipt, [0]))
|
363
|
-
ipt_scaler.fit(train_ipt.reshape(-1, 1))
|
364
|
-
if psizes_scaler:
|
365
|
-
train_psizes = data_ppi[:, SIZE_POS].clip(max=psizes_max, min=1)
|
366
|
-
train_psizes[padding_mask] = np.nan
|
367
|
-
if isinstance(psizes_scaler, MinMaxScaler):
|
368
|
-
train_psizes = np.concatenate((train_psizes, [0]))
|
369
|
-
psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
370
|
-
# FLOWSTATS
|
371
|
-
train_flowstats = pd.DataFrame(data, columns=FLOWSTATS_TO_SCALE)
|
372
|
-
upper_quantiles = train_flowstats.quantile(flowstats_quantile_clip)
|
373
|
-
upper_quantiles[FLOWSTATS_NO_CLIP] = np.Inf # disable clipping for features with "fixed" range
|
374
|
-
if flowstats_scaler:
|
375
|
-
train_flowstats = train_flowstats.clip(upper=upper_quantiles, lower=0, axis=1).to_numpy() # type: ignore
|
376
|
-
flowstats_scaler.fit(train_flowstats)
|
377
|
-
log.info(f"Reading data and fitting scalers took {time.time() - start_time:.2f} seconds")
|
378
|
-
return upper_quantiles
|
379
|
-
|
380
254
|
def load_database(database_path: str, tables_paths: Optional[list[str]] = None, mode: str = "r") -> tuple[tb.File, dict[int, Any]]: # dict[int, tb.Table]
|
381
255
|
database = tb.open_file(database_path, mode=mode)
|
382
256
|
if tables_paths is None:
|
@@ -410,11 +284,11 @@ def convert_dict_indices(base_indices: dict[int, np.ndarray], base_labels: dict[
|
|
410
284
|
np.concatenate(list(unknown_labels_dict.values()))))
|
411
285
|
return known_indices, unknown_indices
|
412
286
|
|
413
|
-
def
|
287
|
+
def load_data_from_tables(tables, indices: np.ndarray, data_dtype: np.dtype) -> np.ndarray:
|
414
288
|
sorted_indices = indices[indices[:, INDICES_TABLE_POS].argsort(kind="stable")]
|
415
289
|
unique_tables, split_bounderies = np.unique(sorted_indices[:, INDICES_TABLE_POS], return_index=True)
|
416
290
|
indices_per_table = np.split(sorted_indices, split_bounderies[1:])
|
417
|
-
data = np.
|
291
|
+
data = np.zeros(len(indices), dtype=data_dtype)
|
418
292
|
for table_id, table_indices in zip(unique_tables, indices_per_table):
|
419
293
|
data[np.where(indices[:, INDICES_TABLE_POS] == table_id)[0]] = tables[table_id].read_coordinates(table_indices[:, INDICES_INDEX_POS])
|
420
294
|
return data
|
@@ -10,10 +10,11 @@ from cesnet_datazoo.constants import SERVICEMAP_CATEGORY_COLUMN, SERVICEMAP_PROV
|
|
10
10
|
|
11
11
|
@dataclass()
|
12
12
|
class ClassInfo:
|
13
|
-
target_names: list[str]
|
14
13
|
num_classes: int
|
15
14
|
known_apps: list[str]
|
16
15
|
unknown_apps: list[str]
|
16
|
+
encoder: LabelEncoder
|
17
|
+
target_names: list[str]
|
17
18
|
unknown_class_label: int
|
18
19
|
group_matrix: np.ndarray
|
19
20
|
has_provider: dict[str, bool]
|
@@ -21,9 +22,9 @@ class ClassInfo:
|
|
21
22
|
provider_members: dict[str, list[str]]
|
22
23
|
categories_mapping: dict[str, Optional[str]]
|
23
24
|
|
24
|
-
def create_class_info(servicemap: Any, encoder: LabelEncoder,
|
25
|
-
known_apps = sorted(
|
26
|
-
unknown_apps = sorted(
|
25
|
+
def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps: list[str], unknown_apps: list[str]) -> ClassInfo:
|
26
|
+
known_apps = sorted(known_apps)
|
27
|
+
unknown_apps = sorted(unknown_apps)
|
27
28
|
target_names_arr = encoder.classes_
|
28
29
|
assert known_apps == list(target_names_arr[:-1])
|
29
30
|
group_matrix = np.array([[a == b or
|
@@ -37,10 +38,11 @@ def create_class_info(servicemap: Any, encoder: LabelEncoder, known_apps_databas
|
|
37
38
|
provider_members = {p: [app for app in target_names_arr if provider_mapping[app] == p] for p in providers}
|
38
39
|
categories_mapping = {app: servicemap.loc[app, SERVICEMAP_CATEGORY_COLUMN] if app in servicemap.index else None for app in target_names_arr}
|
39
40
|
return ClassInfo(
|
40
|
-
target_names=list(target_names_arr),
|
41
41
|
num_classes=len(known_apps),
|
42
42
|
known_apps=known_apps,
|
43
43
|
unknown_apps=unknown_apps,
|
44
|
+
encoder=encoder,
|
45
|
+
target_names=list(target_names_arr),
|
44
46
|
unknown_class_label=len(known_apps),
|
45
47
|
group_matrix=group_matrix,
|
46
48
|
has_provider=has_provider,
|
cesnet_datazoo/utils/download.py
CHANGED
@@ -11,9 +11,14 @@ def simple_download(url: str, file_path: str):
|
|
11
11
|
|
12
12
|
def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2, silent: bool = False):
|
13
13
|
r1 = requests.get(url, stream=True)
|
14
|
+
try:
|
15
|
+
r1.raise_for_status()
|
16
|
+
except requests.exceptions.HTTPError as e:
|
17
|
+
print("The dataset hosting server is unreachable. Please contact us at https://github.com/CESNET/cesnet-datazoo/issues.")
|
18
|
+
raise e
|
19
|
+
|
14
20
|
redirected_url = r1.url
|
15
21
|
content_size = int(r1.headers["Content-Length"])
|
16
|
-
|
17
22
|
if os.path.exists(file_path):
|
18
23
|
temp_size = os.path.getsize(file_path)
|
19
24
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -16,6 +16,7 @@ Classifier: Operating System :: OS Independent
|
|
16
16
|
Requires-Python: >=3.10
|
17
17
|
Description-Content-Type: text/markdown
|
18
18
|
License-File: LICENCE
|
19
|
+
Requires-Dist: cesnet-models
|
19
20
|
Requires-Dist: matplotlib
|
20
21
|
Requires-Dist: numpy
|
21
22
|
Requires-Dist: pandas
|
@@ -0,0 +1,30 @@
|
|
1
|
+
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
cesnet_datazoo/config.py,sha256=vvNyM7TCMolH-uLj3ant7rGkYb_2FPyCWlRQ3mllKWs,37427
|
3
|
+
cesnet_datazoo/constants.py,sha256=EDeeo0xrBt_pnWf3m-ZTiC5HMvyVwcikgCZ9LwZIcAE,1276
|
4
|
+
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
+
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=zoLFduBg6ZK96zoec0kEMB1hFCGn3QOtBtYFTcCbIU0,46546
|
6
|
+
cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
|
7
|
+
cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
|
8
|
+
cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
|
9
|
+
cesnet_datazoo/datasets/statistics.py,sha256=wR8QISIh-KC7CQ5SjN7WoTMFaoRuq0G7pgTFGhC8ek0,15137
|
10
|
+
cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=Ntlp8mHUSr7g-ZTvtBVh238TswZHwGAudMuE52-OA-c,1608
|
12
|
+
cesnet_datazoo/datasets/metadata/metadata.csv,sha256=or0CB7t06G_V1OzClqtpx7sRt_ZoQWE_f7F5SDLlPC8,2175
|
13
|
+
cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
|
15
|
+
cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
|
16
|
+
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
18
|
+
cesnet_datazoo/pytables_data/data_scalers.py,sha256=IfTymhVubjLNetjOIxDhtzkETp_1xmFXbC0rSjQHVUQ,7254
|
19
|
+
cesnet_datazoo/pytables_data/indices_setup.py,sha256=rBW1HwebPXkwLRuKg9ILO_LfUrfnJfqQYsrIAYfXtZo,12932
|
20
|
+
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=kCvbOgIseBdUUGz3nRr7oGsuN2JicXGlsp8-Z9n4JyM,17599
|
21
|
+
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
cesnet_datazoo/utils/class_info.py,sha256=zMt2ndfwvtnE5QOKS1OPbw8VUlsCCbB_SVjzyFn1Wdw,2540
|
23
|
+
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
24
|
+
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
25
|
+
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
26
|
+
cesnet_datazoo-0.1.0.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
27
|
+
cesnet_datazoo-0.1.0.dist-info/METADATA,sha256=c6GMIPE5rkiZtsbGNv28405o-G02J4wyvP-DJL8BfJM,12679
|
28
|
+
cesnet_datazoo-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
29
|
+
cesnet_datazoo-0.1.0.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
30
|
+
cesnet_datazoo-0.1.0.dist-info/RECORD,,
|
@@ -1,28 +0,0 @@
|
|
1
|
-
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
cesnet_datazoo/config.py,sha256=LawxdianPcNcuzxf01FTyED7PoAvQj8RMSE8QG4FZNo,37531
|
3
|
-
cesnet_datazoo/constants.py,sha256=EliK-KvW3GXeKw00W_Pd-ypJMwvFQVqMQS9A9ULyTj4,1420
|
4
|
-
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
-
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=S0FsatG3fq21fVJctWOaLbF1ZzSvxUnzg9Hbe3TFNmo,43116
|
6
|
-
cesnet_datazoo/datasets/datasets.py,sha256=gj7jflxqDgEfHXSFUz6JOW2x8wEUSCqVe6KapaK4IKg,2279
|
7
|
-
cesnet_datazoo/datasets/loaders.py,sha256=HU2Au0P87BCAvdgpiwO5T0xgeQgs_gL4E1d12OP1JoQ,1803
|
8
|
-
cesnet_datazoo/datasets/statistics.py,sha256=GoM7-vFTvqx9ym239VCZd1os2TdoxLOW7WNpNtOU7Fc,14030
|
9
|
-
cesnet_datazoo/datasets/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
cesnet_datazoo/datasets/metadata/dataset_metadata.py,sha256=OZf-NMai2XuSg57y2IdV-804ZpPcmI9sWoDu8IO7e4Y,1567
|
11
|
-
cesnet_datazoo/datasets/metadata/metadata.csv,sha256=Zr2hf9qpJpPE_Js9XmyaHffdho912ikdQfFVQx6q8NE,2161
|
12
|
-
cesnet_datazoo/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
cesnet_datazoo/metrics/classification_report.py,sha256=0JgKWyB281m3EHxI8miMKTjKg3mzyV1WTQndXg_B7i0,4040
|
14
|
-
cesnet_datazoo/metrics/provider_metrics.py,sha256=sRg2bdRTzLLTmiVjacBtGez4LEIfr35hSvMBwW-W73U,1303
|
15
|
-
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
17
|
-
cesnet_datazoo/pytables_data/indices_setup.py,sha256=IraCOFys0p7ZojR-0E99bKN9dTjwCfQO4L6lMqcTEFg,13070
|
18
|
-
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=y2BXuuA73w58XITKFsPm-FS8LB76TH6prNUMsKkXNBM,26511
|
19
|
-
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
cesnet_datazoo/utils/class_info.py,sha256=ync9U3PWo0DloRwX3uMgKW798kC6echioEbEgrPqY4E,2567
|
21
|
-
cesnet_datazoo/utils/download.py,sha256=QVbYKuWUO9j6VUJISPTVBXscjuTuuX-XRez7MJzG3dk,1204
|
22
|
-
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
23
|
-
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
24
|
-
cesnet_datazoo-0.0.16.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
25
|
-
cesnet_datazoo-0.0.16.dist-info/METADATA,sha256=YQBQeLwNIoHcCR3W4Dn46iWLlRMWC4c8B7U8r81gRd8,12650
|
26
|
-
cesnet_datazoo-0.0.16.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
27
|
-
cesnet_datazoo-0.0.16.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
28
|
-
cesnet_datazoo-0.0.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|