cesnet-datazoo 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +8 -5
- cesnet_datazoo/datasets/cesnet_dataset.py +6 -4
- cesnet_datazoo/pytables_data/data_scalers.py +13 -11
- cesnet_datazoo/pytables_data/pytables_dataset.py +2 -4
- {cesnet_datazoo-0.1.4.dist-info → cesnet_datazoo-0.1.6.dist-info}/METADATA +1 -1
- {cesnet_datazoo-0.1.4.dist-info → cesnet_datazoo-0.1.6.dist-info}/RECORD +9 -9
- {cesnet_datazoo-0.1.4.dist-info → cesnet_datazoo-0.1.6.dist-info}/LICENCE +0 -0
- {cesnet_datazoo-0.1.4.dist-info → cesnet_datazoo-0.1.6.dist-info}/WHEEL +0 -0
- {cesnet_datazoo-0.1.4.dist-info → cesnet_datazoo-0.1.6.dist-info}/top_level.txt +0 -0
cesnet_datazoo/config.py
CHANGED
@@ -8,6 +8,7 @@ import warnings
|
|
8
8
|
from dataclasses import InitVar, field
|
9
9
|
from datetime import datetime
|
10
10
|
from enum import Enum
|
11
|
+
from importlib.metadata import version
|
11
12
|
from typing import TYPE_CHECKING, Callable, Literal, Optional
|
12
13
|
|
13
14
|
import yaml
|
@@ -83,6 +84,7 @@ class DataLoaderOrder(Enum):
|
|
83
84
|
|
84
85
|
@dataclass(frozen=True)
|
85
86
|
class TrainDataParams():
|
87
|
+
datazoo_version: str
|
86
88
|
database_filename: str
|
87
89
|
train_period_name: str
|
88
90
|
train_tables_paths: list[str]
|
@@ -97,6 +99,7 @@ class TrainDataParams():
|
|
97
99
|
|
98
100
|
@dataclass(frozen=True)
|
99
101
|
class TestDataParams():
|
102
|
+
datazoo_version: str
|
100
103
|
database_filename: str
|
101
104
|
test_period_name: str
|
102
105
|
test_tables_paths: list[str]
|
@@ -497,14 +500,12 @@ class DatasetConfig():
|
|
497
500
|
return params_hash
|
498
501
|
|
499
502
|
def _get_train_data_path(self) -> str:
|
500
|
-
|
501
|
-
|
502
|
-
return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
|
503
|
-
else:
|
504
|
-
return os.path.join(self.data_root, "train-data", "default")
|
503
|
+
params_hash = self._get_train_data_hash()
|
504
|
+
return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
|
505
505
|
|
506
506
|
def _get_train_data_params(self) -> TrainDataParams:
|
507
507
|
return TrainDataParams(
|
508
|
+
datazoo_version=version("cesnet_datazoo"),
|
508
509
|
database_filename=self.database_filename,
|
509
510
|
train_period_name=self.train_period_name,
|
510
511
|
train_tables_paths=self._get_train_tables_paths(),
|
@@ -520,6 +521,7 @@ class DatasetConfig():
|
|
520
521
|
def _get_val_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
|
521
522
|
assert self.val_approach == ValidationApproach.VALIDATION_DATES
|
522
523
|
val_data_params = TestDataParams(
|
524
|
+
datazoo_version=version("cesnet_datazoo"),
|
523
525
|
database_filename=self.database_filename,
|
524
526
|
test_period_name=self.val_period_name,
|
525
527
|
test_tables_paths=self._get_val_tables_paths(),
|
@@ -532,6 +534,7 @@ class DatasetConfig():
|
|
532
534
|
|
533
535
|
def _get_test_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
|
534
536
|
test_data_params = TestDataParams(
|
537
|
+
datazoo_version=version("cesnet_datazoo"),
|
535
538
|
database_filename=self.database_filename,
|
536
539
|
test_period_name=self.test_period_name,
|
537
540
|
test_tables_paths=self._get_test_tables_paths(),
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
10
10
|
import pandas as pd
|
11
11
|
import tables as tb
|
12
12
|
import torch
|
13
|
+
from numpy.lib.recfunctions import repack_fields
|
13
14
|
from sklearn.model_selection import train_test_split
|
14
15
|
from sklearn.preprocessing import LabelEncoder
|
15
16
|
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, SequentialSampler
|
@@ -17,7 +18,8 @@ from typing_extensions import assert_never
|
|
17
18
|
|
18
19
|
from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
|
19
20
|
from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_APP_FIELD,
|
20
|
-
SERVICEMAP_FILE,
|
21
|
+
INDICES_INDEX_FIELD, INDICES_TABLE_FIELD, SERVICEMAP_FILE,
|
22
|
+
UNKNOWN_STR_LABEL)
|
21
23
|
from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
|
22
24
|
from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
|
23
25
|
from cesnet_datazoo.datasets.statistics import compute_dataset_statistics
|
@@ -619,7 +621,7 @@ class CesnetDataset():
|
|
619
621
|
train_dataset = PyTablesDataset(
|
620
622
|
database_path=dataset_config.database_path,
|
621
623
|
tables_paths=dataset_config._get_train_tables_paths(),
|
622
|
-
indices=dataset_indices.train_indices,
|
624
|
+
indices=repack_fields(dataset_indices.train_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
|
623
625
|
tables_app_enum=self._tables_app_enum,
|
624
626
|
tables_cat_enum=self._tables_cat_enum,
|
625
627
|
flowstats_features=dataset_config.flowstats_features,
|
@@ -638,7 +640,7 @@ class CesnetDataset():
|
|
638
640
|
val_dataset = PyTablesDataset(
|
639
641
|
database_path=dataset_config.database_path,
|
640
642
|
tables_paths=dataset_config._get_val_tables_paths(),
|
641
|
-
indices=dataset_indices.val_known_indices,
|
643
|
+
indices=repack_fields(dataset_indices.val_known_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
|
642
644
|
tables_app_enum=self._tables_app_enum,
|
643
645
|
tables_cat_enum=self._tables_cat_enum,
|
644
646
|
flowstats_features=dataset_config.flowstats_features,
|
@@ -659,7 +661,7 @@ class CesnetDataset():
|
|
659
661
|
test_dataset = PyTablesDataset(
|
660
662
|
database_path=dataset_config.database_path,
|
661
663
|
tables_paths=dataset_config._get_test_tables_paths(),
|
662
|
-
indices=test_combined_indices,
|
664
|
+
indices=repack_fields(test_combined_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
|
663
665
|
tables_app_enum=self._tables_app_enum,
|
664
666
|
tables_cat_enum=self._tables_cat_enum,
|
665
667
|
flowstats_features=dataset_config.flowstats_features,
|
@@ -46,18 +46,20 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
|
|
46
46
|
data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
|
47
47
|
padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
|
48
48
|
# Fit IPT scaler
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
if clip_and_scale_ppi_transform.ipt_scaler:
|
50
|
+
train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
|
51
|
+
train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
|
52
|
+
if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
|
53
|
+
# Let zero be the minimum for minmax scaling
|
54
|
+
train_ipt = np.concatenate((train_ipt, [0]))
|
55
|
+
clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
|
55
56
|
# Fit packet sizes scaler
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
57
|
+
if clip_and_scale_ppi_transform.psizes_scaler:
|
58
|
+
train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
|
59
|
+
train_psizes[padding_mask] = np.nan
|
60
|
+
if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
|
61
|
+
train_psizes = np.concatenate((train_psizes, [0]))
|
62
|
+
clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
|
61
63
|
clip_and_scale_ppi_transform.needs_fitting = False
|
62
64
|
json.dump(clip_and_scale_ppi_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "ppi-transform.json"), "w"), indent=4)
|
63
65
|
|
@@ -158,11 +158,9 @@ class PyTablesDataset(Dataset):
|
|
158
158
|
for i in range(len(tables)):
|
159
159
|
base_labels[i] = tables[i].read(field=APP_COLUMN)
|
160
160
|
base_indices[i] = np.nonzero(np.isin(base_labels[i], disabled_apps_ids, invert=True))[0]
|
161
|
-
indices = np.
|
161
|
+
indices = np.array(list(zip(
|
162
162
|
np.concatenate([[table_id] * len(base_indices[table_id]) for table_id in tables]),
|
163
|
-
np.concatenate(list(base_indices.values())),
|
164
|
-
np.concatenate(list(base_labels.values()))
|
165
|
-
)).astype(np.int32)
|
163
|
+
np.concatenate(list(base_indices.values())))), dtype=[field for field in INDICES_DTYPE if field[0] in [INDICES_INDEX_FIELD, INDICES_TABLE_FIELD]])
|
166
164
|
self.indices = indices
|
167
165
|
database.close()
|
168
166
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -1,8 +1,8 @@
|
|
1
1
|
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
cesnet_datazoo/config.py,sha256=
|
2
|
+
cesnet_datazoo/config.py,sha256=wkpD_OL3gRXX2t0WDfDMsBD2A3vEdAjSm4yXhzsR8T0,38536
|
3
3
|
cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
|
4
4
|
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
-
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=
|
5
|
+
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=14uKWWSGIkH3GM_BDUSYyCIoOh1L-I4bH0zu0m3DkkQ,46988
|
6
6
|
cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
|
7
7
|
cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
|
8
8
|
cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
|
@@ -15,16 +15,16 @@ cesnet_datazoo/metrics/classification_report.py,sha256=kqVW35uEctTiWpMqxhWzOmmDk
|
|
15
15
|
cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKUGYOEFYqdBbNY4,1374
|
16
16
|
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
18
|
-
cesnet_datazoo/pytables_data/data_scalers.py,sha256=
|
18
|
+
cesnet_datazoo/pytables_data/data_scalers.py,sha256=gW75d-DGBokMKNUwM_5A3W3XCZ12WYXefGtpD8xYf1Y,5236
|
19
19
|
cesnet_datazoo/pytables_data/indices_setup.py,sha256=M5J2BevkQK8fuC22vUauKyKAEVwYg8xRz9JJK8E1VX8,13717
|
20
|
-
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=
|
20
|
+
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=YGbzYKrSklCu3J52Xbdcs3zZsYroBBtP8ulgS1c5Fnw,19431
|
21
21
|
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
|
23
23
|
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
24
24
|
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
25
25
|
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
26
|
-
cesnet_datazoo-0.1.
|
27
|
-
cesnet_datazoo-0.1.
|
28
|
-
cesnet_datazoo-0.1.
|
29
|
-
cesnet_datazoo-0.1.
|
30
|
-
cesnet_datazoo-0.1.
|
26
|
+
cesnet_datazoo-0.1.6.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
27
|
+
cesnet_datazoo-0.1.6.dist-info/METADATA,sha256=UDz2g74cJrx_NLbCyPvvLEwE02x4HI8-qhDMgutKb8Q,12964
|
28
|
+
cesnet_datazoo-0.1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
29
|
+
cesnet_datazoo-0.1.6.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
30
|
+
cesnet_datazoo-0.1.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|