cesnet-datazoo 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cesnet_datazoo/config.py +11 -9
- cesnet_datazoo/datasets/cesnet_dataset.py +9 -6
- cesnet_datazoo/pytables_data/indices_setup.py +2 -2
- cesnet_datazoo/pytables_data/pytables_dataset.py +2 -1
- {cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/METADATA +1 -1
- {cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/RECORD +9 -9
- {cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/WHEEL +0 -0
- {cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/licenses/LICENCE +0 -0
- {cesnet_datazoo-0.1.12.dist-info → cesnet_datazoo-0.1.14.dist-info}/top_level.txt +0 -0
cesnet_datazoo/config.py
CHANGED
@@ -142,7 +142,7 @@ class DatasetConfig():
|
|
142
142
|
need_test_set: Use to disable the test set. `Default: True`
|
143
143
|
train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
144
144
|
train_dates: Dates used for creating a train set.
|
145
|
-
|
145
|
+
train_dates_weights: To use a non-uniform distribution of samples across train dates.
|
146
146
|
val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
|
147
147
|
train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
|
148
148
|
val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
@@ -178,6 +178,7 @@ class DatasetConfig():
|
|
178
178
|
|
179
179
|
return_other_fields: Whether to return [auxiliary fields][other-fields], such as communicating hosts, flow times, and more fields extracted from the ClientHello message. `Default: False`
|
180
180
|
return_tensors: Use for returning `torch.Tensor` from dataloaders. Dataframes are not available when this option is used. `Default: False`
|
181
|
+
disable_label_encoding: Whether to disable label encoding and return application names as strings. The original labels of configured unknown classes are preserved. `Default: False`
|
181
182
|
use_packet_histograms: Whether to use packet histogram features, if available in the dataset. `Default: True`
|
182
183
|
use_tcp_features: Whether to use TCP features, if available in the dataset. `Default: True`
|
183
184
|
use_push_flags: Whether to use push flags in packet sequences, if available in the dataset. `Default: False`
|
@@ -219,7 +220,7 @@ class DatasetConfig():
|
|
219
220
|
need_test_set: bool = True
|
220
221
|
train_period_name: str = ""
|
221
222
|
train_dates: list[str] = field(default_factory=list)
|
222
|
-
|
223
|
+
train_dates_weights: Optional[list[int]] = None
|
223
224
|
val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
|
224
225
|
train_val_split_fraction: float = 0.2
|
225
226
|
val_period_name: str = ""
|
@@ -255,6 +256,7 @@ class DatasetConfig():
|
|
255
256
|
|
256
257
|
return_other_fields: bool = False
|
257
258
|
return_tensors: bool = False
|
259
|
+
disable_label_encoding: bool = False
|
258
260
|
use_packet_histograms: bool = False
|
259
261
|
use_tcp_features: bool = False
|
260
262
|
use_push_flags: bool = False
|
@@ -369,16 +371,16 @@ class DatasetConfig():
|
|
369
371
|
raise ValueError("QUIC datasets do not support use_tcp_features")
|
370
372
|
if self.use_push_flags:
|
371
373
|
raise ValueError("QUIC datasets do not support use_push_flags")
|
372
|
-
# When
|
373
|
-
if self.
|
374
|
+
# When train_dates_weights are used, train_size and val_known_size have to be specified
|
375
|
+
if self.train_dates_weights is not None:
|
374
376
|
if not self.need_train_set:
|
375
|
-
raise ValueError("
|
376
|
-
if len(self.
|
377
|
-
raise ValueError("
|
377
|
+
raise ValueError("train_dates_weights cannot be specified when need_train_set is false")
|
378
|
+
if len(self.train_dates_weights) != len(self.train_dates):
|
379
|
+
raise ValueError("train_dates_weights has to have the same length as train_dates")
|
378
380
|
if self.train_size == "all":
|
379
|
-
raise ValueError("train_size cannot be 'all' when
|
381
|
+
raise ValueError("train_size cannot be 'all' when train_dates_weights are speficied")
|
380
382
|
if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN and self.val_known_size == "all":
|
381
|
-
raise ValueError("val_known_size cannot be 'all' when
|
383
|
+
raise ValueError("val_known_size cannot be 'all' when train_dates_weights are speficied and validation_approach is split-from-train")
|
382
384
|
# App selection
|
383
385
|
if self.apps_selection == AppSelection.ALL_KNOWN:
|
384
386
|
self.val_unknown_size = 0
|
@@ -532,7 +532,7 @@ class CesnetDataset():
|
|
532
532
|
servicemap=servicemap,
|
533
533
|
disable_indices_cache=disable_indices_cache,)
|
534
534
|
# Date weight sampling of train indices
|
535
|
-
if dataset_config.
|
535
|
+
if dataset_config.train_dates_weights is not None:
|
536
536
|
assert dataset_config.train_size != "all"
|
537
537
|
if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
|
538
538
|
# requested number of samples is train_size + val_known_size when using the split-from-train validation approach
|
@@ -563,7 +563,7 @@ class CesnetDataset():
|
|
563
563
|
val_data_path = dataset_config._get_train_data_path()
|
564
564
|
val_unknown_indices = train_unknown_indices
|
565
565
|
train_labels = train_indices[INDICES_APP_FIELD]
|
566
|
-
if dataset_config.
|
566
|
+
if dataset_config.train_dates_weights is not None:
|
567
567
|
assert dataset_config.val_known_size != "all"
|
568
568
|
# When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
|
569
569
|
if dataset_config.val_known_size > len(train_indices):
|
@@ -619,7 +619,10 @@ class CesnetDataset():
|
|
619
619
|
encoder = LabelEncoder().fit(known_apps)
|
620
620
|
encoder.classes_ = np.append(encoder.classes_, UNKNOWN_STR_LABEL)
|
621
621
|
class_info = create_class_info(servicemap=servicemap, encoder=encoder, known_apps=known_apps, unknown_apps=unknown_apps)
|
622
|
-
|
622
|
+
if dataset_config.disable_label_encoding:
|
623
|
+
label_encoder_fn = None
|
624
|
+
else:
|
625
|
+
label_encoder_fn = partial(_encode_labels_with_unknown, encoder=encoder, class_info=class_info)
|
623
626
|
# Create train, validation, and test datasets
|
624
627
|
train_dataset = val_dataset = test_dataset = None
|
625
628
|
if dataset_config.need_train_set:
|
@@ -638,7 +641,7 @@ class CesnetDataset():
|
|
638
641
|
ppi_transform=dataset_config.ppi_transform,
|
639
642
|
flowstats_transform=dataset_config.flowstats_transform,
|
640
643
|
flowstats_phist_transform=dataset_config.flowstats_phist_transform,
|
641
|
-
target_transform=
|
644
|
+
target_transform=label_encoder_fn,
|
642
645
|
return_tensors=dataset_config.return_tensors,)
|
643
646
|
if dataset_config.need_val_set:
|
644
647
|
assert val_data_path is not None
|
@@ -657,7 +660,7 @@ class CesnetDataset():
|
|
657
660
|
ppi_transform=dataset_config.ppi_transform,
|
658
661
|
flowstats_transform=dataset_config.flowstats_transform,
|
659
662
|
flowstats_phist_transform=dataset_config.flowstats_phist_transform,
|
660
|
-
target_transform=
|
663
|
+
target_transform=label_encoder_fn,
|
661
664
|
return_tensors=dataset_config.return_tensors,
|
662
665
|
preload=dataset_config.preload_val,
|
663
666
|
preload_blob=os.path.join(val_data_path, "preload", f"val_dataset-{dataset_config.val_known_size}.npz"),)
|
@@ -678,7 +681,7 @@ class CesnetDataset():
|
|
678
681
|
ppi_transform=dataset_config.ppi_transform,
|
679
682
|
flowstats_transform=dataset_config.flowstats_transform,
|
680
683
|
flowstats_phist_transform=dataset_config.flowstats_phist_transform,
|
681
|
-
target_transform=
|
684
|
+
target_transform=label_encoder_fn,
|
682
685
|
return_tensors=dataset_config.return_tensors,
|
683
686
|
preload=dataset_config.preload_test,
|
684
687
|
preload_blob=os.path.join(test_data_path, "preload", f"test_dataset-{dataset_config.test_known_size}-{dataset_config.test_unknown_size}.npz"),)
|
@@ -64,11 +64,11 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
|
|
64
64
|
def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
|
65
65
|
rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
|
66
66
|
indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
|
67
|
-
weights = np.array(dataset_config.
|
67
|
+
weights = np.array(dataset_config.train_dates_weights)
|
68
68
|
weights = weights / weights.sum()
|
69
69
|
samples_per_date = np.ceil((weights * (num_samples))).astype(int)
|
70
70
|
samples_per_date_clipped = np.clip(samples_per_date, a_max=list(map(len, indices_per_date)), a_min=0)
|
71
|
-
df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.
|
71
|
+
df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weights, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
|
72
72
|
log.info(f"Weight sampling per date with requsted total number of samples {num_samples} (train_size + val_known_size when using the split-from-train validation approach; train_size otherwise)")
|
73
73
|
for l in df.to_string(index=False).splitlines():
|
74
74
|
log.info(l)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import atexit
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import sys
|
4
5
|
import time
|
5
6
|
import warnings
|
6
7
|
from datetime import datetime
|
@@ -198,7 +199,7 @@ def init_train_indices(train_data_params: TrainDataParams, database_path: str, t
|
|
198
199
|
if train_data_params.min_train_samples_check == MinTrainSamplesCheck.WARN_AND_EXIT:
|
199
200
|
warnings.warn(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
200
201
|
"To disable these applications, add them to config.disabled_apps or set config.min_train_samples_check to disable-apps. To turn off this check, set config.min_train_samples_per_app to zero. Exiting")
|
201
|
-
exit()
|
202
|
+
sys.exit()
|
202
203
|
elif train_data_params.min_train_samples_check == MinTrainSamplesCheck.DISABLE_APPS:
|
203
204
|
log.info(f"Found applications with less than {train_data_params.min_train_samples_per_app} train samples: {min_samples_apps_names}. " +
|
204
205
|
"Disabling these applications")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -1,8 +1,8 @@
|
|
1
1
|
cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
cesnet_datazoo/config.py,sha256=
|
2
|
+
cesnet_datazoo/config.py,sha256=AoYMX_drWqb6K6MhdTQRyvUPf_6kZeL3ie04hUgJzgE,38768
|
3
3
|
cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
|
4
4
|
cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
|
5
|
-
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=
|
5
|
+
cesnet_datazoo/datasets/cesnet_dataset.py,sha256=AgxdiEwtJrOug9F8LJcMwttc7NMrligkq7cy4lRmLAs,47817
|
6
6
|
cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
|
7
7
|
cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
|
8
8
|
cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
|
@@ -16,15 +16,15 @@ cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKU
|
|
16
16
|
cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
|
18
18
|
cesnet_datazoo/pytables_data/data_scalers.py,sha256=xPL0SCLByDOgKv1Apqi5XQd501mIfsF8FdonmRQ0zzQ,5236
|
19
|
-
cesnet_datazoo/pytables_data/indices_setup.py,sha256=
|
20
|
-
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=
|
19
|
+
cesnet_datazoo/pytables_data/indices_setup.py,sha256=yCYWjkCPIj0en3btnC-C7cte0CqbqMZzOnaVR9jaNes,13717
|
20
|
+
cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=Xk5BwKoQPewqL1gj5-EuiA9HfhJPUsYs7lOsG3CEzlQ,19447
|
21
21
|
cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
|
23
23
|
cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
|
24
24
|
cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
|
25
25
|
cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
|
26
|
-
cesnet_datazoo-0.1.
|
27
|
-
cesnet_datazoo-0.1.
|
28
|
-
cesnet_datazoo-0.1.
|
29
|
-
cesnet_datazoo-0.1.
|
30
|
-
cesnet_datazoo-0.1.
|
26
|
+
cesnet_datazoo-0.1.14.dist-info/licenses/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
|
27
|
+
cesnet_datazoo-0.1.14.dist-info/METADATA,sha256=yp-ld51MKd-oTx-z86x0BwbJEvdY2zlc1fIbfYIDKWY,12574
|
28
|
+
cesnet_datazoo-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
29
|
+
cesnet_datazoo-0.1.14.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
|
30
|
+
cesnet_datazoo-0.1.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|