cesnet-datazoo 0.1.11__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/PKG-INFO +2 -2
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/config.py +9 -9
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/cesnet_dataset.py +2 -2
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/indices_setup.py +2 -2
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo.egg-info/PKG-INFO +2 -2
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo.egg-info/requires.txt +1 -1
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/pyproject.toml +2 -2
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/LICENCE +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/README.md +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/__init__.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/constants.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/__init__.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/datasets.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/loaders.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/statistics.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/metrics/__init__.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/metrics/classification_report.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/__init__.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/data_scalers.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/pytables_dataset.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/utils/__init__.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/utils/class_info.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/utils/download.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/utils/fileutils.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/utils/random.py +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo.egg-info/top_level.txt +0 -0
- {cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -19,7 +19,7 @@ Requires-Dist: cesnet_models
|
|
19
19
|
Requires-Dist: matplotlib
|
20
20
|
Requires-Dist: numpy
|
21
21
|
Requires-Dist: pandas
|
22
|
-
Requires-Dist: pydantic!=2.
|
22
|
+
Requires-Dist: pydantic!=2.9.*,<2.12.0,>=2.0
|
23
23
|
Requires-Dist: PyYAML
|
24
24
|
Requires-Dist: requests
|
25
25
|
Requires-Dist: scikit-learn
|
@@ -142,7 +142,7 @@ class DatasetConfig():
|
|
142
142
|
need_test_set: Use to disable the test set. `Default: True`
|
143
143
|
train_period_name: Name of the train period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
144
144
|
train_dates: Dates used for creating a train set.
|
145
|
-
|
145
|
+
train_dates_weights: To use a non-uniform distribution of samples across train dates.
|
146
146
|
val_approach: How a validation set should be created. Either split train data into train and validation or have a separate validation period. `Default: SPLIT_FROM_TRAIN`
|
147
147
|
train_val_split_fraction: The fraction of validation samples when splitting from the train set. `Default: 0.2`
|
148
148
|
val_period_name: Name of the validation period. See [instructions][config.DatasetConfig--how-to-configure-train-validation-and-test-sets].
|
@@ -219,7 +219,7 @@ class DatasetConfig():
|
|
219
219
|
need_test_set: bool = True
|
220
220
|
train_period_name: str = ""
|
221
221
|
train_dates: list[str] = field(default_factory=list)
|
222
|
-
|
222
|
+
train_dates_weights: Optional[list[int]] = None
|
223
223
|
val_approach: ValidationApproach = ValidationApproach.SPLIT_FROM_TRAIN
|
224
224
|
train_val_split_fraction: float = 0.2
|
225
225
|
val_period_name: str = ""
|
@@ -369,16 +369,16 @@ class DatasetConfig():
|
|
369
369
|
raise ValueError("QUIC datasets do not support use_tcp_features")
|
370
370
|
if self.use_push_flags:
|
371
371
|
raise ValueError("QUIC datasets do not support use_push_flags")
|
372
|
-
# When
|
373
|
-
if self.
|
372
|
+
# When train_dates_weights are used, train_size and val_known_size have to be specified
|
373
|
+
if self.train_dates_weights is not None:
|
374
374
|
if not self.need_train_set:
|
375
|
-
raise ValueError("
|
376
|
-
if len(self.
|
377
|
-
raise ValueError("
|
375
|
+
raise ValueError("train_dates_weights cannot be specified when need_train_set is false")
|
376
|
+
if len(self.train_dates_weights) != len(self.train_dates):
|
377
|
+
raise ValueError("train_dates_weights has to have the same length as train_dates")
|
378
378
|
if self.train_size == "all":
|
379
|
-
raise ValueError("train_size cannot be 'all' when
|
379
|
+
raise ValueError("train_size cannot be 'all' when train_dates_weights are speficied")
|
380
380
|
if self.val_approach == ValidationApproach.SPLIT_FROM_TRAIN and self.val_known_size == "all":
|
381
|
-
raise ValueError("val_known_size cannot be 'all' when
|
381
|
+
raise ValueError("val_known_size cannot be 'all' when train_dates_weights are speficied and validation_approach is split-from-train")
|
382
382
|
# App selection
|
383
383
|
if self.apps_selection == AppSelection.ALL_KNOWN:
|
384
384
|
self.val_unknown_size = 0
|
@@ -532,7 +532,7 @@ class CesnetDataset():
|
|
532
532
|
servicemap=servicemap,
|
533
533
|
disable_indices_cache=disable_indices_cache,)
|
534
534
|
# Date weight sampling of train indices
|
535
|
-
if dataset_config.
|
535
|
+
if dataset_config.train_dates_weights is not None:
|
536
536
|
assert dataset_config.train_size != "all"
|
537
537
|
if dataset_config.val_approach == ValidationApproach.SPLIT_FROM_TRAIN:
|
538
538
|
# requested number of samples is train_size + val_known_size when using the split-from-train validation approach
|
@@ -563,7 +563,7 @@ class CesnetDataset():
|
|
563
563
|
val_data_path = dataset_config._get_train_data_path()
|
564
564
|
val_unknown_indices = train_unknown_indices
|
565
565
|
train_labels = train_indices[INDICES_APP_FIELD]
|
566
|
-
if dataset_config.
|
566
|
+
if dataset_config.train_dates_weights is not None:
|
567
567
|
assert dataset_config.val_known_size != "all"
|
568
568
|
# When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
|
569
569
|
if dataset_config.val_known_size > len(train_indices):
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/indices_setup.py
RENAMED
@@ -64,11 +64,11 @@ def subset_and_sort_indices(dataset_config: DatasetConfig, dataset_indices: Indi
|
|
64
64
|
def date_weight_sample_train_indices(dataset_config: DatasetConfig, train_indices: np.ndarray, num_samples: int) -> np.ndarray:
|
65
65
|
rng = get_fresh_random_generator(dataset_config=dataset_config, section=RandomizedSection.DATE_WEIGHT_SAMPLING)
|
66
66
|
indices_per_date = [train_indices[train_indices[INDICES_TABLE_FIELD] == i] for i in np.unique(train_indices[INDICES_TABLE_FIELD])]
|
67
|
-
weights = np.array(dataset_config.
|
67
|
+
weights = np.array(dataset_config.train_dates_weights)
|
68
68
|
weights = weights / weights.sum()
|
69
69
|
samples_per_date = np.ceil((weights * (num_samples))).astype(int)
|
70
70
|
samples_per_date_clipped = np.clip(samples_per_date, a_max=list(map(len, indices_per_date)), a_min=0)
|
71
|
-
df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.
|
71
|
+
df = pd.DataFrame(data={"Dates": dataset_config.train_dates, "Weights": dataset_config.train_dates_weights, "Requested Samples": samples_per_date, "Available Samples": samples_per_date_clipped})
|
72
72
|
log.info(f"Weight sampling per date with requsted total number of samples {num_samples} (train_size + val_known_size when using the split-from-train validation approach; train_size otherwise)")
|
73
73
|
for l in df.to_string(index=False).splitlines():
|
74
74
|
log.info(l)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cesnet-datazoo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: A toolkit for large network traffic datasets
|
5
5
|
Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
6
6
|
Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
|
@@ -19,7 +19,7 @@ Requires-Dist: cesnet_models
|
|
19
19
|
Requires-Dist: matplotlib
|
20
20
|
Requires-Dist: numpy
|
21
21
|
Requires-Dist: pandas
|
22
|
-
Requires-Dist: pydantic!=2.
|
22
|
+
Requires-Dist: pydantic!=2.9.*,<2.12.0,>=2.0
|
23
23
|
Requires-Dist: PyYAML
|
24
24
|
Requires-Dist: requests
|
25
25
|
Requires-Dist: scikit-learn
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "cesnet-datazoo"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.13"
|
8
8
|
authors = [
|
9
9
|
{name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
|
10
10
|
{name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
|
@@ -32,7 +32,7 @@ dependencies = [
|
|
32
32
|
"matplotlib",
|
33
33
|
"numpy",
|
34
34
|
"pandas",
|
35
|
-
"pydantic >=2.0, !=2.9.*,
|
35
|
+
"pydantic >=2.0, !=2.9.*, <2.12.0",
|
36
36
|
"PyYAML",
|
37
37
|
"requests",
|
38
38
|
"scikit-learn",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/datasets_constants.py
RENAMED
File without changes
|
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/metadata/__init__.py
RENAMED
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/metadata/dataset_metadata.py
RENAMED
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/datasets/metadata/metadata.csv
RENAMED
File without changes
|
File without changes
|
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/metrics/classification_report.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/data_scalers.py
RENAMED
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo/pytables_data/pytables_dataset.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cesnet_datazoo-0.1.11 → cesnet_datazoo-0.1.13}/cesnet_datazoo.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|