cesnet-datazoo 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cesnet_datazoo/config.py CHANGED
@@ -8,6 +8,7 @@ import warnings
8
8
  from dataclasses import InitVar, field
9
9
  from datetime import datetime
10
10
  from enum import Enum
11
+ from importlib.metadata import version
11
12
  from typing import TYPE_CHECKING, Callable, Literal, Optional
12
13
 
13
14
  import yaml
@@ -83,6 +84,7 @@ class DataLoaderOrder(Enum):
83
84
 
84
85
  @dataclass(frozen=True)
85
86
  class TrainDataParams():
87
+ datazoo_version: str
86
88
  database_filename: str
87
89
  train_period_name: str
88
90
  train_tables_paths: list[str]
@@ -97,6 +99,7 @@ class TrainDataParams():
97
99
 
98
100
  @dataclass(frozen=True)
99
101
  class TestDataParams():
102
+ datazoo_version: str
100
103
  database_filename: str
101
104
  test_period_name: str
102
105
  test_tables_paths: list[str]
@@ -497,14 +500,12 @@ class DatasetConfig():
497
500
  return params_hash
498
501
 
499
502
  def _get_train_data_path(self) -> str:
500
- if self.need_train_set:
501
- params_hash = self._get_train_data_hash()
502
- return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
503
- else:
504
- return os.path.join(self.data_root, "train-data", "default")
503
+ params_hash = self._get_train_data_hash()
504
+ return os.path.join(self.data_root, "train-data", f"{params_hash}_{self.random_state}", f"fold_{self.fold_id}")
505
505
 
506
506
  def _get_train_data_params(self) -> TrainDataParams:
507
507
  return TrainDataParams(
508
+ datazoo_version=version("cesnet_datazoo"),
508
509
  database_filename=self.database_filename,
509
510
  train_period_name=self.train_period_name,
510
511
  train_tables_paths=self._get_train_tables_paths(),
@@ -520,6 +521,7 @@ class DatasetConfig():
520
521
  def _get_val_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
521
522
  assert self.val_approach == ValidationApproach.VALIDATION_DATES
522
523
  val_data_params = TestDataParams(
524
+ datazoo_version=version("cesnet_datazoo"),
523
525
  database_filename=self.database_filename,
524
526
  test_period_name=self.val_period_name,
525
527
  test_tables_paths=self._get_val_tables_paths(),
@@ -532,6 +534,7 @@ class DatasetConfig():
532
534
 
533
535
  def _get_test_data_params_and_path(self, known_apps: list[str], unknown_apps: list[str]) -> tuple[TestDataParams, str]:
534
536
  test_data_params = TestDataParams(
537
+ datazoo_version=version("cesnet_datazoo"),
535
538
  database_filename=self.database_filename,
536
539
  test_period_name=self.test_period_name,
537
540
  test_tables_paths=self._get_test_tables_paths(),
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
  import tables as tb
12
12
  import torch
13
+ from numpy.lib.recfunctions import repack_fields
13
14
  from sklearn.model_selection import train_test_split
14
15
  from sklearn.preprocessing import LabelEncoder
15
16
  from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, SequentialSampler
@@ -17,7 +18,8 @@ from typing_extensions import assert_never
17
18
 
18
19
  from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
19
20
  from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_APP_FIELD,
20
- SERVICEMAP_FILE, UNKNOWN_STR_LABEL)
21
+ INDICES_INDEX_FIELD, INDICES_TABLE_FIELD, SERVICEMAP_FILE,
22
+ UNKNOWN_STR_LABEL)
21
23
  from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
22
24
  from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
23
25
  from cesnet_datazoo.datasets.statistics import compute_dataset_statistics
@@ -619,7 +621,7 @@ class CesnetDataset():
619
621
  train_dataset = PyTablesDataset(
620
622
  database_path=dataset_config.database_path,
621
623
  tables_paths=dataset_config._get_train_tables_paths(),
622
- indices=dataset_indices.train_indices,
624
+ indices=repack_fields(dataset_indices.train_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
623
625
  tables_app_enum=self._tables_app_enum,
624
626
  tables_cat_enum=self._tables_cat_enum,
625
627
  flowstats_features=dataset_config.flowstats_features,
@@ -638,7 +640,7 @@ class CesnetDataset():
638
640
  val_dataset = PyTablesDataset(
639
641
  database_path=dataset_config.database_path,
640
642
  tables_paths=dataset_config._get_val_tables_paths(),
641
- indices=dataset_indices.val_known_indices,
643
+ indices=repack_fields(dataset_indices.val_known_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
642
644
  tables_app_enum=self._tables_app_enum,
643
645
  tables_cat_enum=self._tables_cat_enum,
644
646
  flowstats_features=dataset_config.flowstats_features,
@@ -659,7 +661,7 @@ class CesnetDataset():
659
661
  test_dataset = PyTablesDataset(
660
662
  database_path=dataset_config.database_path,
661
663
  tables_paths=dataset_config._get_test_tables_paths(),
662
- indices=test_combined_indices,
664
+ indices=repack_fields(test_combined_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
663
665
  tables_app_enum=self._tables_app_enum,
664
666
  tables_cat_enum=self._tables_cat_enum,
665
667
  flowstats_features=dataset_config.flowstats_features,
@@ -46,18 +46,20 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
46
46
  data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
47
47
  padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
48
48
  # Fit IPT scaler
49
- train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
50
- train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
51
- if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
52
- # Let zero be the minimum for minmax scaling
53
- train_ipt = np.concatenate((train_ipt, [0]))
54
- clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
49
+ if clip_and_scale_ppi_transform.ipt_scaler:
50
+ train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
51
+ train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
52
+ if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
53
+ # Let zero be the minimum for minmax scaling
54
+ train_ipt = np.concatenate((train_ipt, [0]))
55
+ clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
55
56
  # Fit packet sizes scaler
56
- train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
57
- train_psizes[padding_mask] = np.nan
58
- if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
59
- train_psizes = np.concatenate((train_psizes, [0]))
60
- clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
57
+ if clip_and_scale_ppi_transform.psizes_scaler:
58
+ train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
59
+ train_psizes[padding_mask] = np.nan
60
+ if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
61
+ train_psizes = np.concatenate((train_psizes, [0]))
62
+ clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
61
63
  clip_and_scale_ppi_transform.needs_fitting = False
62
64
  json.dump(clip_and_scale_ppi_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "ppi-transform.json"), "w"), indent=4)
63
65
 
@@ -158,11 +158,9 @@ class PyTablesDataset(Dataset):
158
158
  for i in range(len(tables)):
159
159
  base_labels[i] = tables[i].read(field=APP_COLUMN)
160
160
  base_indices[i] = np.nonzero(np.isin(base_labels[i], disabled_apps_ids, invert=True))[0]
161
- indices = np.column_stack((
161
+ indices = np.array(list(zip(
162
162
  np.concatenate([[table_id] * len(base_indices[table_id]) for table_id in tables]),
163
- np.concatenate(list(base_indices.values())),
164
- np.concatenate(list(base_labels.values()))
165
- )).astype(np.int32)
163
+ np.concatenate(list(base_indices.values())))), dtype=[field for field in INDICES_DTYPE if field[0] in [INDICES_INDEX_FIELD, INDICES_TABLE_FIELD]])
166
164
  self.indices = indices
167
165
  database.close()
168
166
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -1,8 +1,8 @@
1
1
  cesnet_datazoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- cesnet_datazoo/config.py,sha256=x8bugBZmBZ9PNd0D5TNHLPHbvx4ZTCQGwQzXPypenjc,38406
2
+ cesnet_datazoo/config.py,sha256=wkpD_OL3gRXX2t0WDfDMsBD2A3vEdAjSm4yXhzsR8T0,38536
3
3
  cesnet_datazoo/constants.py,sha256=6GhcIyjVnWYrVnxRgTlGuiWRtvwZL1KqyzMJS26ge2E,1481
4
4
  cesnet_datazoo/datasets/__init__.py,sha256=8ziQ3EUzUh5fMfWWXwk0cqYk0lOUNU7zbi0Gom3bLnI,443
5
- cesnet_datazoo/datasets/cesnet_dataset.py,sha256=DtC597aRm4n8rlbVknG25yd9rsCqRG94jn7xMxZoC1g,46635
5
+ cesnet_datazoo/datasets/cesnet_dataset.py,sha256=14uKWWSGIkH3GM_BDUSYyCIoOh1L-I4bH0zu0m3DkkQ,46988
6
6
  cesnet_datazoo/datasets/datasets.py,sha256=Bn4SU1k5og6AsUlnPapFPeu4uGlpRH-IaOSafz0ZT2k,3617
7
7
  cesnet_datazoo/datasets/datasets_constants.py,sha256=1P54Ns8wCQMemdKNe8OH7cVUfkxs3vL29ugSmOLXceI,29154
8
8
  cesnet_datazoo/datasets/loaders.py,sha256=9KgRY-Y8CcgtXbgqWpAaG7gyOAsSf278w7b1eHwTSyE,1854
@@ -15,16 +15,16 @@ cesnet_datazoo/metrics/classification_report.py,sha256=kqVW35uEctTiWpMqxhWzOmmDk
15
15
  cesnet_datazoo/metrics/provider_metrics.py,sha256=zoX0ps8BzEs3ml70g9dWWeLPflNAKUGYOEFYqdBbNY4,1374
16
16
  cesnet_datazoo/pytables_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  cesnet_datazoo/pytables_data/apps_split.py,sha256=RjLFomrlBCmnBn08FDw1IzL3PuQf4914yJQzwhiXH_E,1411
18
- cesnet_datazoo/pytables_data/data_scalers.py,sha256=ednTRVl-sjrFLX6vwzCuPLJDpFuwNWDlJz7msV3yM9M,5083
18
+ cesnet_datazoo/pytables_data/data_scalers.py,sha256=gW75d-DGBokMKNUwM_5A3W3XCZ12WYXefGtpD8xYf1Y,5236
19
19
  cesnet_datazoo/pytables_data/indices_setup.py,sha256=M5J2BevkQK8fuC22vUauKyKAEVwYg8xRz9JJK8E1VX8,13717
20
- cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=NkN0PKKUIiAhrGFM9OCR0s48TH66JLzZLiaIOE6d7AE,19413
20
+ cesnet_datazoo/pytables_data/pytables_dataset.py,sha256=YGbzYKrSklCu3J52Xbdcs3zZsYroBBtP8ulgS1c5Fnw,19431
21
21
  cesnet_datazoo/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  cesnet_datazoo/utils/class_info.py,sha256=H5UgyRqXIepBJmkLQ1gAIXV4owKSoIllguRiqFTu5XU,2462
23
23
  cesnet_datazoo/utils/download.py,sha256=hG5V1ZYZGtqCzlVV76NMgOZkSKOywdOFiq9Lagkgego,1441
24
24
  cesnet_datazoo/utils/fileutils.py,sha256=XA_VWDuTiCXnoOgHPUzsmbnLFgrlxOo5cvUY_OBJUR8,642
25
25
  cesnet_datazoo/utils/random.py,sha256=Dqgm_T25ljbew-OJozK90PsiXKnd4Kw6lcUexxF6vIc,575
26
- cesnet_datazoo-0.1.4.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
- cesnet_datazoo-0.1.4.dist-info/METADATA,sha256=8HeN2mch9VTCpeBr66ZgwrdJyrS53y4INsiU0Rhgcts,12964
28
- cesnet_datazoo-0.1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
29
- cesnet_datazoo-0.1.4.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
- cesnet_datazoo-0.1.4.dist-info/RECORD,,
26
+ cesnet_datazoo-0.1.6.dist-info/LICENCE,sha256=69Wc69APiM1YKrFOIipG7jjU2lk89WQuO_U0AXKU8KE,1541
27
+ cesnet_datazoo-0.1.6.dist-info/METADATA,sha256=UDz2g74cJrx_NLbCyPvvLEwE02x4HI8-qhDMgutKb8Q,12964
28
+ cesnet_datazoo-0.1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
29
+ cesnet_datazoo-0.1.6.dist-info/top_level.txt,sha256=bu1Z8zaI_1Id_ZaYyvJnxIBa87OSrdlZ8J2OBMggK5o,15
30
+ cesnet_datazoo-0.1.6.dist-info/RECORD,,