cesnet-datazoo 0.1.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/PKG-INFO +1 -1
  2. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/cesnet_dataset.py +6 -4
  3. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/data_scalers.py +13 -11
  4. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/pytables_dataset.py +2 -4
  5. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/PKG-INFO +1 -1
  6. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/pyproject.toml +1 -1
  7. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/LICENCE +0 -0
  8. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/README.md +0 -0
  9. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/__init__.py +0 -0
  10. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/config.py +0 -0
  11. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/constants.py +0 -0
  12. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/__init__.py +0 -0
  13. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/datasets.py +0 -0
  14. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/datasets_constants.py +0 -0
  15. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/loaders.py +0 -0
  16. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  17. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
  18. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/metadata/metadata.csv +0 -0
  19. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/datasets/statistics.py +0 -0
  20. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/__init__.py +0 -0
  21. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/classification_report.py +0 -0
  22. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/metrics/provider_metrics.py +0 -0
  23. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  24. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  25. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/pytables_data/indices_setup.py +0 -0
  26. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/__init__.py +0 -0
  27. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/class_info.py +0 -0
  28. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/download.py +0 -0
  29. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/fileutils.py +0 -0
  30. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo/utils/random.py +0 -0
  31. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
  32. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  33. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/requires.txt +0 -0
  34. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  35. {cesnet_datazoo-0.1.4 → cesnet_datazoo-0.1.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
  import tables as tb
12
12
  import torch
13
+ from numpy.lib.recfunctions import repack_fields
13
14
  from sklearn.model_selection import train_test_split
14
15
  from sklearn.preprocessing import LabelEncoder
15
16
  from torch.utils.data import BatchSampler, DataLoader, RandomSampler, Sampler, SequentialSampler
@@ -17,7 +18,8 @@ from typing_extensions import assert_never
17
18
 
18
19
  from cesnet_datazoo.config import AppSelection, DataLoaderOrder, DatasetConfig, ValidationApproach
19
20
  from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, DATASET_SIZES, INDICES_APP_FIELD,
20
- SERVICEMAP_FILE, UNKNOWN_STR_LABEL)
21
+ INDICES_INDEX_FIELD, INDICES_TABLE_FIELD, SERVICEMAP_FILE,
22
+ UNKNOWN_STR_LABEL)
21
23
  from cesnet_datazoo.datasets.loaders import collate_fn_simple, create_df_from_dataloader
22
24
  from cesnet_datazoo.datasets.metadata.dataset_metadata import DatasetMetadata, load_metadata
23
25
  from cesnet_datazoo.datasets.statistics import compute_dataset_statistics
@@ -619,7 +621,7 @@ class CesnetDataset():
619
621
  train_dataset = PyTablesDataset(
620
622
  database_path=dataset_config.database_path,
621
623
  tables_paths=dataset_config._get_train_tables_paths(),
622
- indices=dataset_indices.train_indices,
624
+ indices=repack_fields(dataset_indices.train_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
623
625
  tables_app_enum=self._tables_app_enum,
624
626
  tables_cat_enum=self._tables_cat_enum,
625
627
  flowstats_features=dataset_config.flowstats_features,
@@ -638,7 +640,7 @@ class CesnetDataset():
638
640
  val_dataset = PyTablesDataset(
639
641
  database_path=dataset_config.database_path,
640
642
  tables_paths=dataset_config._get_val_tables_paths(),
641
- indices=dataset_indices.val_known_indices,
643
+ indices=repack_fields(dataset_indices.val_known_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
642
644
  tables_app_enum=self._tables_app_enum,
643
645
  tables_cat_enum=self._tables_cat_enum,
644
646
  flowstats_features=dataset_config.flowstats_features,
@@ -659,7 +661,7 @@ class CesnetDataset():
659
661
  test_dataset = PyTablesDataset(
660
662
  database_path=dataset_config.database_path,
661
663
  tables_paths=dataset_config._get_test_tables_paths(),
662
- indices=test_combined_indices,
664
+ indices=repack_fields(test_combined_indices[[INDICES_TABLE_FIELD, INDICES_INDEX_FIELD]]), # type: ignore
663
665
  tables_app_enum=self._tables_app_enum,
664
666
  tables_cat_enum=self._tables_cat_enum,
665
667
  flowstats_features=dataset_config.flowstats_features,
@@ -46,18 +46,20 @@ def fit_scalers(dataset_config: DatasetConfig, train_indices: np.ndarray) -> Non
46
46
  data_ppi = data_ppi.transpose(0, 2, 1).reshape(-1, ppi_channels)
47
47
  padding_mask = data_ppi[:, DIR_POS] == 0 # Mask of padded packets
48
48
  # Fit IPT scaler
49
- train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
50
- train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
51
- if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
52
- # Let zero be the minimum for minmax scaling
53
- train_ipt = np.concatenate((train_ipt, [0]))
54
- clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
49
+ if clip_and_scale_ppi_transform.ipt_scaler:
50
+ train_ipt = data_ppi[:, IPT_POS].clip(max=clip_and_scale_ppi_transform.ipt_max, min=clip_and_scale_ppi_transform.ipt_min)
51
+ train_ipt[padding_mask] = np.nan # NaNs are ignored in sklearn scalers
52
+ if isinstance(clip_and_scale_ppi_transform.ipt_scaler, MinMaxScaler):
53
+ # Let zero be the minimum for minmax scaling
54
+ train_ipt = np.concatenate((train_ipt, [0]))
55
+ clip_and_scale_ppi_transform.ipt_scaler.fit(train_ipt.reshape(-1, 1))
55
56
  # Fit packet sizes scaler
56
- train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
57
- train_psizes[padding_mask] = np.nan
58
- if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
59
- train_psizes = np.concatenate((train_psizes, [0]))
60
- clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
57
+ if clip_and_scale_ppi_transform.psizes_scaler:
58
+ train_psizes = data_ppi[:, SIZE_POS].clip(max=clip_and_scale_ppi_transform.psizes_max, min=clip_and_scale_ppi_transform.pszies_min)
59
+ train_psizes[padding_mask] = np.nan
60
+ if isinstance(clip_and_scale_ppi_transform.psizes_scaler, MinMaxScaler):
61
+ train_psizes = np.concatenate((train_psizes, [0]))
62
+ clip_and_scale_ppi_transform.psizes_scaler.fit(train_psizes.reshape(-1, 1))
61
63
  clip_and_scale_ppi_transform.needs_fitting = False
62
64
  json.dump(clip_and_scale_ppi_transform.to_dict(), open(os.path.join(train_data_path, "transforms", "ppi-transform.json"), "w"), indent=4)
63
65
 
@@ -158,11 +158,9 @@ class PyTablesDataset(Dataset):
158
158
  for i in range(len(tables)):
159
159
  base_labels[i] = tables[i].read(field=APP_COLUMN)
160
160
  base_indices[i] = np.nonzero(np.isin(base_labels[i], disabled_apps_ids, invert=True))[0]
161
- indices = np.column_stack((
161
+ indices = np.array(list(zip(
162
162
  np.concatenate([[table_id] * len(base_indices[table_id]) for table_id in tables]),
163
- np.concatenate(list(base_indices.values())),
164
- np.concatenate(list(base_labels.values()))
165
- )).astype(np.int32)
163
+ np.concatenate(list(base_indices.values())))), dtype=[field for field in INDICES_DTYPE if field[0] in [INDICES_INDEX_FIELD, INDICES_TABLE_FIELD]])
166
164
  self.indices = indices
167
165
  database.close()
168
166
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.1.4"
7
+ version = "0.1.5"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
File without changes
File without changes
File without changes