cesnet-datazoo 0.0.7__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/PKG-INFO +1 -1
  2. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/cesnet_dataset.py +30 -12
  3. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/loaders.py +6 -5
  4. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/metadata/metadata.csv +1 -1
  5. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/statistics.py +8 -13
  6. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/utils/download.py +5 -4
  7. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo.egg-info/PKG-INFO +1 -1
  8. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/pyproject.toml +1 -1
  9. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/LICENCE +0 -0
  10. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/README.md +0 -0
  11. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/__init__.py +0 -0
  12. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/config.py +0 -0
  13. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/constants.py +0 -0
  14. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/__init__.py +0 -0
  15. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/datasets.py +0 -0
  16. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/metadata/__init__.py +0 -0
  17. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/metadata/dataset_metadata.py +0 -0
  18. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/metrics/__init__.py +0 -0
  19. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/metrics/classification_report.py +0 -0
  20. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/metrics/superclass_metrics.py +0 -0
  21. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/pytables_data/__init__.py +0 -0
  22. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/pytables_data/apps_split.py +0 -0
  23. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/pytables_data/indices_setup.py +0 -0
  24. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/pytables_data/pytables_dataset.py +0 -0
  25. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/utils/__init__.py +0 -0
  26. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/utils/class_info.py +0 -0
  27. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/utils/fileutils.py +0 -0
  28. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/utils/random.py +0 -0
  29. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo.egg-info/SOURCES.txt +0 -0
  30. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo.egg-info/dependency_links.txt +0 -0
  31. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo.egg-info/requires.txt +0 -0
  32. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo.egg-info/top_level.txt +0 -0
  33. {cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -62,6 +62,7 @@ class CesnetDataset():
62
62
  Parameters:
63
63
  data_root: Path to the folder where the dataset will be stored. Each dataset size has its own subfolder `data_root/size`
64
64
  size: Size of the dataset. Options are `XS`, `S`, `M`, `L`, `ORIG`.
65
+ silent: Whether to suppress print and tqdm output.
65
66
 
66
67
  Attributes:
67
68
  name: Name of the dataset.
@@ -110,9 +111,10 @@ class CesnetDataset():
110
111
  metadata: DatasetMetadata
111
112
  available_dates: list[str]
112
113
  time_periods: dict[str, list[str]]
113
- time_periods_gen: bool = False
114
114
  default_train_period: str
115
115
  default_test_period: str
116
+ time_periods_gen: bool = False
117
+ silent: bool = False
116
118
 
117
119
  dataset_config: Optional[DatasetConfig] = None
118
120
  class_info: Optional[ClassInfo] = None
@@ -137,7 +139,8 @@ class CesnetDataset():
137
139
  val_dataloader: Optional[DataLoader] = None
138
140
  test_dataloader: Optional[DataLoader] = None
139
141
 
140
- def __init__(self, data_root: str, size: str = "S", skip_dataset_read_at_init: bool = False) -> None:
142
+ def __init__(self, data_root: str, size: str = "S", skip_dataset_read_at_init: bool = False, silent: bool = False) -> None:
143
+ self.silent = silent
141
144
  self.metadata = load_metadata(self.name)
142
145
  self.size = size
143
146
  if self.size != "ORIG":
@@ -161,9 +164,9 @@ class CesnetDataset():
161
164
  for p in tables_paths:
162
165
  num_samples += len(database.get_node(p))
163
166
  if self.size == "ORIG":
164
- assert num_samples == self.metadata.available_samples; f"Expected {self.metadata.available_samples} samples, got {num_samples} in the database"
167
+ assert num_samples == self.metadata.available_samples, f"Expected {self.metadata.available_samples} samples, got {num_samples} in the database"
165
168
  else:
166
- assert num_samples == DATASET_SIZES[self.size]; f"Expected {DATASET_SIZES[self.size]} samples, got {num_samples} in the database"
169
+ assert num_samples == DATASET_SIZES[self.size], f"Expected {DATASET_SIZES[self.size]} samples, got {num_samples} in the database"
167
170
  self.available_dates = list(map(lambda x: x.removeprefix("/flows/D"), tables_paths))
168
171
  else:
169
172
  self.available_dates = []
@@ -335,7 +338,7 @@ class CesnetDataset():
335
338
  train_dataloader.sampler.sampler = SequentialSampler(self.train_dataset)
336
339
  train_dataloader.sampler.drop_last = False
337
340
  feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
338
- df = create_df_from_dataloader(dataloader=train_dataloader, feature_names=feature_names, flatten_ppi=flatten_ppi)
341
+ df = create_df_from_dataloader(dataloader=train_dataloader, feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
339
342
  # Restore the original dataloader sampler and drop_last
340
343
  train_dataloader.sampler.sampler = self.train_dataloader_sampler
341
344
  train_dataloader.sampler.drop_last = self.train_dataloader_drop_last
@@ -360,7 +363,7 @@ class CesnetDataset():
360
363
  if len(self.val_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
361
364
  warnings.warn(f"Validation set has ({len(self.val_dataset)} samples), consider using get_val_dataloader() instead")
362
365
  feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
363
- return create_df_from_dataloader(dataloader=self.get_val_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi)
366
+ return create_df_from_dataloader(dataloader=self.get_val_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
364
367
 
365
368
  def get_test_df(self, flatten_ppi: bool = False) -> pd.DataFrame:
366
369
  """
@@ -386,7 +389,7 @@ class CesnetDataset():
386
389
  if len(self.test_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
387
390
  warnings.warn(f"Test set has ({len(self.test_dataset)} samples), consider using get_test_dataloader() instead")
388
391
  feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
389
- return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi)
392
+ return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
390
393
 
391
394
  def compute_dataset_statistics(self, num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, disabled_apps: Optional[list[str]] = None)-> None:
392
395
  """
@@ -410,34 +413,39 @@ class CesnetDataset():
410
413
  disabled_apps=disabled_apps if disabled_apps is not None else [],
411
414
  num_samples=num_samples,
412
415
  num_workers=num_workers,
413
- batch_size=batch_size)
416
+ batch_size=batch_size,
417
+ silent=self.silent)
414
418
 
415
419
  def _generate_time_periods(self) -> None:
420
+ time_periods = {}
416
421
  for period in self.time_periods:
422
+ time_periods[period] = []
417
423
  if period.startswith("W"):
418
424
  split = period.split("-")
419
425
  collection_year, week = int(split[1]), int(split[2])
420
426
  for d in range(1, 8):
421
427
  s = datetime.date.fromisocalendar(collection_year, week, d).strftime("%Y%m%d")
422
428
  if s not in self.metadata.missing_dates_in_collection_period:
423
- self.time_periods[period].append(s)
429
+ time_periods[period].append(s)
424
430
  if period.startswith("M"):
425
431
  split = period.split("-")
426
432
  collection_year, month = int(split[1]), int(split[2])
427
433
  for d in range(1, calendar.monthrange(collection_year, month)[1]):
428
434
  s = datetime.date(collection_year, month, d).strftime("%Y%m%d")
429
435
  if s not in self.metadata.missing_dates_in_collection_period:
430
- self.time_periods[period].append(s)
436
+ time_periods[period].append(s)
437
+ self.time_periods = time_periods
431
438
 
432
439
  def _is_downloaded(self) -> bool:
433
440
  """Servicemap is downloaded after the database; thus if it exists, the database is also downloaded"""
434
441
  return os.path.exists(self.servicemap_path) and os.path.exists(self.database_path)
435
442
 
436
443
  def _download(self) -> None:
437
- print(f"Downloading {self.name} dataset")
444
+ if not self.silent:
445
+ print(f"Downloading {self.name} dataset")
438
446
  database_url = f"{self.bucket_url}&file={self.database_filename}"
439
447
  servicemap_url = f"{self.bucket_url}&file={SERVICEMAP_FILE}"
440
- resumable_download(url=database_url, file_path=self.database_path)
448
+ resumable_download(url=database_url, file_path=self.database_path, silent=self.silent)
441
449
  simple_download(url=servicemap_url, file_path=self.servicemap_path)
442
450
 
443
451
  def _clear(self) -> None:
@@ -487,6 +495,8 @@ class CesnetDataset():
487
495
  num_samples = dataset_config.train_size + dataset_config.val_known_size
488
496
  else:
489
497
  num_samples = dataset_config.train_size
498
+ if num_samples > len(train_indices):
499
+ raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
490
500
  train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
491
501
  # Obtain validation indices based on the selected approach
492
502
  if dataset_config.val_approach == ValidationApproach.VALIDATION_DATES:
@@ -499,11 +509,19 @@ class CesnetDataset():
499
509
  if dataset_config.train_dates_weigths is not None:
500
510
  assert dataset_config.val_known_size != "all"
501
511
  # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
512
+ if dataset_config.val_known_size > len(train_indices):
513
+ raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples after weight sampling ({len(train_indices)})")
502
514
  train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.val_known_size, stratify=train_labels, shuffle=True, random_state=train_val_rng)
503
515
  dataset_config.train_size = len(train_indices)
504
516
  elif dataset_config.train_size == "all" and dataset_config.val_known_size == "all":
505
517
  train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.train_val_split_fraction, stratify=train_labels, shuffle=True, random_state=train_val_rng)
506
518
  else:
519
+ if dataset_config.val_known_size != "all" and dataset_config.train_size != "all" and dataset_config.train_size + dataset_config.val_known_size > len(train_indices):
520
+ raise ValueError(f"Requested train size + validation size ({dataset_config.train_size + dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
521
+ if dataset_config.train_size != "all" and dataset_config.train_size > len(train_indices):
522
+ raise ValueError(f"Requested train size ({dataset_config.train_size}) is larger than the number of available train samples ({len(train_indices)})")
523
+ if dataset_config.val_known_size != "all" and dataset_config.val_known_size > len(train_indices):
524
+ raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
507
525
  train_indices, val_known_indices = train_test_split(train_indices,
508
526
  train_size=dataset_config.train_size if dataset_config.train_size != "all" else None,
509
527
  test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,
@@ -7,12 +7,13 @@ from tqdm import tqdm
7
7
  from cesnet_datazoo.constants import APP_COLUMN
8
8
 
9
9
 
10
- def load_from_dataloader(dataloader: DataLoader) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
10
+ def load_from_dataloader(dataloader: DataLoader, silent: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
11
11
  data_ppi = []
12
12
  data_flowstats = []
13
13
  labels = []
14
- print("Loading data from dataloader")
15
- for batch_ppi, batch_flowstats, batch_labels in tqdm(dataloader, total=len(dataloader)):
14
+ if not silent:
15
+ print("Loading data from dataloader")
16
+ for batch_ppi, batch_flowstats, batch_labels in tqdm(dataloader, total=len(dataloader), disable=silent):
16
17
  data_ppi.append(batch_ppi)
17
18
  data_flowstats.append(batch_flowstats)
18
19
  labels.append(batch_labels)
@@ -21,8 +22,8 @@ def load_from_dataloader(dataloader: DataLoader) -> tuple[np.ndarray, np.ndarray
21
22
  labels = np.concatenate(labels)
22
23
  return data_ppi, data_flowstats, labels
23
24
 
24
- def create_df_from_dataloader(dataloader: DataLoader, feature_names: list[str], flatten_ppi: bool = False) -> pd.DataFrame:
25
- data_ppi, data_flowstats, labels = load_from_dataloader(dataloader)
25
+ def create_df_from_dataloader(dataloader: DataLoader, feature_names: list[str], flatten_ppi: bool = False, silent: bool = False) -> pd.DataFrame:
26
+ data_ppi, data_flowstats, labels = load_from_dataloader(dataloader, silent=silent)
26
27
  if flatten_ppi:
27
28
  data_ppi = data_ppi.reshape(data_ppi.shape[0], -1)
28
29
  data = np.column_stack((data_ppi, data_flowstats))
@@ -1,4 +1,4 @@
1
1
  Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic,Features in packet sequences,Packet histogram features,Flowstats features,TCP features,Other fields,Cite,Zenodo URL,Related papers
2
2
  CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE",,"BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
3
3
  CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
4
- CESNET-TLS-Year22,TLS,2023,2022,1 year,507739322,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",182,,"IPT, DIR, SIZE, PUSH_FLAG","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
4
+ CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",182,,"IPT, DIR, SIZE, PUSH_FLAG","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
@@ -12,16 +12,12 @@ from torch.utils.data import BatchSampler, DataLoader, SequentialSampler
12
12
  from tqdm import tqdm
13
13
 
14
14
  from cesnet_datazoo.config import Protocol
15
- from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN,
16
- FLOWEND_REASON_FEATURES, PHISTS_FEATURES,
17
- PPI_COLUMN, SIZE_POS)
15
+ from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASON_FEATURES,
16
+ PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
18
17
  from cesnet_datazoo.pytables_data.indices_setup import sort_indices
19
- from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset,
20
- list_all_tables,
18
+ from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
21
19
  worker_init_fn)
22
20
 
23
- log = logging.getLogger(__name__)
24
-
25
21
 
26
22
  def pick_quic_fields(batch):
27
23
  return (
@@ -46,7 +42,7 @@ def pick_stats_fields(batch, flowstats_features: list[str]):
46
42
  def simple_collate_fn(batch):
47
43
  return batch
48
44
 
49
- def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096):
45
+ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
50
46
  stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
51
47
  stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
52
48
  categories_csv_path = os.path.join(output_dir, "categories.csv")
@@ -72,7 +68,8 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
72
68
  feature_packets_total = []
73
69
  feature_bytes_total = []
74
70
  packet_sizes_counter = Counter()
75
- log.info(f"Reading data from {database_path} for statistics")
71
+ if not silent:
72
+ print(f"Reading data from {database_path} for statistics")
76
73
  table_names = list_all_tables(database_path)
77
74
  stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_names, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
78
75
  if num_samples != "all":
@@ -91,7 +88,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
91
88
  if num_workers == 0:
92
89
  stats_dataset.pytables_worker_init()
93
90
 
94
- for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader)):
91
+ for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader), disable=silent):
95
92
  ppi, duration, packets_total, bytes_total, asn, phist, flowend_reason, app, cat = pick_stats_fields(batch, flowstats_features=flowstats_features)
96
93
  # Saving feature values for distribution plots
97
94
  feature_duration.append(duration)
@@ -127,6 +124,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
127
124
  df_flowstats = pd.DataFrame(data={"FLOW DURATION": feature_duration, "FLOW BYTE VOLUME": feature_bytes_total, "FLOW LENGTH": feature_packets_total}).describe()
128
125
  df_flowstats.to_csv(stats_csv_path)
129
126
  # Categories tikzpicture and csv output
127
+ stats_dataset.pytables_worker_init() # to get access to cat enum; TODO implement better
130
128
  df_categories.index = df_categories.index.map(stats_dataset.get_cat_enum())
131
129
  df_categories = df_categories.drop("default", errors="ignore")
132
130
  df_categories["FLOW_PERC"] = df_categories["FLOW_COUNT"] / sum(df_categories["FLOW_COUNT"]) * 100
@@ -236,6 +234,3 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
236
234
  plt.tight_layout()
237
235
  fig.show()
238
236
  fig.savefig(stats_pdf_path, bbox_inches="tight")
239
-
240
- def compute_known_unknown_statistics():
241
- raise NotImplementedError
@@ -9,7 +9,7 @@ def simple_download(url: str, file_path: str):
9
9
  with open(file_path, "wb") as file:
10
10
  file.write(r.content)
11
11
 
12
- def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2):
12
+ def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2, silent: bool = False):
13
13
  r1 = requests.get(url, stream=True)
14
14
  redirected_url = r1.url
15
15
  content_size = int(r1.headers["Content-Length"])
@@ -23,10 +23,11 @@ def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2):
23
23
 
24
24
  headers = {"Range": f"bytes={temp_size}-"}
25
25
  r2 = requests.get(redirected_url, stream=True, headers=headers)
26
- print(f"File size: {content_size / (1024**3):0.2f}GB")
27
- print(f"Remaining: {(content_size - temp_size) / (1024**3):0.2f}GB")
26
+ if not silent:
27
+ print(f"File size: {content_size / (1024**3):0.2f}GB")
28
+ print(f"Remaining: {(content_size - temp_size) / (1024**3):0.2f}GB")
28
29
 
29
- progress_bar = tqdm(total=content_size - temp_size, unit="B", unit_scale=True, unit_divisor=1024)
30
+ progress_bar = tqdm(total=content_size - temp_size, unit="B", unit_scale=True, unit_divisor=1024, disable=silent)
30
31
  with open(file_path, "ab") as file:
31
32
  for data in r2.iter_content(chunk_size=chunk_size):
32
33
  file.write(data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cesnet-datazoo
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: A toolkit for large network traffic datasets
5
5
  Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
6
6
  Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cesnet-datazoo"
7
- version = "0.0.7"
7
+ version = "0.0.9"
8
8
  authors = [
9
9
  {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
10
10
  {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},
File without changes
File without changes
File without changes