PyPI - cesnet-datazoo - Versions diffs - 0.0.7__tar.gz → 0.0.9__tar.gz - Mend

cesnet-datazoo 0.0.7tar.gz → 0.0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cesnet-datazoo
-Version: 0.0.7
+Version: 0.0.9
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/cesnet_dataset.py RENAMED Viewed

@@ -62,6 +62,7 @@ class CesnetDataset():
     Parameters:
         data_root: Path to the folder where the dataset will be stored. Each dataset size has its own subfolder `data_root/size`
         size: Size of the dataset. Options are `XS`, `S`, `M`, `L`, `ORIG`.
+        silent: Whether to suppress print and tqdm output.
     Attributes:
         name: Name of the dataset.
@@ -110,9 +111,10 @@ class CesnetDataset():
     metadata: DatasetMetadata
     available_dates: list[str]
     time_periods: dict[str, list[str]]
-    time_periods_gen: bool = False
     default_train_period: str
     default_test_period: str
+    time_periods_gen: bool = False
+    silent: bool = False
     dataset_config: Optional[DatasetConfig] = None
     class_info: Optional[ClassInfo] = None
@@ -137,7 +139,8 @@ class CesnetDataset():
     val_dataloader: Optional[DataLoader] = None
     test_dataloader: Optional[DataLoader] = None
-    def __init__(self, data_root: str, size: str = "S", skip_dataset_read_at_init: bool = False) -> None:
+    def __init__(self, data_root: str, size: str = "S", skip_dataset_read_at_init: bool = False, silent: bool = False) -> None:
+        self.silent = silent
         self.metadata = load_metadata(self.name)
         self.size = size
         if self.size != "ORIG":
@@ -161,9 +164,9 @@ class CesnetDataset():
                 for p in tables_paths:
                     num_samples += len(database.get_node(p))
                 if self.size == "ORIG":
-                    assert num_samples == self.metadata.available_samples; f"Expected {self.metadata.available_samples} samples, got {num_samples} in the database"
+                    assert num_samples == self.metadata.available_samples, f"Expected {self.metadata.available_samples} samples, got {num_samples} in the database"
                 else:
-                    assert num_samples == DATASET_SIZES[self.size]; f"Expected {DATASET_SIZES[self.size]} samples, got {num_samples} in the database"
+                    assert num_samples == DATASET_SIZES[self.size], f"Expected {DATASET_SIZES[self.size]} samples, got {num_samples} in the database"
                 self.available_dates = list(map(lambda x: x.removeprefix("/flows/D"), tables_paths))
         else:
             self.available_dates = []
@@ -335,7 +338,7 @@ class CesnetDataset():
         train_dataloader.sampler.sampler = SequentialSampler(self.train_dataset)
         train_dataloader.sampler.drop_last = False
         feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
-        df = create_df_from_dataloader(dataloader=train_dataloader, feature_names=feature_names, flatten_ppi=flatten_ppi)
+        df = create_df_from_dataloader(dataloader=train_dataloader, feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
         # Restore the original dataloader sampler and drop_last
         train_dataloader.sampler.sampler = self.train_dataloader_sampler
         train_dataloader.sampler.drop_last = self.train_dataloader_drop_last
@@ -360,7 +363,7 @@ class CesnetDataset():
         if len(self.val_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Validation set has ({len(self.val_dataset)} samples), consider using get_val_dataloader() instead")
         feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
-        return create_df_from_dataloader(dataloader=self.get_val_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi)
+        return create_df_from_dataloader(dataloader=self.get_val_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
     def get_test_df(self, flatten_ppi: bool = False) -> pd.DataFrame:
         """
@@ -386,7 +389,7 @@ class CesnetDataset():
         if len(self.test_dataset) > DATAFRAME_SAMPLES_WARNING_THRESHOLD:
             warnings.warn(f"Test set has ({len(self.test_dataset)} samples), consider using get_test_dataloader() instead")
         feature_names = self.dataset_config.get_feature_names(flatten_ppi=flatten_ppi)
-        return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi)
+        return create_df_from_dataloader(dataloader=self.get_test_dataloader(), feature_names=feature_names, flatten_ppi=flatten_ppi, silent=self.silent)
     def compute_dataset_statistics(self, num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, disabled_apps: Optional[list[str]] = None)-> None:
         """
@@ -410,34 +413,39 @@ class CesnetDataset():
                                    disabled_apps=disabled_apps if disabled_apps is not None else [],
                                    num_samples=num_samples,
                                    num_workers=num_workers,
-                                   batch_size=batch_size)
+                                   batch_size=batch_size,
+                                   silent=self.silent)
     def _generate_time_periods(self) -> None:
+        time_periods = {}
         for period in self.time_periods:
+            time_periods[period] = []
             if period.startswith("W"):
                 split = period.split("-")
                 collection_year, week = int(split[1]), int(split[2])
                 for d in range(1, 8):
                     s = datetime.date.fromisocalendar(collection_year, week, d).strftime("%Y%m%d")
                     if s not in self.metadata.missing_dates_in_collection_period:
-                        self.time_periods[period].append(s)
+                        time_periods[period].append(s)
             if period.startswith("M"):
                 split = period.split("-")
                 collection_year, month = int(split[1]), int(split[2])
                 for d in range(1, calendar.monthrange(collection_year, month)[1]):
                     s = datetime.date(collection_year, month, d).strftime("%Y%m%d")
                     if s not in self.metadata.missing_dates_in_collection_period:
-                        self.time_periods[period].append(s)
+                        time_periods[period].append(s)
+        self.time_periods = time_periods
     def _is_downloaded(self) -> bool:
         """Servicemap is downloaded after the database; thus if it exists, the database is also downloaded"""
         return os.path.exists(self.servicemap_path) and os.path.exists(self.database_path)
     def _download(self) -> None:
-        print(f"Downloading {self.name} dataset")
+        if not self.silent:
+            print(f"Downloading {self.name} dataset")
         database_url = f"{self.bucket_url}&file={self.database_filename}"
         servicemap_url = f"{self.bucket_url}&file={SERVICEMAP_FILE}"
-        resumable_download(url=database_url, file_path=self.database_path)
+        resumable_download(url=database_url, file_path=self.database_path, silent=self.silent)
         simple_download(url=servicemap_url, file_path=self.servicemap_path)
     def _clear(self) -> None:
@@ -487,6 +495,8 @@ class CesnetDataset():
                 num_samples = dataset_config.train_size + dataset_config.val_known_size
             else:
                 num_samples = dataset_config.train_size
+            if num_samples > len(train_indices):
+                raise ValueError(f"Requested number of samples for weight sampling ({num_samples}) is larger than the number of available train samples ({len(train_indices)})")
             train_indices = date_weight_sample_train_indices(dataset_config=dataset_config, train_indices=train_indices, num_samples=num_samples)
         # Obtain validation indices based on the selected approach
         if dataset_config.val_approach == ValidationApproach.VALIDATION_DATES:
@@ -499,11 +509,19 @@ class CesnetDataset():
             if dataset_config.train_dates_weigths is not None:
                 assert dataset_config.val_known_size != "all"
                 # When weight sampling is used, val_known_size is kept but the resulting train size can be smaller due to no enough samples in some train dates
+                if dataset_config.val_known_size > len(train_indices):
+                    raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples after weight sampling ({len(train_indices)})")
                 train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.val_known_size, stratify=train_labels, shuffle=True, random_state=train_val_rng)
                 dataset_config.train_size = len(train_indices)
             elif dataset_config.train_size == "all" and dataset_config.val_known_size == "all":
                 train_indices, val_known_indices = train_test_split(train_indices, test_size=dataset_config.train_val_split_fraction, stratify=train_labels, shuffle=True, random_state=train_val_rng)
             else:
+                if dataset_config.val_known_size != "all" and  dataset_config.train_size != "all" and dataset_config.train_size + dataset_config.val_known_size > len(train_indices):
+                    raise ValueError(f"Requested train size + validation size ({dataset_config.train_size + dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
+                if dataset_config.train_size != "all" and dataset_config.train_size > len(train_indices):
+                    raise ValueError(f"Requested train size ({dataset_config.train_size}) is larger than the number of available train samples ({len(train_indices)})")
+                if dataset_config.val_known_size != "all" and dataset_config.val_known_size > len(train_indices):
+                    raise ValueError(f"Requested validation size ({dataset_config.val_known_size}) is larger than the number of available train samples ({len(train_indices)})")
                 train_indices, val_known_indices = train_test_split(train_indices,
                                                                           train_size=dataset_config.train_size if dataset_config.train_size != "all" else None,
                                                                           test_size=dataset_config.val_known_size if dataset_config.val_known_size != "all" else None,

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/loaders.py RENAMED Viewed

@@ -7,12 +7,13 @@ from tqdm import tqdm
 from cesnet_datazoo.constants import APP_COLUMN
-def load_from_dataloader(dataloader: DataLoader) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+def load_from_dataloader(dataloader: DataLoader, silent: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     data_ppi = []
     data_flowstats = []
     labels = []
-    print("Loading data from dataloader")
-    for batch_ppi, batch_flowstats, batch_labels in tqdm(dataloader, total=len(dataloader)):
+    if not silent:
+        print("Loading data from dataloader")
+    for batch_ppi, batch_flowstats, batch_labels in tqdm(dataloader, total=len(dataloader), disable=silent):
         data_ppi.append(batch_ppi)
         data_flowstats.append(batch_flowstats)
         labels.append(batch_labels)
@@ -21,8 +22,8 @@ def load_from_dataloader(dataloader: DataLoader) -> tuple[np.ndarray, np.ndarray
     labels = np.concatenate(labels)
     return data_ppi, data_flowstats, labels
-def create_df_from_dataloader(dataloader: DataLoader, feature_names: list[str], flatten_ppi: bool = False) -> pd.DataFrame:
-    data_ppi, data_flowstats, labels = load_from_dataloader(dataloader)
+def create_df_from_dataloader(dataloader: DataLoader, feature_names: list[str], flatten_ppi: bool = False, silent: bool = False) -> pd.DataFrame:
+    data_ppi, data_flowstats, labels = load_from_dataloader(dataloader, silent=silent)
     if flatten_ppi:
         data_ppi = data_ppi.reshape(data_ppi.shape[0], -1)
         data = np.column_stack((data_ppi, data_flowstats))

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/metadata/metadata.csv RENAMED Viewed

@@ -1,4 +1,4 @@
 Name,Protocol,Published in,Collected in,Collection duration,Available samples,Available dataset sizes,Collection period,Missing dates in collection period,Application count,Background traffic,Features in packet sequences,Packet histogram features,Flowstats features,TCP features,Other fields,Cite,Zenodo URL,Related papers
 CESNET-TLS22,TLS,2022,2021,2 weeks,141720670,"XS, S, M, L",4.10.2021 - 17.10.2021,,191,,"IPT, DIR, SIZE",,"BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV",ID,https://doi.org/10.1016/j.comnet.2022.109467,https://zenodo.org/record/7965515,
 CESNET-QUIC22,QUIC,2023,2022,4 weeks,153226273,"XS, S, M, L",31.10.2022 - 27.11.2022,,102,"default-background, google-background, facebook-background","IPT, DIR, SIZE","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_OTHER",,"ID, SRC_IP, DST_IP, DST_ASN, SRC_PORT, DST_PORT, PROTOCOL, QUIC_VERSION, QUIC_SNI, QUIC_USERAGENT, TIME_FIRST, TIME_LAST",https://doi.org/10.1016/j.dib.2023.108888,https://zenodo.org/record/7963302,https://doi.org/10.23919/TMA58422.2023.10199052
-CESNET-TLS-Year22,TLS,2023,2022,1 year,507739322,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",182,,"IPT, DIR, SIZE, PUSH_FLAG","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,
+CESNET-TLS-Year22,TLS,2023,2022,1 year,507739073,"XS, S, M, L",1.1.2022 - 31.12.2022,"20220128, 20220129, 20220130, 20221212, 20221213, 20221229, 20221230, 20221231",182,,"IPT, DIR, SIZE, PUSH_FLAG","PHIST_SRC_SIZES, PHIST_DST_SIZES, PHIST_SRC_IPT, PHIST_DST_IPT","BYTES, BYTES_REV, PACKETS, PACKETS_REV, PPI_LEN, PPI_ROUNDTRIPS, PPI_DURATION, DURATION, FLOW_ENDREASON_IDLE, FLOW_ENDREASON_ACTIVE, FLOW_ENDREASON_END, FLOW_ENDREASON_OTHER","FLAG_CWR, FLAG_CWR_REV, FLAG_ECE, FLAG_ECE_REV, FLAG_URG, FLAG_URG_REV, FLAG_ACK, FLAG_ACK_REV, FLAG_PSH, FLAG_PSH_REV, FLAG_RST, FLAG_RST_REV, FLAG_SYN, FLAG_SYN_REV, FLAG_FIN, FLAG_FIN_REV","ID, SRC_IP, DST_IP, DST_ASN, DST_PORT, PROTOCOL, TLS_SNI, TLS_JA3, TIME_FIRST, TIME_LAST",,,

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/datasets/statistics.py RENAMED Viewed

@@ -12,16 +12,12 @@ from torch.utils.data import BatchSampler, DataLoader, SequentialSampler
 from tqdm import tqdm
 from cesnet_datazoo.config import Protocol
-from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN,
-                                      FLOWEND_REASON_FEATURES, PHISTS_FEATURES,
-                                      PPI_COLUMN, SIZE_POS)
+from cesnet_datazoo.constants import (APP_COLUMN, CATEGORY_COLUMN, FLOWEND_REASON_FEATURES,
+                                      PHISTS_FEATURES, PPI_COLUMN, SIZE_POS)
 from cesnet_datazoo.pytables_data.indices_setup import sort_indices
-from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset,
-                                                           list_all_tables,
+from cesnet_datazoo.pytables_data.pytables_dataset import (PyTablesDataset, list_all_tables,
                                                            worker_init_fn)
-log = logging.getLogger(__name__)
 def pick_quic_fields(batch):
     return (
@@ -46,7 +42,7 @@ def pick_stats_fields(batch, flowstats_features: list[str]):
 def simple_collate_fn(batch):
     return batch
-def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096):
+def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_features: list[str], protocol: Protocol, disabled_apps: list[str], num_samples: int | Literal["all"] = 10_000_000, num_workers: int = 4, batch_size: int = 4096, silent: bool = False):
     stats_pdf_path = os.path.join(output_dir, "dataset-statistics.pdf")
     stats_csv_path = os.path.join(output_dir, "dataset-statistics.csv")
     categories_csv_path = os.path.join(output_dir, "categories.csv")
@@ -72,7 +68,8 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     feature_packets_total = []
     feature_bytes_total = []
     packet_sizes_counter = Counter()
-    log.info(f"Reading data from {database_path} for statistics")
+    if not silent:
+        print(f"Reading data from {database_path} for statistics")
     table_names = list_all_tables(database_path)
     stats_dataset = PyTablesDataset(database_path=database_path, tables_paths=table_names, flowstats_features=flowstats_features, disabled_apps=disabled_apps, indices=None, return_all_fields=True)
     if num_samples != "all":
@@ -91,7 +88,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     if num_workers == 0:
         stats_dataset.pytables_worker_init()
-    for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader)):
+    for batch, batch_idx in tqdm(stats_dloader, total=len(stats_dloader), disable=silent):
         ppi, duration, packets_total, bytes_total, asn, phist, flowend_reason, app, cat = pick_stats_fields(batch, flowstats_features=flowstats_features)
         # Saving feature values for distribution plots
         feature_duration.append(duration)
@@ -127,6 +124,7 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     df_flowstats = pd.DataFrame(data={"FLOW DURATION": feature_duration, "FLOW BYTE VOLUME": feature_bytes_total, "FLOW LENGTH": feature_packets_total}).describe()
     df_flowstats.to_csv(stats_csv_path)
     # Categories tikzpicture and csv output
+    stats_dataset.pytables_worker_init() # to get access to cat enum; TODO implement better
     df_categories.index = df_categories.index.map(stats_dataset.get_cat_enum())
     df_categories = df_categories.drop("default", errors="ignore")
     df_categories["FLOW_PERC"] = df_categories["FLOW_COUNT"] / sum(df_categories["FLOW_COUNT"]) * 100
@@ -236,6 +234,3 @@ def compute_dataset_statistics(database_path: str, output_dir: str, flowstats_fe
     plt.tight_layout()
     fig.show()
     fig.savefig(stats_pdf_path, bbox_inches="tight")
-def compute_known_unknown_statistics():
-    raise NotImplementedError

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo/utils/download.py RENAMED Viewed

@@ -9,7 +9,7 @@ def simple_download(url: str, file_path: str):
     with open(file_path, "wb") as file:
         file.write(r.content)
-def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2):
+def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2, silent: bool = False):
     r1 = requests.get(url, stream=True)
     redirected_url = r1.url
     content_size = int(r1.headers["Content-Length"])
@@ -23,10 +23,11 @@ def resumable_download(url: str, file_path: str, chunk_size: int = 1024**2):
     headers = {"Range": f"bytes={temp_size}-"}
     r2 = requests.get(redirected_url, stream=True, headers=headers)
-    print(f"File size: {content_size / (1024**3):0.2f}GB")
-    print(f"Remaining: {(content_size - temp_size) / (1024**3):0.2f}GB")
+    if not silent:
+        print(f"File size: {content_size / (1024**3):0.2f}GB")
+        print(f"Remaining: {(content_size - temp_size) / (1024**3):0.2f}GB")
-    progress_bar = tqdm(total=content_size - temp_size, unit="B", unit_scale=True, unit_divisor=1024)
+    progress_bar = tqdm(total=content_size - temp_size, unit="B", unit_scale=True, unit_divisor=1024, disable=silent)
     with open(file_path, "ab") as file:
         for data in r2.iter_content(chunk_size=chunk_size):
             file.write(data)

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/cesnet_datazoo.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cesnet-datazoo
-Version: 0.0.7
+Version: 0.0.9
 Summary: A toolkit for large network traffic datasets
 Author-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>
 Maintainer-email: Jan Luxemburk <luxemburk@cesnet.cz>, Karel Hynek <hynekkar@cesnet.cz>

{cesnet-datazoo-0.0.7 → cesnet-datazoo-0.0.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cesnet-datazoo"
-version = "0.0.7"
+version = "0.0.9"
 authors = [
   {name = "Jan Luxemburk", email = "luxemburk@cesnet.cz"},
   {name = "Karel Hynek", email = "hynekkar@cesnet.cz"},