PyPI - plato-learn - Versions diffs - 1.1__py3-none-any.whl - Mend

plato-learn 1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

plato/__init__.py +1 -0
plato/algorithms/__init__.py +0 -0
plato/algorithms/base.py +45 -0
plato/algorithms/fedavg.py +48 -0
plato/algorithms/fedavg_gan.py +79 -0
plato/algorithms/fedavg_personalized.py +48 -0
plato/algorithms/mistnet.py +52 -0
plato/algorithms/registry.py +39 -0
plato/algorithms/split_learning.py +89 -0
plato/callbacks/__init__.py +0 -0
plato/callbacks/client.py +56 -0
plato/callbacks/handler.py +78 -0
plato/callbacks/server.py +139 -0
plato/callbacks/trainer.py +124 -0
plato/client.py +67 -0
plato/clients/__init__.py +0 -0
plato/clients/base.py +467 -0
plato/clients/edge.py +103 -0
plato/clients/fedavg_personalized.py +40 -0
plato/clients/mistnet.py +49 -0
plato/clients/registry.py +43 -0
plato/clients/self_supervised_learning.py +51 -0
plato/clients/simple.py +218 -0
plato/clients/split_learning.py +150 -0
plato/config.py +339 -0
plato/datasources/__init__.py +0 -0
plato/datasources/base.py +123 -0
plato/datasources/celeba.py +150 -0
plato/datasources/cifar10.py +87 -0
plato/datasources/cifar100.py +61 -0
plato/datasources/cinic10.py +62 -0
plato/datasources/coco.py +119 -0
plato/datasources/datalib/__init__.py +0 -0
plato/datasources/datalib/audio_extraction_tools.py +137 -0
plato/datasources/datalib/data_utils.py +124 -0
plato/datasources/datalib/flickr30kE_utils.py +336 -0
plato/datasources/datalib/frames_extraction_tools.py +254 -0
plato/datasources/datalib/gym_utils/__init__.py +0 -0
plato/datasources/datalib/gym_utils/gym_trim.py +189 -0
plato/datasources/datalib/modality_data_anntation_tools.py +163 -0
plato/datasources/datalib/modality_extraction_base.py +59 -0
plato/datasources/datalib/parse_datasets.py +212 -0
plato/datasources/datalib/refer_utils/__init__.py +0 -0
plato/datasources/datalib/refer_utils/referitgame_utils.py +237 -0
plato/datasources/datalib/tiny_data_tools.py +81 -0
plato/datasources/datalib/video_transform.py +79 -0
plato/datasources/emnist.py +64 -0
plato/datasources/fashion_mnist.py +41 -0
plato/datasources/feature.py +24 -0
plato/datasources/feature_dataset.py +15 -0
plato/datasources/femnist.py +141 -0
plato/datasources/flickr30k_entities.py +362 -0
plato/datasources/gym.py +431 -0
plato/datasources/huggingface.py +165 -0
plato/datasources/kinetics.py +568 -0
plato/datasources/mnist.py +44 -0
plato/datasources/multimodal_base.py +328 -0
plato/datasources/pascal_voc.py +56 -0
plato/datasources/purchase.py +94 -0
plato/datasources/qoenflx.py +127 -0
plato/datasources/referitgame.py +330 -0
plato/datasources/registry.py +119 -0
plato/datasources/self_supervised_learning.py +98 -0
plato/datasources/stl10.py +103 -0
plato/datasources/texas.py +94 -0
plato/datasources/tiny_imagenet.py +64 -0
plato/datasources/yolov8.py +85 -0
plato/models/__init__.py +0 -0
plato/models/cnn_encoder.py +103 -0
plato/models/dcgan.py +116 -0
plato/models/general_multilayer.py +254 -0
plato/models/huggingface.py +27 -0
plato/models/lenet5.py +113 -0
plato/models/multilayer.py +90 -0
plato/models/multimodal/__init__.py +0 -0
plato/models/multimodal/base_net.py +91 -0
plato/models/multimodal/blending.py +142 -0
plato/models/multimodal/fc_net.py +77 -0
plato/models/multimodal/fusion_net.py +78 -0
plato/models/multimodal/multimodal_module.py +152 -0
plato/models/registry.py +99 -0
plato/models/resnet.py +190 -0
plato/models/torch_hub.py +19 -0
plato/models/vgg.py +113 -0
plato/models/vit.py +166 -0
plato/models/yolov8.py +22 -0
plato/processors/__init__.py +0 -0
plato/processors/base.py +35 -0
plato/processors/compress.py +46 -0
plato/processors/decompress.py +48 -0
plato/processors/feature.py +51 -0
plato/processors/feature_additive_noise.py +48 -0
plato/processors/feature_dequantize.py +34 -0
plato/processors/feature_gaussian.py +17 -0
plato/processors/feature_laplace.py +15 -0
plato/processors/feature_quantize.py +34 -0
plato/processors/feature_randomized_response.py +50 -0
plato/processors/feature_unbatch.py +39 -0
plato/processors/inbound_feature_tensors.py +39 -0
plato/processors/model.py +55 -0
plato/processors/model_compress.py +34 -0
plato/processors/model_decompress.py +37 -0
plato/processors/model_decrypt.py +41 -0
plato/processors/model_deepcopy.py +21 -0
plato/processors/model_dequantize.py +18 -0
plato/processors/model_dequantize_qsgd.py +61 -0
plato/processors/model_encrypt.py +43 -0
plato/processors/model_quantize.py +18 -0
plato/processors/model_quantize_qsgd.py +82 -0
plato/processors/model_randomized_response.py +34 -0
plato/processors/outbound_feature_ndarrays.py +38 -0
plato/processors/pipeline.py +26 -0
plato/processors/registry.py +124 -0
plato/processors/structured_pruning.py +57 -0
plato/processors/unstructured_pruning.py +73 -0
plato/samplers/__init__.py +0 -0
plato/samplers/all_inclusive.py +41 -0
plato/samplers/base.py +31 -0
plato/samplers/dirichlet.py +81 -0
plato/samplers/distribution_noniid.py +132 -0
plato/samplers/iid.py +53 -0
plato/samplers/label_quantity_noniid.py +119 -0
plato/samplers/mixed.py +44 -0
plato/samplers/mixed_label_quantity_noniid.py +128 -0
plato/samplers/modality_iid.py +42 -0
plato/samplers/modality_quantity_noniid.py +56 -0
plato/samplers/orthogonal.py +99 -0
plato/samplers/registry.py +66 -0
plato/samplers/sample_quantity_noniid.py +123 -0
plato/samplers/sampler_utils.py +190 -0
plato/servers/__init__.py +0 -0
plato/servers/base.py +1395 -0
plato/servers/fedavg.py +281 -0
plato/servers/fedavg_cs.py +335 -0
plato/servers/fedavg_gan.py +74 -0
plato/servers/fedavg_he.py +106 -0
plato/servers/fedavg_personalized.py +57 -0
plato/servers/mistnet.py +67 -0
plato/servers/registry.py +52 -0
plato/servers/split_learning.py +109 -0
plato/trainers/__init__.py +0 -0
plato/trainers/base.py +99 -0
plato/trainers/basic.py +649 -0
plato/trainers/diff_privacy.py +178 -0
plato/trainers/gan.py +330 -0
plato/trainers/huggingface.py +173 -0
plato/trainers/loss_criterion.py +70 -0
plato/trainers/lr_schedulers.py +252 -0
plato/trainers/optimizers.py +53 -0
plato/trainers/pascal_voc.py +80 -0
plato/trainers/registry.py +44 -0
plato/trainers/self_supervised_learning.py +302 -0
plato/trainers/split_learning.py +305 -0
plato/trainers/tracking.py +96 -0
plato/trainers/yolov8.py +41 -0
plato/utils/__init__.py +0 -0
plato/utils/count_parameters.py +30 -0
plato/utils/csv_processor.py +26 -0
plato/utils/data_loaders.py +148 -0
plato/utils/decorators.py +24 -0
plato/utils/fonts.py +23 -0
plato/utils/homo_enc.py +187 -0
plato/utils/reinforcement_learning/__init__.py +0 -0
plato/utils/reinforcement_learning/policies/__init__.py +0 -0
plato/utils/reinforcement_learning/policies/base.py +161 -0
plato/utils/reinforcement_learning/policies/ddpg.py +75 -0
plato/utils/reinforcement_learning/policies/registry.py +32 -0
plato/utils/reinforcement_learning/policies/sac.py +343 -0
plato/utils/reinforcement_learning/policies/td3.py +485 -0
plato/utils/reinforcement_learning/rl_agent.py +142 -0
plato/utils/reinforcement_learning/rl_server.py +113 -0
plato/utils/rl_env.py +154 -0
plato/utils/s3.py +141 -0
plato/utils/trainer_utils.py +21 -0
plato/utils/unary_encoding.py +47 -0
plato_learn-1.1.dist-info/METADATA +35 -0
plato_learn-1.1.dist-info/RECORD +179 -0
plato_learn-1.1.dist-info/WHEEL +4 -0
plato_learn-1.1.dist-info/licenses/LICENSE +201 -0

plato/datasources/gym.py ADDED Viewed

@@ -0,0 +1,431 @@
+"""
+The Gym dataset.
+Note that the setting for the data loader is obtained from the github repo provided
+by the official workers:
+    Finegym: A hierarchical video dataset for fine-grained action understanding
+The data structure should be:
+├── data
+│   ├── gym99
+|   |   ├── annotations
+|   |   |   ├── gym99_train_org.txt
+|   |   |   ├── gym99_val_org.txt
+|   |   |   ├── gym99_train.txt
+|   |   |   ├── gym99_val.txt
+|   |   |   ├── annotation.json
+|   |   |   └── event_annotation.json
+│   │   ├── videos
+|   |   |   ├── 0LtLS9wROrk.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw.mp4
+│   │   ├── events
+|   |   |   ├── 0LtLS9wROrk_E_002407_002435.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw_E_006732_006824.mp4
+│   │   ├── subactions
+|   |   |   ├── 0LtLS9wROrk_E_002407_002435_A_0003_0005.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw_E_006244_006252_A_0000_0007.mp4
+|   |   └── subaction_frames
+|   |   |── subaction_audios
+"""
+import logging
+import os
+import shutil
+import torch
+from mmaction.tools.data.gym import download as gym_downloader
+from mmaction.datasets import build_dataset
+from plato.config import Config
+from plato.datasources.datalib.gym_utils import gym_trim
+from plato.datasources import multimodal_base
+from plato.datasources.datalib import frames_extraction_tools
+from plato.datasources.datalib import audio_extraction_tools
+from plato.datasources.datalib import data_utils
+class GymDataset(multimodal_base.MultiModalDataset):
+    """Prepare the Gym dataset."""
+    def __init__(
+        self, multimodal_data_holder, phase, phase_info, modality_sampler=None
+    ):
+        super().__init__()
+        self.phase = phase
+        #  multimodal_data_holder is a dict:
+        #    {"rgb": rgb_dataset, "flow": flow_dataset, "audio": audio_dataset}
+        self.phase_multimodal_data_record = multimodal_data_holder
+        # a dict presented as:
+        #   "rgb": <rgb_annotation_file_path>
+        self.phase_info = phase_info
+        self.modalities_name = list(multimodal_data_holder.keys())
+        self.supported_modalities = ["rgb", "flow", "audio_feature"]
+        # default utilizing the full modalities
+        if modality_sampler is None:
+            self.modality_sampler = self.supported_modalities
+        else:
+            self.modality_sampler = modality_sampler
+        self.targets = self.get_targets()
+    def __len__(self):
+        return len(self.phase_multimodal_data_record)
+    def get_targets(self):
+        """Obtain the labels of samples in current phase dataset."""
+        # There is no label provided in the fine gym dataset currently
+        #  This part will be added afterward
+        return [0]
+    def get_one_multimodal_sample(self, sample_idx):
+        """Obtain one sample from the Kinetics dataset."""
+        obtained_mm_sample = dict()
+        for modality_name in self.modalities_name:
+            modality_dataset = self.phase_multimodal_data_record[modality_name]
+            obtained_mm_sample[modality_name] = modality_dataset[sample_idx]
+        return obtained_mm_sample
+class DataSource(multimodal_base.MultiModalDataSource):
+    """The Gym dataset."""
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.data_name = Config().data.datasource
+        # the rawframes contains the "flow", "rgb"
+        # thus, the flow and rgb will be put in in same directory rawframes/
+        # self.modality_names = ["video", "audio", "rawframes", "audio_feature"]
+        self.modality_names = ["video", "audio", "rgb", "flow", "audio_feature"]
+        _path = Config().params["data_path"]
+        self._data_path_process(data_path=_path, base_data_name=self.data_name)
+        self._create_modalities_path(modality_names=self.modality_names)
+        base_data_path = self.mm_data_info["data_path"]
+        # define all the dir here
+        gym_anno_dir_name = "annotations"
+        self.data_annotation_path = os.path.join(base_data_path, gym_anno_dir_name)
+        self.data_anno_file_path = os.path.join(
+            self.data_annotation_path, "annotation.json"
+        )
+        self.categoty_anno_file_path = os.path.join(
+            self.data_annotation_path, "gym99_categories.txt"
+        )
+        self.raw_videos_path = os.path.join(base_data_path, "videos")
+        self.event__path = os.path.join(base_data_path, "event")
+        self.event_subsection__path = os.path.join(base_data_path, "subactions")
+        self.data_event_anno_file_path = os.path.join(
+            self.data_annotation_path, "event_annotation.json"
+        )
+        self.event_subsection_frames__path = os.path.join(
+            base_data_path, "subaction_rawframes"
+        )
+        self.event_subsection_audios__path = os.path.join(
+            base_data_path, "subaction_audios"
+        )
+        self.event_subsection_audios_fea__path = os.path.join(
+            base_data_path, "subaction_audios_features"
+        )
+        self.rawframes_splits_list_files_into = {
+            "train": os.path.join(
+                self.data_annotation_path, "gym99_train_rawframes.txt"
+            ),
+            "val": os.path.join(self.data_annotation_path, "gym99_val_rawframes.txt"),
+        }
+        self.audios_splits_list_files_into = {
+            "train": os.path.join(self.data_annotation_path, "gym99_train_audios.txt"),
+            "val": os.path.join(self.data_annotation_path, "gym99_val_audios.txt"),
+        }
+        self.audio_features_splits_list_files_into = {
+            "train": os.path.join(
+                self.data_annotation_path, "gym99_train_audio_features.txt"
+            ),
+            "val": os.path.join(
+                self.data_annotation_path, "gym99_val_audio_features.txt"
+            ),
+        }
+        set_level_category_url = (
+            "https://sdolivia.github.io/FineGym/resources/dataset/set_categories.txt"
+        )
+        g99_categoty_url = (
+            "https://sdolivia.github.io/FineGym/resources/dataset/gym99_categories.txt"
+        )
+        anno_url = "https://sdolivia.github.io/FineGym/resources/dataset/finegym_annotation_info_v1.0.json"
+        train_url = "https://sdolivia.github.io/FineGym/resources/dataset/gym99_train_element_v1.0.txt"
+        eval_url = (
+            "https://sdolivia.github.io/FineGym/resources/dataset/gym99_val_element.txt"
+        )
+        _ = self._download_arrange_data(
+            download_url_address=set_level_category_url,
+            data_path=self.data_annotation_path,
+            obtained_file_name="set_categories.txt",
+        )
+        _ = self._download_arrange_data(
+            download_url_address=g99_categoty_url,
+            data_path=self.data_annotation_path,
+            obtained_file_name="gym99_categories.txt",
+        )
+        _ = self._download_arrange_data(
+            download_url_address=anno_url,
+            data_path=self.data_annotation_path,
+            obtained_file_name="annotation.json",
+        )
+        _ = self._download_arrange_data(
+            download_url_address=train_url,
+            data_path=self.data_annotation_path,
+            obtained_file_name="gym99_train_org.txt",
+        )
+        _ = self._download_arrange_data(
+            download_url_address=eval_url,
+            data_path=self.data_annotation_path,
+            obtained_file_name="gym99_val_org.txt",
+        )
+        if not self._exists(self.raw_videos_path):
+            logging.info(
+                "Downloading the raw videos for the Gym dataset. This may take a long time."
+            )
+            gym_downloader.main(
+                input=self.data_anno_file_path,
+                output_dir=self.raw_videos_path,
+                num_jobs=Config().data.downloader.num_workers,
+            )
+            logging.info("Done.")
+        # Trim Videos into Events
+        if not self._exists(self.event__path):
+            gym_trim.trim_event(
+                video_root=self.raw_videos_path,
+                anno_file=self.data_anno_file_path,
+                event_anno_file=self.data_event_anno_file_path,
+                event_root=self.event__path,
+            )
+        if not self._exists(self.event_subsection__path):
+            gym_trim.trim_subsection(
+                event_anno_file=self.data_event_anno_file_path,
+                event_root=self.event__path,
+                subaction_root=self.event_subsection__path,
+            )
+        logging.info("The Gym dataset has been prepared")
+        self.extract_videos_rgb_flow_audio()
+    def extract_videos_rgb_flow_audio(self):
+        """Extract the rgb optical flow audios from the video"""
+        src_videos_dir = self.event_subsection__path
+        frames_out__path = self.event_subsection_frames__path
+        rgb_out__path = self.event_subsection_frames__path
+        flow_our__path = self.event_subsection_frames__path
+        audio_out__path = self.event_subsection_audios__path
+        audio_feature__path = self.event_subsection_audios_fea__path
+        # define the modalities extractor
+        vdf_extractor = frames_extraction_tools.VideoFramesExtractor(
+            video_src_dir=src_videos_dir,
+            dir_level=1,
+            num_worker=8,
+            video_ext="mp4",
+            mixed_ext=False,
+        )
+        vda_extractor = audio_extraction_tools.VideoAudioExtractor(
+            video_src_dir=src_videos_dir,
+            dir_level=1,
+            num_worker=8,
+            video_ext="mp4",
+            mixed_ext=False,
+        )
+        if torch.cuda.is_available():
+            if not self._exists(rgb_out__path) and not self._exists(flow_our__path):
+                logging.info(
+                    "Extracting frames by GPU from videos in %s to %s.",
+                    src_videos_dir,
+                    rgb_out__path,
+                )
+                vdf_extractor.build_full_frames_gpu(
+                    to__path=frames_out__path, new_short=256, new_width=0, new_height=0
+                )
+        else:
+            if not self._exists(rgb_out__path):
+                logging.info(
+                    "Extracting frames by CPU from videos in %s to %s.",
+                    src_videos_dir,
+                    rgb_out__path,
+                )
+                vdf_extractor.build_frames_cpu(to_dir=frames_out__path)
+        if not self._exists(audio_out__path):
+            logging.info(
+                "Extracting audios by CPU from videos in %s to %s.",
+                src_videos_dir,
+                audio_out__path,
+            )
+            vda_extractor.build_audios(to_dir=audio_out__path)
+        if not self._exists(audio_feature__path):
+            logging.info(
+                "Extracting audios feature by CPU from audios in %s to %s.",
+                audio_out__path,
+                audio_feature__path,
+            )
+            # # window_size:32ms hop_size:16ms
+            vda_extractor.build_audios_features(
+                audio_src_path=audio_out__path,
+                to_dir=audio_feature__path,
+                fft_size=512,  # fft_size / sample_rate is window size
+                hop_size=256,
+            )
+        # extract the splits data into list files based on the frames information
+        gym_trim.generate_splits_list(
+            data_root=self.event_subsection__path,
+            annotation_root=self.data_annotation_path,
+            frame_data_root=frames_out__path,
+        )
+        # generate the audio and audio features splits file
+        # just copy the frame files to the audio ones
+        for split in list(self.rawframes_splits_list_files_into.keys()):
+            rawframes_split_file_path = self.rawframes_splits_list_files_into[split]
+            audios_split_file_path = self.audios_splits_list_files_into[split]
+            audio_features_split_file_path = self.audios_splits_list_files_into[split]
+            shutil.copy(src=rawframes_split_file_path, dst=audios_split_file_path)
+            shutil.copy(
+                src=rawframes_split_file_path, dst=audio_features_split_file_path
+            )
+    def correct_current_config(self, loaded_plato_config, mode, modality_name):
+        """Correct the loaded configuration settings based on
+        on-hand data information"""
+        # 1.1. convert plato config to dict type
+        loaded_config = data_utils.config_to_dict(loaded_plato_config)
+        # 1.2. convert the list to tuple
+        loaded_config = data_utils.dict_list2tuple(loaded_config)
+        # 2. using the obtained annotation file replace the user set ones
+        #   in the configuration file
+        #   The main reason is that the obtained path here is the full path
+        cur_rawframes_anno_file_path = self.rawframes_splits_list_files_into[mode]
+        cur_rawframes_data_path = self.event_subsection_frames__path
+        cur_videos_anno_file_path = None
+        cur_video_data_path = self.event_subsection__path
+        cur_audio_feas_anno_file_path = self.audios_splits_list_files_into[mode]
+        cur_audio_feas_data_path = self.event_subsection_audios__path
+        if modality_name == "rgb" or modality_name == "flow":
+            loaded_config["ann_file"] = cur_rawframes_anno_file_path
+        elif modality_name == "audio_feature":
+            loaded_config["ann_file"] = cur_audio_feas_anno_file_path
+        else:
+            loaded_config["ann_file"] = cur_videos_anno_file_path
+        # 3. reset the data_prefix by using the modality path
+        if modality_name == "rgb" or modality_name == "flow":
+            loaded_config["data_prefix"] = cur_rawframes_data_path
+        elif modality_name == "audio_feature":
+            loaded_config["data_prefix"] = cur_audio_feas_data_path
+        else:
+            loaded_config["data_prefix"] = cur_video_data_path
+        return loaded_config
+    def get_phase_dataset(self, phase, modality_sampler):
+        """Get the dataset for the specific phase."""
+        rgb_mode_config = getattr(Config().data.multi_modal_configs.rgb, phase)
+        flow_mode_config = getattr(Config().data.multi_modal_configs.flow, phase)
+        audio_feature_mode_config = getattr(
+            Config().data.multi_modal_configs.audio_feature, phase
+        )
+        rgb_mode_config = self.correct_current_config(
+            loaded_plato_config=rgb_mode_config, mode=phase, modality_name="rgb"
+        )
+        flow_mode_config = self.correct_current_config(
+            loaded_plato_config=flow_mode_config, mode=phase, modality_name="flow"
+        )
+        audio_feature_mode_config = self.correct_current_config(
+            loaded_plato_config=audio_feature_mode_config,
+            mode=phase,
+            modality_name="audio_feature",
+        )
+        # build a RawframeDataset
+        rgb_mode_dataset = build_dataset(rgb_mode_config)
+        flow_mode_dataset = build_dataset(flow_mode_config)
+        audio_feature_mode_dataset = build_dataset(audio_feature_mode_config)
+        multi_modal_mode_data = {
+            "rgb": rgb_mode_dataset,
+            "flow": flow_mode_dataset,
+            "audio_feature": audio_feature_mode_dataset,
+        }
+        multi_modal_mode_info = {
+            "rgb": rgb_mode_config["ann_file"],
+            "flow": flow_mode_config["ann_file"],
+            "audio_feature": audio_feature_mode_config["ann_file"],
+            "categories": self.categoty_anno_file_path,
+        }
+        gym_mode_dataset = GymDataset(
+            multimodal_data_holder=multi_modal_mode_data,
+            phase="train",
+            phase_info=multi_modal_mode_info,
+            modality_sampler=modality_sampler,
+        )
+        return gym_mode_dataset
+    def get_train_set(self, modality_sampler=None):
+        """Obtain the trainset for multimodal data."""
+        gym_train_dataset = self.get_phase_dataset(
+            phase="train", modality_sampler=modality_sampler
+        )
+        return gym_train_dataset
+    def get_test_set(self, modality_sampler=None):
+        """Obtain the testset for multimodal data.
+        Note, in the kinetics dataset, there is no testset in which
+         samples contain the groundtruth label.
+         Thus, we utilize the validation set directly.
+        """
+        gym_val_dataset = self.get_phase_dataset(
+            phase="val", modality_sampler=modality_sampler
+        )
+        return gym_val_dataset
+    def get_modality_name(self):
+        """Get all supports modalities"""
+        return ["rgb", "flow", "audio"]

plato/datasources/huggingface.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""
+A data source for the HuggingFace datasets.
+For more information about the HuggingFace datasets, refer to:
+https://huggingface.co/docs/datasets/quicktour.html
+"""
+import logging
+import os
+from datasets import load_dataset, load_from_disk
+from transformers import AutoConfig, AutoTokenizer, HfArgumentParser
+from transformers import TrainingArguments, testing_utils, utils
+from plato.config import Config
+from plato.datasources import base
+class DataSource(base.DataSource):
+    """A data source for the HuggingFace datasets."""
+    def __init__(self, **kwargs):
+        super().__init__()
+        dataset_name = Config().data.dataset_name
+        logging.info("Dataset: %s", dataset_name)
+        if hasattr(Config.data, "dataset_config"):
+            dataset_config = Config().data.dataset_config
+        else:
+            dataset_config = None
+        saved_data_path = (
+            f"{Config().params['data_path']}/{dataset_name}_{dataset_config}"
+        )
+        if os.path.exists(saved_data_path):
+            # If the dataset has already been downloaded and saved
+            self.dataset = load_from_disk(saved_data_path)
+        else:
+            # Download and save the dataset
+            self.dataset = load_dataset(dataset_name, dataset_config)
+            self.dataset.save_to_disk(saved_data_path)
+        parser = HfArgumentParser(TrainingArguments)
+        (self.training_args,) = parser.parse_args_into_dataclasses(
+            args=["--output_dir=/tmp", "--report_to=none"]
+        )
+        model_name = Config().trainer.model_name
+        use_auth_token = None
+        if hasattr(Config().parameters, "huggingface_token"):
+            use_auth_token = Config().parameters.huggingface_token
+        config_kwargs = {
+            "cache_dir": Config().params["model_path"],
+            "revision": "main",
+            "use_auth_token": use_auth_token,
+        }
+        tokenizer_kwargs = {
+            "cache_dir": Config().params["data_path"],
+            "use_fast": True,
+            "revision": "main",
+            "use_auth_token": use_auth_token,
+        }
+        self.config = AutoConfig.from_pretrained(model_name, **config_kwargs)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, config=self.config, **tokenizer_kwargs
+        )
+        self.tok_logger = utils.logging.get_logger(
+            "transformers.tokenization_utils_base"
+        )
+        self.block_size = 128
+        self.column_names = ["text"]
+        self.text_column_name = "text"
+        self.trainset = self.preprocess_data(self.dataset["train"])
+        self.testset = self.preprocess_data(self.dataset["validation"])
+    def num_train_examples(self):
+        return len(self.trainset)
+    def num_test_examples(self):
+        return len(self.testset)
+    def get_train_set(self):
+        return self.trainset
+    def get_test_set(self):
+        return self.testset
+    @staticmethod
+    def input_shape():
+        """Returns the input shape of the dataset, useful for building
+        a TF model."""
+        raise ValueError("Not implemented.")
+    def tokenize_function(self, examples):
+        """Using the tokenizer from AutoTokenizer to tokenize the text."""
+        with testing_utils.CaptureLogger(self.tok_logger) as cl:
+            output = self.tokenizer(examples[self.text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            self.tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be "
+                "chunked into smaller bits before being passed to the model."
+            )
+        return output
+    def group_texts(self, examples):
+        """Concatenate all texts."""
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it
+        # instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // self.block_size) * self.block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [
+                t[i : i + self.block_size]
+                for i in range(0, total_length, self.block_size)
+            ]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    def preprocess_data(self, datasets):
+        """Tokenizing and grouping the raw dataset."""
+        with self.training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = datasets.map(
+                self.tokenize_function,
+                batched=True,
+                num_proc=4,
+                remove_columns=self.column_names,
+                load_from_cache_file=True,
+                desc="Running tokenizer on dataset",
+            )
+        block_size = self.tokenizer.model_max_length
+        if block_size > 1024:
+            logging.warning(
+                "The tokenizer picked seems to have a very large `model_max_length` "
+                "%s. Picking 1024 instead.",
+                self.tokenizer.model_max_length,
+            )
+            block_size = 1024
+        with self.training_args.main_process_first(desc="grouping texts together"):
+            lm_datasets = tokenized_datasets.map(
+                self.group_texts,
+                batched=True,
+                num_proc=4,
+                load_from_cache_file=True,
+                desc=f"Grouping texts in chunks of {block_size}",
+            )
+        return lm_datasets