PyPI - genhpf - Versions diffs - 1.0.11__py3-none-any.whl - Mend

genhpf 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

genhpf/__init__.py +9 -0
genhpf/configs/__init__.py +23 -0
genhpf/configs/config.yaml +8 -0
genhpf/configs/configs.py +240 -0
genhpf/configs/constants.py +29 -0
genhpf/configs/initialize.py +58 -0
genhpf/configs/utils.py +29 -0
genhpf/criterions/__init__.py +74 -0
genhpf/criterions/binary_cross_entropy.py +114 -0
genhpf/criterions/binary_cross_entropy_with_logits.py +115 -0
genhpf/criterions/criterion.py +87 -0
genhpf/criterions/cross_entropy.py +202 -0
genhpf/criterions/multi_task_criterion.py +177 -0
genhpf/criterions/simclr_criterion.py +84 -0
genhpf/criterions/wav2vec2_criterion.py +130 -0
genhpf/datasets/__init__.py +84 -0
genhpf/datasets/dataset.py +109 -0
genhpf/datasets/genhpf_dataset.py +451 -0
genhpf/datasets/meds_dataset.py +232 -0
genhpf/loggings/__init__.py +0 -0
genhpf/loggings/meters.py +374 -0
genhpf/loggings/metrics.py +155 -0
genhpf/loggings/progress_bar.py +445 -0
genhpf/models/__init__.py +73 -0
genhpf/models/genhpf.py +244 -0
genhpf/models/genhpf_mlm.py +64 -0
genhpf/models/genhpf_predictor.py +73 -0
genhpf/models/genhpf_simclr.py +58 -0
genhpf/models/genhpf_wav2vec2.py +304 -0
genhpf/modules/__init__.py +15 -0
genhpf/modules/gather_layer.py +23 -0
genhpf/modules/grad_multiply.py +12 -0
genhpf/modules/gumbel_vector_quantizer.py +204 -0
genhpf/modules/identity_layer.py +8 -0
genhpf/modules/layer_norm.py +27 -0
genhpf/modules/positional_encoding.py +24 -0
genhpf/scripts/__init__.py +0 -0
genhpf/scripts/preprocess/__init__.py +0 -0
genhpf/scripts/preprocess/genhpf/README.md +75 -0
genhpf/scripts/preprocess/genhpf/__init__.py +0 -0
genhpf/scripts/preprocess/genhpf/ehrs/__init__.py +36 -0
genhpf/scripts/preprocess/genhpf/ehrs/ehr.py +919 -0
genhpf/scripts/preprocess/genhpf/ehrs/eicu.py +550 -0
genhpf/scripts/preprocess/genhpf/ehrs/mimiciii.py +839 -0
genhpf/scripts/preprocess/genhpf/ehrs/mimiciv.py +619 -0
genhpf/scripts/preprocess/genhpf/main.py +175 -0
genhpf/scripts/preprocess/genhpf/manifest.py +79 -0
genhpf/scripts/preprocess/genhpf/sample_dataset.py +177 -0
genhpf/scripts/preprocess/genhpf/utils/__init__.py +3 -0
genhpf/scripts/preprocess/genhpf/utils/utils.py +16 -0
genhpf/scripts/preprocess/manifest.py +83 -0
genhpf/scripts/preprocess/preprocess_meds.py +674 -0
genhpf/scripts/test.py +264 -0
genhpf/scripts/train.py +365 -0
genhpf/trainer.py +370 -0
genhpf/utils/checkpoint_utils.py +171 -0
genhpf/utils/data_utils.py +130 -0
genhpf/utils/distributed_utils.py +497 -0
genhpf/utils/file_io.py +170 -0
genhpf/utils/pdb.py +38 -0
genhpf/utils/utils.py +204 -0
genhpf-1.0.11.dist-info/LICENSE +21 -0
genhpf-1.0.11.dist-info/METADATA +202 -0
genhpf-1.0.11.dist-info/RECORD +67 -0
genhpf-1.0.11.dist-info/WHEEL +5 -0
genhpf-1.0.11.dist-info/entry_points.txt +6 -0
genhpf-1.0.11.dist-info/top_level.txt +1 -0

genhpf/datasets/genhpf_dataset.py ADDED Viewed

@@ -0,0 +1,451 @@
+import logging
+from typing import List, Union
+import h5pickle
+import numpy as np
+import pandas as pd
+import torch
+from genhpf.datasets.dataset import BaseDataset
+logger = logging.getLogger(__name__)
+class GenHPFDataset(BaseDataset):
+    def __init__(
+        self,
+        manifest_paths: List[str],
+        structure: str,
+        vocab_size: int = 28996,
+        pad_token_id: int = 0,
+        sep_token_id: int = 102,
+        ignore_index: int = -100,
+        apply_mask: bool = False,
+        mask_token_id: int = 103,
+        mask_prob: float = 0,
+        mask_unit: str = "individual",
+        simclr: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        if structure == "hierarchical":
+            structure = "hi"
+        elif structure == "flattened":
+            structure = "fl"
+        self.structure = structure
+        self.pad_token_id = pad_token_id
+        self.ignore_index = ignore_index
+        self.apply_mask = apply_mask
+        self.mask_prob = mask_prob
+        self.vocab_size = vocab_size
+        self.mask_token_id = mask_token_id
+        self.mask_unit = mask_unit
+        self.sep_token_id = sep_token_id
+        self.simclr = simclr
+        for k, v in kwargs.items():
+            self.__setattr__(k, v)
+        self.data = []
+        self.subjects = []
+        self.labels = {}
+        for i, manifest_path in enumerate(manifest_paths):
+            with open(manifest_path, "r") as f:
+                data_root = f.readline().strip()
+                label_root = f.readline().strip()
+                self.data.append(h5pickle.File(data_root, "r")["ehr"])
+                labels = pd.read_csv(label_root)
+                labels.index = [(i, str(x)) for x in labels["stay_id"]]
+                labels = labels.drop(columns=["stay_id"])
+                self.labels.update(labels.to_dict(orient="index"))
+                for line in f:
+                    items = line.strip().split("\t")
+                    assert len(items) == 1, line
+                    self.subjects.append((i, items[0]))
+        logger.info(f"loaded {len(self.subjects)} samples from {len(manifest_paths)} dataset(s)")
+    def __len__(self):
+        return len(self.subjects)
+    def __getitem__(self, index):
+        raise NotImplementedError
+class HierarchicalGenHPFDataset(GenHPFDataset):
+    def __init__(
+        self,
+        manifest_paths: List[str],
+        label: bool = False,
+        tasks: List[str] = None,
+        num_labels: List[int] = None,
+        dummy_token_id: int = 101,
+        **kwargs,
+    ):
+        kwargs.pop("structure", None)
+        super().__init__(manifest_paths=manifest_paths, structure="hierarchical", **kwargs)
+        self.label = label
+        self.tasks = tasks
+        self.num_labels = num_labels
+        self.dummy_token_id = dummy_token_id
+    def mask(self, tokens: Union[np.ndarray, torch.Tensor], **kwargs):
+        for i, event in enumerate(tokens):
+            tokens[i], _ = super().mask(event, **kwargs)
+        return tokens
+    def collator(self, samples):
+        samples = [s for s in samples if s["input_ids"] is not None]
+        if len(samples) == 0:
+            return {}
+        if self.simclr:
+            input_ids = sum(
+                [
+                    [s["input_ids"][: len(s["input_ids"]) // 2], s["input_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            type_ids = sum(
+                [
+                    [s["type_ids"][: len(s["input_ids"]) // 2], s["type_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            dpe_ids = sum(
+                [
+                    [s["dpe_ids"][: len(s["input_ids"]) // 2], s["dpe_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+        else:
+            input_ids = [s["input_ids"] for s in samples]
+            type_ids = [s["type_ids"] for s in samples]
+            dpe_ids = [s["dpe_ids"] for s in samples]
+        sizes = [s.size(0) for s in input_ids]
+        target_size = max(sizes)
+        collated_input_ids = (
+            input_ids[0].new_zeros((len(input_ids), target_size, len(input_ids[0][0]))).long()
+        )
+        collated_type_ids = type_ids[0].new_zeros((len(type_ids), target_size, len(type_ids[0][0]))).long()
+        collated_dpe_ids = dpe_ids[0].new_zeros((len(dpe_ids), target_size, len(dpe_ids[0][0]))).long()
+        for i, size in enumerate(sizes):
+            diff = size - target_size
+            if diff == 0:
+                collated_input_ids[i] = input_ids[i]
+                collated_type_ids[i] = type_ids[i]
+                collated_dpe_ids[i] = dpe_ids[i]
+            elif diff < 0:
+                collated_input_ids[i] = torch.cat(
+                    [
+                        input_ids[i],
+                        input_ids[i].new_zeros(-diff, len(input_ids[i][0])),
+                    ],
+                    dim=0,
+                )
+                # add dummy token to the start of each padded event as the event encoder can be
+                # crushed when all the input tokens are pad tokens
+                collated_input_ids[i][diff:, 0] = self.dummy_token_id
+                collated_type_ids[i] = torch.cat(
+                    [
+                        type_ids[i],
+                        type_ids[i].new_zeros(-diff, len(type_ids[i][0])),
+                    ],
+                    dim=0,
+                )
+                collated_dpe_ids[i] = torch.cat(
+                    [
+                        dpe_ids[i],
+                        dpe_ids[i].new_zeros(-diff, len(dpe_ids[i][0])),
+                    ],
+                    dim=0,
+                )
+            else:
+                raise ValueError(f"size mismatch, expected <={target_size}, got {size}")
+        out = {"id": [s["id"] for s in samples]}
+        out["net_input"] = {
+            "input_ids": collated_input_ids,
+            "type_ids": collated_type_ids,
+            "dpe_ids": collated_dpe_ids,
+        }
+        if self.label:
+            label = {}
+            for task in self.tasks:
+                label[task] = torch.stack([s[task] for s in samples])
+            out["label"] = label
+        return out
+    def __getitem__(self, index):
+        data_index, subject = self.subjects[index]
+        data = self.data[data_index][subject][self.structure][:]
+        if self.apply_mask:
+            data = self.mask(
+                data,
+                mask_prob=self.mask_prob,
+                vocab_size=self.vocab_size,
+                mask_token_id=self.mask_token_id,
+                mask_unit=self.mask_unit,
+                sep_token_id=self.sep_token_id,
+            )
+        ret = {
+            "id": subject,
+            "input_ids": torch.LongTensor(data[:, 0, :]),
+            "type_ids": torch.LongTensor(data[:, 1, :]),
+            "dpe_ids": torch.LongTensor(data[:, 2, :]),
+        }
+        if self.label:
+            for i, task in enumerate(self.tasks):
+                ret[task] = self.labels[self.subjects[index]][task]
+                if isinstance(ret[task], str):
+                    ret[task] = eval(ret[task])
+                # for multi-label classification, where the label is given by a list of class indices
+                if isinstance(ret[task], list):
+                    ret[task] = list(map(int, ret[task]))
+                    num_label = self.num_labels[i]
+                    label = np.zeros(num_label, dtype=np.int16)
+                    label[ret[task]] = 1
+                    ret[task] = torch.tensor(label)
+                else:
+                    if np.isnan(ret[task]) or ret[task] < 0:
+                        ret[task] = self.ignore_index
+                    ret[task] = torch.tensor(ret[task])
+        return ret
+class FlattenedGenHPFDataset(GenHPFDataset):
+    def __init__(
+        self,
+        manifest_paths: List[str],
+        label: bool = False,
+        tasks: List[str] = None,
+        num_labels: List[int] = None,
+        **kwargs,
+    ):
+        kwargs.pop("structure", None)
+        super().__init__(manifest_paths=manifest_paths, structure="flattened", **kwargs)
+        self.label = label
+        self.tasks = tasks
+        self.num_labels = num_labels
+    def sample_crop_indices(self, size, diff):
+        if self.mask:
+            start = np.random.randint(0, diff + 1)
+            end = size - diff + start
+        else:
+            start = 0
+            end = size - diff
+        return start, end
+    def pad_to_max_size(self, sample, max_len):
+        if len(sample) < max_len:
+            sample = np.concatenate([sample, np.zeros(max_len - len(sample), dtype=np.int16)])
+        else:
+            sample = sample[:max_len]
+        return sample
+    def collator(self, samples):
+        samples = [s for s in samples if s["input_ids"] is not None]
+        if len(samples) == 0:
+            return {}
+        if self.simclr:
+            input_ids = sum(
+                [
+                    [s["input_ids"][: len(s["input_ids"]) // 2], s["input_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            type_ids = sum(
+                [
+                    [s["type_ids"][: len(s["input_ids"]) // 2], s["type_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            dpe_ids = sum(
+                [
+                    [s["dpe_ids"][: len(s["input_ids"]) // 2], s["dpe_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            if self.apply_mask:
+                input_label = sum(
+                    [
+                        [
+                            s["input_label"][: len(s["input_ids"]) // 2],
+                            s["input_label"][len(s["input_ids"]) // 2 :],
+                        ]
+                        for s in samples
+                    ],
+                    [],
+                )
+        else:
+            input_ids = [s["input_ids"] for s in samples]
+            type_ids = [s["type_ids"] for s in samples]
+            dpe_ids = [s["dpe_ids"] for s in samples]
+            if self.apply_mask:
+                input_label = [s["input_label"] for s in samples]
+                type_label = [s["type_label"] for s in samples]
+                dpe_label = [s["dpe_label"] for s in samples]
+        sizes = [s.size(0) for s in input_ids]
+        target_size = max(sizes)
+        collated_input_ids = input_ids[0].new_zeros((len(input_ids), target_size)).long()
+        collated_type_ids = type_ids[0].new_zeros((len(type_ids), target_size)).long()
+        collated_dpe_ids = dpe_ids[0].new_zeros((len(dpe_ids), target_size)).long()
+        if self.apply_mask:
+            collated_input_label = input_label[0].new_zeros((len(input_label), target_size)).long()
+            collated_type_label = type_label[0].new_zeros((len(type_label), target_size)).long()
+            collated_dpe_label = dpe_label[0].new_zeros((len(dpe_label), target_size)).long()
+        for i, size in enumerate(sizes):
+            diff = size - target_size
+            if diff == 0:
+                collated_input_ids[i] = input_ids[i]
+                collated_type_ids[i] = type_ids[i]
+                collated_dpe_ids[i] = dpe_ids[i]
+                if self.apply_mask:
+                    collated_input_label[i] = input_label[i]
+                    collated_type_label[i] = type_label[i]
+                    collated_dpe_label[i] = dpe_label[i]
+            elif diff < 0:
+                collated_input_ids[i] = torch.cat(
+                    [
+                        input_ids[i],
+                        input_ids[i].new_zeros(
+                            -diff,
+                        ),
+                    ],
+                    dim=0,
+                )
+                collated_type_ids[i] = torch.cat(
+                    [
+                        type_ids[i],
+                        type_ids[i].new_zeros(
+                            -diff,
+                        ),
+                    ],
+                    dim=0,
+                )
+                collated_dpe_ids[i] = torch.cat(
+                    [
+                        dpe_ids[i],
+                        dpe_ids[i].new_zeros(
+                            -diff,
+                        ),
+                    ],
+                    dim=0,
+                )
+                if self.apply_mask:
+                    collated_input_label[i] = torch.cat(
+                        [
+                            input_label[i],
+                            input_label[i].new_zeros(
+                                -diff,
+                            ),
+                        ],
+                        dim=0,
+                    )
+                    collated_type_label[i] = torch.cat(
+                        [
+                            type_label[i],
+                            type_label[i].new_zeros(
+                                -diff,
+                            ),
+                        ],
+                        dim=0,
+                    )
+                    collated_dpe_label[i] = torch.cat(
+                        [
+                            dpe_label[i],
+                            dpe_label[i].new_zeros(
+                                -diff,
+                            ),
+                        ],
+                        dim=0,
+                    )
+            else:
+                raise ValueError(f"size mismatch, expected <={target_size}, got {size}")
+        out = {"id": [s["id"] for s in samples]}
+        out["net_input"] = {
+            "input_ids": collated_input_ids,
+            "type_ids": collated_type_ids,
+            "dpe_ids": collated_dpe_ids,
+        }
+        if self.apply_mask:
+            out["input_label"] = collated_input_label
+            out["type_label"] = collated_type_label
+            out["dpe_label"] = collated_dpe_label
+        if self.label:
+            label = {}
+            for task in self.tasks:
+                label[task] = torch.stack([s[task] for s in samples])
+            out["label"] = label
+        return out
+    def __getitem__(self, index):
+        data_index, subject = self.subjects[index]
+        data = self.data[data_index][subject][self.structure][:]
+        if self.apply_mask:
+            data, mlm_labels = self.mask(
+                data,
+                mask_prob=self.mask_prob,
+                vocab_size=self.vocab_size,
+                mask_token_id=self.mask_token_id,
+                mask_unit=self.mask_unit,
+                sep_token_id=self.sep_token_id,
+            )
+        ret = {
+            "id": self.subjects[index],
+            "input_ids": torch.LongTensor(data[0, :]),
+            "type_ids": torch.LongTensor(data[1, :]),
+            "dpe_ids": torch.LongTensor(data[2, :]),
+        }
+        if self.apply_mask:
+            ret["input_label"] = torch.LongTensor(mlm_labels[0, :])
+            ret["type_label"] = torch.LongTensor(mlm_labels[1, :])
+            ret["dpe_label"] = torch.LongTensor(mlm_labels[2, :])
+        if self.label:
+            for i, task in enumerate(self.tasks):
+                ret[task] = self.labels[self.subjects[index]][task]
+                if isinstance(ret[task], str):
+                    ret[task] = eval(ret[task])
+                # for multi-label classification, where the label is given by a list of class indices
+                if isinstance(ret[task], list):
+                    ret[task] = list(map(int, ret[task]))
+                    num_label = self.num_labels[i]
+                    label = np.zeros(num_label, dtype=np.int16)
+                    label[ret[task]] = 1
+                    ret[task] = torch.tensor(label)
+                else:
+                    if np.isnan(ret[task]) or ret[task] < 0:
+                        ret[task] = self.ignore_index
+                    ret[task] = torch.tensor(ret[task])
+        return ret

genhpf/datasets/meds_dataset.py ADDED Viewed

@@ -0,0 +1,232 @@
+import logging
+import os
+from typing import List, Union
+import h5pickle
+import numpy as np
+import torch
+from genhpf.datasets.dataset import BaseDataset
+logger = logging.getLogger(__name__)
+class MEDSDataset(BaseDataset):
+    def __init__(
+        self,
+        manifest_paths: List[str],
+        structure: str = "hierarchical",
+        vocab_size: int = 28996,
+        pad_token_id: int = 0,
+        sep_token_id: int = 102,
+        ignore_index: int = -100,
+        apply_mask: bool = False,
+        mask_token_id: int = 103,
+        mask_prob: float = 0,
+        mask_unit: str = "individual",
+        simclr: bool = False,
+        debug: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        if structure == "hierarchical":
+            structure = "hi"
+        elif structure == "flattened":
+            raise NotImplementedError("Flattened structure is not supported yet.")
+        self.structure = structure
+        self.pad_token_id = pad_token_id
+        self.ignore_index = ignore_index
+        self.apply_mask = apply_mask
+        self.mask_prob = mask_prob
+        self.vocab_size = vocab_size
+        self.mask_token_id = mask_token_id
+        self.mask_unit = mask_unit
+        self.sep_token_id = sep_token_id
+        self.simclr = simclr
+        for k, v in kwargs.items():
+            self.__setattr__(k, v)
+        self.data = []
+        self.subjects = []
+        self.shard_ids = []
+        self.sizes = []
+        for i, manifest_path in enumerate(manifest_paths):
+            with open(manifest_path, "r") as f:
+                data_i_root = f.readline().strip()
+                shard_ids = []
+                for j, line in enumerate(f):
+                    if debug and j >= 300:
+                        break
+                    items = line.strip().split("\t")
+                    assert len(items) == 3, line
+                    subject_id, num_events, shard_id = items
+                    self.subjects.append((i, subject_id))
+                    self.sizes.append(int(num_events))
+                    shard_ids.append(int(shard_id))
+                data_i = {}
+                unique_shard_ids = np.unique(shard_ids)
+                for shard_id in unique_shard_ids:
+                    data_i[shard_id] = h5pickle.File(os.path.join(data_i_root, f"{shard_id}.h5"))["ehr"]
+                self.data.append(data_i)
+                self.shard_ids.extend(shard_ids)
+        logger.info(f"loaded {len(self.subjects)} samples from {len(manifest_paths)} dataset(s)")
+    def __len__(self):
+        return len(self.subjects)
+class HierarchicalMEDSDataset(MEDSDataset):
+    def __init__(
+        self,
+        manifest_paths: List[str],
+        max_events: int = 256,
+        label: bool = False,
+        tasks: List[str] = None,
+        num_labels: List[int] = None,
+        dummy_token_id: int = 101,
+        **kwargs,
+    ):
+        kwargs.pop("structure", None)
+        super().__init__(manifest_paths=manifest_paths, structure="hierarchical", **kwargs)
+        self.max_events = max_events
+        self.label = label
+        self.tasks = tasks
+        self.num_labels = num_labels
+        self.dummy_token_id = dummy_token_id
+    def mask(self, tokens: Union[np.ndarray, torch.Tensor], **kwargs):
+        for i, event in enumerate(tokens):
+            tokens[i], _ = super().mask(event, **kwargs)
+        return tokens
+    def collator(self, samples):
+        samples = [s for s in samples if s["input_ids"] is not None]
+        if len(samples) == 0:
+            return {}
+        if self.simclr:
+            input_ids = sum(
+                [
+                    [s["input_ids"][: len(s["input_ids"]) // 2], s["input_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            type_ids = sum(
+                [
+                    [s["type_ids"][: len(s["input_ids"]) // 2], s["type_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+            dpe_ids = sum(
+                [
+                    [s["dpe_ids"][: len(s["input_ids"]) // 2], s["dpe_ids"][len(s["input_ids"]) // 2 :]]
+                    for s in samples
+                ],
+                [],
+            )
+        else:
+            input_ids = [s["input_ids"] for s in samples]
+            type_ids = [s["type_ids"] for s in samples]
+            dpe_ids = [s["dpe_ids"] for s in samples]
+        sizes = [s.size(0) for s in input_ids]
+        target_size = self.max_events
+        collated_input_ids = (
+            input_ids[0].new_zeros((len(input_ids), target_size, len(input_ids[0][0]))).long()
+        )
+        collated_type_ids = type_ids[0].new_zeros((len(type_ids), target_size, len(type_ids[0][0]))).long()
+        collated_dpe_ids = dpe_ids[0].new_zeros((len(dpe_ids), target_size, len(dpe_ids[0][0]))).long()
+        for i, size in enumerate(sizes):
+            diff = size - target_size
+            if diff == 0:
+                collated_input_ids[i] = input_ids[i]
+                collated_type_ids[i] = type_ids[i]
+                collated_dpe_ids[i] = dpe_ids[i]
+            elif diff < 0:
+                collated_input_ids[i] = torch.cat(
+                    [
+                        input_ids[i],
+                        input_ids[i].new_zeros(-diff, len(input_ids[i][0])),
+                    ],
+                    dim=0,
+                )
+                # add dummy token to the start of each padded event as the event encoder can be
+                # crushed when all the input tokens are pad tokens
+                collated_input_ids[i][diff:, 0] = self.dummy_token_id
+                collated_type_ids[i] = torch.cat(
+                    [
+                        type_ids[i],
+                        type_ids[i].new_zeros(-diff, len(type_ids[i][0])),
+                    ],
+                    dim=0,
+                )
+                collated_dpe_ids[i] = torch.cat(
+                    [
+                        dpe_ids[i],
+                        dpe_ids[i].new_zeros(-diff, len(dpe_ids[i][0])),
+                    ],
+                    dim=0,
+                )
+            else:
+                collated_input_ids[i] = input_ids[i][-target_size:]
+                collated_type_ids[i] = type_ids[i][-target_size:]
+                collated_dpe_ids[i] = dpe_ids[i][-target_size:]
+        out = {"id": [s["id"] for s in samples]}
+        out["net_input"] = {
+            "input_ids": collated_input_ids,
+            "type_ids": collated_type_ids,
+            "dpe_ids": collated_dpe_ids,
+        }
+        if self.label:
+            label = {}
+            for task in self.tasks:
+                if len(samples[0][task]) == 1:
+                    label[task] = torch.cat([s[task] for s in samples])
+                else:
+                    label[task] = torch.stack([s[task] for s in samples])
+            out["label"] = label
+        return out
+    def __getitem__(self, idx):
+        data_idx, subject = self.subjects[idx]
+        data = self.data[data_idx][self.shard_ids[idx]][subject]
+        tokens = data[self.structure][:]
+        if self.apply_mask:
+            tokens = self.mask(
+                tokens,
+                mask_prob=self.mask_prob,
+                vocab_size=self.vocab_size,
+                mask_token_id=self.mask_token_id,
+                mask_unit=self.mask_unit,
+                sep_token_id=self.sep_token_id,
+            )
+        ret = {
+            "id": subject,
+            "input_ids": torch.LongTensor(tokens[:, 0, :]),
+            "type_ids": torch.LongTensor(tokens[:, 1, :]),
+            "dpe_ids": torch.LongTensor(tokens[:, 2, :]),
+        }
+        if self.label:
+            for i, task in enumerate(self.tasks):
+                try:
+                    ret[task] = torch.LongTensor(data["label"][i])
+                except ValueError:
+                    ret[task] = torch.LongTensor([data["label"][()]])
+        return ret

genhpf/loggings/__init__.py ADDED Viewed

File without changes