PyPI - genhpf - Versions diffs - 1.0.0__py3-none-any.whl - Mend

genhpf 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of genhpf might be problematic. Click here for more details.

Files changed (67) hide show

genhpf/__init__.py +9 -0
genhpf/configs/__init__.py +23 -0
genhpf/configs/config.yaml +8 -0
genhpf/configs/configs.py +240 -0
genhpf/configs/constants.py +29 -0
genhpf/configs/initialize.py +58 -0
genhpf/configs/utils.py +29 -0
genhpf/criterions/__init__.py +74 -0
genhpf/criterions/binary_cross_entropy.py +114 -0
genhpf/criterions/binary_cross_entropy_with_logits.py +115 -0
genhpf/criterions/criterion.py +87 -0
genhpf/criterions/cross_entropy.py +202 -0
genhpf/criterions/multi_task_criterion.py +177 -0
genhpf/criterions/simclr_criterion.py +84 -0
genhpf/criterions/wav2vec2_criterion.py +130 -0
genhpf/datasets/__init__.py +84 -0
genhpf/datasets/dataset.py +109 -0
genhpf/datasets/genhpf_dataset.py +451 -0
genhpf/datasets/meds_dataset.py +232 -0
genhpf/loggings/__init__.py +0 -0
genhpf/loggings/meters.py +374 -0
genhpf/loggings/metrics.py +155 -0
genhpf/loggings/progress_bar.py +445 -0
genhpf/models/__init__.py +73 -0
genhpf/models/genhpf.py +233 -0
genhpf/models/genhpf_mlm.py +64 -0
genhpf/models/genhpf_predictor.py +73 -0
genhpf/models/genhpf_simclr.py +58 -0
genhpf/models/genhpf_wav2vec2.py +304 -0
genhpf/modules/__init__.py +15 -0
genhpf/modules/gather_layer.py +23 -0
genhpf/modules/grad_multiply.py +12 -0
genhpf/modules/gumbel_vector_quantizer.py +204 -0
genhpf/modules/identity_layer.py +8 -0
genhpf/modules/layer_norm.py +27 -0
genhpf/modules/positional_encoding.py +24 -0
genhpf/scripts/__init__.py +0 -0
genhpf/scripts/preprocess/__init__.py +0 -0
genhpf/scripts/preprocess/genhpf/README.md +75 -0
genhpf/scripts/preprocess/genhpf/__init__.py +0 -0
genhpf/scripts/preprocess/genhpf/ehrs/__init__.py +36 -0
genhpf/scripts/preprocess/genhpf/ehrs/ehr.py +919 -0
genhpf/scripts/preprocess/genhpf/ehrs/eicu.py +550 -0
genhpf/scripts/preprocess/genhpf/ehrs/mimiciii.py +839 -0
genhpf/scripts/preprocess/genhpf/ehrs/mimiciv.py +619 -0
genhpf/scripts/preprocess/genhpf/main.py +174 -0
genhpf/scripts/preprocess/genhpf/manifest.py +79 -0
genhpf/scripts/preprocess/genhpf/sample_dataset.py +177 -0
genhpf/scripts/preprocess/genhpf/utils/__init__.py +3 -0
genhpf/scripts/preprocess/genhpf/utils/utils.py +16 -0
genhpf/scripts/preprocess/manifest.py +83 -0
genhpf/scripts/preprocess/preprocess_meds.py +584 -0
genhpf/scripts/test.py +261 -0
genhpf/scripts/train.py +350 -0
genhpf/trainer.py +370 -0
genhpf/utils/checkpoint_utils.py +171 -0
genhpf/utils/data_utils.py +130 -0
genhpf/utils/distributed_utils.py +497 -0
genhpf/utils/file_io.py +170 -0
genhpf/utils/pdb.py +38 -0
genhpf/utils/utils.py +204 -0
genhpf-1.0.0.dist-info/LICENSE +21 -0
genhpf-1.0.0.dist-info/METADATA +197 -0
genhpf-1.0.0.dist-info/RECORD +67 -0
genhpf-1.0.0.dist-info/WHEEL +5 -0
genhpf-1.0.0.dist-info/entry_points.txt +6 -0
genhpf-1.0.0.dist-info/top_level.txt +1 -0

genhpf/criterions/binary_cross_entropy_with_logits.py ADDED Viewed

@@ -0,0 +1,115 @@
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+import genhpf.utils.utils as utils
+from genhpf.criterions import BaseCriterion, register_criterion
+from genhpf.criterions.criterion import CriterionConfig
+from genhpf.loggings import meters, metrics
+from genhpf.loggings.meters import safe_round
+@dataclass
+class BinaryCrossEntropyWithLogitsConfig(CriterionConfig):
+    threshold: float = field(default=0.5, metadata={"help": "threshold value for binary classification"})
+@register_criterion("binary_cross_entropy_with_logits", dataclass=BinaryCrossEntropyWithLogitsConfig)
+class BinaryCrossEntropyWithLogits(BaseCriterion):
+    def __init__(self, cfg: BinaryCrossEntropyWithLogitsConfig):
+        super().__init__(cfg)
+        if self.task_names is not None and len(self.task_names) > 1:
+            raise ValueError(
+                "binary_cross_entropy_with_logits only supports single task training."
+                " if you want to train multiple tasks, use multi_task_criterion instead."
+            )
+        self.threshold = cfg.threshold
+    def compute_loss(
+        self, logits: torch.Tensor, targets: torch.Tensor, sample=None, net_output=None, model=None
+    ) -> Tuple[torch.Tensor, List[float]]:
+        assert (
+            logits.size() == targets.size()
+        ), f"logits and targets must have the same size: {logits.size()} vs {targets.size()}"
+        targets = targets.float()
+        loss = F.binary_cross_entropy_with_logits(input=logits, target=targets, reduction="sum")
+        return loss, [loss.detach().item()]
+    def get_sample_size(self, sample, targets: torch.Tensor) -> int:
+        if "sample_size" in sample:
+            sample_size = sample["sample_size"]
+        else:
+            sample_size = targets.numel()
+        return sample_size
+    def get_logging_outputs(
+        self, logging_output, logits: torch.Tensor, targets: torch.Tensor, sample=None
+    ) -> List[Dict[str, Any]]:
+        with torch.no_grad():
+            probs = torch.sigmoid(logits)
+            outputs = probs > self.threshold
+            if probs.numel() == 0:
+                corr = 0
+                count = 0
+            else:
+                count = float(probs.numel())
+                corr = (outputs == targets).sum().item()
+            logging_output["correct"] = corr
+            logging_output["count"] = count
+            # report aucs only in eval mode
+            if not self.training:
+                logging_output["_y_true"] = targets.cpu().numpy()
+                logging_output["_y_score"] = probs.cpu().numpy()
+        return logging_output
+    @staticmethod
+    def reduce_metrics(logging_outputs: List[Dict[str, Any]], prefix: str = None) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        if prefix is None:
+            prefix = ""
+        elif prefix is not None and not prefix.endswith("_"):
+            prefix = prefix + "_"
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        sample_size = utils.item(sum(log.get("sample_size", 0) for log in logging_outputs))
+        metrics.log_scalar(f"{prefix}loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3)
+        if "_y_true" in logging_outputs[0] and "_y_score" in logging_outputs[0]:
+            y_true = np.concatenate([log.get("_y_true", []) for log in logging_outputs])
+            y_score = np.concatenate([log.get("_y_score", []) for log in logging_outputs])
+            metrics.log_custom(meters.AUCMeter, f"_{prefix}auc", y_score, y_true)
+        correct = sum(log.get("correct", 0) for log in logging_outputs)
+        metrics.log_scalar(f"_{prefix}correct", correct)
+        total = sum(log.get("count", 0) for log in logging_outputs)
+        metrics.log_scalar(f"_{prefix}total", total)
+        if total > 0:
+            metrics.log_derived(
+                f"{prefix}accuracy",
+                lambda meters: safe_round(meters[f"_{prefix}correct"].sum / meters[f"_{prefix}total"].sum, 5)
+                if meters[f"_{prefix}total"].sum > 0
+                else float("nan"),
+            )
+    def post_validate(self, stats, agg, **kwargs):
+        for key in agg.keys():
+            if key.startswith("_") and key.endswith("auc"):
+                stats[key[1:-3] + "auroc"] = agg[key].auroc
+                stats[key[1:-3] + "auprc"] = agg[key].auprc
+        return stats

genhpf/criterions/criterion.py ADDED Viewed

@@ -0,0 +1,87 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch.nn.modules.loss import _Loss
+from genhpf.configs import BaseConfig
+from genhpf.models.genhpf import GenHPF
+@dataclass
+class CriterionConfig(BaseConfig):
+    task_names: Optional[List[str]] = field(
+        default=None, metadata={"help": "a list of task names for multi-task learning"}
+    )
+    num_labels: Optional[List[int]] = field(
+        default=None, metadata={"help": "a list of number of labels for each task"}
+    )
+class BaseCriterion(_Loss):
+    def __init__(self, cfg: CriterionConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.task_names = cfg.task_names
+        self.num_labels = cfg.num_labels
+    @classmethod
+    def build_criterion(cls, cfg: CriterionConfig):
+        """Construct a new criterion instance."""
+        return cls(cfg)
+    def compute_loss(
+        self, logits: torch.Tensor, targets: torch.Tensor, sample=None, net_output=None, model=None
+    ) -> Tuple[torch.Tensor, List[float]]:
+        """Compute the loss given the logits and targets from the model."""
+        raise NotImplementedError("Criterion must implement the `compute_loss` method")
+    def get_sample_size(self, sample, targets: torch.Tensor) -> int:
+        """Get the sample size, which is used as the denominator for the gradient."""
+        raise NotImplementedError("Criterion must implement the `get_sample_size` method")
+    def get_logging_outputs(
+        self, logging_output, logits: torch.Tensor, targets: torch.Tensor, sample=None
+    ) -> List[Dict[str, Any]]:
+        """
+        Get the logging output to display while training
+        """
+        raise NotImplementedError("Criterion must implement the `get_logging_outputs` method")
+    def forward(self, model: GenHPF, sample, return_net_output=False):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1. the loss
+        2. the sample size, which is used as the denominator for the gradient
+        3. logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        logits = model.get_logits(sample, net_output)
+        targets = model.get_targets(sample, net_output)
+        loss, losses_to_log = self.compute_loss(
+            logits, targets, sample=sample, net_output=net_output, model=model
+        )
+        sample_size = self.get_sample_size(sample, targets)
+        logging_output = {}
+        if len(losses_to_log) > 1:
+            logging_output["loss"] = loss.item()
+            for i, l in enumerate(losses_to_log):
+                logging_output[f"loss_{i}"] = l
+        else:
+            logging_output["loss"] = losses_to_log[0]
+        logging_output["sample_size"] = sample_size
+        logging_output = self.get_logging_outputs(logging_output, logits, targets, sample)
+        if return_net_output:
+            return loss, sample_size, logging_output, net_output
+        else:
+            return loss, sample_size, logging_output
+    @staticmethod
+    def reduce_metrics(stats: Dict[str, Any], prefix: str = None) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        raise NotImplementedError

genhpf/criterions/cross_entropy.py ADDED Viewed

@@ -0,0 +1,202 @@
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from omegaconf import II
+import genhpf.utils.utils as utils
+from genhpf.criterions import BaseCriterion, register_criterion
+from genhpf.criterions.criterion import CriterionConfig
+from genhpf.loggings import meters, metrics
+from genhpf.loggings.meters import safe_round
+@dataclass
+class CrossEntropyConfig(CriterionConfig):
+    report_auc: bool = field(
+        default=False,
+        metadata={
+            "help": "whether to report auc. note that this is only available in eval mode and "
+            "can cause memory and performance issues if enabled."
+        },
+    )
+    ignore_index: int = II("dataset.ignore_index")
+@register_criterion("cross_entropy", dataclass=CrossEntropyConfig)
+class CrossEntropy(BaseCriterion):
+    def __init__(self, cfg: CrossEntropyConfig):
+        super().__init__(cfg)
+        if self.task_names is not None and len(self.task_names) > 1:
+            raise ValueError(
+                "cross_entropy only supports single task training. if you want to train multiple"
+                " tasks, use multi_task_criterion instead."
+            )
+        self.report_auc = cfg.report_auc
+        self.ignore_index = cfg.ignore_index
+    def compute_loss(
+        self, logits: torch.Tensor, targets: torch.Tensor, sample=None, net_output=None, model=None
+    ) -> Tuple[torch.Tensor, List[float]]:
+        """Compute the loss given the logits and targets from the model."""
+        logits = logits.view(-1, logits.size(-1))
+        targets = targets.view(-1).long()
+        if torch.all(targets == self.ignore_index):
+            return logits.new_tensor(0.0), [0.0]
+        loss = F.cross_entropy(logits, targets, reduction="sum", ignore_index=self.ignore_index)
+        return loss, [loss.detach().item()]
+    def get_sample_size(self, sample, targets: torch.Tensor) -> int:
+        if "sample_size" in sample:
+            sample_size = sample["sample_size"]
+        else:
+            sample_size = targets.numel()
+        return sample_size
+    def get_logging_outputs(
+        self, logging_output, logits: torch.Tensor, targets: torch.Tensor, sample=None
+    ) -> List[Dict[str, Any]]:
+        with torch.no_grad():
+            logits = logits.view(-1, logits.size(-1))
+            targets = targets.view(-1).long()
+            valid_indices = torch.where(targets != self.ignore_index)
+            if len(valid_indices[0]) == 0:
+                return {}
+            logits = logits[valid_indices]
+            targets = targets[valid_indices]
+            preds = logits.argmax(dim=-1)
+            count = targets.numel()
+            corr = (preds == targets).sum().item()
+            logging_output["correct"] = corr
+            logging_output["count"] = count
+            # report aucs only in eval mode
+            if self.report_auc and not self.training:
+                probs = torch.sigmoid(logits).view(-1)
+                targets = F.one_hot(targets, logits.size(-1)).float().view(-1)
+                logging_output["_y_true"] = targets.cpu().numpy()
+                logging_output["_y_score"] = probs.cpu().numpy()
+        return logging_output
+    # def forward(self, model, sample):
+    #     net_output = model(**sample['net_input'])
+    #     if isinstance(model, DistributedDataParallel):
+    #         logits = model.module.get_outputs(
+    #                 net_output,
+    #                 task=self.args.train_task,
+    #                 normalize=False
+    #             )
+    #         targets = model.module.get_targets(sample, net_output, self.args.train_task)
+    #     else:
+    #         logits = model.get_outputs(
+    #                 net_output,
+    #                 task=self.args.train_task,
+    #                 normalize=False
+    #             )
+    #         targets = model.get_targets(sample, net_output, self.args.train_task)
+    #     loss_dict = {}
+    #     logging_output = {}
+    #     if self.args.train_task == 'pretrain' and self.args.pretrain_task in ['mlm', 'spanmlm']:
+    #         B, S= targets['input_label'].shape
+    #         for victim in self.args.mask_list:
+    #             loss = F.cross_entropy(
+    #                 logits[victim+'_ids'].view(B*S, -1),
+    #                 targets[victim+'_label'].view(-1)
+    #             )
+    #             loss_dict[victim+'_loss'] = loss
+    #             with torch.no_grad():
+    #                 preds = torch.argmax(logits[victim+'_ids'], dim=-1).view(-1).detach().cpu()
+    #                 target_label = targets[victim+'_label'].view(-1).detach().cpu()
+    #                 mask_idcs = (target_label != -100) & (target_label != 0)
+    #                 total = mask_idcs.sum()
+    #                 correct = (preds[mask_idcs] == target_label[mask_idcs]).sum().float()
+    #                 logging_output[victim+'_correct'] = correct
+    #                 logging_output[victim+'_total'] = total
+    #         loss = sum(loss_dict.values())
+    #         sample_size = len(sample)
+    #         logging_output['loss'] = loss.item()
+    #         logging_output['sample_size'] = sample_size
+    #     elif self.args.train_task in ['finetune', 'scratch']:
+    #         sample_size = len(targets)
+    #         loss = F.cross_entropy(
+    #             logits, F.one_hot(
+    #                 targets.long(),
+    #                 self.multi_label_dict[self.args.pred_src][self.args.pred_target]
+    #             ).float().to(logits.device),
+    #             reduction=self.ce_reduction_mode
+    #         )
+    #         logging_output['loss'] = loss.item()
+    #         logging_output['sample_size'] = sample_size
+    #         with torch.no_grad():
+    #             probs = torch.sigmoid(logits).view(-1).detach()
+    #             targets = self.mlb.transform(np.expand_dims(targets.view(-1).cpu(), axis=1)).flatten()
+    #             logging_output["_y_true"] = targets
+    #             logging_output["_y_score"] = probs.cpu().numpy()
+    #     return loss, sample_size, logging_output
+    @staticmethod
+    def reduce_metrics(logging_outputs: List[Dict[str, Any]], prefix: str = None) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        if prefix is None:
+            prefix = ""
+        elif prefix is not None and not prefix.endswith("_"):
+            prefix = prefix + "_"
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        sample_size = utils.item(sum(log.get("sample_size", 0) for log in logging_outputs))
+        metrics.log_scalar(f"{prefix}loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3)
+        if "_y_true" in logging_outputs[0] and "_y_score" in logging_outputs[0]:
+            y_true = np.concatenate([log.get("_y_true", []) for log in logging_outputs])
+            y_score = np.concatenate([log.get("_y_score", []) for log in logging_outputs])
+            metrics.log_custom(meters.AUCMeter, f"_{prefix}auc", y_score, y_true)
+        correct = sum(log.get("correct", 0) for log in logging_outputs)
+        metrics.log_scalar(f"_{prefix}correct", correct)
+        total = sum(log.get("count", 0) for log in logging_outputs)
+        metrics.log_scalar(f"_{prefix}total", total)
+        if total > 0:
+            metrics.log_derived(
+                f"{prefix}accuracy",
+                lambda meters: safe_round(meters[f"_{prefix}correct"].sum / meters[f"_{prefix}total"].sum, 5)
+                if meters[f"_{prefix}total"].sum > 0
+                else float("nan"),
+            )
+    def post_validate(self, stats, agg, **kwargs):
+        for key in agg.keys():
+            if key.startswith("_") and key.endswith("auc"):
+                stats[key[1:-3] + "auroc"] = agg[key].auroc
+                stats[key[1:-3] + "auprc"] = agg[key].auprc
+        return stats

genhpf/criterions/multi_task_criterion.py ADDED Viewed

@@ -0,0 +1,177 @@
+import math
+import re
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import genhpf.utils.utils as utils
+from genhpf.criterions import BaseCriterion, register_criterion
+from genhpf.criterions.criterion import CriterionConfig
+from genhpf.loggings import metrics
+from genhpf.models.genhpf import GenHPF
+from . import build_criterion
+@dataclass
+class MultiTaskCriterionConfig(CriterionConfig):
+    task_loss_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={
+            "help": "weights for each loss term. if given, has to be a float list of size " "n_criterions"
+        },
+    )
+    args: Any = field(
+        default=None,
+        metadata={
+            "help": "configurations for each criterion where the name of each argument should "
+            "match with the corresponding task name."
+        },
+    )
+@register_criterion("multi_task_criterion", dataclass=MultiTaskCriterionConfig)
+class MultiTaskCriterion(BaseCriterion):
+    def __init__(self, cfg: MultiTaskCriterionConfig):
+        super().__init__(cfg)
+        criterions = {}
+        for task_name in self.task_names:
+            criterion_cfg = getattr(cfg.args, task_name)
+            criterions[task_name] = build_criterion(criterion_cfg)
+        self.criterions = criterions
+        if cfg.task_loss_weights is None:
+            self.task_loss_weights = [1.0] * len(criterions)
+        else:
+            self.task_loss_weights = cfg.task_loss_weights
+    def forward(self, model: GenHPF, sample, return_net_output=False):
+        net_output = model(**sample["net_input"])
+        logits = model.get_logits(sample, net_output)
+        targets = model.get_targets(sample, net_output)
+        if not isinstance(logits, dict):
+            logits = {self.task_names[0]: logits}
+        if not isinstance(targets, dict):
+            targets = {self.task_names[0]: targets}
+        if len(logits) != len(self.task_names) or len(targets) != len(self.task_names):
+            raise ValueError(
+                "number of logits and targets should be equal to the number of tasks. "
+                f"got {len(logits)} logits and {len(targets)} targets for "
+                f"{len(self.task_names)} tasks"
+            )
+        loss = 0.0
+        logging_outputs = dict()
+        for i, task_name in enumerate(self.task_names):
+            criterion = self.criterions[task_name]
+            assert (
+                task_name in logits and task_name in targets
+            ), f"task name {task_name} not found in logits or targets"
+            task_logits = logits[task_name]
+            task_targets = targets[task_name]
+            task_loss, task_losses_to_log = criterion.compute_loss(
+                logits=task_logits, targets=task_targets, sample=sample, net_output=net_output, model=model
+            )
+            task_loss *= self.task_loss_weights[i]
+            sample_size = criterion.get_sample_size(sample, task_targets)
+            logging_outputs[f"<{task_name}>_criterion_cls"] = criterion.__class__
+            if len(task_losses_to_log) > 1:
+                logging_outputs[f"{task_name}_loss"] = task_loss.item()
+                for j, l in enumerate(task_losses_to_log):
+                    logging_outputs[f"<{task_name}>_loss_{j}"] = l
+            else:
+                logging_outputs[f"<{task_name}>_loss"] = task_losses_to_log[0]
+            logging_outputs[f"<{task_name}>_sample_size"] = sample_size
+            task_logging_output = criterion.get_logging_outputs({}, task_logits, task_targets, sample)
+            for log, value in task_logging_output.items():
+                if log.startswith("_"):
+                    log = log[1:]
+                    logging_outputs[f"_<{task_name}>_{log}"] = value
+                else:
+                    logging_outputs[f"<{task_name}>_{log}"] = value
+            # divide task loss by the sample size beforehand to handle different sample
+            # sizes for multiple criterions
+            loss += task_loss / logging_outputs[f"<{task_name}>_sample_size"]
+        # manipulate sample_size to be 1 to avoid double-dividing gradients in optimizer later
+        sample_size = 1
+        if return_net_output:
+            return loss, sample_size, logging_outputs, net_output
+        else:
+            return loss, sample_size, logging_outputs
+    @staticmethod
+    def reduce_metrics(logging_outputs: List[Dict[str, Any]]) -> None:
+        log_keys = logging_outputs[0].keys()
+        grouped_log_keys = defaultdict(list)
+        for lk in log_keys:
+            group = re.search(r"\<.*\>", lk)
+            offset = group.end() + 1
+            group = group.group()[1:-1]
+            key = lk[offset:]
+            if lk.startswith("_"):
+                key = "_" + key
+            grouped_log_keys[group].append(key)
+        total_loss = 0
+        for group, log_keys in grouped_log_keys.items():
+            criterion_cls = logging_outputs[0][f"<{group}>_criterion_cls"]
+            logging_output = []
+            for log in logging_outputs:
+                log_dict = {}
+                for log_key in set(log_keys) - {"criterion_cls"}:
+                    if log_key.startswith("_") and f"_<{group}>{log_key}" in log:
+                        log_dict[log_key] = log[f"_<{group}>{log_key}"]
+                    elif f"<{group}>_{log_key}" in log:
+                        log_dict[log_key] = log[f"<{group}>_{log_key}"]
+                logging_output.append(log_dict)
+            criterion_cls.reduce_metrics(logging_output, prefix=group)
+            loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_output))
+            sample_size = utils.item(sum(log.get("sample_size", 0) for log in logging_output))
+            total_loss += loss_sum / (sample_size or 1) / math.log(2)
+        metrics.log_scalar("loss", total_loss, 1, round=3)
+    def post_validate(self, stats, agg, **kwargs):
+        task_agg = {}
+        for key in agg:
+            for task_name in self.task_names:
+                if key.startswith(task_name) or key[1:].startswith(task_name):
+                    if task_name not in task_agg:
+                        task_agg[task_name] = {}
+                    task_agg[task_name][key] = agg[key]
+                    break
+        for task_name, task_agg in task_agg.items():
+            if hasattr(self.criterions[task_name], "post_validate"):
+                stats = self.criterions[task_name].post_validate(stats, task_agg, **kwargs)
+        for key in list(stats.keys()):
+            for task_name in self.task_names:
+                if key.startswith(task_name):
+                    stat_key = key[len(task_name) + 1 :]
+                    if f"avg_{stat_key}" not in stats:
+                        stats[f"avg_{stat_key}"] = []
+                    stats[f"avg_{stat_key}"].append(stats[key])
+                    break
+        for key in list(stats.keys()):
+            if key.startswith("avg_"):
+                stats[key] = sum(stats[key]) / len(stats[key])
+        return stats
+    def eval(self):
+        super().eval()
+        for criterion in self.criterions.values():
+            criterion.eval()
+        return self

genhpf/criterions/simclr_criterion.py ADDED Viewed

@@ -0,0 +1,84 @@
+import math
+from dataclasses import dataclass, field
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+import genhpf.utils.utils as utils
+from genhpf.criterions import BaseCriterion, register_criterion
+from genhpf.criterions.criterion import CriterionConfig
+from genhpf.loggings import metrics
+@dataclass
+class SimCLRCriterionConfig(CriterionConfig):
+    temp: float = field(default=0.1, metadata={"help": "temperature to divide logits by"})
+@register_criterion("simclr_criterion", dataclass=SimCLRCriterionConfig)
+class SimCLRCriterion(BaseCriterion):
+    def __init__(self, cfg: SimCLRCriterionConfig):
+        super().__init__(cfg)
+        self.temp = cfg.temp
+    def compute_loss(
+        self, logits: torch.Tensor, targets: torch.Tensor = None, sample=None, net_output=None, model=None
+    ) -> Tuple[torch.Tensor, List[float]]:
+        """Compute the loss given the logits and targets from the model."""
+        logits = F.normalize(logits, dim=1)  # normalize logits
+        bsz = int(logits.shape[0] / 2)
+        mask = 1 - torch.eye(bsz * 2, dtype=torch.uint8).to(logits.device)
+        pos_ind = (
+            torch.arange(bsz * 2).to(logits.device),
+            2
+            * torch.arange(bsz, dtype=torch.long)
+            .unsqueeze(1)
+            .repeat(1, 2)
+            .view(-1, 1)
+            .squeeze()
+            .to(logits.device),
+        )
+        neg_mask = torch.ones((bsz * 2, bsz * 2 - 1), dtype=torch.uint8).to(logits.device)
+        neg_mask[pos_ind] = 0
+        # Cosine similarity computation
+        sim_matrix = torch.matmul(logits, logits.T)  # cosine similarity computation
+        # Eliminate similarity between same view
+        sim_matrix = torch.masked_select(sim_matrix, mask.bool()).view(sim_matrix.size(0), -1)
+        positives = sim_matrix[pos_ind].unsqueeze(1)
+        negatives = torch.masked_select(sim_matrix, neg_mask.bool()).view(sim_matrix.size(0), -1)
+        logits = torch.cat((positives, negatives), dim=1)
+        logits /= self.temp
+        target = torch.zeros((logits.size(0),), dtype=torch.long).to(logits.device)
+        loss = F.cross_entropy(logits, target, reduction="sum")
+        return loss, [loss.detach().item()]
+    def get_sample_size(self, sample, targets: torch.Tensor = None) -> int:
+        return sample["net_input"]["input_ids"].size(0)
+    def get_logging_outputs(self, logging_output, logits, target, sample=None, net_output=None):
+        return logging_output
+    @staticmethod
+    def reduce_metrics(logging_outputs, prefix: str = None) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        if prefix is None:
+            prefix = ""
+        elif prefix is not None and not prefix.endswith("_"):
+            prefix = prefix + "_"
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        sample_size = utils.item(sum(log.get("sample_size", 0) for log in logging_outputs))
+        metrics.log_scalar(f"{prefix}loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3)