PyPI - adaptive-harmony - Versions diffs - 0.1.24__py3-none-any.whl → 0.1.25__py3-none-any.whl - Mend

adaptive-harmony 0.1.24py3-none-any.whl → 0.1.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

adaptive_harmony/artifacts/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from harmony_client.artifacts import CustomArtifact as CustomArtifact
+from harmony_client.artifacts import DatasetArtifact as DatasetArtifact
+from adaptive_harmony.evaluation import EvaluationArtifact as EvaluationArtifact

adaptive_harmony/common/env_grpo.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass
+from itertools import groupby
 from typing import Callable, Sequence
 import numpy as np
@@ -7,6 +8,7 @@ import numpy as np
 from adaptive_harmony import (
     CosineScheduler,
     DataSet,
+    InferenceModel,
     JobNotifier,
     Logger,
     StageNotifier,
@@ -21,19 +23,23 @@ from adaptive_harmony.environment import EnvironmentFactory, TrajectoryScore
 from adaptive_harmony.metric_logger import StdoutLogger
-def compute_advantages(
+async def compute_advantages(
     scores: list[TrajectoryScore],
     logprobs: list[list[float]],
     samples: list[TokenizedThread],
     num_generated_turns: list[int],
+    model: InferenceModel,
 ) -> list[list[float]]:
-    def get_assistant_lengths(samples: list[TokenizedThread], num_generated_turns: list[int]) -> list[list[int]]:
-        # here the +1 is because we have a loss weight on the EOD token of a turn, which is not represented when you look at the tokenized
-        # Keep only the last num_generated_turns assistant turns (those with weight>0) since logprobs_per_token only returns logprobs for them
-        return [
-            [len(turn.content) + 1 for turn in sample.get_turns() if turn.role == "assistant"][-num_gen:]
-            for sample, num_gen in zip(samples, num_generated_turns)
-        ]
+    async def get_assistant_lengths(samples: list[TokenizedThread], num_generated_turns: list[int]) -> list[list[int]]:
+        async def get_number_weight_per_assistant_turn(thread: TokenizedThread):
+            # you cannot rely on the tokens of the thread because templates can modify
+            # the number of tokens that will get weights and you need to match the weights
+            # for the advantages
+            weights = (await model.serialize_tokenized_thread(thread))[2]
+            return [len(list(group)) for key, group in groupby(weights, key=bool) if key]
+        all_lengths = await async_map(get_number_weight_per_assistant_turn, samples)
+        return [lengths[-num_gen:] for lengths, num_gen in zip(all_lengths, num_generated_turns)]
     # FROM https://arxiv.org/pdf/2402.03300 -> Process Supervision RL with GRPO
     # HERE PADDING DOES NOT PLAYS A ROLE IN ADVANTAGE COMPUTATION. SINCE nan are ignored.
@@ -68,7 +74,7 @@ def compute_advantages(
         for adv, score in zip(score_level_advantage, scores)
     ]
-    assistant_lengths = get_assistant_lengths(samples, num_generated_turns)
+    assistant_lengths = await get_assistant_lengths(samples, num_generated_turns)
     assert all([len(lp) == sum(al) for lp, al in zip(logprobs, assistant_lengths)])
     token_level_advantage = [np.repeat(adv, al).tolist() for adv, al in zip(turn_level_advantage, assistant_lengths)]
@@ -227,7 +233,9 @@ class ENVGRPO:
         ref_logprobs = await async_map(self.model_ref.logprobs_per_token, all_samples)
         all_trajectory_scores = [score for _, score, _ in trajs_and_scores]
-        advantages = compute_advantages(all_trajectory_scores, logprobs, all_samples, num_generated_turns_list)
+        advantages = await compute_advantages(
+            all_trajectory_scores, logprobs, all_samples, num_generated_turns_list, self.model
+        )
         kl = [
             (np.array(lp, dtype=np.float32) - np.array(ref_lp, dtype=np.float32)).tolist()

adaptive_harmony/common/env_gspo.py ADDED Viewed

@@ -0,0 +1,190 @@
+from dataclasses import dataclass
+from typing import Callable, Sequence, TypeAlias
+import numpy as np
+from numpy.typing import NDArray
+from adaptive_harmony import (
+    JobNotifier,
+    Logger,
+    StageNotifier,
+    StringThread,
+    TokenizedThread,
+    TrainingModel,
+)
+from adaptive_harmony.common import RecipeCallback
+from adaptive_harmony.common.env_grpo import ENVGRPO
+from adaptive_harmony.core.utils import (
+    async_map,
+    hash_hyperparams,
+    log_args,
+)
+from adaptive_harmony.environment import EnvironmentFactory, TrajectoryScore
+from adaptive_harmony.metric_logger import StdoutLogger
+FloatArray: TypeAlias = NDArray[np.float32]
+@dataclass
+class Sample:
+    sample: TokenizedThread
+    logprobs: list[float]
+    ref_logprobs: list[float]
+    advantage: float
+    kl_div: list[float]
+    # for logging
+    score: float
+    gen_len: float
+ENVGSPO_HYPERPARAMS = {
+    "max_num_grpo_steps",
+    "completions_per_sample",
+    "lr",
+    "lr_scheduler",
+    "samples_per_batch",
+    "samples_per_mini_batch",
+    "mini_epochs_per_batch",
+    "max_grad_norm",
+    "clip_range",
+    "kl_beta",
+    "weight_decays",
+    "skip_nan_gradients",
+}
+class ENVGSPO(ENVGRPO):
+    @log_args
+    @hash_hyperparams(include=ENVGSPO_HYPERPARAMS)
+    def __init__(
+        self,
+        dataset: list[StringThread],
+        model: TrainingModel,
+        environment_factory: EnvironmentFactory,
+        logger: Logger = StdoutLogger(),
+        stage_notifier: StageNotifier = JobNotifier().stage_notifier("ENVGSPO Training"),
+        callbacks: Sequence[RecipeCallback] = [],
+        validation_dataset: list[StringThread] | None = None,
+        validation_frequency: float = 0.2,
+        max_num_grpo_steps: int | None = None,
+        completions_per_sample=8,
+        lr: float = 7.5e-7,
+        lr_scheduler: Callable[[float], float] | None = None,
+        samples_per_batch=128,
+        samples_per_mini_batch=128,
+        mini_epochs_per_batch=1,
+        max_grad_norm=1.0,
+        clip_range=0.01,
+        kl_beta=0.1,
+        weight_decays: float = 0.0,
+        skip_nan_gradients: bool = False,
+        restart_from_checkpoint: str | None = None,
+        checkpoint_frequency: float = 0.2,
+        data_seed: int = 42,
+    ):
+        super().__init__(
+            dataset,
+            model,
+            environment_factory,
+            logger,
+            stage_notifier,
+            callbacks,
+            max_num_grpo_steps=max_num_grpo_steps,
+            completions_per_sample=completions_per_sample,
+            lr=lr,
+            lr_scheduler=lr_scheduler,
+            samples_per_batch=samples_per_batch,
+            samples_per_mini_batch=samples_per_mini_batch,
+            mini_epochs_per_batch=mini_epochs_per_batch,
+            max_grad_norm=max_grad_norm,
+            clip_range=clip_range,
+            kl_beta=kl_beta,
+            weight_decays=weight_decays,
+            data_seed=data_seed,
+        )
+    async def gen_data(self, sample: StringThread) -> list[Sample]:
+        # need to override gen data due to the single reward check
+        async def generate_trajectory(
+            prompt: StringThread,
+        ) -> tuple[TokenizedThread, TrajectoryScore, int]:
+            # this create the environment for the first turn.
+            environment = self.environment_factory.create_environment(prompt.metadata)
+            prompt = await environment.bootstrap_prompt(prompt)
+            # Count assistant turns in the context (before generation)
+            nb_context_assistant_turns = sum(1 for turn in prompt.get_turns() if turn.role == "assistant")
+            string_trajectory = await self.model.generate(prompt)  # generate the first response from the agent.
+            num_generated_turns = 1
+            # we loop until the environment returns a score.
+            # notice how the environment can return a score or a tool or user response.
+            while not isinstance(
+                environment_response := await environment.react_to(string_trajectory),
+                TrajectoryScore,
+            ):
+                for env_role, env_content in environment_response:
+                    if not isinstance(env_content, str):
+                        raise ValueError(f"env_content should be a str, got {env_content}")
+                    if env_role == "user":
+                        string_trajectory = string_trajectory.user(env_content)
+                    elif env_role == "tool":
+                        string_trajectory = string_trajectory.tool(env_content)
+                    else:
+                        raise ValueError
+                string_trajectory = await self.model.generate(string_trajectory)
+                num_generated_turns += 1
+            tokenized_trajectory = (
+                await self.model.tokenize_thread(string_trajectory)
+            ).with_weight_assistant_turns_from_index(nb_context_assistant_turns)
+            return tokenized_trajectory, environment_response, num_generated_turns
+        assert self.model_ref is not None, "Calling `gen_data` before reference model has been set"
+        trajs_and_scores = await async_map(generate_trajectory, [sample] * self.completions_per_sample)
+        all_samples = [traj for traj, _, _ in trajs_and_scores]
+        logprobs = await async_map(self.model.logprobs_per_token, all_samples)
+        ref_logprobs = await async_map(self.model_ref.logprobs_per_token, all_samples)
+        all_trajectory_scores = [score for _, score, _ in trajs_and_scores]
+        assert all(len(traj_score.scores) == 1 for traj_score in all_trajectory_scores)
+        all_scores = np.array(
+            [traj_score.scores[0].score for traj_score in all_trajectory_scores],
+            dtype=np.float32,
+        )
+        advantages: FloatArray = all_scores - all_scores.mean()
+        advantages /= advantages.std() + 1e-8
+        kl = [
+            (np.array(lp, dtype=np.float32) - np.array(ref_lp, dtype=np.float32)).tolist()
+            for lp, ref_lp in zip(logprobs, ref_logprobs)
+        ]
+        samples = []
+        for i in range(len(logprobs)):
+            samples.append(
+                Sample(
+                    sample=all_samples[i],
+                    logprobs=logprobs[i],
+                    ref_logprobs=ref_logprobs[i],
+                    advantage=advantages[i],
+                    kl_div=kl[i],
+                    score=all_trajectory_scores[i].cumulative_score,
+                    gen_len=all_samples[i].len_last_turn(),
+                )
+            )
+        return samples
+    async def train_sample(self, sample: Sample):
+        await self.model.train_gspo(
+            sample.sample,
+            sample.logprobs,
+            sample.ref_logprobs,
+            advantage=[sample.advantage],
+            left_clip=self.clip_range,
+            right_clip=self.clip_range,
+            kl_beta=self.kl_beta,
+        )

{adaptive_harmony-0.1.24.dist-info → adaptive_harmony-0.1.25.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: adaptive-harmony
-Version: 0.1.24
+Version: 0.1.25
 Summary: Adaptive Harmony training recipes and utilities for LLM fine-tuning
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Requires-Python: >=3.12
-Requires-Dist: harmony-client~=0.1.0
+Requires-Dist: harmony-client~=0.2.0
 Requires-Dist: rich>=13.7.0
 Requires-Dist: datasets>=2.14.0
 Requires-Dist: hf-xet>=1.1.2

{adaptive_harmony-0.1.24.dist-info → adaptive_harmony-0.1.25.dist-info}/RECORD RENAMED Viewed

@@ -2,11 +2,13 @@ adaptive_harmony/__init__.py,sha256=_KoDEWVU-mCtXWp7ZXXlWcTWSVVkE6_r8xlJDXyOxRw,
 adaptive_harmony/logging_table.py,sha256=kN5jS-PO0Y1B6KFicv3BnSyXz5OfThV4L1pCY3_kUmk,56
 adaptive_harmony/metric_logger.py,sha256=6KAp7UhhygiHgWj5l9Bhwc7Sg9cIhxSzAilpxp_7iZM,16619
 adaptive_harmony/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+adaptive_harmony/artifacts/__init__.py,sha256=6iZNKLr3gDC9swbmlwPaUyev5N7lY-ukLZgtV93BeZQ,224
 adaptive_harmony/common/__init__.py,sha256=qebnYmwNBurtouGDbK27mtwt9zLm3P0tHR_M9LnFZT4,967
 adaptive_harmony/common/callbacks.py,sha256=Q5qxVOAdnQRUZxy_ZcBAVxXTmSNA3o7L-cfEZ3JPnWs,8636
 adaptive_harmony/common/checkpointing.py,sha256=rNfzwTEvWzNbUMjkl4CUD3zfsYdsWU_ksR3Lqn-Ghck,6569
 adaptive_harmony/common/dpo.py,sha256=ioionFEnxzagfBVnIvLBh6rb6-d8WeWtVHgp-VDBKf8,3463
-adaptive_harmony/common/env_grpo.py,sha256=bQEBJ7TojBxnCIRopu_pkjzfTl1zAXRcW8olRDMDtIE,15149
+adaptive_harmony/common/env_grpo.py,sha256=HR5CFrK1MiVXNljV1uE5ylTDeryF8hbdqWskqs_BqBE,15440
+adaptive_harmony/common/env_gspo.py,sha256=AVkh8qTZ-IGgPrPYGifb1t-3mBsn-blW3aM5vZj-joA,6877
 adaptive_harmony/common/grpo.py,sha256=LlG0NxpTtFga06YguTNDnEOVfBjRYHJoRyz4fbAFCRc,10384
 adaptive_harmony/common/gspo.py,sha256=O4z-BrKLusGeM8P6LWz77h8i0HrUhLR7_wxrAluxdxQ,2407
 adaptive_harmony/common/ppo.py,sha256=owJlajLDnOxq4LpjjIn-dLXJVmKlsQh3wMG0zfnbUxU,12393
@@ -61,7 +63,7 @@ adaptive_harmony/runtime/decorators.py,sha256=zDNnG_fNz-zgHnb-d5WCPNLMMKFRtL_ncz
 adaptive_harmony/runtime/model_artifact_save.py,sha256=1Ui-Q1hP_eDAhKBFOXpEVix5Q3TY9_d11viXs0xsk3o,137
 adaptive_harmony/runtime/runner.py,sha256=70lNz2pe2dGEgqH8Igwp8ppGLDLxHVwNmxcyV4Y6HMM,898
 adaptive_harmony/runtime/simple_notifier.py,sha256=iVXtZwfcOvkZlWQgFC0qjE1P-yA6Y7Wx0SxQ9FoJ-0s,129
-adaptive_harmony-0.1.24.dist-info/METADATA,sha256=G67ZZoxVySEAvW2NAZM5wJK5HBGFmWpHw7Gi7QNS0pA,1436
-adaptive_harmony-0.1.24.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-adaptive_harmony-0.1.24.dist-info/top_level.txt,sha256=ZEmoKxkFM4M7H2mgH15wQ4Tf0Eb13FBmghRvC2seacU,17
-adaptive_harmony-0.1.24.dist-info/RECORD,,
+adaptive_harmony-0.1.25.dist-info/METADATA,sha256=LnlRqVeKLSw07fnI5_nFRLOP1pjMsL2wPAQ-6xYCt5A,1436
+adaptive_harmony-0.1.25.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+adaptive_harmony-0.1.25.dist-info/top_level.txt,sha256=ZEmoKxkFM4M7H2mgH15wQ4Tf0Eb13FBmghRvC2seacU,17
+adaptive_harmony-0.1.25.dist-info/RECORD,,

{adaptive_harmony-0.1.24.dist-info → adaptive_harmony-0.1.25.dist-info}/WHEEL RENAMED Viewed

File without changes

{adaptive_harmony-0.1.24.dist-info → adaptive_harmony-0.1.25.dist-info}/top_level.txt RENAMED Viewed

File without changes

adaptive-harmony 0.1.24__py3-none-any.whl → 0.1.25__py3-none-any.whl

adaptive-harmony 0.1.24py3-none-any.whl → 0.1.25py3-none-any.whl