PyPI - project-llm-trainer - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

project-llm-trainer 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of project-llm-trainer might be problematic. Click here for more details.

Files changed (18) hide show

llm_trainer/dataset.py CHANGED Viewed

@@ -3,9 +3,10 @@ import os.path
 import torch
 from torch.utils.data import Dataset
 import pickle
+import csv
 from .tools import TrainerTools
-from .utils import extra_image_tag_and_repeat_image_tok
+from .utils import repeat_image_tok
 def _try_load_pkl(file_path: str):
@@ -23,7 +24,12 @@ class TextDataset(Dataset):
     """
     适用于pretrain阶段
     """
-    def __init__(self, file_path, block_size, stride):
+    def __init__(
+            self,
+            file_path,
+            block_size,
+            stride
+    ):
         super().__init__()
         self.input_ids = []
@@ -56,12 +62,19 @@ class LineByLineTextDataset(Dataset):
     """
     适用于sft阶段
     """
-    def __init__(self, file_path, max_len, tokens_per_image=-1):
+    def __init__(
+            self,
+            file_path,
+            max_len,
+            image_tags_file_path=None,
+            tokens_per_image=-1
+    ):
         super().__init__()
         self.max_len = max_len
         self.tokens_per_image = tokens_per_image
         self.input_ids = []
+        self.image_tags = []
         tokens = _try_load_pkl(file_path)
         if not tokens:
@@ -79,19 +92,26 @@ class LineByLineTextDataset(Dataset):
         self.input_ids = tokens
+        if image_tags_file_path:
+            with open(image_tags_file_path, 'r') as f:
+                csv_reader = csv.reader(f)
+                for line in csv_reader:
+                    self.image_tags.append(line[0])
     def __len__(self):
         return len(self.input_ids)
     def __getitem__(self, item):
-        inputs = self.input_ids[item]
+        inputs = torch.tensor(self.input_ids[item]).long()
+        image_tag = self.image_tags[item] if self.image_tags else None
         if self.tokens_per_image != -1:
-            inputs, image_tag = extra_image_tag_and_repeat_image_tok(inputs, self.tokens_per_image)
+            inputs = repeat_image_tok(inputs, self.tokens_per_image)
         else:
             image_tag = None
         inputs = inputs[:self.max_len]
-        return {'inputs': torch.tensor(inputs).long(), 'image_tag': image_tag}
+        return {'inputs': inputs, 'image_tag': image_tag}
 class DPODataset(Dataset):

llm_trainer/dpo_trainer.py CHANGED Viewed

@@ -28,7 +28,7 @@ class DPOTrainer(Trainer):
             *,
             train_config: TrainConfig,
             eval_prompts: List[str],
-            eval_image_tags: Optional[List[int]] = None
+            eval_image_tags: Optional[List[str]] = None
     ):
         super().__init__(
             train_config=train_config,
@@ -112,9 +112,10 @@ class DPOTrainer(Trainer):
         return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
-    def _create_dataset(self, file_path) -> Dataset:
+    def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
+        file_path = self.train_config.file_dataset[file_idx]
         max_position_embeddings = self.train_config.model_config.max_position_embeddings
-        return DPODataset(file_path, max_position_embeddings)
+        return DPODataset(file_path, max_position_embeddings), file_path
     def _calc_loss(self, inputs, attention_mask, logits, labels): ...
@@ -184,9 +185,7 @@ class DPOTrainer(Trainer):
             file_count = len(self.train_config.file_dataset)
             for file_idx in range(file_count):
-                file_path = self.train_config.file_dataset[file_idx]
-                dataset = self._create_dataset(file_path)
+                dataset, file_path = self._create_dataset(file_idx)
                 train_data_loader = TrainerTools().parallel.process_dataloader(
                     dataset=dataset,
                     data_loader_kwargs=self.data_loader_kwargs,

llm_trainer/grpo_trainer.py CHANGED Viewed

@@ -30,7 +30,7 @@ class GRPOTrainer(Trainer):
             train_config: TrainConfig,
             reward_func: Callable[[torch.Tensor, torch.Tensor, torch.Tensor], List[float]],
             eval_prompts: List[str],
-            eval_image_tags: Optional[List[int]] = None
+            eval_image_tags: Optional[List[str]] = None
     ):
         super().__init__(
             train_config=train_config,
@@ -90,8 +90,9 @@ class GRPOTrainer(Trainer):
         return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
-    def _create_dataset(self, file_path) -> Dataset:
-        return GRPORolloutDataset(file_path)
+    def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
+        file_path = self.train_config.file_dataset[file_idx]
+        return GRPORolloutDataset(file_path), file_path
     def _calc_loss(self, inputs, attention_mask, logits, labels): ...
@@ -302,8 +303,7 @@ class GRPOTrainer(Trainer):
             file_count = len(self.train_config.file_dataset)
             for file_idx in range(file_count):
-                file_path = self.train_config.file_dataset[file_idx]
-                dataset = self._create_dataset(file_path)
+                dataset, file_path = self._create_dataset(file_idx)
                 train_data_loader = TrainerTools().parallel.process_dataloader(
                     dataset=dataset,

llm_trainer/sft_trainer.py CHANGED Viewed

@@ -14,7 +14,7 @@ class SFTTrainer(Trainer):
             *,
             train_config: TrainConfig,
             eval_prompts: List[str],
-            eval_image_tags: Optional[List[int]] = None
+            eval_image_tags: Optional[List[str]] = None
     ):
         super().__init__(
             train_config=train_config,
@@ -29,11 +29,14 @@ class SFTTrainer(Trainer):
         return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
-    def _create_dataset(self, file_path) -> Dataset:
+    def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
+        file_path = self.train_config.file_dataset[file_idx]
         max_position_embeddings = self.train_config.model_config.max_position_embeddings
         if isinstance(self.train_config.model_config, VLMConfig):
+            image_tag_file_path = self.train_config.image_tags_file_dataset[file_idx]
             tokens_per_image = self.train_config.model_config.tokens_per_image
         else:
+            image_tag_file_path = None
             tokens_per_image = -1
-        return LineByLineTextDataset(file_path, max_position_embeddings, tokens_per_image)
+        return LineByLineTextDataset(file_path, max_position_embeddings, image_tag_file_path, tokens_per_image), file_path

llm_trainer/train_configs.py CHANGED Viewed

@@ -408,6 +408,7 @@ class TrainConfig:
             *,
             model_config: Union[ModelConfig, VLMConfig],
             file_dataset: FileDataset,
+            image_tags_file_dataset: Optional[FileDataset] = None,
             mask_prompt: bool = True,
             gradient_accumulation_steps: int = 0,
             eval_batch_interval: int = 100,
@@ -419,7 +420,7 @@ class TrainConfig:
             fsdp_config: FsdpConfig = FsdpConfig(),
             data_loader_config: DataLoaderConfig = DataLoaderConfig(),
             kd_config: Optional[KDConfig] = None,
-            pixel_values_provider: Optional[Callable[[list[int]], torch.Tensor]] = None,
+            pixel_values_provider: Optional[Callable[[list[str]], torch.Tensor]] = None,
             init_state_dict: Optional[Mapping[str, Any]] = None,
             eval_config: EvalConfig = EvalConfig()
     ):
@@ -427,6 +428,7 @@ class TrainConfig:
         self.batch_size = batch_size
         self.model_config = model_config
         self.file_dataset = file_dataset
+        self.image_tags_file_dataset = image_tags_file_dataset
         self.mask_prompt = mask_prompt
         self.gradient_accumulation_steps = gradient_accumulation_steps
         self.eval_batch_interval = eval_batch_interval

llm_trainer/trainer.py CHANGED Viewed

@@ -52,7 +52,7 @@ class Trainer:
             *,
             train_config: TrainConfig,
             eval_prompts: List[str],
-            eval_image_tags: Optional[List[int]] = None
+            eval_image_tags: Optional[List[str]] = None
     ):
         set_seed()
@@ -318,9 +318,10 @@ class Trainer:
         return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
-    def _create_dataset(self, file_path) -> Dataset:
+    def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
+        file_path = self.train_config.file_dataset[file_idx]
         max_position_embeddings = self.train_config.model_config.max_position_embeddings
-        return TextDataset(file_path, max_position_embeddings, max_position_embeddings)
+        return TextDataset(file_path, max_position_embeddings, max_position_embeddings), file_path
     def _calc_loss(self, inputs, attention_mask, logits, labels):
         # calc loss
@@ -353,7 +354,7 @@ class Trainer:
         TrainerTools().parallel.synchronize()
-    def _get_eval_data(self) -> Tuple[str, Optional[int]]:
+    def _get_eval_data(self) -> Tuple[str, Optional[str]]:
         if len(self.eval_prompts) == 0:
             return '', None
@@ -458,9 +459,7 @@ class Trainer:
             file_count = len(self.train_config.file_dataset)
             for file_idx in range(file_count):
-                file_path = self.train_config.file_dataset[file_idx]
-                dataset = self._create_dataset(file_path)
+                dataset, file_path = self._create_dataset(file_idx)
                 train_data_loader = TrainerTools().parallel.process_dataloader(
                     dataset=dataset,
                     data_loader_kwargs=self.data_loader_kwargs,

llm_trainer/utils.py CHANGED Viewed

@@ -15,45 +15,6 @@ def set_seed(seed=42):
     torch.cuda.manual_seed_all(seed)
-def extra_image_tag_and_repeat_image_tok(
-        inputs: list[int],
-        tokens_per_image: int
-) -> Tuple[list[int], Optional[int]]:
-    # tokens_per_image=3 -> <image>{image_tag}...xxxx -> <image><image><image>...xxx
-    image_tok = TrainerTools().tokenizer.image
-    if image_tok not in inputs:
-        return inputs, None
-    image_tok_idx = inputs.index(image_tok)
-    image_tag_idx = image_tok_idx + 1
-    if image_tag_idx < len(inputs):
-        # remove it
-        image_tag = inputs.pop(image_tag_idx)
-    else:
-        image_tag = None
-    # repeat image_tok
-    new_inputs = inputs[:image_tok_idx] + [image_tok] * tokens_per_image + inputs[image_tok_idx + 1:]
-    return new_inputs, image_tag
-def batch_extra_image_tag_and_repeat_image_tok(
-        tokens: torch.Tensor,
-        tokens_per_image: int
-) -> Tuple[torch.Tensor, list[int]]:
-    new_tokens = []
-    image_tags = []
-    tokens_list = tokens.cpu().detach().tolist()
-    for token in tokens_list:
-        new_token, image_tag = extra_image_tag_and_repeat_image_tok(token, tokens_per_image)
-        new_tokens.append(new_token)
-        image_tags.append(image_tag)
-    return torch.tensor(new_tokens, dtype=tokens.dtype, device=tokens.device), image_tags
 def repeat_image_tok(
         tokens: torch.Tensor,
         tokens_per_image: int

{project_llm_trainer-0.3.1.dist-info → project_llm_trainer-0.3.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: project_llm_trainer
-Version: 0.3.1
+Version: 0.3.2
 Summary: LLM and VLM trainer
 Author: qibin
 Author-email: qibin0506@gmail.com

{project_llm_trainer-0.3.1.dist-info → project_llm_trainer-0.3.2.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
 llm_trainer/checkpoint.py,sha256=Dlkcit0o7Gx6S9QUrIrVp2pTurP9X0zVA7w7ImSuVQU,6049
-llm_trainer/dataset.py,sha256=uz1TTd87ikf7CZPdGxmR95TSQTFWPPTilgWLBWO46_I,3916
+llm_trainer/dataset.py,sha256=4QlOo0SFB5816BUYegQjgobUqTUMQvdmZMM_OEAMSjE,4347
 llm_trainer/dcp.py,sha256=PkD97DyrOtoTKn4FJsfL3VqAy4dxufgjdzJEz8-Cnoc,3635
-llm_trainer/dpo_trainer.py,sha256=6rm8Jq0rI0xazcl_bCOun8rnd34Tb_PKgezowhwoiCM,13150
+llm_trainer/dpo_trainer.py,sha256=q3JZ1iKzmiuwUV-DTrSXUea2d39g6f5x1oUuF1QzBGA,13173
 llm_trainer/ds_checkpoint.py,sha256=_svpzqRaa43--DKPputoXAelc6X9vPM0gNQu-hlh6NI,2153
 llm_trainer/eval.py,sha256=sCvdYnqWWf5_nuDQN5BHb_YivXLOQW-V0ET9mPu0tPU,2389
 llm_trainer/generate_utils.py,sha256=4iM0vyc_1C_iTL31GlS9PR4eZtYaELPRZ02KDSPZA9U,15158
-llm_trainer/grpo_trainer.py,sha256=gWDX8vRZ7hLKl_483X5ua92nst1m617BrqnzLhwr87g,16390
+llm_trainer/grpo_trainer.py,sha256=_k9pik-kpbE8g9taQyG9w3dTLAHilgVBTUa4Y90Wae4,16414
 llm_trainer/log.py,sha256=LxqTGRNZUGMTSQCePRpk-rYyxSnSIbT4kOdP8Fbzr0M,462
 llm_trainer/loss.py,sha256=Yv3fsaVuZ5AhnGPJOr5vEMb_tM2urR6mCb4DBbrHHI8,6030
 llm_trainer/parallel.py,sha256=2VJtW3Gq2c1yS_LdcrNhk7B12prFwBmFnKhvV8FS2d8,4428
@@ -15,20 +15,20 @@ llm_trainer/parallel_ds.py,sha256=W_PkczyAlgffCRcQadN-Pf7H7HM7TU26v5W63jKELFM,99
 llm_trainer/parallel_fsdp.py,sha256=u9XbbVTzcsMcaf-aQFrC_QwWsDRGoEpRmgvu1cKNtgk,3887
 llm_trainer/parallel_none.py,sha256=a6tt3aBmCq5rSP7n2I-sF-hsZ992BbLbpbxutDCFJfs,607
 llm_trainer/scheduler.py,sha256=Xz8HhwoRMjRe41sf_NHhpZfkTlEs0I2MYusvMY6hCVw,3531
-llm_trainer/sft_trainer.py,sha256=T9CujoEp8D5I65fLF2wgV6SPjzhGFbAI4We5NwL4O-M,1443
+llm_trainer/sft_trainer.py,sha256=WWmg8YOwr-w90otmeMjXvK9sa_DSPKlfgAPg3kHyRF4,1672
 llm_trainer/tokenizer.py,sha256=A7TYYUbtPf75kjCvWP7yBui4xZBObMk2aPem62YpwpY,6776
 llm_trainer/tools.py,sha256=AhfjN9oln5Pyif1SgCWwgQg-Q5acTCd9xpz4L26QUjA,3039
-llm_trainer/train_configs.py,sha256=FAlylSYVeh_oJGTy2fcMNUV8JLD6B70hMuk-iKx14iI,15748
-llm_trainer/trainer.py,sha256=mq51d-2ADUpcWCArszhYnOSTveatt3_x43hcC7IZgYk,24330
-llm_trainer/utils.py,sha256=04XiMENVotNgbNRBn9wadHu-cJHPxj0Xq-zzLJmNgZQ,8062
-project_llm_trainer-0.3.1.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
-project_llm_trainer-0.3.1.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
-project_llm_trainer-0.3.1.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
-project_llm_trainer-0.3.1.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
-project_llm_trainer-0.3.1.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
-project_llm_trainer-0.3.1.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
-project_llm_trainer-0.3.1.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
-project_llm_trainer-0.3.1.dist-info/METADATA,sha256=LJl2lNqTIIQZpTt7iVqzQJ2NhAvTUOwS9w44_XxIn0Y,195
-project_llm_trainer-0.3.1.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-project_llm_trainer-0.3.1.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
-project_llm_trainer-0.3.1.dist-info/RECORD,,
+llm_trainer/train_configs.py,sha256=cadfo8RgxNUR-L3ZLyjiRXTQvhjUl4A1qENaq-ol8h4,15878
+llm_trainer/trainer.py,sha256=153F8FzsKh6k9XLm9i6JzmwN4Vwva5mWr9rVoge_3bY,24353
+llm_trainer/utils.py,sha256=-ivhMF0d999va13S1wt2uBvtVw8Nvr3uBzhaUFKL04Q,6826
+project_llm_trainer-0.3.2.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
+project_llm_trainer-0.3.2.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
+project_llm_trainer-0.3.2.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
+project_llm_trainer-0.3.2.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
+project_llm_trainer-0.3.2.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
+project_llm_trainer-0.3.2.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
+project_llm_trainer-0.3.2.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
+project_llm_trainer-0.3.2.dist-info/METADATA,sha256=NQpGh0Xy09euhzVTSBcC6m5P23ATvRKQ-zmkE0o__6g,195
+project_llm_trainer-0.3.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+project_llm_trainer-0.3.2.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
+project_llm_trainer-0.3.2.dist-info/RECORD,,