PyPI - data-forager - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

data-forager 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data_forager/datasets/tokens_with_aux.py +91 -0
data_forager/index_stores/fs_based.py +91 -8
data_forager/indexers/text_lines.py +28 -73
data_forager/indexers/tokenization_indexer.py +158 -191
data_forager/sample_generators/__init__.py +30 -0
data_forager/sample_generators/aux/__init__.py +18 -0
data_forager/sample_generators/aux/common.py +77 -0
data_forager/sample_generators/aux/loss_mask.py +78 -0
data_forager/sample_generators/common.py +117 -0
data_forager/sample_generators/schema.py +54 -0
data_forager/sample_generators/tokenization.py +210 -0
data_forager/sample_generators/tokenization_with_aux.py +250 -0
data_forager/sample_index.py +34 -2
{data_forager-0.1.5.dist-info → data_forager-0.2.0.dist-info}/METADATA +1 -1
data_forager-0.2.0.dist-info/RECORD +29 -0
{data_forager-0.1.5.dist-info → data_forager-0.2.0.dist-info}/WHEEL +1 -1
data_forager-0.1.5.dist-info/RECORD +0 -20
{data_forager-0.1.5.dist-info → data_forager-0.2.0.dist-info}/licenses/LICENSE +0 -0
{data_forager-0.1.5.dist-info → data_forager-0.2.0.dist-info}/top_level.txt +0 -0

data_forager/indexers/tokenization_indexer.py CHANGED Viewed

@@ -1,28 +1,46 @@
-from typing import Callable, List, Optional
+"""
+Factory function for creating tokenization and indexing pipelines.
-import logging
-import os
-from pathlib import Path
+This module provides a convenience function for setting up the complete pipeline
+to tokenize JSONL text files and create an index for random access.
+"""
-import json
+from typing import Callable, Dict, List, Optional
-import numpy as np
+import json
+import logging
+import os
-from basics.base import Base
 from basics.logging import get_logger
-module_logger = get_logger(os.path.basename(__file__))
 from data_forager.index_stores.common import IndexStoreInterface
 from data_forager.index_stores.fs_based import IndexStore as FSBasedIndexStore
-from data_forager.indexers.text_lines import SampleData, FileTextLinesIndexer, SampleGeneratorInterface
+from data_forager.indexers.text_lines import FileTextLinesIndexer
+from data_forager.sample_generators.tokenization import (
+    TokenizedSampleGenerator,
+    TokenizerFunc,
+    ProcessTextLineFunc,
+)
+from data_forager.sample_generators.aux.common import Part, AuxDataGenerator
 from data_forager.utils import find_files_recursive, natural_sort
-TokenizerFunc = Callable[[str], List[int]]
-ProcessTextLineFunc = Callable[[bytes], str]
+ProcessPartsFunc = Callable[[bytes], List[Part]]
+module_logger = get_logger(os.path.basename(__file__))
 def get_text_from_jsonl(jsonl_bytes: bytes, text_key: str = "text", text_encoding: str = "utf-8") -> str:
+    """
+    Extract text from a JSONL line.
+    :param jsonl_bytes: Raw bytes of the JSONL line.
+    :param text_key: Key in the JSON object containing the text.
+    :param text_encoding: Text encoding to use for decoding.
+    :return: The extracted text string.
+    """
     jsonl_text = jsonl_bytes.decode(text_encoding)
     data = json.loads(jsonl_text)
     return data[text_key]
@@ -44,9 +62,9 @@ def create_tokenize_and_index_jsonl_text_func(
     Create a pipeline to tokenize text from JSONL files and create an index for random access.
     The pipeline:
-     * Tokenizes text from input JSONL objects
-     * Stores the token data in bin files under "tokenized-samples" folder
-     * Stores index data under "index" folder
+    * Tokenizes text from input JSONL objects
+    * Stores the token data in bin files under "tokenized-samples" folder
+    * Stores index data under "index" folder
     Usage:
     ```python
@@ -165,179 +183,128 @@ def create_tokenize_and_index_jsonl_text_func(
     )
-class TokenizedSampleGenerator(Base, SampleGeneratorInterface):
-    def __init__(
-        self,
-        process_text_line_func: ProcessTextLineFunc,
-        tokenizer_func: TokenizerFunc,
-        eos_idx: int,
-        token_dtype: np.dtype = np.uint16,
-        sample_size: Optional[int] = None,
-        base_output_path: str = None,
-        file_name_postfix: str = "tokenized-samples",
-        name: Optional[str] = None
-    ):
-        """
-        Tokenizes and indexed text into fixed length (`sample_size` not None) samples or
-        samples variable of variable size, depending on the text (`sample_size` is not None).
-        This callable performs the following steps:
-        ## prepare ##
-         * In the preparation step, create file to store tokenized samples, based on the input `text_file_path` and
-           the given `base_output_path` and `file_name_postfix`
-         * If the `base_output_path` is not given the `text_file_path` will be used + "/tokenized-samples"
-        ## create_samples ##
-         * To create tokenized text samples, processes incoming text line using `process_text_line_func`,
-           e.g. convert JSONL in to dict and retrieve the sample text from it.
-         * The resulting text is tokenized using `tokenizer_func`.
-         * If a `sample_size` is given:
-           The tokenized text is split into samples of length `sample_size` and stored in the
-           file opened in the prepare step. Here `token_dtype` is used.
-            - Trailing tokens will be combined with samples of a next text line
-            - Tokens from different text samples will be separated by `eos_idx`
-         * If a `sample_size` is not given:
-           The tokenized text is immediately stored as is, in the file opened in the prepare step.
-           Here `token_dtype` is used.
-        ## finish ##
-         * After all text lines are processed, the file holding the tokenized text samples is closed.
-           When `sample_size` not None: Any final trailing tokens will be discarded, but only when the last
-           input text file was processed.
-        :param tokenizer_func:
-        :param name:
-        """
-        super().__init__(pybase_logger_name=name)
-        if sample_size is None:
-            self._log.info(f"Tokenized text will NOT be broken in to samples of fixed length.")
-        self._process_text_line_func = process_text_line_func
-        self._tokenizer_func = tokenizer_func
-        self._eos_idx = eos_idx
-        self._token_dtype = token_dtype
-        self._sample_size = sample_size
-        self._base_output_path = base_output_path
-        self._file_name_postfix = file_name_postfix
-        self._current_samples_path = None
-        self._current_samples_file = None
-        self._rest_tokens = None
-    def prepare(self, text_file_path: str):
-        """
-        ## prepare ##
-         * In the preparation step, create file to store tokenized samples, based on the input `text_file_path` and
-           the given `base_output_path` and `file_name_postfix`
-         * If the `base_output_path` is not given the `text_file_path` will be used + "/tokenized-samples"
-        :param text_file_path: path to text file
-        :return:
-        """
-        input_file_path = os.path.dirname(text_file_path)
-        input_file_name = Path(text_file_path).stem
-        output_file_name = f"{input_file_name}-{self._file_name_postfix}.bin"
-        output_path = self._base_output_path
-        if self._base_output_path is None:
-            output_path = os.path.join(input_file_path, "tokenized-samples")
-        os.makedirs(output_path, exist_ok=True)
-        output_file_path = os.path.join(output_path, output_file_name)
-        if os.path.exists(output_file_path):
-            raise FileExistsError(f"Tokenized samples file already exists: \n{output_file_path}")
-        self._current_samples_path = output_file_path
-        self._current_samples_file = open(output_file_path, "wb")
-        self._log.debug(f"Tokenized samples file opened: \n"
-                        f"{output_file_path}")
-    def create_samples(self, text_line: bytes) -> List[SampleData]:
-        """
-        ## create_samples ##
-         * To create tokenized text samples, processes incoming text line using `process_text_line_func`,
-           e.g. convert JSONL in to dict and retrieve the sample text from it.
-         * The resulting text is tokenized using `tokenizer_func`.
-         * If a `sample_size` is given:
-           The tokenized text is split into samples of length `sample_size` and stored in the
-           file opened in the prepare step. Here `token_dtype` is used.
-            - Trailing tokens will be combined with samples of a next text line
-            - Tokens from different text samples will be separated by `eos_idx`
-         * If a `sample_size` is not given:
-           The tokenized text is immediately stored as is, in the file opened in the prepare step.
-           Here `token_dtype` is used.
-        :param text_line: JSONL text line
-        :return: List of DataSample objects. For each created sample the following is given:
-            * Its representation in bytes, as used to store the sample
-            * The file path to where the sample is stored
-        """
-        input_text = self._process_text_line_func(text_line)
-        tokenized_text = self._tokenizer_func(input_text)
-        if self._sample_size is not None:
-            # Always append EOS after each document to mark document boundary
-            tokenized_text = tokenized_text + [self._eos_idx]
-            # Prepend any leftover tokens from previous document
-            if self._rest_tokens is not None:
-                tokenized_text = self._rest_tokens + tokenized_text
-                self._rest_tokens = None
-            num_tokens = len(tokenized_text)
-            num_samples = num_tokens // self._sample_size
-            num_rest_tokens = num_tokens % self._sample_size
-            if num_rest_tokens > 0:
-                # Store remainder tokens (includes EOS from this document)
-                self._rest_tokens = tokenized_text[-num_rest_tokens:]
-                tokenized_text = tokenized_text[:num_samples * self._sample_size]
-            tokenized_samples = np.array(tokenized_text, dtype=self._token_dtype)
-            tokenized_samples = tokenized_samples.reshape(-1, self._sample_size)
-        else:
-            tokenized_samples = np.array([tokenized_text], dtype=self._token_dtype)
-        # Store tokenized_samples
-        sample_data = []
-        for sample_idx in range(tokenized_samples.shape[0]):
-            sample_bytes = tokenized_samples[sample_idx, :].tobytes()
-            sample_data.append(SampleData(
-                sample_bytes, self._current_samples_path,
-            ))
-            self._current_samples_file.write(sample_bytes)
-        return sample_data
-    def finish(self, is_last_file: bool):
-        """
-        ## finish ##
-         * After all text lines are processed, the file holding the tokenized text samples is closed.
-           When `sample_size` not None: Any final trailing tokens will be discarded, but only when the last
-           input text file was processed.
-        :param is_last_file:
-        :return:
-        """
-        self._close_current_samples_file()
-        if is_last_file and self._rest_tokens is not None:
-            self._log.debug(f"Cut off {len(self._rest_tokens)} unused tokens")
-    def _close_current_samples_file(self):
-        if self._current_samples_file:
-            self._log.debug(f"Closing tokenized samples file: \n{self._current_samples_path}")
-            self._current_samples_file.close()
-            self._current_samples_file = None
-    def __del__(self):
-        self._close_current_samples_file()
+def create_tokenize_and_index_with_aux_func(
+    process_parts_func: ProcessPartsFunc,
+    tokenizer_func: TokenizerFunc,
+    eos_idx: int,
+    aux_generators: Dict[str, AuxDataGenerator],
+    input_base_path: Optional[str] = None,
+    input_file_paths: Optional[List[str]] = None,
+    output_base_path: Optional[str] = None,
+    index_store: Optional[IndexStoreInterface] = None,
+    logger: Optional[logging.Logger] = None,
+    name: Optional[str] = None,
+    **sample_generator_kwargs,
+) -> FileTextLinesIndexer:
+    """
+    Create a pipeline to tokenize structured samples with auxiliary data.
+    This function creates a pipeline that:
+    * Processes structured input (parts with types) from JSONL files
+    * Tokenizes each part and generates auxiliary data (e.g., loss masks)
+    * Stores concatenated token + aux data in bin files
+    * Creates an index with schema for random access
+    Usage:
+    ```python
+    from data_forager.sample_generators.aux import Part, LossMaskGenerator
+    def parse_parts(line_bytes: bytes) -> List[Part]:
+        data = json.loads(line_bytes.decode('utf-8'))
+        return [Part(type=p['type'], text=p['text']) for p in data['parts']]
+    indexer = create_tokenize_and_index_with_aux_func(
+        process_parts_func=parse_parts,
+        tokenizer_func=tokenizer.encode,
+        eos_idx=tokenizer.eos_token_id,
+        aux_generators={'loss_mask': LossMaskGenerator()},
+        input_base_path='./data',
+        sample_size=4096,
+    )
+    indexer()
+    ```
+    :param process_parts_func: Function to extract typed parts from input bytes.
+        Takes JSONL bytes and returns List[Part].
+    :param tokenizer_func: Function used to tokenize text.
+    :param eos_idx: EOS token index, known by the used Tokenizer.
+    :param aux_generators: Dict mapping names to AuxDataGenerator instances.
+        Example: {'loss_mask': LossMaskGenerator()}
+    :param input_base_path: Path to directory containing JSONL files.
+    :param input_file_paths: List of file paths to process.
+    :param output_base_path: Base path for output (index and tokenized samples).
+    :param index_store: Index store to use. Must support set_sample_schema().
+    :param logger: Logger to use.
+    :param name: Name of the indexer for logging.
+    :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleWithAuxGenerator
+        (e.g., sample_size, token_dtype).
+    :raises ValueError: If both input_base_path and input_file_paths are None.
+    :raises ValueError: If output destination cannot be determined.
+    :return: FileTextLinesIndexer instance that can be called to run the pipeline.
+    """
+    # Import here to avoid circular dependency
+    from data_forager.sample_generators.tokenization_with_aux import (
+        TokenizedSampleWithAuxGenerator,
+    )
+    if logger is None:
+        logger = module_logger
+    # Validate input source
+    if input_base_path is None and input_file_paths is None:
+        raise ValueError(
+            "Either input_base_path or input_file_paths must be provided"
+        )
+    # Determine output base path
+    effective_output_base_path = output_base_path or input_base_path
+    # Validate output destination
+    if index_store is None and effective_output_base_path is None:
+        raise ValueError(
+            "Either index_store, output_base_path, or input_base_path must be provided "
+            "to determine where to store the index"
+        )
+    logger.info(f"Output base path: {effective_output_base_path}")
+    if index_store is None:
+        index_store = FSBasedIndexStore(
+            base_path=effective_output_base_path,
+        )
+    if input_file_paths is None:
+        logger.info(f"Scanning for JSONL files in: {input_base_path}")
+        input_file_paths = find_files_recursive(
+            input_base_path,
+            extension_patterns=['*.jsonl', '*.JSONL']
+        )
+        input_file_paths = natural_sort(input_file_paths)
+        logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
+    # Set default base_output_path for tokenized samples if not provided
+    if 'base_output_path' not in sample_generator_kwargs:
+        default_base_output_path = os.path.join(
+            effective_output_base_path, "tokenized-samples"
+        )
+        logger.info(f"Tokenized samples output path: {default_base_output_path}")
+        sample_generator_kwargs['base_output_path'] = default_base_output_path
+    sample_generator = TokenizedSampleWithAuxGenerator(
+        process_parts_func=process_parts_func,
+        tokenizer_func=tokenizer_func,
+        eos_idx=eos_idx,
+        aux_generators=aux_generators,
+        **sample_generator_kwargs
+    )
+    return FileTextLinesIndexer(
+        input_file_paths=input_file_paths,
+        index_store=index_store,
+        sample_generator=sample_generator,
+        description="Tokenizing with aux data and indexing",
+        name=name,
+    )

data_forager/sample_generators/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""
+Sample generators for transforming input data into samples.
+This package contains:
+- SampleGeneratorInterface: Protocol for sample generators
+- SampleData: Data class for sample information
+- SampleSchema, ArraySpec: Schema classes for structured samples
+- TokenizedSampleGenerator: Tokenizes text into fixed-length samples
+- TokenizedSampleWithAuxGenerator: Tokenizes with auxiliary data (loss masks, etc.)
+"""
+from data_forager.sample_generators.common import (
+    SampleData,
+    SampleGeneratorInterface,
+    NOOPSampleGenerator,
+    noop_sample_processing,
+)
+from data_forager.sample_generators.schema import (
+    ArraySpec,
+    SampleSchema,
+)
+__all__ = [
+    "ArraySpec",
+    "NOOPSampleGenerator",
+    "SampleData",
+    "SampleGeneratorInterface",
+    "SampleSchema",
+    "noop_sample_processing",
+]

data_forager/sample_generators/aux/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+Auxiliary data generators for sample generators.
+This subpackage provides generators for auxiliary data that accompanies
+tokenized samples, such as loss masks for selective training.
+"""
+from data_forager.sample_generators.aux.common import (
+    AuxDataGenerator,
+    Part,
+)
+from data_forager.sample_generators.aux.loss_mask import LossMaskGenerator
+__all__ = [
+    "AuxDataGenerator",
+    "LossMaskGenerator",
+    "Part",
+]

data_forager/sample_generators/aux/common.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+Common interfaces for auxiliary data generators.
+This module provides the protocol and data structures for generating auxiliary
+data (e.g., loss masks) alongside tokenized samples.
+"""
+from typing import List, Protocol
+from dataclasses import dataclass
+import numpy as np
+@dataclass
+class Part:
+    """
+    A typed part of a structured sample.
+    Used to represent segments of text with semantic types (e.g., prompt,
+    response, system) that determine how auxiliary data is generated.
+    :param type: Semantic type of this part (e.g., "system", "prompt",
+        "response", "thinking", "text").
+    :param text: The text content of this part.
+    """
+    type: str
+    text: str
+class AuxDataGenerator(Protocol):
+    """
+    Protocol for generating auxiliary data for tokenized samples.
+    Auxiliary data generators produce per-token data (e.g., loss masks)
+    based on the semantic type of each part in a structured sample.
+    """
+    @property
+    def dtype(self) -> np.dtype:
+        """
+        Return the NumPy dtype for this auxiliary data.
+        :return: NumPy dtype (e.g., np.uint8 for loss masks).
+        """
+        ...
+    def generate(
+        self,
+        part_type: str,
+        num_tokens: int,
+        *,
+        part_tokens: List[int] | None = None,
+    ) -> List[int]:
+        """
+        Generate auxiliary data values for a tokenized part.
+        :param part_type: Semantic type of the part (e.g., "prompt", "response").
+        :param num_tokens: Number of tokens in this part.
+        :param part_tokens: Optional list of actual token IDs, for generators
+            that need token-level information.
+        :return: List of auxiliary data values, one per token.
+        """
+        ...
+    def generate_for_eos(self) -> int:
+        """
+        Generate the auxiliary data value for an EOS token.
+        EOS tokens are inserted between documents. This method returns the
+        value to use for these boundary tokens.
+        :return: Single auxiliary data value for the EOS token.
+        """
+        ...

data_forager/sample_generators/aux/loss_mask.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""
+Loss mask generator for selective training.
+This module provides LossMaskGenerator which creates loss masks based on
+part types, enabling selective training on specific parts of samples
+(e.g., train on responses but not prompts).
+"""
+from typing import List, Set
+import numpy as np
+from data_forager.sample_generators.aux.common import AuxDataGenerator
+class LossMaskGenerator(AuxDataGenerator):
+    """
+    Generates loss masks based on part types.
+    Loss mask semantics:
+    - mask=1: Excluded from loss (masked out, don't train)
+    - mask=0: Included in loss (train on these tokens)
+    Default masked types: "system", "prompt"
+    Default unmasked types: "text", "response", "thinking"
+    :param masked_types: Set of part types to mask (exclude from loss).
+    :param mask_eos: Whether to mask EOS tokens. Default False (train on EOS).
+    """
+    def __init__(
+        self,
+        masked_types: Set[str] | None = None,
+        mask_eos: bool = False,
+    ):
+        """
+        Initialize the loss mask generator.
+        :param masked_types: Set of part types to mask. Defaults to {"system", "prompt"}.
+        :param mask_eos: Whether to mask EOS tokens. Default False.
+        """
+        if masked_types is None:
+            masked_types = {"system", "prompt"}
+        self._masked_types = masked_types
+        self._mask_eos = mask_eos
+    @property
+    def dtype(self) -> np.dtype:
+        """Return uint8 dtype for loss masks."""
+        return np.dtype(np.uint8)
+    def generate(
+        self,
+        part_type: str,
+        num_tokens: int,
+        *,
+        part_tokens: List[int] | None = None,
+    ) -> List[int]:
+        """
+        Generate loss mask values for a tokenized part.
+        :param part_type: Semantic type of the part.
+        :param num_tokens: Number of tokens in this part.
+        :param part_tokens: Unused, accepted for protocol compatibility.
+        :return: List of mask values (1=masked, 0=train).
+        """
+        mask_value = 1 if part_type in self._masked_types else 0
+        return [mask_value] * num_tokens
+    def generate_for_eos(self) -> int:
+        """
+        Generate the loss mask value for an EOS token.
+        :return: 1 if mask_eos is True, 0 otherwise.
+        """
+        return 1 if self._mask_eos else 0

data-forager 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

data-forager 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl