PyPI - data-forager - Versions diffs - 0.1.2__tar.gz → 0.1.4__tar.gz - Mend

data-forager 0.1.2tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{data_forager-0.1.2 → data_forager-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-forager
-Version: 0.1.2
+Version: 0.1.4
 Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
 Author-email: Freddy Snijder <forager@visionscapers.com>
 License-Expression: MIT

{data_forager-0.1.2 → data_forager-0.1.4}/data_forager/datasets/common.py RENAMED Viewed

@@ -1,6 +1,8 @@
 import abc
 from typing import Optional, Union, Dict, List, Protocol, Any
+import numpy as np
 from basics.base import Base
 from data_forager.sample_index import SampleIndex, SampleLocation
@@ -128,3 +130,52 @@ class Dataset(Base, metaclass=abc.ABCMeta):
     def __del__(self):
         self._close_files()
+class SubsampledDataset:
+    """
+    Wrapper that provides a subsampled view of a dataset.
+    Randomly selects a subset of indices from the wrapped dataset, allowing
+    for faster iteration through epochs when testing or debugging.
+    :param dataset: The dataset to wrap (must support __len__ and __getitem__).
+    :param subsample_factor: Fraction of the dataset to use (must be between 0 and 1).
+    :param seed: Random seed for reproducibility. If None, sampling is random.
+    :param random_order: If False (default), indices are sorted for better disk
+        read locality. If True, indices are kept in random order, which can be
+        used as a randomizer.
+    """
+    def __init__(
+        self,
+        dataset,
+        subsample_factor: float,
+        seed: int | None = None,
+        random_order: bool = False,
+    ):
+        if not 0 < subsample_factor <= 1:
+            raise ValueError(
+                f"subsample_factor must be between 0 (exclusive) and 1 (inclusive), "
+                f"got {subsample_factor}"
+            )
+        self._dataset = dataset
+        self._subsample_factor = subsample_factor
+        n_full = len(dataset)
+        n_sub = int(subsample_factor * n_full)
+        # Sample indices without replacement
+        rng = np.random.default_rng(seed)
+        self._indices = rng.choice(n_full, size=n_sub, replace=False)
+        # Sort for cache locality unless random order is requested
+        if not random_order:
+            self._indices.sort()
+    def __len__(self) -> int:
+        return len(self._indices)
+    def __getitem__(self, idx: int):
+        return self._dataset[self._indices[idx]]

{data_forager-0.1.2 → data_forager-0.1.4}/data_forager/indexers/tokenization_indexer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from typing import Callable, List, Optional
+import logging
 import os
 from pathlib import Path
@@ -8,7 +9,11 @@ import json
 import numpy as np
 from basics.base import Base
+from basics.logging import get_logger
+module_logger = get_logger(os.path.basename(__file__))
+from data_forager.index_stores.common import IndexStoreInterface
 from data_forager.index_stores.fs_based import IndexStore as FSBasedIndexStore
 from data_forager.indexers.text_lines import SampleData, FileTextLinesIndexer, SampleGeneratorInterface
 from data_forager.utils import find_files_recursive, natural_sort
@@ -24,61 +29,125 @@ def get_text_from_jsonl(jsonl_bytes: bytes, text_key: str = "text", text_encodin
 def create_tokenize_and_index_jsonl_text_func(
-    input_base_path: str,
     tokenizer_func: TokenizerFunc,
     eos_idx: int,
+    input_base_path: Optional[str] = None,
+    input_file_paths: Optional[List[str]] = None,
+    output_base_path: Optional[str] = None,
+    index_store: Optional[IndexStoreInterface] = None,
     process_text_line_func: Optional[ProcessTextLineFunc] = None,
+    logger: Optional[logging.Logger] = None,
     name: Optional[str] = None,
     **sample_generator_kwargs,
 ) -> FileTextLinesIndexer:
     """
-    Create function to:
-     * Tokenize text from input JSONL objects, loaded from files at input_base_path (recursively),
-     * Store the token data in bin files under folder "tokenized-samples" in input_base_path
-     * Store index data under folder "index" in input_base_path
+    Create a pipeline to tokenize text from JSONL files and create an index for random access.
+    The pipeline:
+     * Tokenizes text from input JSONL objects
+     * Stores the token data in bin files under "tokenized-samples" folder
+     * Stores index data under "index" folder
     Usage:
-    # Create pipeline to tokenize text from input JSONL objects and index the token samples
+    ```python
     import tiktoken
     enc = tiktoken.get_encoding("gpt2")
     def tokenize_text(text: str) -> List[int]:
-        return tiktoken.enc.encode_ordinary(text)
+        return enc.encode_ordinary(text)
+    # Option 1: Scan directory for JSONL files, output to same directory
+    indexer = create_tokenize_and_index_jsonl_text_func(
+        tokenizer_func=tokenize_text,
+        eos_idx=enc.eot_token,
+        input_base_path='./data',
+        sample_size=1024,
+    )
-    tokenize_and_index_jsonl_text_func = create_jsonl_text_tokenization_and_indexing_pipeline(
-        input_base_path='.',
+    # Option 2: Explicit input files and output path
+    indexer = create_tokenize_and_index_jsonl_text_func(
         tokenizer_func=tokenize_text,
-        sample_size=1024
+        eos_idx=enc.eot_token,
+        input_file_paths=['./data/train.jsonl'],
+        output_base_path='./output',
+        sample_size=1024,
     )
-    # Start tokenization and indexing
-    tokenize_and_index_jsonl_text_func()
+    # Run tokenization and indexing
+    indexer()
+    ```
-    :param input_base_path: Path to directory containing JSONL files (searched recursively).
     :param tokenizer_func: Function used to tokenize text.
-    :param eos_idx: EOS token index, known by the used Tokenizer
+    :param eos_idx: EOS token index, known by the used Tokenizer.
+    :param input_base_path: Path to directory containing JSONL files (searched recursively).
+        Used as fallback for output if `output_base_path` is not provided.
+    :param input_file_paths: List of file paths to process. If provided, these are used
+        instead of scanning `input_base_path` for JSONL files.
+    :param output_base_path: Base path for output (index and tokenized samples).
+        If not provided, `input_base_path` is used.
+    :param index_store: Index store to use. If provided, this is used instead of
+        creating a new FSBasedIndexStore.
     :param process_text_line_func: Function used to process text lines.
         By default, this converts input JSON lines to dicts and returns the "text" field.
         See function get_text_from_jsonl().
-    :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator.
-    :param name: Optional: name of the indexer to create, used for logging purposes
+    :param logger: Logger to use. If not provided, uses module logger.
+    :param name: Name of the indexer, used for logging purposes.
+    :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator
+        (e.g., sample_size, token_dtype, base_output_path).
+    :raises ValueError: If both `input_base_path` and `input_file_paths` are None.
+    :raises ValueError: If `index_store` is None and both `output_base_path` and
+        `input_base_path` are None.
-    :return: FileTextLinesIndexer instance that can be used to tokenize and index text from jsonl objects, from
-             JSONL files at input_base_path (recursively)
+    :return: FileTextLinesIndexer instance that can be called to run tokenization
+        and indexing.
     """
-    if process_text_line_func is None:
-        process_text_line_func=get_text_from_jsonl
+    if logger is None:
+        logger = module_logger
-    index_store = FSBasedIndexStore(
-        base_path=input_base_path,
-    )
-    input_file_paths = find_files_recursive(
-        input_base_path,
-        extension_patterns=['*.jsonl', '*.JSONL']
-    )
+    # Validate input source
+    if input_base_path is None and input_file_paths is None:
+        raise ValueError(
+            "Either input_base_path or input_file_paths must be provided"
+        )
+    # Determine output base path
+    effective_output_base_path = output_base_path or input_base_path
-    # Assuming numbered files
-    input_file_paths = natural_sort(input_file_paths)
+    # Validate output destination
+    if index_store is None and effective_output_base_path is None:
+        raise ValueError(
+            "Either index_store, output_base_path, or input_base_path must be provided "
+            "to determine where to store the index"
+        )
+    logger.info(f"Output base path: {effective_output_base_path}")
+    if process_text_line_func is None:
+        process_text_line_func = get_text_from_jsonl
+    if index_store is None:
+        index_store = FSBasedIndexStore(
+            base_path=effective_output_base_path,
+        )
+    if input_file_paths is None:
+        logger.info(f"Scanning for JSONL files in: {input_base_path}")
+        input_file_paths = find_files_recursive(
+            input_base_path,
+            extension_patterns=['*.jsonl', '*.JSONL']
+        )
+        # Assuming numbered files
+        input_file_paths = natural_sort(input_file_paths)
+        logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
+    # Set default base_output_path for tokenized samples if not provided in kwargs
+    if 'base_output_path' not in sample_generator_kwargs:
+        default_base_output_path = os.path.join(
+            effective_output_base_path, "tokenized-samples"
+        )
+        logger.info(f"Tokenized samples output path: {default_base_output_path}")
+        sample_generator_kwargs['base_output_path'] = default_base_output_path
     sample_generator = TokenizedSampleGenerator(
         process_text_line_func=process_text_line_func,

{data_forager-0.1.2 → data_forager-0.1.4}/data_forager.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-forager
-Version: 0.1.2
+Version: 0.1.4
 Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
 Author-email: Freddy Snijder <forager@visionscapers.com>
 License-Expression: MIT

{data_forager-0.1.2 → data_forager-0.1.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "data-forager"
-version = "0.1.2"
+version = "0.1.4"
 description = "Enabling random access to large datasets on disk for PyTorch training and other use cases"
 readme = "README.md"
 license = "MIT"