PyPI - megatron-core - Versions diffs - 0.13.0rc1__tar.gz → 0.14.0rc0__tar.gz - Mend

megatron-core 0.13.0rc1tar.gz → 0.14.0rc0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (307) hide show

{megatron_core-0.13.0rc1/megatron_core.egg-info → megatron_core-0.14.0rc0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.13.0rc1
+Version: 0.14.0rc0
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -30,6 +30,7 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: torch
+Requires-Dist: numpy<2.0.0
 Provides-Extra: mlm
 Requires-Dist: flask-restful; extra == "mlm"
 Requires-Dist: sentencepiece; extra == "mlm"
@@ -40,16 +41,16 @@ Requires-Dist: tqdm; extra == "dev"
 Requires-Dist: einops; extra == "dev"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "dev"
 Requires-Dist: nvtx; extra == "dev"
-Requires-Dist: numpy<2.0.0; extra == "dev"
 Requires-Dist: transformers; extra == "dev"
 Requires-Dist: multi-storage-client; extra == "dev"
 Requires-Dist: setuptools<80.0.0; extra == "dev"
+Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
+Requires-Dist: megatron-energon[av_decode]<7; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
 Requires-Dist: einops; extra == "lts"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
 Requires-Dist: nvtx; extra == "lts"
-Requires-Dist: numpy<2.0.0; extra == "lts"
 Requires-Dist: transformers; extra == "lts"
 Requires-Dist: zarr; extra == "lts"
 Requires-Dist: setuptools<80.0.0; extra == "lts"

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/bert_dataset.py RENAMED Viewed

@@ -31,16 +31,13 @@ class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
     """The BERT dataset that assumes WordPiece tokenization
     Args:
-        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
+        indexed_dataset (IndexedDataset): The IndexedDataset around which
+            to build the MegatronDataset
         dataset_path (str): The real path on disk to the dataset, for bookkeeping
         indexed_indices (numpy.ndarray): The set of the documents indices to expose
-        num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
+        num_samples (Optional[int]): The number of samples to draw from the indexed dataset.
+            When None, build as many samples as correspond to one epoch.
         index_split (Split): The indexed_indices Split
         config (BERTMaskedWordPieceDatasetConfig): The config
     """
@@ -83,6 +80,7 @@ class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
         Returns:
             Dict[str, Union[int, numpy.ndarray]]: The
         """
         idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
         sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
         numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_dataset.py RENAMED Viewed

@@ -80,7 +80,7 @@ class BlendedDataset(torch.utils.data.Dataset):
             unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
         )
         self.unique_description_hash = hashlib.md5(
-            self.unique_description.encode("utf-8")
+            self.unique_description.encode("utf-8"), usedforsecurity=False
         ).hexdigest()
         self.dataset_index, self.dataset_sample_index = self._build_indices()
@@ -103,6 +103,7 @@ class BlendedDataset(torch.utils.data.Dataset):
         Returns:
             Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
         """
         path_to_cache = self.config.path_to_cache
         if path_to_cache:
@@ -192,7 +193,7 @@ class BlendedDataset(torch.utils.data.Dataset):
             logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
         )
         t_beg = time.time()
-        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
+        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
@@ -203,7 +204,7 @@ class BlendedDataset(torch.utils.data.Dataset):
         )
         t_beg = time.time()
         dataset_sample_index = numpy.load(
-            path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
+            path_to_dataset_sample_index, allow_pickle=True, mmap_mode="r"
         )
         t_end = time.time()
         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_megatron_dataset_builder.py RENAMED Viewed

@@ -529,6 +529,7 @@ def _get_size_per_split_per_dataset(
     Returns:
         List[List[int]]: The number of samples to request per MegatronDataset per split
     """
     assert numpy.isclose(sum(normalized_weights), 1.0)
     # Use margin as buffer to ensure we satiate the request

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/gpt_dataset.py RENAMED Viewed

@@ -19,6 +19,7 @@ from megatron.core.utils import log_single_rank
 logger = logging.getLogger(__name__)
 _PAD_TOKEN_ID = -1
@@ -356,7 +357,6 @@ class GPTDataset(MegatronDataset):
             not cache_hit
             and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0)
         ):
             log_single_rank(
                 logger,
                 logging.INFO,
@@ -494,7 +494,7 @@ class GPTDataset(MegatronDataset):
             f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
         )
         t_beg = time.time()
-        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
+        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
@@ -504,7 +504,7 @@ class GPTDataset(MegatronDataset):
             f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
         )
         t_beg = time.time()
-        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
+        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
@@ -514,7 +514,7 @@ class GPTDataset(MegatronDataset):
             f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
         )
         t_beg = time.time()
-        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
+        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
@@ -575,6 +575,7 @@ def _build_document_index(
     Returns:
         numpy.ndarray: The document index
     """
     if not separate_final_epoch or num_epochs == 1:
         document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
         document_index[:] = documents
@@ -604,6 +605,7 @@ def _build_shuffle_index(
     Returns:
         numpy.ndarray: The shuffle index
     """
     dtype_ = numpy.uint32
     if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
         dtype_ = numpy.int64

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/helpers.py RENAMED Viewed

@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-import numpy
 # Implicit imports for backwards compatibility
 # Explicit imports for readability
+import numpy
 from megatron.core.datasets.helpers_cpp import *
 from megatron.core.datasets.helpers_cpp import build_sample_idx_int32, build_sample_idx_int64
@@ -39,6 +40,7 @@ def build_sample_idx(
     Returns:
         numpy.ndarray: The 2-D sample index
     """
     sample_idx_max = max(document_indices.shape[0], sizes.max())
     if sample_idx_max <= numpy.iinfo(numpy.int32).max:
         sample_idx = build_sample_idx_int32(

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/indexed_dataset.py RENAMED Viewed

@@ -17,12 +17,13 @@ from itertools import accumulate
 from types import TracebackType
 from typing import List, Optional, Tuple, Type, Union
+import numpy
 try:
     import boto3
 except ModuleNotFoundError:
     pass
-import numpy
 import torch
 from megatron.core.datasets.object_storage_utils import S3Config  # pylint: disable=unused-import
@@ -204,7 +205,7 @@ class _IndexWriter(object):
         # the mode per sequence
         if sequence_modes is not None:
-            self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order='C'))
+            self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
     def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
         """Build the sequence pointers per the sequence lengths and dtype size
@@ -234,7 +235,6 @@ class _IndexReader(object):
     """
     def __init__(self, idx_path: str, multimodal: bool) -> None:
         log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
         with open(idx_path, "rb") as stream:
@@ -435,11 +435,11 @@ class _FileBinReader(_BinReader):
         sequence = numpy.empty(count, dtype=dtype)
         if MultiStorageClientFeature.is_enabled():
             msc = MultiStorageClientFeature.import_package()
-            with msc.open(self._bin_path, mode='rb', buffering=0) as bin_buffer_file:
+            with msc.open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
                 bin_buffer_file.seek(offset)
                 bin_buffer_file.readinto(sequence)
         else:
-            with open(self._bin_path, mode='rb', buffering=0) as bin_buffer_file:
+            with open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
                 bin_buffer_file.seek(offset)
                 bin_buffer_file.readinto(sequence)
         return sequence
@@ -520,8 +520,8 @@ class _S3BinReader(_BinReader):
             Bucket=self._s3_bucket,
             Key=self._s3_key,
             # Subtract 1, because the end of Range is inclusive.
-            Range=f'bytes={bytes_start}-{bytes_end-1}',
-        )['Body'].read()
+            Range=f"bytes={bytes_start}-{bytes_end - 1}",
+        )["Body"].read()
         self._cache_bytes_start = bytes_start
         self._cache_bytes_end = bytes_end
         return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype)
@@ -551,7 +551,7 @@ class _MultiStorageClientBinReader(_BinReader):
 # Map of object storage access to the corresponding bin reader
-OBJECT_STORAGE_BIN_READERS = {'s3': _S3BinReader, 'msc': _MultiStorageClientBinReader}
+OBJECT_STORAGE_BIN_READERS = {"s3": _S3BinReader, "msc": _MultiStorageClientBinReader}
 class IndexedDataset(torch.utils.data.Dataset):

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/masked_dataset.py RENAMED Viewed

@@ -355,7 +355,6 @@ class MaskedWordPieceDataset(MegatronDataset):
         numpy_random_state.shuffle(candidate_ngrams)
         if self.config.masking_do_permutation:
             n_swappings = n_maskings
             permuted_indices = set()
@@ -417,7 +416,7 @@ class MaskedWordPieceDataset(MegatronDataset):
         masked_spans = sorted(masked_spans, key=lambda x: x[0][0])
-        return masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans
+        return (masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans)
     @abstractmethod
     def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_dataset.py RENAMED Viewed

@@ -63,7 +63,7 @@ class MegatronDataset(ABC, torch.utils.data.Dataset):
             self.unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
         )
         self.unique_description_hash = hashlib.md5(
-            self.unique_description.encode("utf-8")
+            self.unique_description.encode("utf-8"), usedforsecurity=False
         ).hexdigest()
     @staticmethod

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_tokenizer.py RENAMED Viewed

@@ -20,7 +20,6 @@ class MegatronTokenizer(ABC):
     """
     def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
         self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/bert_embedders.py RENAMED Viewed

@@ -4,7 +4,6 @@
 import abc
 from dataclasses import dataclass
-from typing import Any
 import numpy as np
 import torch
@@ -22,7 +21,9 @@ class Embedder(abc.ABC):
         """Embed a text dataset.
         Args:
-            text_dataset (torch.utils.data.Dataset): Text dataset to embed. Each sample of the text dataset should output a dict with a key 'text' and a string value.
+            text_dataset (torch.utils.data.Dataset): Text dataset to embed.
+                Each sample of the text dataset should output a dict with a key 'text'
+                and a string value.
         Returns:
             A 2D ndarray with shape (len(text_dataset), dimension(embedder)).

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/build.py RENAMED Viewed

@@ -11,7 +11,6 @@ Building a chunk database consists of.
   - Save chunk offsets to disk for each indexed dataset.
 """
-import glob
 import os
 import types
 from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -19,11 +18,9 @@ from typing import Dict, List, Tuple
 import numpy as np
 import torch
-from tqdm import tqdm
 from megatron.core.datasets.indexed_dataset import IndexedDataset
 from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.external_libs import h5py
 from megatron.core.datasets.retro.utils import (
     extract_data_config,
     get_blocks_by_rank,
@@ -40,10 +37,23 @@ from .utils import (
     get_individual_doc_offsets,
     get_merged_db_path_map,
     init_indexed_dataset_infos,
-    load_indexed_datasets,
     save_indexed_dataset_infos,
 )
+try:
+    from tqdm import tqdm
+    HAVE_TQDM = True
+except ImportError:
+    HAVE_TQDM = False
+try:
+    import h5py
+    HAVE_H5PY = True
+except ImportError:
+    HAVE_H5PY = False
 def build_partial_db(
     config: types.SimpleNamespace,
@@ -64,7 +74,8 @@ def build_partial_db(
     from each document.
     Args:
-        config (types.SimpleNamespace): Subset of Retro config, containing 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
+        config (types.SimpleNamespace): Subset of Retro config, containing
+            'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
         dataset_idx (int): Index of this dataset out of all blended datasets.
         n_datasets (int): Total number of blended datasets.
         indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
@@ -83,6 +94,9 @@ def build_partial_db(
         - Dict mapping document ID to number of valid chunks.
     """
+    if not HAVE_TQDM:
+        raise ImportError("tqdm is required to use the RetroDataset. Please install tqdm.")
     # Document start/end indexes.
     doc_range = block["range"]
     n_docs = doc_range[1] - doc_range[0]
@@ -111,7 +125,6 @@ def build_partial_db(
     chunk_db_invalid: List[Tuple] = []
     doc_size_map = {}
     for doc_id in pbar:
         # Progress description.
         try:
             pbar.set_description(
@@ -142,7 +155,6 @@ def build_partial_db(
         # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
         doc_size_map[doc_id] = 0
         for i, chunk_start_idx in enumerate(chunk_start_idxs):
             # Re-tokenize.
             chunk_end_idx = chunk_end_idxs[i]
             gpt_token_ids = indexed_dataset.get(
@@ -176,12 +188,13 @@ def build_block_db(
     """Split each document within block into consecutive retro_gpt_chunk_length size chunks.
     Args:
-        config (RetroPreprocessingConfig): For DB building, we make use of attributes 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
+        config (RetroPreprocessingConfig): For DB building, we make use of attributes
+            'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
         dataset_idx (int): Index of this dataset out of all blended datasets.
         n_datasets (int): Total number of blended datasets.
         indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
         n_procs (int): Total number of parallel processes.
-        executor (ProcessPoolExecutor): Executor for launching parallel processes.
+            executor (ProcessPoolExecutor): Executor for launching parallel processes.
         n_missing_blocks (int):  Total number of blocks to be processed.
         block_idx (int): Block index out of all blocks to be processed.
         block (dict): Range information such as start/end points for chunking idnexed dataset.
@@ -195,7 +208,7 @@ def build_block_db(
     """
     # Build partial dbs.
-    log_retro_rank_0(' > build partial dbs.')
+    log_retro_rank_0(" > build partial dbs.")
     futures = []
     for proc_id in range(n_procs):  # not true process id
         futures.append(
@@ -232,7 +245,7 @@ def build_block_db(
     ]
     # Convert to numpy.
-    log_retro_rank_0(' > converting chunk db to numpy.')
+    log_retro_rank_0(" > converting chunk db to numpy.")
     chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
     chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
@@ -261,6 +274,9 @@ def save_block_db(
         chunk_db_invalid (np.ndarray): Array of invalid chunk indexes.
         doc_offsets (np.ndarray): Array of document offsets by chunks.
     """
+    if not HAVE_H5PY:
+        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
     log_retro_rank_0(" > saving individual db.")
     with h5py.File(block["path"], "w") as f:
         dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
@@ -277,7 +293,8 @@ def build_individual_db(
         config (RetroPreprocessingConfig): Retro preprocessing config.
         dataset_idx (int): Dataset index within blended dataset.
         n_datasets (int): Total number of datasets within blended dataset.
-        dataset_info (dict): Metadata for dataset (see `save_indexed_dataset_infos()` in `utils.py` for more detail).
+        dataset_info (dict): Metadata for dataset
+            (see `save_indexed_dataset_infos()` in `utils.py` for more detail).
     """
     # Make directory.
@@ -323,9 +340,7 @@ def build_individual_db(
     # Process documents in parallel.
     with ProcessPoolExecutor(max_workers=n_procs) as executor:
         for block_idx, block in enumerate(active_blocks):
             if block is not None:
                 # Build block DB.
                 chunk_db_valid, chunk_db_invalid, doc_offsets = build_block_db(
                     config=config,
@@ -349,7 +364,6 @@ def build_individual_db(
                     )
                 else:
                     # Load existing block DB.
                     with h5py.File(block["path"]) as f:
                         existing_chunks_valid = np.copy(f["chunks_valid"])
@@ -382,7 +396,6 @@ def build_individual_dbs(
     # Build individual DBs.
     log_retro_rank_0(" > build individual chunk dbs.")
     for ds_idx, ds_info in enumerate(indexed_dataset_infos):
         # Progress.
         log_retro_rank_0(
             " > building individual db, dataset %d / %d ... '%s'."
@@ -400,7 +413,8 @@ def update_chunk_counts(
     Args:
         config (RetroPreprocessingConfig): Retro preprocessing config.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
+            (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
     """
     if torch.distributed.get_rank() != 0:
@@ -416,7 +430,6 @@ def update_chunk_counts(
     # Set n_chunks (including n_chunks_sampled for unambiguity).
     log_retro_rank_0(" > compute n_chunks.")
     for ds_index, ds_info in enumerate(indexed_dataset_infos):
         db_paths = get_individual_db_paths(config.retro_project_dir, ds_info["prefix"])
         # Update counts.
@@ -457,10 +470,14 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
     Args:
         project_dir (str): Retro project dir.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
+            (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
         db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
     """
+    if not HAVE_H5PY:
+        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
     if torch.distributed.get_rank() != 0:
         return
@@ -489,9 +506,7 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
     # Delete existing chunk db if incorrect size.
     if os.path.exists(db_path):
         try:
             f = h5py.File(db_path)
             n_alloc = len(f["chunks"])  # total allocated
             n_written = f["n_written"][0].item()  # total written
@@ -511,7 +526,6 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
     # Build merged chunk db.
     if not os.path.exists(db_path):
         os.makedirs(os.path.dirname(db_path), exist_ok=True)
         f = h5py.File(db_path, "w")
@@ -589,7 +603,8 @@ def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> Non
     Args:
         project_dir (str): Retro project dir.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
+            (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
     """
     merge_dbs(project_dir, indexed_dataset_infos, "sampled")
     merge_dbs(project_dir, indexed_dataset_infos, "train")
@@ -599,7 +614,8 @@ def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> Non
 def build_db(config: RetroPreprocessingConfig) -> None:
     """Extract token chunks from each indexed dataset.
-    Iterate each document of each indexed dataset, extract that document's chunks, and save to a 'DB' (hdf5 file).
+    Iterate each document of each indexed dataset, extract that document's chunks,
+        and save to a 'DB' (hdf5 file).
     Args:
         config (RetroPreprocessingConfig): Retro preprocessing config.

{megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/dataset.py RENAMED Viewed

@@ -10,10 +10,16 @@ from typing import List
 import numpy as np
 import torch
-from tqdm import tqdm
 from megatron.core.datasets.indexed_dataset import IndexedDataset
+try:
+    from tqdm import tqdm
+    HAVE_TQDM = True
+except ImportError:
+    HAVE_TQDM = False
 class DBDataset(torch.utils.data.Dataset):
     """Dataset for iterating chunks.
@@ -21,7 +27,8 @@ class DBDataset(torch.utils.data.Dataset):
     Args:
         db_path (str): Path of HDF5-format chunk database.
         indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
-        chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets. Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
+        chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets.
+            Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
         chunk_length (int): Max GPT chunk length (e.g., 64).
         eod_token_id (int): EOD token ID.
     """
@@ -34,7 +41,6 @@ class DBDataset(torch.utils.data.Dataset):
         chunk_length: int,
         eod_token_id: int,
     ):
         assert chunks.shape[1] == 5, (
             "expected 5 columns (dataset_idx, "
             "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); "
@@ -93,6 +99,9 @@ class DBDataset(torch.utils.data.Dataset):
         Load the dataset id & document id of each chunk in the database, to
         be used for causality filtering during querying.
         """
+        if not HAVE_TQDM:
+            raise ImportError("tqdm is required to use the DBDataset. Please install tqdm.")
         self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
         block_size = int(1e6)
         for start_idx in tqdm(

megatron-core 0.13.0rc1__tar.gz → 0.14.0rc0__tar.gz

Potentially problematic release.

megatron-core 0.13.0rc1tar.gz → 0.14.0rc0tar.gz