npm - dicom-curate - Versions diffs - 0.26.2 → 0.28.0 - Mend

dicom-curate 0.26.2 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/esm/applyMappingsWorker.js +72 -32
package/dist/esm/collectMappings.js +3 -3
package/dist/esm/config/dicom/tagConversion.js +1 -1
package/dist/esm/config/sampleCompositeSpecification.js +1 -1
package/dist/esm/curateDict.js +3 -3
package/dist/esm/curateOne.js +71 -32
package/dist/esm/deidentifyPS315E.js +3 -3
package/dist/esm/hash.js +28 -4
package/dist/esm/index.js +113 -38
package/dist/esm/mappingWorkerPool.js +32 -5
package/dist/esm/s3Client.js +1 -1
package/dist/esm/scanDirectoryWorker.js +37 -3
package/dist/types/applyMappingsWorker.d.ts +1 -0
package/dist/types/curateOne.d.ts +2 -1
package/dist/types/hash.d.ts +1 -1
package/dist/types/mappingWorkerPool.d.ts +12 -1
package/dist/types/scanDirectoryWorker.d.ts +2 -0
package/dist/types/types.d.ts +3 -1
package/dist/umd/dicom-curate.umd.js +300 -83
package/dist/umd/dicom-curate.umd.js.map +1 -1
package/dist/umd/dicom-curate.umd.min.js +7 -7
package/dist/umd/dicom-curate.umd.min.js.map +1 -1
package/package.json +1 -1

package/dist/umd/dicom-curate.umd.js CHANGED Viewed

@@ -50,7 +50,7 @@
                 const { createRequire } = await Promise.resolve().then(function () { return _polyfillNode_module; });
                 const req = createRequire((_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('scanDirectoryWorker.js', document.baseURI).href));
                 const mod = req('@aws-sdk/client-s3');
-                cachedS3Client = mod?.default || mod;
+                cachedS3Client = mod?.default ?? mod;
             }
             else {
                 // browser-friendly dynamic import -> code-split chunk
@@ -95,6 +95,31 @@
             '.ds_store',
         ];
         let keepScanning = true;
+        // Backpressure gate: when the main thread signals 'stop', the scan worker
+        // awaits this promise before emitting the next file. 'resume' resolves it.
+        let pauseResolve = null;
+        let pausePromise = null;
+        function pauseScanning() {
+            if (!pausePromise) {
+                pausePromise = new Promise((resolve) => {
+                    pauseResolve = resolve;
+                });
+            }
+        }
+        function resumeScanning() {
+            if (pauseResolve) {
+                pauseResolve();
+                pauseResolve = null;
+                pausePromise = null;
+            }
+        }
+        /** If paused, wait until resumed. Returns false if scanning was aborted. */
+        async function waitIfPaused() {
+            if (pausePromise) {
+                await pausePromise;
+            }
+            return keepScanning;
+        }
         let excludedFiletypes = [];
         // Compiled regexes from glob patterns, used to exclude files by path
         let excludedPathRegexes = [];
@@ -272,7 +297,13 @@
                         break;
                     }
                     case 'stop': {
-                        keepScanning = false;
+                        // Pause scanning — the scan loop will await waitIfPaused()
+                        pauseScanning();
+                        break;
+                    }
+                    case 'resume': {
+                        // Resume scanning after a pause
+                        resumeScanning();
                         break;
                     }
                     default:
@@ -356,6 +387,9 @@
                 for await (const entry of dir.values()) {
                     if (!keepScanning)
                         return;
+                    // Backpressure: if the main thread paused us, wait here until resumed
+                    if (!(await waitIfPaused()))
+                        return;
                     if (entry.kind === 'file') {
                         const file = await entry.getFile();
                         const fileAnomalies = [];
@@ -422,6 +456,9 @@
                     for (const entry of entries) {
                         if (!keepScanning)
                             return;
+                        // Backpressure: if the main thread paused us, wait here until resumed
+                        if (!(await waitIfPaused()))
+                            return;
                         if (entry.isFile()) {
                             const filePath = path.join(currentPath, entry.name);
                             const stats = await fs.stat(filePath);
@@ -28715,7 +28752,7 @@
           // For private tags (which don't have keywords), keep as-is
           const tagId = isPrivateTag(keyword)
               ? keyword
-              : data$1.DicomMetaDictionary.nameMap[keyword]?.tag || keyword;
+              : (data$1.DicomMetaDictionary.nameMap[keyword]?.tag ?? keyword);
           // Remove parentheses and commas, convert to the format used in dictionary keys
           return tagId.replace(/[(),]/g, '').toLowerCase();
       }
@@ -46996,7 +47033,7 @@
       // Deal with dcmjs quirk of labeling retired tags with a
       // "RETIRED_" prefix
       function getVr(keyword) {
-          const element = nameMap[keyword] || nameMap[`RETIRED_${keyword}`];
+          const element = nameMap[keyword] ?? nameMap[`RETIRED_${keyword}`];
           return element?.vr;
       }
       function temporalVr(vr) {
@@ -47071,7 +47108,7 @@
                       }
                   }
               }
-              return current[tagName] || null;
+              return current[tagName] ?? null;
           }
           const { cleanDescriptorsOption, cleanDescriptorsExceptions, retainLongitudinalTemporalInformationOptions, retainPatientCharacteristicsOption, retainDeviceIdentityOption, retainUIDsOption, retainSafePrivateOption, retainInstitutionIdentityOption, } = dicomPS315EOptions;
           const taggedps315EEls = ps315EElements.reduce((acc, item) => {
@@ -48717,17 +48754,20 @@
       var crcExports = requireCrc();
-      async function hash(buffer, hashMethod) {
+      const DEFAULT_HASH_PART_SIZE = 5 * 1024 * 1024; // 5 MB — matches @aws-sdk/lib-storage default
+      async function hash(buffer, hashMethod, hashPartSize) {
           switch (hashMethod) {
               case 'sha256':
                   return await sha256Hex(buffer);
               case 'crc32':
                   return crc32Hex(buffer);
-              case 'md5':
-                  return md5Hex(buffer);
               case 'crc64':
-              default:
                   return crc64Hex(buffer);
+              case 'aws-s3-etag-2025':
+                  return awsS3Etag(buffer, hashPartSize ?? DEFAULT_HASH_PART_SIZE);
+              case 'md5':
+              default:
+                  return md5Hex(buffer);
           }
       }
       // helper: compute sha256 hex
@@ -48739,6 +48779,49 @@
       function md5Hex(buffer) {
           return md5(new Uint8Array(buffer));
       }
+      /**
+       * Compute a hash that matches the S3 ETag for the given buffer.
+       *
+       * - Single-part (buffer.byteLength <= partSize): plain MD5 hex string.
+       *   This matches the documented S3 ETag behaviour for objects created via
+       *   PUT Object with SSE-S3 (AES256) encryption.
+       *
+       * - Multi-part (buffer.byteLength > partSize): the undocumented but stable
+       *   composite format  md5(concat(md5_raw(part1) … md5_raw(partN)))-N
+       *   that S3 returns for objects created via the Multipart Upload API.
+       */
+      function awsS3Etag(buffer, partSize) {
+          if (buffer.byteLength <= partSize) {
+              return md5Hex(buffer);
+          }
+          return multipartMd5(buffer, partSize);
+      }
+      /**
+       * Reproduce the S3 multipart ETag for a buffer given a known part size.
+       *
+       * Algorithm (empirically stable since ~2006, undocumented by AWS):
+       *   1. Split buffer into ceil(size / partSize) chunks
+       *   2. Compute raw MD5 (16 bytes) of each chunk
+       *   3. Concatenate all raw digests
+       *   4. Compute MD5 of the concatenation → hex
+       *   5. Append "-" + number of parts
+       */
+      function multipartMd5(buffer, partSize) {
+          const totalSize = buffer.byteLength;
+          const partCount = Math.ceil(totalSize / partSize);
+          const rawDigests = new Uint8Array(partCount * 16);
+          for (let i = 0; i < partCount; i++) {
+              const start = i * partSize;
+              const end = Math.min(start + partSize, totalSize);
+              const partBuffer = buffer.slice(start, end);
+              // md5() returns a 32-char hex string; convert to 16 raw bytes
+              const hex = md5(new Uint8Array(partBuffer));
+              for (let j = 0; j < 16; j++) {
+                  rawDigests[i * 16 + j] = parseInt(hex.slice(j * 2, j * 2 + 2), 16);
+              }
+          }
+          return `${md5(rawDigests)}-${partCount}`;
+      }
       // helper: compute crc32 hex (use js-crc). Accepts ArrayBuffer and returns
       // lowercase, zero-padded 8-character hex string.
       // Accept ArrayBuffer, Uint8Array or Node Buffer and always compute the CRC32
@@ -48827,7 +48910,7 @@
               const { createRequire } = await Promise.resolve().then(function () { return _polyfillNode_module; });
               const req = createRequire((_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('applyMappingsWorker.js', document.baseURI).href));
               const mod = req('@aws-sdk/client-s3');
-              cachedS3Client = mod?.default || mod;
+              cachedS3Client = mod?.default ?? mod;
           }
           else {
               // browser-friendly dynamic import -> code-split chunk
@@ -48836,7 +48919,7 @@
           return cachedS3Client;
       }
-      async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, previousSourceFileInfo, previousMappedFileInfo, }) {
+      async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, hashPartSize, previousSourceFileInfo, previousMappedFileInfo, }) {
           const startTime = performance.now();
           let mtime;
           // 1) Read the file (from handle or blob)
@@ -48864,7 +48947,7 @@
                   throw new Error(`Failed to fetch ${fileInfo.url}: ${resp.status} ${resp.statusText}`);
               }
               file = await resp.blob();
-              const lastModifiedHeader = resp.headers.get('last-modified') || undefined;
+              const lastModifiedHeader = resp.headers.get('last-modified');
               if (lastModifiedHeader) {
                   mtime = new Date(lastModifiedHeader).toISOString();
               }
@@ -48937,7 +49020,9 @@
               }
           }
           // 3) read bytes (needed for deep hash)
-          const fileArrayBuffer = await file.arrayBuffer();
+          // Use let so we can null the reference after the last use, allowing GC to
+          // reclaim the buffer while the rest of the function (upload, hashing) runs.
+          let fileArrayBuffer = await file.arrayBuffer();
           let preMappedHash;
           let postMappedHash;
           const postMappedHashHeader = 'x-source-file-hash';
@@ -48945,8 +49030,8 @@
           let canSkip = false;
           if (previousSourceFileInfo?.preMappedHash !== undefined) {
               try {
-                  // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
-                  preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
+                  // choose hashing algorithm: default to md5 for S3 ETag compatibility
+                  preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
               }
               catch (e) {
                   console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -49059,8 +49144,8 @@
           // If we didn't compute preMappedHash yet, do it now
           if (!preMappedHash) {
               try {
-                  // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
-                  preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
+                  // choose hashing algorithm: default to md5 for S3 ETag compatibility
+                  preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
               }
               catch (e) {
                   console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -49077,7 +49162,13 @@
                   allowInvalidVRLength: true,
               });
               // Always calculate post-mapped hash even if deep compare is not requested
-              postMappedHash = await hash(modifiedArrayBuffer, hashMethod || 'crc64');
+              postMappedHash = await hash(modifiedArrayBuffer, hashMethod ?? 'md5', hashPartSize);
+              // Release the original file buffer — the modifiedArrayBuffer is all we
+              // need from this point. In the passthrough case (no header changes),
+              // modifiedArrayBuffer === fileArrayBuffer so the data stays alive through
+              // that reference; in the modified case, fileArrayBuffer is a separate
+              // allocation that can now be GC'd.
+              fileArrayBuffer = null;
               const previousPostMappedHash = previousMappedFileInfo
                   ? previousMappedFileInfo(clonedMapResults.outputFilePath)?.postMappedHash
                   : undefined;
@@ -49116,15 +49207,16 @@
                   const fullFilePath = path.join(fullDirPath, fileName);
                   await fs.writeFile(fullFilePath, new DataView(modifiedArrayBuffer));
               }
-              else {
+              else if (!outputTarget?.http && !outputTarget?.s3) {
+                  // Only create mappedBlob when there is no output target at all (no
+                  // directory, no HTTP endpoint, no S3 bucket). When an upload target is
+                  // present the blob has already been consumed and keeping it around
+                  // retains the full file content in memory for every processed file,
+                  // causing OOM crashes at scale.
                   clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
                       type: 'application/octet-stream',
                   });
               }
-              // If no directory or even if directory present, expose mappedBlob for consumers
-              clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
-                  type: 'application/octet-stream',
-              });
               // If upload URL (bucket) is provided, perform an HTTP PUT upload to the server
               if (outputTarget?.http) {
                   try {
@@ -49137,32 +49229,36 @@
                       const uploadUrl = `${outputTarget.http.url}/${key}`;
                       // Create headers per helper described by the user
                       const headers = {
-                          'Content-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
+                          'Content-Type': 'application/octet-stream',
                           'X-File-Name': fileName,
-                          'X-File-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
+                          'X-File-Type': 'application/octet-stream',
                           'X-File-Size': String(modifiedArrayBuffer.byteLength),
-                          'X-Source-File-Size': String(clonedMapResults.fileInfo?.size || ''),
-                          'X-Source-File-Modified-Time': mtime || '',
-                          'X-Source-File-Hash': preMappedHash || '',
+                          'X-Source-File-Size': String(clonedMapResults.fileInfo?.size ?? ''),
+                          'X-Source-File-Modified-Time': mtime ?? '',
+                          'X-Source-File-Hash': preMappedHash ?? '',
                       };
                       if (outputTarget.http.headers) {
                           Object.assign(headers, outputTarget.http.headers);
                       }
                       if (postMappedHashHeader && postMappedHash)
                           headers[postMappedHashHeader] = postMappedHash;
+                      // Send the ArrayBuffer directly instead of wrapping in a Blob first —
+                      // avoids an extra copy in memory.
                       const resp = await fetchWithRetry(uploadUrl, {
                           method: 'PUT',
                           headers,
-                          body: clonedMapResults.mappedBlob,
+                          body: new Blob([modifiedArrayBuffer], {
+                              type: 'application/octet-stream',
+                          }),
                       });
                       if (!resp.ok) {
                           console.error(`Upload failed for ${uploadUrl}: ${resp.status} ${resp.statusText}`);
-                          clonedMapResults.errors = clonedMapResults.errors || [];
+                          clonedMapResults.errors = clonedMapResults.errors ?? [];
                           clonedMapResults.errors.push(`Upload failed: ${resp.status} ${resp.statusText}`);
                       }
                       else {
                           // attach upload info if available
-                          clonedMapResults.outputUpload = clonedMapResults.outputUpload || {
+                          clonedMapResults.outputUpload = clonedMapResults.outputUpload ?? {
                               url: uploadUrl,
                               status: resp.status,
                           };
@@ -49170,7 +49266,7 @@
                   }
                   catch (e) {
                       console.error('Upload error', e);
-                      clonedMapResults.errors = clonedMapResults.errors || [];
+                      clonedMapResults.errors = clonedMapResults.errors ?? [];
                       clonedMapResults.errors.push(`Upload error: ${e instanceof Error ? e.message : String(e)}`);
                   }
               }
@@ -49193,12 +49289,14 @@
                       await client.send(new s3.PutObjectCommand({
                           Bucket: outputTarget.s3.bucketName,
                           Key: key,
-                          Body: await clonedMapResults.mappedBlob.arrayBuffer(),
-                          ContentType: clonedMapResults.mappedBlob.type || 'application/octet-stream',
+                          // Use the ArrayBuffer directly — going through Blob.arrayBuffer()
+                          // would create yet another copy of the data in memory.
+                          Body: new Uint8Array(modifiedArrayBuffer),
+                          ContentType: 'application/octet-stream',
                           Metadata: {
-                              'source-file-size': String(clonedMapResults.fileInfo?.size || ''),
-                              'source-file-modified-time': mtime || '',
-                              'source-file-hash': preMappedHash || '',
+                              'source-file-size': String(clonedMapResults.fileInfo?.size ?? ''),
+                              'source-file-modified-time': mtime ?? '',
+                              'source-file-hash': preMappedHash ?? '',
                               ...(postMappedHash
                                   ? { 'source-file-post-mapped-hash': postMappedHash }
                                   : {}),
@@ -49213,7 +49311,7 @@
                   }
                   catch (e) {
                       console.error('S3 Upload error', e);
-                      clonedMapResults.errors = clonedMapResults.errors || [];
+                      clonedMapResults.errors = clonedMapResults.errors ?? [];
                       clonedMapResults.errors.push(`S3 Upload error: ${e instanceof Error ? e.message : String(e)}`);
                   }
               }
@@ -62464,6 +62562,7 @@
                               fileInfo,
                               outputTarget: event.data.outputTarget ?? {},
                               hashMethod: event.data.hashMethod,
+                              hashPartSize: event.data.hashPartSize,
                               mappingOptions,
                               previousSourceFileInfo: event.data.previousFileInfo,
                               previousMappedFileInfo: (targetName) => {
@@ -90771,7 +90870,7 @@
         // For private tags (which don't have keywords), keep as-is
         const tagId = isPrivateTag(keyword)
             ? keyword
-            : data$1.DicomMetaDictionary.nameMap[keyword]?.tag || keyword;
+            : (data$1.DicomMetaDictionary.nameMap[keyword]?.tag ?? keyword);
         // Remove parentheses and commas, convert to the format used in dictionary keys
         return tagId.replace(/[(),]/g, '').toLowerCase();
     }
@@ -109053,7 +109152,7 @@
     // Deal with dcmjs quirk of labeling retired tags with a
     // "RETIRED_" prefix
     function getVr(keyword) {
-        const element = nameMap[keyword] || nameMap[`RETIRED_${keyword}`];
+        const element = nameMap[keyword] ?? nameMap[`RETIRED_${keyword}`];
         return element?.vr;
     }
     function temporalVr(vr) {
@@ -109128,7 +109227,7 @@
                     }
                 }
             }
-            return current[tagName] || null;
+            return current[tagName] ?? null;
         }
         const { cleanDescriptorsOption, cleanDescriptorsExceptions, retainLongitudinalTemporalInformationOptions, retainPatientCharacteristicsOption, retainDeviceIdentityOption, retainUIDsOption, retainSafePrivateOption, retainInstitutionIdentityOption, } = dicomPS315EOptions;
         const taggedps315EEls = ps315EElements.reduce((acc, item) => {
@@ -110768,17 +110867,20 @@
     var crcExports = requireCrc();
-    async function hash(buffer, hashMethod) {
+    const DEFAULT_HASH_PART_SIZE = 5 * 1024 * 1024; // 5 MB — matches @aws-sdk/lib-storage default
+    async function hash(buffer, hashMethod, hashPartSize) {
         switch (hashMethod) {
             case 'sha256':
                 return await sha256Hex(buffer);
             case 'crc32':
                 return crc32Hex(buffer);
-            case 'md5':
-                return md5Hex(buffer);
             case 'crc64':
-            default:
                 return crc64Hex(buffer);
+            case 'aws-s3-etag-2025':
+                return awsS3Etag(buffer, hashPartSize ?? DEFAULT_HASH_PART_SIZE);
+            case 'md5':
+            default:
+                return md5Hex(buffer);
         }
     }
     // helper: compute sha256 hex
@@ -110790,6 +110892,49 @@
     function md5Hex(buffer) {
         return md5(new Uint8Array(buffer));
     }
+    /**
+     * Compute a hash that matches the S3 ETag for the given buffer.
+     *
+     * - Single-part (buffer.byteLength <= partSize): plain MD5 hex string.
+     *   This matches the documented S3 ETag behaviour for objects created via
+     *   PUT Object with SSE-S3 (AES256) encryption.
+     *
+     * - Multi-part (buffer.byteLength > partSize): the undocumented but stable
+     *   composite format  md5(concat(md5_raw(part1) … md5_raw(partN)))-N
+     *   that S3 returns for objects created via the Multipart Upload API.
+     */
+    function awsS3Etag(buffer, partSize) {
+        if (buffer.byteLength <= partSize) {
+            return md5Hex(buffer);
+        }
+        return multipartMd5(buffer, partSize);
+    }
+    /**
+     * Reproduce the S3 multipart ETag for a buffer given a known part size.
+     *
+     * Algorithm (empirically stable since ~2006, undocumented by AWS):
+     *   1. Split buffer into ceil(size / partSize) chunks
+     *   2. Compute raw MD5 (16 bytes) of each chunk
+     *   3. Concatenate all raw digests
+     *   4. Compute MD5 of the concatenation → hex
+     *   5. Append "-" + number of parts
+     */
+    function multipartMd5(buffer, partSize) {
+        const totalSize = buffer.byteLength;
+        const partCount = Math.ceil(totalSize / partSize);
+        const rawDigests = new Uint8Array(partCount * 16);
+        for (let i = 0; i < partCount; i++) {
+            const start = i * partSize;
+            const end = Math.min(start + partSize, totalSize);
+            const partBuffer = buffer.slice(start, end);
+            // md5() returns a 32-char hex string; convert to 16 raw bytes
+            const hex = md5(new Uint8Array(partBuffer));
+            for (let j = 0; j < 16; j++) {
+                rawDigests[i * 16 + j] = parseInt(hex.slice(j * 2, j * 2 + 2), 16);
+            }
+        }
+        return `${md5(rawDigests)}-${partCount}`;
+    }
     // helper: compute crc32 hex (use js-crc). Accepts ArrayBuffer and returns
     // lowercase, zero-padded 8-character hex string.
     // Accept ArrayBuffer, Uint8Array or Node Buffer and always compute the CRC32
@@ -110878,7 +111023,7 @@
             const { createRequire } = await Promise.resolve().then(function () { return _polyfillNode_module; });
             const req = createRequire((typeof document === 'undefined' && typeof location === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : typeof document === 'undefined' ? location.href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('dicom-curate.umd.js', document.baseURI).href)));
             const mod = req('@aws-sdk/client-s3');
-            cachedS3Client = mod?.default || mod;
+            cachedS3Client = mod?.default ?? mod;
         }
         else {
             // browser-friendly dynamic import -> code-split chunk
@@ -110887,7 +111032,7 @@
         return cachedS3Client;
     }
-    async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, previousSourceFileInfo, previousMappedFileInfo, }) {
+    async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, hashPartSize, previousSourceFileInfo, previousMappedFileInfo, }) {
         const startTime = performance.now();
         let mtime;
         // 1) Read the file (from handle or blob)
@@ -110915,7 +111060,7 @@
                 throw new Error(`Failed to fetch ${fileInfo.url}: ${resp.status} ${resp.statusText}`);
             }
             file = await resp.blob();
-            const lastModifiedHeader = resp.headers.get('last-modified') || undefined;
+            const lastModifiedHeader = resp.headers.get('last-modified');
             if (lastModifiedHeader) {
                 mtime = new Date(lastModifiedHeader).toISOString();
             }
@@ -110988,7 +111133,9 @@
             }
         }
         // 3) read bytes (needed for deep hash)
-        const fileArrayBuffer = await file.arrayBuffer();
+        // Use let so we can null the reference after the last use, allowing GC to
+        // reclaim the buffer while the rest of the function (upload, hashing) runs.
+        let fileArrayBuffer = await file.arrayBuffer();
         let preMappedHash;
         let postMappedHash;
         const postMappedHashHeader = 'x-source-file-hash';
@@ -110996,8 +111143,8 @@
         let canSkip = false;
         if (previousSourceFileInfo?.preMappedHash !== undefined) {
             try {
-                // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
-                preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
+                // choose hashing algorithm: default to md5 for S3 ETag compatibility
+                preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
             }
             catch (e) {
                 console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -111110,8 +111257,8 @@
         // If we didn't compute preMappedHash yet, do it now
         if (!preMappedHash) {
             try {
-                // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
-                preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
+                // choose hashing algorithm: default to md5 for S3 ETag compatibility
+                preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
             }
             catch (e) {
                 console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -111128,7 +111275,13 @@
                 allowInvalidVRLength: true,
             });
             // Always calculate post-mapped hash even if deep compare is not requested
-            postMappedHash = await hash(modifiedArrayBuffer, hashMethod || 'crc64');
+            postMappedHash = await hash(modifiedArrayBuffer, hashMethod ?? 'md5', hashPartSize);
+            // Release the original file buffer — the modifiedArrayBuffer is all we
+            // need from this point. In the passthrough case (no header changes),
+            // modifiedArrayBuffer === fileArrayBuffer so the data stays alive through
+            // that reference; in the modified case, fileArrayBuffer is a separate
+            // allocation that can now be GC'd.
+            fileArrayBuffer = null;
             const previousPostMappedHash = previousMappedFileInfo
                 ? previousMappedFileInfo(clonedMapResults.outputFilePath)?.postMappedHash
                 : undefined;
@@ -111167,15 +111320,16 @@
                 const fullFilePath = path.join(fullDirPath, fileName);
                 await fs.writeFile(fullFilePath, new DataView(modifiedArrayBuffer));
             }
-            else {
+            else if (!outputTarget?.http && !outputTarget?.s3) {
+                // Only create mappedBlob when there is no output target at all (no
+                // directory, no HTTP endpoint, no S3 bucket). When an upload target is
+                // present the blob has already been consumed and keeping it around
+                // retains the full file content in memory for every processed file,
+                // causing OOM crashes at scale.
                 clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
                     type: 'application/octet-stream',
                 });
             }
-            // If no directory or even if directory present, expose mappedBlob for consumers
-            clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
-                type: 'application/octet-stream',
-            });
             // If upload URL (bucket) is provided, perform an HTTP PUT upload to the server
             if (outputTarget?.http) {
                 try {
@@ -111188,32 +111342,36 @@
                     const uploadUrl = `${outputTarget.http.url}/${key}`;
                     // Create headers per helper described by the user
                     const headers = {
-                        'Content-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
+                        'Content-Type': 'application/octet-stream',
                         'X-File-Name': fileName,
-                        'X-File-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
+                        'X-File-Type': 'application/octet-stream',
                         'X-File-Size': String(modifiedArrayBuffer.byteLength),
-                        'X-Source-File-Size': String(clonedMapResults.fileInfo?.size || ''),
-                        'X-Source-File-Modified-Time': mtime || '',
-                        'X-Source-File-Hash': preMappedHash || '',
+                        'X-Source-File-Size': String(clonedMapResults.fileInfo?.size ?? ''),
+                        'X-Source-File-Modified-Time': mtime ?? '',
+                        'X-Source-File-Hash': preMappedHash ?? '',
                     };
                     if (outputTarget.http.headers) {
                         Object.assign(headers, outputTarget.http.headers);
                     }
                     if (postMappedHashHeader && postMappedHash)
                         headers[postMappedHashHeader] = postMappedHash;
+                    // Send the ArrayBuffer directly instead of wrapping in a Blob first —
+                    // avoids an extra copy in memory.
                     const resp = await fetchWithRetry(uploadUrl, {
                         method: 'PUT',
                         headers,
-                        body: clonedMapResults.mappedBlob,
+                        body: new Blob([modifiedArrayBuffer], {
+                            type: 'application/octet-stream',
+                        }),
                     });
                     if (!resp.ok) {
                         console.error(`Upload failed for ${uploadUrl}: ${resp.status} ${resp.statusText}`);
-                        clonedMapResults.errors = clonedMapResults.errors || [];
+                        clonedMapResults.errors = clonedMapResults.errors ?? [];
                         clonedMapResults.errors.push(`Upload failed: ${resp.status} ${resp.statusText}`);
                     }
                     else {
                         // attach upload info if available
-                        clonedMapResults.outputUpload = clonedMapResults.outputUpload || {
+                        clonedMapResults.outputUpload = clonedMapResults.outputUpload ?? {
                             url: uploadUrl,
                             status: resp.status,
                         };
@@ -111221,7 +111379,7 @@
                 }
                 catch (e) {
                     console.error('Upload error', e);
-                    clonedMapResults.errors = clonedMapResults.errors || [];
+                    clonedMapResults.errors = clonedMapResults.errors ?? [];
                     clonedMapResults.errors.push(`Upload error: ${e instanceof Error ? e.message : String(e)}`);
                 }
             }
@@ -111244,12 +111402,14 @@
                     await client.send(new s3.PutObjectCommand({
                         Bucket: outputTarget.s3.bucketName,
                         Key: key,
-                        Body: await clonedMapResults.mappedBlob.arrayBuffer(),
-                        ContentType: clonedMapResults.mappedBlob.type || 'application/octet-stream',
+                        // Use the ArrayBuffer directly — going through Blob.arrayBuffer()
+                        // would create yet another copy of the data in memory.
+                        Body: new Uint8Array(modifiedArrayBuffer),
+                        ContentType: 'application/octet-stream',
                         Metadata: {
-                            'source-file-size': String(clonedMapResults.fileInfo?.size || ''),
-                            'source-file-modified-time': mtime || '',
-                            'source-file-hash': preMappedHash || '',
+                            'source-file-size': String(clonedMapResults.fileInfo?.size ?? ''),
+                            'source-file-modified-time': mtime ?? '',
+                            'source-file-hash': preMappedHash ?? '',
                             ...(postMappedHash
                                 ? { 'source-file-post-mapped-hash': postMappedHash }
                                 : {}),
@@ -111264,7 +111424,7 @@
                 }
                 catch (e) {
                     console.error('S3 Upload error', e);
-                    clonedMapResults.errors = clonedMapResults.errors || [];
+                    clonedMapResults.errors = clonedMapResults.errors ?? [];
                     clonedMapResults.errors.push(`S3 Upload error: ${e instanceof Error ? e.message : String(e)}`);
                 }
             }
@@ -126760,16 +126920,40 @@
     let scanAnomalies = [];
     // Callbacks set by curateMany, stored here for use by the dispatch loop.
     let progressCallback = () => { };
+    // Callback to resume the scan worker when the processing queue drains below
+    // the low-water mark. Set by curateMany via setScanResumeCallback().
+    let scanResumeCallback = null;
+    let scanPaused = false;
+    /**
+     * Low-water mark for the file processing queue. When the queue size drops
+     * below this threshold after a dispatch, the scan worker is resumed.
+     */
+    const LOW_WATER_MARK = 50;
     // -------------------------------------------------------------------------
     // Public API
     // -------------------------------------------------------------------------
     function setMappingWorkerOptions(opts) {
         mappingWorkerOptions = opts;
     }
+    /**
+     * Register a callback that resumes the scan worker. Called by curateMany
+     * after the scan worker is created.
+     */
+    function setScanResumeCallback(cb) {
+        scanResumeCallback = cb;
+        scanPaused = false;
+    }
+    /**
+     * Mark the scan as paused. Called from the scan worker message handler in
+     * index.ts when the queue exceeds the high-water mark.
+     */
+    function markScanPaused() {
+        scanPaused = true;
+    }
     /**
      * Initialize the mapping worker pool. Call once per curateMany invocation.
      */
-    async function initializeMappingWorkers(skipCollectingMappings, fileInfoIndex, progressCb) {
+    async function initializeMappingWorkers(skipCollectingMappings, fileInfoIndex, progressCb, workerCount) {
         mappingWorkerOptions = {};
         workersActive = 0;
         mapResultsList = skipCollectingMappings ? undefined : [];
@@ -126783,8 +126967,8 @@
         scanAnomalies = [];
         if (progressCb)
             progressCallback = progressCb;
-        const workerCount = navigator.hardwareConcurrency;
-        const workers = await Promise.all(Array.from({ length: workerCount }, () => createMappingWorker(fileInfoIndex)));
+        const effectiveWorkerCount = workerCount ?? Math.min(await getHardwareConcurrency(), 8);
+        const workers = await Promise.all(Array.from({ length: effectiveWorkerCount }, () => createMappingWorker(fileInfoIndex)));
         availableMappingWorkers.push(...workers);
     }
     /**
@@ -126799,7 +126983,7 @@
             // Track which file this worker is processing so we can identify it
             // if the worker crashes.
             workerCurrentFile.set(mappingWorker, fileInfo);
-            const { outputTarget, hashMethod, ...mappingOptions } =
+            const { outputTarget, hashMethod, hashPartSize, ...mappingOptions } =
             // Not partial anymore.
             mappingWorkerOptions;
             mappingWorker.postMessage({
@@ -126808,10 +126992,20 @@
                 outputTarget: await getHttpOutputHeaders(outputTarget),
                 previousFileInfo,
                 hashMethod,
+                hashPartSize,
                 serializedMappingOptions: serializeMappingOptions(mappingOptions),
             });
             workersActive += 1;
         }
+        // Backpressure: resume the scan worker when the queue drains below the
+        // low-water mark. This prevents the queue from staying empty while the
+        // scan worker is paused.
+        if (scanPaused &&
+            filesToProcess.length < LOW_WATER_MARK &&
+            scanResumeCallback) {
+            scanPaused = false;
+            scanResumeCallback();
+        }
         if (workersActive === 0 &&
             pendingReplacements === 0 &&
             directoryScanFinished &&
@@ -126851,6 +127045,18 @@
     // -------------------------------------------------------------------------
     // Internal helpers
     // -------------------------------------------------------------------------
+    /**
+     * Return the number of logical CPUs available, working in both browser and
+     * Node.js environments. Falls back to `os.cpus().length` when the global
+     * `navigator` object is not available (Node.js < 21).
+     */
+    async function getHardwareConcurrency() {
+        if (typeof navigator !== 'undefined' && navigator.hardwareConcurrency) {
+            return navigator.hardwareConcurrency;
+        }
+        const { cpus } = await import('node:os');
+        return cpus().length;
+    }
     /**
      * Recover from a mapping worker crash. Returns the worker slot, counts the
      * in-flight file as a mapping error, and re-dispatches. Called from onerror,
@@ -127049,10 +127255,14 @@
                         scanAnomalies: [], // Files sent to processing have no scan anomalies
                         previousFileInfo,
                     });
-                    // Could do some throttling:
-                    // if (filesToProcess.length > 10) {
-                    //   fileListWorker.postMessage({ request: 'stop' })
-                    // }
+                    // Backpressure: when the queue grows too large, pause the scan
+                    // worker so file handles don't accumulate unboundedly in memory.
+                    // The scan worker supports 'stop' and 'resume' commands.
+                    const HIGH_WATER_MARK = 100;
+                    if (filesToProcess.length > HIGH_WATER_MARK) {
+                        fileListWorker.postMessage({ request: 'stop' });
+                        markScanPaused();
+                    }
                     dispatchMappingJobs();
                     break;
                 }
@@ -127123,6 +127333,7 @@
         const skipModifications = organizeOptions.skipModifications ?? false;
         const skipValidation = organizeOptions.skipValidation ?? false;
         const hashMethod = organizeOptions.hashMethod;
+        const hashPartSize = organizeOptions.hashPartSize;
         const dateOffset = organizeOptions.dateOffset;
         if (requiresDateOffset(deIdOpts) && !dateOffset?.match(iso8601)) {
             throw new Error('When using "Offset" for retainLongitudinalTemporalInformationOptions, an iso8601 compatible dateOffset must be provided.');
@@ -127136,6 +127347,7 @@
             skipValidation,
             dateOffset,
             hashMethod,
+            hashPartSize,
         };
     }
     function queueFilesForMapping(organizeOptions) {
@@ -127215,7 +127427,7 @@
             };
             try {
                 // create the mapping workers
-                await initializeMappingWorkers(organizeOptions.skipCollectingMappings, organizeOptions.fileInfoIndex, progressCallback);
+                await initializeMappingWorkers(organizeOptions.skipCollectingMappings, organizeOptions.fileInfoIndex, progressCallback, organizeOptions.workerCount);
                 // Set global mappingWorkerOptions
                 setMappingWorkerOptions((await collectMappingOptions(organizeOptions)));
                 //
@@ -127228,6 +127440,11 @@
                     organizeOptions.inputType === 'path' ||
                     organizeOptions.inputType === 's3') {
                     const fileListWorker = await initializeFileListWorker(rejectCallback);
+                    // Wire up backpressure resume: when the dispatch loop drains the
+                    // queue below the low-water mark, it calls this to resume scanning.
+                    setScanResumeCallback(() => {
+                        fileListWorker.postMessage({ request: 'resume' });
+                    });
                     let specExcludedFiletypes;
                     let noDicomSignatureCheck = false;
                     let noDefaultExclusions = false;