dicom-curate 0.26.2 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -50,7 +50,7 @@
50
50
  const { createRequire } = await Promise.resolve().then(function () { return _polyfillNode_module; });
51
51
  const req = createRequire((_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('scanDirectoryWorker.js', document.baseURI).href));
52
52
  const mod = req('@aws-sdk/client-s3');
53
- cachedS3Client = mod?.default || mod;
53
+ cachedS3Client = mod?.default ?? mod;
54
54
  }
55
55
  else {
56
56
  // browser-friendly dynamic import -> code-split chunk
@@ -95,6 +95,31 @@
95
95
  '.ds_store',
96
96
  ];
97
97
  let keepScanning = true;
98
+ // Backpressure gate: when the main thread signals 'stop', the scan worker
99
+ // awaits this promise before emitting the next file. 'resume' resolves it.
100
+ let pauseResolve = null;
101
+ let pausePromise = null;
102
+ function pauseScanning() {
103
+ if (!pausePromise) {
104
+ pausePromise = new Promise((resolve) => {
105
+ pauseResolve = resolve;
106
+ });
107
+ }
108
+ }
109
+ function resumeScanning() {
110
+ if (pauseResolve) {
111
+ pauseResolve();
112
+ pauseResolve = null;
113
+ pausePromise = null;
114
+ }
115
+ }
116
+ /** If paused, wait until resumed. Returns false if scanning was aborted. */
117
+ async function waitIfPaused() {
118
+ if (pausePromise) {
119
+ await pausePromise;
120
+ }
121
+ return keepScanning;
122
+ }
98
123
  let excludedFiletypes = [];
99
124
  // Compiled regexes from glob patterns, used to exclude files by path
100
125
  let excludedPathRegexes = [];
@@ -272,7 +297,13 @@
272
297
  break;
273
298
  }
274
299
  case 'stop': {
275
- keepScanning = false;
300
+ // Pause scanning — the scan loop will await waitIfPaused()
301
+ pauseScanning();
302
+ break;
303
+ }
304
+ case 'resume': {
305
+ // Resume scanning after a pause
306
+ resumeScanning();
276
307
  break;
277
308
  }
278
309
  default:
@@ -356,6 +387,9 @@
356
387
  for await (const entry of dir.values()) {
357
388
  if (!keepScanning)
358
389
  return;
390
+ // Backpressure: if the main thread paused us, wait here until resumed
391
+ if (!(await waitIfPaused()))
392
+ return;
359
393
  if (entry.kind === 'file') {
360
394
  const file = await entry.getFile();
361
395
  const fileAnomalies = [];
@@ -422,6 +456,9 @@
422
456
  for (const entry of entries) {
423
457
  if (!keepScanning)
424
458
  return;
459
+ // Backpressure: if the main thread paused us, wait here until resumed
460
+ if (!(await waitIfPaused()))
461
+ return;
425
462
  if (entry.isFile()) {
426
463
  const filePath = path.join(currentPath, entry.name);
427
464
  const stats = await fs.stat(filePath);
@@ -28715,7 +28752,7 @@
28715
28752
  // For private tags (which don't have keywords), keep as-is
28716
28753
  const tagId = isPrivateTag(keyword)
28717
28754
  ? keyword
28718
- : data$1.DicomMetaDictionary.nameMap[keyword]?.tag || keyword;
28755
+ : (data$1.DicomMetaDictionary.nameMap[keyword]?.tag ?? keyword);
28719
28756
  // Remove parentheses and commas, convert to the format used in dictionary keys
28720
28757
  return tagId.replace(/[(),]/g, '').toLowerCase();
28721
28758
  }
@@ -46996,7 +47033,7 @@
46996
47033
  // Deal with dcmjs quirk of labeling retired tags with a
46997
47034
  // "RETIRED_" prefix
46998
47035
  function getVr(keyword) {
46999
- const element = nameMap[keyword] || nameMap[`RETIRED_${keyword}`];
47036
+ const element = nameMap[keyword] ?? nameMap[`RETIRED_${keyword}`];
47000
47037
  return element?.vr;
47001
47038
  }
47002
47039
  function temporalVr(vr) {
@@ -47071,7 +47108,7 @@
47071
47108
  }
47072
47109
  }
47073
47110
  }
47074
- return current[tagName] || null;
47111
+ return current[tagName] ?? null;
47075
47112
  }
47076
47113
  const { cleanDescriptorsOption, cleanDescriptorsExceptions, retainLongitudinalTemporalInformationOptions, retainPatientCharacteristicsOption, retainDeviceIdentityOption, retainUIDsOption, retainSafePrivateOption, retainInstitutionIdentityOption, } = dicomPS315EOptions;
47077
47114
  const taggedps315EEls = ps315EElements.reduce((acc, item) => {
@@ -48717,17 +48754,20 @@
48717
48754
 
48718
48755
  var crcExports = requireCrc();
48719
48756
 
48720
- async function hash(buffer, hashMethod) {
48757
+ const DEFAULT_HASH_PART_SIZE = 5 * 1024 * 1024; // 5 MB — matches @aws-sdk/lib-storage default
48758
+ async function hash(buffer, hashMethod, hashPartSize) {
48721
48759
  switch (hashMethod) {
48722
48760
  case 'sha256':
48723
48761
  return await sha256Hex(buffer);
48724
48762
  case 'crc32':
48725
48763
  return crc32Hex(buffer);
48726
- case 'md5':
48727
- return md5Hex(buffer);
48728
48764
  case 'crc64':
48729
- default:
48730
48765
  return crc64Hex(buffer);
48766
+ case 'aws-s3-etag-2025':
48767
+ return awsS3Etag(buffer, hashPartSize ?? DEFAULT_HASH_PART_SIZE);
48768
+ case 'md5':
48769
+ default:
48770
+ return md5Hex(buffer);
48731
48771
  }
48732
48772
  }
48733
48773
  // helper: compute sha256 hex
@@ -48739,6 +48779,49 @@
48739
48779
  function md5Hex(buffer) {
48740
48780
  return md5(new Uint8Array(buffer));
48741
48781
  }
48782
+ /**
48783
+ * Compute a hash that matches the S3 ETag for the given buffer.
48784
+ *
48785
+ * - Single-part (buffer.byteLength <= partSize): plain MD5 hex string.
48786
+ * This matches the documented S3 ETag behaviour for objects created via
48787
+ * PUT Object with SSE-S3 (AES256) encryption.
48788
+ *
48789
+ * - Multi-part (buffer.byteLength > partSize): the undocumented but stable
48790
+ * composite format md5(concat(md5_raw(part1) … md5_raw(partN)))-N
48791
+ * that S3 returns for objects created via the Multipart Upload API.
48792
+ */
48793
+ function awsS3Etag(buffer, partSize) {
48794
+ if (buffer.byteLength <= partSize) {
48795
+ return md5Hex(buffer);
48796
+ }
48797
+ return multipartMd5(buffer, partSize);
48798
+ }
48799
+ /**
48800
+ * Reproduce the S3 multipart ETag for a buffer given a known part size.
48801
+ *
48802
+ * Algorithm (empirically stable since ~2006, undocumented by AWS):
48803
+ * 1. Split buffer into ceil(size / partSize) chunks
48804
+ * 2. Compute raw MD5 (16 bytes) of each chunk
48805
+ * 3. Concatenate all raw digests
48806
+ * 4. Compute MD5 of the concatenation → hex
48807
+ * 5. Append "-" + number of parts
48808
+ */
48809
+ function multipartMd5(buffer, partSize) {
48810
+ const totalSize = buffer.byteLength;
48811
+ const partCount = Math.ceil(totalSize / partSize);
48812
+ const rawDigests = new Uint8Array(partCount * 16);
48813
+ for (let i = 0; i < partCount; i++) {
48814
+ const start = i * partSize;
48815
+ const end = Math.min(start + partSize, totalSize);
48816
+ const partBuffer = buffer.slice(start, end);
48817
+ // md5() returns a 32-char hex string; convert to 16 raw bytes
48818
+ const hex = md5(new Uint8Array(partBuffer));
48819
+ for (let j = 0; j < 16; j++) {
48820
+ rawDigests[i * 16 + j] = parseInt(hex.slice(j * 2, j * 2 + 2), 16);
48821
+ }
48822
+ }
48823
+ return `${md5(rawDigests)}-${partCount}`;
48824
+ }
48742
48825
  // helper: compute crc32 hex (use js-crc). Accepts ArrayBuffer and returns
48743
48826
  // lowercase, zero-padded 8-character hex string.
48744
48827
  // Accept ArrayBuffer, Uint8Array or Node Buffer and always compute the CRC32
@@ -48827,7 +48910,7 @@
48827
48910
  const { createRequire } = await Promise.resolve().then(function () { return _polyfillNode_module; });
48828
48911
  const req = createRequire((_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('applyMappingsWorker.js', document.baseURI).href));
48829
48912
  const mod = req('@aws-sdk/client-s3');
48830
- cachedS3Client = mod?.default || mod;
48913
+ cachedS3Client = mod?.default ?? mod;
48831
48914
  }
48832
48915
  else {
48833
48916
  // browser-friendly dynamic import -> code-split chunk
@@ -48836,7 +48919,7 @@
48836
48919
  return cachedS3Client;
48837
48920
  }
48838
48921
 
48839
- async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, previousSourceFileInfo, previousMappedFileInfo, }) {
48922
+ async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, hashPartSize, previousSourceFileInfo, previousMappedFileInfo, }) {
48840
48923
  const startTime = performance.now();
48841
48924
  let mtime;
48842
48925
  // 1) Read the file (from handle or blob)
@@ -48864,7 +48947,7 @@
48864
48947
  throw new Error(`Failed to fetch ${fileInfo.url}: ${resp.status} ${resp.statusText}`);
48865
48948
  }
48866
48949
  file = await resp.blob();
48867
- const lastModifiedHeader = resp.headers.get('last-modified') || undefined;
48950
+ const lastModifiedHeader = resp.headers.get('last-modified');
48868
48951
  if (lastModifiedHeader) {
48869
48952
  mtime = new Date(lastModifiedHeader).toISOString();
48870
48953
  }
@@ -48937,7 +49020,9 @@
48937
49020
  }
48938
49021
  }
48939
49022
  // 3) read bytes (needed for deep hash)
48940
- const fileArrayBuffer = await file.arrayBuffer();
49023
+ // Use let so we can null the reference after the last use, allowing GC to
49024
+ // reclaim the buffer while the rest of the function (upload, hashing) runs.
49025
+ let fileArrayBuffer = await file.arrayBuffer();
48941
49026
  let preMappedHash;
48942
49027
  let postMappedHash;
48943
49028
  const postMappedHashHeader = 'x-source-file-hash';
@@ -48945,8 +49030,8 @@
48945
49030
  let canSkip = false;
48946
49031
  if (previousSourceFileInfo?.preMappedHash !== undefined) {
48947
49032
  try {
48948
- // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
48949
- preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
49033
+ // choose hashing algorithm: default to md5 for S3 ETag compatibility
49034
+ preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
48950
49035
  }
48951
49036
  catch (e) {
48952
49037
  console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -49059,8 +49144,8 @@
49059
49144
  // If we didn't compute preMappedHash yet, do it now
49060
49145
  if (!preMappedHash) {
49061
49146
  try {
49062
- // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
49063
- preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
49147
+ // choose hashing algorithm: default to md5 for S3 ETag compatibility
49148
+ preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
49064
49149
  }
49065
49150
  catch (e) {
49066
49151
  console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -49077,7 +49162,13 @@
49077
49162
  allowInvalidVRLength: true,
49078
49163
  });
49079
49164
  // Always calculate post-mapped hash even if deep compare is not requested
49080
- postMappedHash = await hash(modifiedArrayBuffer, hashMethod || 'crc64');
49165
+ postMappedHash = await hash(modifiedArrayBuffer, hashMethod ?? 'md5', hashPartSize);
49166
+ // Release the original file buffer — the modifiedArrayBuffer is all we
49167
+ // need from this point. In the passthrough case (no header changes),
49168
+ // modifiedArrayBuffer === fileArrayBuffer so the data stays alive through
49169
+ // that reference; in the modified case, fileArrayBuffer is a separate
49170
+ // allocation that can now be GC'd.
49171
+ fileArrayBuffer = null;
49081
49172
  const previousPostMappedHash = previousMappedFileInfo
49082
49173
  ? previousMappedFileInfo(clonedMapResults.outputFilePath)?.postMappedHash
49083
49174
  : undefined;
@@ -49116,15 +49207,16 @@
49116
49207
  const fullFilePath = path.join(fullDirPath, fileName);
49117
49208
  await fs.writeFile(fullFilePath, new DataView(modifiedArrayBuffer));
49118
49209
  }
49119
- else {
49210
+ else if (!outputTarget?.http && !outputTarget?.s3) {
49211
+ // Only create mappedBlob when there is no output target at all (no
49212
+ // directory, no HTTP endpoint, no S3 bucket). When an upload target is
49213
+ // present the blob has already been consumed and keeping it around
49214
+ // retains the full file content in memory for every processed file,
49215
+ // causing OOM crashes at scale.
49120
49216
  clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
49121
49217
  type: 'application/octet-stream',
49122
49218
  });
49123
49219
  }
49124
- // If no directory or even if directory present, expose mappedBlob for consumers
49125
- clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
49126
- type: 'application/octet-stream',
49127
- });
49128
49220
  // If upload URL (bucket) is provided, perform an HTTP PUT upload to the server
49129
49221
  if (outputTarget?.http) {
49130
49222
  try {
@@ -49137,32 +49229,36 @@
49137
49229
  const uploadUrl = `${outputTarget.http.url}/${key}`;
49138
49230
  // Create headers per helper described by the user
49139
49231
  const headers = {
49140
- 'Content-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
49232
+ 'Content-Type': 'application/octet-stream',
49141
49233
  'X-File-Name': fileName,
49142
- 'X-File-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
49234
+ 'X-File-Type': 'application/octet-stream',
49143
49235
  'X-File-Size': String(modifiedArrayBuffer.byteLength),
49144
- 'X-Source-File-Size': String(clonedMapResults.fileInfo?.size || ''),
49145
- 'X-Source-File-Modified-Time': mtime || '',
49146
- 'X-Source-File-Hash': preMappedHash || '',
49236
+ 'X-Source-File-Size': String(clonedMapResults.fileInfo?.size ?? ''),
49237
+ 'X-Source-File-Modified-Time': mtime ?? '',
49238
+ 'X-Source-File-Hash': preMappedHash ?? '',
49147
49239
  };
49148
49240
  if (outputTarget.http.headers) {
49149
49241
  Object.assign(headers, outputTarget.http.headers);
49150
49242
  }
49151
49243
  if (postMappedHashHeader && postMappedHash)
49152
49244
  headers[postMappedHashHeader] = postMappedHash;
49245
+ // Send the ArrayBuffer directly instead of wrapping in a Blob first —
49246
+ // avoids an extra copy in memory.
49153
49247
  const resp = await fetchWithRetry(uploadUrl, {
49154
49248
  method: 'PUT',
49155
49249
  headers,
49156
- body: clonedMapResults.mappedBlob,
49250
+ body: new Blob([modifiedArrayBuffer], {
49251
+ type: 'application/octet-stream',
49252
+ }),
49157
49253
  });
49158
49254
  if (!resp.ok) {
49159
49255
  console.error(`Upload failed for ${uploadUrl}: ${resp.status} ${resp.statusText}`);
49160
- clonedMapResults.errors = clonedMapResults.errors || [];
49256
+ clonedMapResults.errors = clonedMapResults.errors ?? [];
49161
49257
  clonedMapResults.errors.push(`Upload failed: ${resp.status} ${resp.statusText}`);
49162
49258
  }
49163
49259
  else {
49164
49260
  // attach upload info if available
49165
- clonedMapResults.outputUpload = clonedMapResults.outputUpload || {
49261
+ clonedMapResults.outputUpload = clonedMapResults.outputUpload ?? {
49166
49262
  url: uploadUrl,
49167
49263
  status: resp.status,
49168
49264
  };
@@ -49170,7 +49266,7 @@
49170
49266
  }
49171
49267
  catch (e) {
49172
49268
  console.error('Upload error', e);
49173
- clonedMapResults.errors = clonedMapResults.errors || [];
49269
+ clonedMapResults.errors = clonedMapResults.errors ?? [];
49174
49270
  clonedMapResults.errors.push(`Upload error: ${e instanceof Error ? e.message : String(e)}`);
49175
49271
  }
49176
49272
  }
@@ -49193,12 +49289,14 @@
49193
49289
  await client.send(new s3.PutObjectCommand({
49194
49290
  Bucket: outputTarget.s3.bucketName,
49195
49291
  Key: key,
49196
- Body: await clonedMapResults.mappedBlob.arrayBuffer(),
49197
- ContentType: clonedMapResults.mappedBlob.type || 'application/octet-stream',
49292
+ // Use the ArrayBuffer directly — going through Blob.arrayBuffer()
49293
+ // would create yet another copy of the data in memory.
49294
+ Body: new Uint8Array(modifiedArrayBuffer),
49295
+ ContentType: 'application/octet-stream',
49198
49296
  Metadata: {
49199
- 'source-file-size': String(clonedMapResults.fileInfo?.size || ''),
49200
- 'source-file-modified-time': mtime || '',
49201
- 'source-file-hash': preMappedHash || '',
49297
+ 'source-file-size': String(clonedMapResults.fileInfo?.size ?? ''),
49298
+ 'source-file-modified-time': mtime ?? '',
49299
+ 'source-file-hash': preMappedHash ?? '',
49202
49300
  ...(postMappedHash
49203
49301
  ? { 'source-file-post-mapped-hash': postMappedHash }
49204
49302
  : {}),
@@ -49213,7 +49311,7 @@
49213
49311
  }
49214
49312
  catch (e) {
49215
49313
  console.error('S3 Upload error', e);
49216
- clonedMapResults.errors = clonedMapResults.errors || [];
49314
+ clonedMapResults.errors = clonedMapResults.errors ?? [];
49217
49315
  clonedMapResults.errors.push(`S3 Upload error: ${e instanceof Error ? e.message : String(e)}`);
49218
49316
  }
49219
49317
  }
@@ -62464,6 +62562,7 @@
62464
62562
  fileInfo,
62465
62563
  outputTarget: event.data.outputTarget ?? {},
62466
62564
  hashMethod: event.data.hashMethod,
62565
+ hashPartSize: event.data.hashPartSize,
62467
62566
  mappingOptions,
62468
62567
  previousSourceFileInfo: event.data.previousFileInfo,
62469
62568
  previousMappedFileInfo: (targetName) => {
@@ -90771,7 +90870,7 @@
90771
90870
  // For private tags (which don't have keywords), keep as-is
90772
90871
  const tagId = isPrivateTag(keyword)
90773
90872
  ? keyword
90774
- : data$1.DicomMetaDictionary.nameMap[keyword]?.tag || keyword;
90873
+ : (data$1.DicomMetaDictionary.nameMap[keyword]?.tag ?? keyword);
90775
90874
  // Remove parentheses and commas, convert to the format used in dictionary keys
90776
90875
  return tagId.replace(/[(),]/g, '').toLowerCase();
90777
90876
  }
@@ -109053,7 +109152,7 @@
109053
109152
  // Deal with dcmjs quirk of labeling retired tags with a
109054
109153
  // "RETIRED_" prefix
109055
109154
  function getVr(keyword) {
109056
- const element = nameMap[keyword] || nameMap[`RETIRED_${keyword}`];
109155
+ const element = nameMap[keyword] ?? nameMap[`RETIRED_${keyword}`];
109057
109156
  return element?.vr;
109058
109157
  }
109059
109158
  function temporalVr(vr) {
@@ -109128,7 +109227,7 @@
109128
109227
  }
109129
109228
  }
109130
109229
  }
109131
- return current[tagName] || null;
109230
+ return current[tagName] ?? null;
109132
109231
  }
109133
109232
  const { cleanDescriptorsOption, cleanDescriptorsExceptions, retainLongitudinalTemporalInformationOptions, retainPatientCharacteristicsOption, retainDeviceIdentityOption, retainUIDsOption, retainSafePrivateOption, retainInstitutionIdentityOption, } = dicomPS315EOptions;
109134
109233
  const taggedps315EEls = ps315EElements.reduce((acc, item) => {
@@ -110768,17 +110867,20 @@
110768
110867
 
110769
110868
  var crcExports = requireCrc();
110770
110869
 
110771
- async function hash(buffer, hashMethod) {
110870
+ const DEFAULT_HASH_PART_SIZE = 5 * 1024 * 1024; // 5 MB — matches @aws-sdk/lib-storage default
110871
+ async function hash(buffer, hashMethod, hashPartSize) {
110772
110872
  switch (hashMethod) {
110773
110873
  case 'sha256':
110774
110874
  return await sha256Hex(buffer);
110775
110875
  case 'crc32':
110776
110876
  return crc32Hex(buffer);
110777
- case 'md5':
110778
- return md5Hex(buffer);
110779
110877
  case 'crc64':
110780
- default:
110781
110878
  return crc64Hex(buffer);
110879
+ case 'aws-s3-etag-2025':
110880
+ return awsS3Etag(buffer, hashPartSize ?? DEFAULT_HASH_PART_SIZE);
110881
+ case 'md5':
110882
+ default:
110883
+ return md5Hex(buffer);
110782
110884
  }
110783
110885
  }
110784
110886
  // helper: compute sha256 hex
@@ -110790,6 +110892,49 @@
110790
110892
  function md5Hex(buffer) {
110791
110893
  return md5(new Uint8Array(buffer));
110792
110894
  }
110895
+ /**
110896
+ * Compute a hash that matches the S3 ETag for the given buffer.
110897
+ *
110898
+ * - Single-part (buffer.byteLength <= partSize): plain MD5 hex string.
110899
+ * This matches the documented S3 ETag behaviour for objects created via
110900
+ * PUT Object with SSE-S3 (AES256) encryption.
110901
+ *
110902
+ * - Multi-part (buffer.byteLength > partSize): the undocumented but stable
110903
+ * composite format md5(concat(md5_raw(part1) … md5_raw(partN)))-N
110904
+ * that S3 returns for objects created via the Multipart Upload API.
110905
+ */
110906
+ function awsS3Etag(buffer, partSize) {
110907
+ if (buffer.byteLength <= partSize) {
110908
+ return md5Hex(buffer);
110909
+ }
110910
+ return multipartMd5(buffer, partSize);
110911
+ }
110912
+ /**
110913
+ * Reproduce the S3 multipart ETag for a buffer given a known part size.
110914
+ *
110915
+ * Algorithm (empirically stable since ~2006, undocumented by AWS):
110916
+ * 1. Split buffer into ceil(size / partSize) chunks
110917
+ * 2. Compute raw MD5 (16 bytes) of each chunk
110918
+ * 3. Concatenate all raw digests
110919
+ * 4. Compute MD5 of the concatenation → hex
110920
+ * 5. Append "-" + number of parts
110921
+ */
110922
+ function multipartMd5(buffer, partSize) {
110923
+ const totalSize = buffer.byteLength;
110924
+ const partCount = Math.ceil(totalSize / partSize);
110925
+ const rawDigests = new Uint8Array(partCount * 16);
110926
+ for (let i = 0; i < partCount; i++) {
110927
+ const start = i * partSize;
110928
+ const end = Math.min(start + partSize, totalSize);
110929
+ const partBuffer = buffer.slice(start, end);
110930
+ // md5() returns a 32-char hex string; convert to 16 raw bytes
110931
+ const hex = md5(new Uint8Array(partBuffer));
110932
+ for (let j = 0; j < 16; j++) {
110933
+ rawDigests[i * 16 + j] = parseInt(hex.slice(j * 2, j * 2 + 2), 16);
110934
+ }
110935
+ }
110936
+ return `${md5(rawDigests)}-${partCount}`;
110937
+ }
110793
110938
  // helper: compute crc32 hex (use js-crc). Accepts ArrayBuffer and returns
110794
110939
  // lowercase, zero-padded 8-character hex string.
110795
110940
  // Accept ArrayBuffer, Uint8Array or Node Buffer and always compute the CRC32
@@ -110878,7 +111023,7 @@
110878
111023
  const { createRequire } = await Promise.resolve().then(function () { return _polyfillNode_module; });
110879
111024
  const req = createRequire((typeof document === 'undefined' && typeof location === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : typeof document === 'undefined' ? location.href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('dicom-curate.umd.js', document.baseURI).href)));
110880
111025
  const mod = req('@aws-sdk/client-s3');
110881
- cachedS3Client = mod?.default || mod;
111026
+ cachedS3Client = mod?.default ?? mod;
110882
111027
  }
110883
111028
  else {
110884
111029
  // browser-friendly dynamic import -> code-split chunk
@@ -110887,7 +111032,7 @@
110887
111032
  return cachedS3Client;
110888
111033
  }
110889
111034
 
110890
- async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, previousSourceFileInfo, previousMappedFileInfo, }) {
111035
+ async function curateOne({ fileInfo, outputTarget, mappingOptions, hashMethod, hashPartSize, previousSourceFileInfo, previousMappedFileInfo, }) {
110891
111036
  const startTime = performance.now();
110892
111037
  let mtime;
110893
111038
  // 1) Read the file (from handle or blob)
@@ -110915,7 +111060,7 @@
110915
111060
  throw new Error(`Failed to fetch ${fileInfo.url}: ${resp.status} ${resp.statusText}`);
110916
111061
  }
110917
111062
  file = await resp.blob();
110918
- const lastModifiedHeader = resp.headers.get('last-modified') || undefined;
111063
+ const lastModifiedHeader = resp.headers.get('last-modified');
110919
111064
  if (lastModifiedHeader) {
110920
111065
  mtime = new Date(lastModifiedHeader).toISOString();
110921
111066
  }
@@ -110988,7 +111133,9 @@
110988
111133
  }
110989
111134
  }
110990
111135
  // 3) read bytes (needed for deep hash)
110991
- const fileArrayBuffer = await file.arrayBuffer();
111136
+ // Use let so we can null the reference after the last use, allowing GC to
111137
+ // reclaim the buffer while the rest of the function (upload, hashing) runs.
111138
+ let fileArrayBuffer = await file.arrayBuffer();
110992
111139
  let preMappedHash;
110993
111140
  let postMappedHash;
110994
111141
  const postMappedHashHeader = 'x-source-file-hash';
@@ -110996,8 +111143,8 @@
110996
111143
  let canSkip = false;
110997
111144
  if (previousSourceFileInfo?.preMappedHash !== undefined) {
110998
111145
  try {
110999
- // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
111000
- preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
111146
+ // choose hashing algorithm: default to md5 for S3 ETag compatibility
111147
+ preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
111001
111148
  }
111002
111149
  catch (e) {
111003
111150
  console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -111110,8 +111257,8 @@
111110
111257
  // If we didn't compute preMappedHash yet, do it now
111111
111258
  if (!preMappedHash) {
111112
111259
  try {
111113
- // choose hashing algorithm: default to crc64 (nvme-style) for compatibility
111114
- preMappedHash = await hash(fileArrayBuffer, hashMethod || 'crc64');
111260
+ // choose hashing algorithm: default to md5 for S3 ETag compatibility
111261
+ preMappedHash = await hash(fileArrayBuffer, hashMethod ?? 'md5', hashPartSize);
111115
111262
  }
111116
111263
  catch (e) {
111117
111264
  console.warn(`Failed to compute preMappedHash for ${fileInfo.name}`, e);
@@ -111128,7 +111275,13 @@
111128
111275
  allowInvalidVRLength: true,
111129
111276
  });
111130
111277
  // Always calculate post-mapped hash even if deep compare is not requested
111131
- postMappedHash = await hash(modifiedArrayBuffer, hashMethod || 'crc64');
111278
+ postMappedHash = await hash(modifiedArrayBuffer, hashMethod ?? 'md5', hashPartSize);
111279
+ // Release the original file buffer — the modifiedArrayBuffer is all we
111280
+ // need from this point. In the passthrough case (no header changes),
111281
+ // modifiedArrayBuffer === fileArrayBuffer so the data stays alive through
111282
+ // that reference; in the modified case, fileArrayBuffer is a separate
111283
+ // allocation that can now be GC'd.
111284
+ fileArrayBuffer = null;
111132
111285
  const previousPostMappedHash = previousMappedFileInfo
111133
111286
  ? previousMappedFileInfo(clonedMapResults.outputFilePath)?.postMappedHash
111134
111287
  : undefined;
@@ -111167,15 +111320,16 @@
111167
111320
  const fullFilePath = path.join(fullDirPath, fileName);
111168
111321
  await fs.writeFile(fullFilePath, new DataView(modifiedArrayBuffer));
111169
111322
  }
111170
- else {
111323
+ else if (!outputTarget?.http && !outputTarget?.s3) {
111324
+ // Only create mappedBlob when there is no output target at all (no
111325
+ // directory, no HTTP endpoint, no S3 bucket). When an upload target is
111326
+ // present the blob has already been consumed and keeping it around
111327
+ // retains the full file content in memory for every processed file,
111328
+ // causing OOM crashes at scale.
111171
111329
  clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
111172
111330
  type: 'application/octet-stream',
111173
111331
  });
111174
111332
  }
111175
- // If no directory or even if directory present, expose mappedBlob for consumers
111176
- clonedMapResults.mappedBlob = new Blob([modifiedArrayBuffer], {
111177
- type: 'application/octet-stream',
111178
- });
111179
111333
  // If upload URL (bucket) is provided, perform an HTTP PUT upload to the server
111180
111334
  if (outputTarget?.http) {
111181
111335
  try {
@@ -111188,32 +111342,36 @@
111188
111342
  const uploadUrl = `${outputTarget.http.url}/${key}`;
111189
111343
  // Create headers per helper described by the user
111190
111344
  const headers = {
111191
- 'Content-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
111345
+ 'Content-Type': 'application/octet-stream',
111192
111346
  'X-File-Name': fileName,
111193
- 'X-File-Type': clonedMapResults.mappedBlob.type || 'application/octet-stream',
111347
+ 'X-File-Type': 'application/octet-stream',
111194
111348
  'X-File-Size': String(modifiedArrayBuffer.byteLength),
111195
- 'X-Source-File-Size': String(clonedMapResults.fileInfo?.size || ''),
111196
- 'X-Source-File-Modified-Time': mtime || '',
111197
- 'X-Source-File-Hash': preMappedHash || '',
111349
+ 'X-Source-File-Size': String(clonedMapResults.fileInfo?.size ?? ''),
111350
+ 'X-Source-File-Modified-Time': mtime ?? '',
111351
+ 'X-Source-File-Hash': preMappedHash ?? '',
111198
111352
  };
111199
111353
  if (outputTarget.http.headers) {
111200
111354
  Object.assign(headers, outputTarget.http.headers);
111201
111355
  }
111202
111356
  if (postMappedHashHeader && postMappedHash)
111203
111357
  headers[postMappedHashHeader] = postMappedHash;
111358
+ // Send the ArrayBuffer directly instead of wrapping in a Blob first —
111359
+ // avoids an extra copy in memory.
111204
111360
  const resp = await fetchWithRetry(uploadUrl, {
111205
111361
  method: 'PUT',
111206
111362
  headers,
111207
- body: clonedMapResults.mappedBlob,
111363
+ body: new Blob([modifiedArrayBuffer], {
111364
+ type: 'application/octet-stream',
111365
+ }),
111208
111366
  });
111209
111367
  if (!resp.ok) {
111210
111368
  console.error(`Upload failed for ${uploadUrl}: ${resp.status} ${resp.statusText}`);
111211
- clonedMapResults.errors = clonedMapResults.errors || [];
111369
+ clonedMapResults.errors = clonedMapResults.errors ?? [];
111212
111370
  clonedMapResults.errors.push(`Upload failed: ${resp.status} ${resp.statusText}`);
111213
111371
  }
111214
111372
  else {
111215
111373
  // attach upload info if available
111216
- clonedMapResults.outputUpload = clonedMapResults.outputUpload || {
111374
+ clonedMapResults.outputUpload = clonedMapResults.outputUpload ?? {
111217
111375
  url: uploadUrl,
111218
111376
  status: resp.status,
111219
111377
  };
@@ -111221,7 +111379,7 @@
111221
111379
  }
111222
111380
  catch (e) {
111223
111381
  console.error('Upload error', e);
111224
- clonedMapResults.errors = clonedMapResults.errors || [];
111382
+ clonedMapResults.errors = clonedMapResults.errors ?? [];
111225
111383
  clonedMapResults.errors.push(`Upload error: ${e instanceof Error ? e.message : String(e)}`);
111226
111384
  }
111227
111385
  }
@@ -111244,12 +111402,14 @@
111244
111402
  await client.send(new s3.PutObjectCommand({
111245
111403
  Bucket: outputTarget.s3.bucketName,
111246
111404
  Key: key,
111247
- Body: await clonedMapResults.mappedBlob.arrayBuffer(),
111248
- ContentType: clonedMapResults.mappedBlob.type || 'application/octet-stream',
111405
+ // Use the ArrayBuffer directly — going through Blob.arrayBuffer()
111406
+ // would create yet another copy of the data in memory.
111407
+ Body: new Uint8Array(modifiedArrayBuffer),
111408
+ ContentType: 'application/octet-stream',
111249
111409
  Metadata: {
111250
- 'source-file-size': String(clonedMapResults.fileInfo?.size || ''),
111251
- 'source-file-modified-time': mtime || '',
111252
- 'source-file-hash': preMappedHash || '',
111410
+ 'source-file-size': String(clonedMapResults.fileInfo?.size ?? ''),
111411
+ 'source-file-modified-time': mtime ?? '',
111412
+ 'source-file-hash': preMappedHash ?? '',
111253
111413
  ...(postMappedHash
111254
111414
  ? { 'source-file-post-mapped-hash': postMappedHash }
111255
111415
  : {}),
@@ -111264,7 +111424,7 @@
111264
111424
  }
111265
111425
  catch (e) {
111266
111426
  console.error('S3 Upload error', e);
111267
- clonedMapResults.errors = clonedMapResults.errors || [];
111427
+ clonedMapResults.errors = clonedMapResults.errors ?? [];
111268
111428
  clonedMapResults.errors.push(`S3 Upload error: ${e instanceof Error ? e.message : String(e)}`);
111269
111429
  }
111270
111430
  }
@@ -126760,16 +126920,40 @@
126760
126920
  let scanAnomalies = [];
126761
126921
  // Callbacks set by curateMany, stored here for use by the dispatch loop.
126762
126922
  let progressCallback = () => { };
126923
+ // Callback to resume the scan worker when the processing queue drains below
126924
+ // the low-water mark. Set by curateMany via setScanResumeCallback().
126925
+ let scanResumeCallback = null;
126926
+ let scanPaused = false;
126927
+ /**
126928
+ * Low-water mark for the file processing queue. When the queue size drops
126929
+ * below this threshold after a dispatch, the scan worker is resumed.
126930
+ */
126931
+ const LOW_WATER_MARK = 50;
126763
126932
  // -------------------------------------------------------------------------
126764
126933
  // Public API
126765
126934
  // -------------------------------------------------------------------------
126766
126935
  function setMappingWorkerOptions(opts) {
126767
126936
  mappingWorkerOptions = opts;
126768
126937
  }
126938
+ /**
126939
+ * Register a callback that resumes the scan worker. Called by curateMany
126940
+ * after the scan worker is created.
126941
+ */
126942
+ function setScanResumeCallback(cb) {
126943
+ scanResumeCallback = cb;
126944
+ scanPaused = false;
126945
+ }
126946
+ /**
126947
+ * Mark the scan as paused. Called from the scan worker message handler in
126948
+ * index.ts when the queue exceeds the high-water mark.
126949
+ */
126950
+ function markScanPaused() {
126951
+ scanPaused = true;
126952
+ }
126769
126953
  /**
126770
126954
  * Initialize the mapping worker pool. Call once per curateMany invocation.
126771
126955
  */
126772
- async function initializeMappingWorkers(skipCollectingMappings, fileInfoIndex, progressCb) {
126956
+ async function initializeMappingWorkers(skipCollectingMappings, fileInfoIndex, progressCb, workerCount) {
126773
126957
  mappingWorkerOptions = {};
126774
126958
  workersActive = 0;
126775
126959
  mapResultsList = skipCollectingMappings ? undefined : [];
@@ -126783,8 +126967,8 @@
126783
126967
  scanAnomalies = [];
126784
126968
  if (progressCb)
126785
126969
  progressCallback = progressCb;
126786
- const workerCount = navigator.hardwareConcurrency;
126787
- const workers = await Promise.all(Array.from({ length: workerCount }, () => createMappingWorker(fileInfoIndex)));
126970
+ const effectiveWorkerCount = workerCount ?? Math.min(await getHardwareConcurrency(), 8);
126971
+ const workers = await Promise.all(Array.from({ length: effectiveWorkerCount }, () => createMappingWorker(fileInfoIndex)));
126788
126972
  availableMappingWorkers.push(...workers);
126789
126973
  }
126790
126974
  /**
@@ -126799,7 +126983,7 @@
126799
126983
  // Track which file this worker is processing so we can identify it
126800
126984
  // if the worker crashes.
126801
126985
  workerCurrentFile.set(mappingWorker, fileInfo);
126802
- const { outputTarget, hashMethod, ...mappingOptions } =
126986
+ const { outputTarget, hashMethod, hashPartSize, ...mappingOptions } =
126803
126987
  // Not partial anymore.
126804
126988
  mappingWorkerOptions;
126805
126989
  mappingWorker.postMessage({
@@ -126808,10 +126992,20 @@
126808
126992
  outputTarget: await getHttpOutputHeaders(outputTarget),
126809
126993
  previousFileInfo,
126810
126994
  hashMethod,
126995
+ hashPartSize,
126811
126996
  serializedMappingOptions: serializeMappingOptions(mappingOptions),
126812
126997
  });
126813
126998
  workersActive += 1;
126814
126999
  }
127000
+ // Backpressure: resume the scan worker when the queue drains below the
127001
+ // low-water mark. This prevents the queue from staying empty while the
127002
+ // scan worker is paused.
127003
+ if (scanPaused &&
127004
+ filesToProcess.length < LOW_WATER_MARK &&
127005
+ scanResumeCallback) {
127006
+ scanPaused = false;
127007
+ scanResumeCallback();
127008
+ }
126815
127009
  if (workersActive === 0 &&
126816
127010
  pendingReplacements === 0 &&
126817
127011
  directoryScanFinished &&
@@ -126851,6 +127045,18 @@
126851
127045
  // -------------------------------------------------------------------------
126852
127046
  // Internal helpers
126853
127047
  // -------------------------------------------------------------------------
127048
+ /**
127049
+ * Return the number of logical CPUs available, working in both browser and
127050
+ * Node.js environments. Falls back to `os.cpus().length` when the global
127051
+ * `navigator` object is not available (Node.js < 21).
127052
+ */
127053
+ async function getHardwareConcurrency() {
127054
+ if (typeof navigator !== 'undefined' && navigator.hardwareConcurrency) {
127055
+ return navigator.hardwareConcurrency;
127056
+ }
127057
+ const { cpus } = await import('node:os');
127058
+ return cpus().length;
127059
+ }
126854
127060
  /**
126855
127061
  * Recover from a mapping worker crash. Returns the worker slot, counts the
126856
127062
  * in-flight file as a mapping error, and re-dispatches. Called from onerror,
@@ -127049,10 +127255,14 @@
127049
127255
  scanAnomalies: [], // Files sent to processing have no scan anomalies
127050
127256
  previousFileInfo,
127051
127257
  });
127052
- // Could do some throttling:
127053
- // if (filesToProcess.length > 10) {
127054
- // fileListWorker.postMessage({ request: 'stop' })
127055
- // }
127258
+ // Backpressure: when the queue grows too large, pause the scan
127259
+ // worker so file handles don't accumulate unboundedly in memory.
127260
+ // The scan worker supports 'stop' and 'resume' commands.
127261
+ const HIGH_WATER_MARK = 100;
127262
+ if (filesToProcess.length > HIGH_WATER_MARK) {
127263
+ fileListWorker.postMessage({ request: 'stop' });
127264
+ markScanPaused();
127265
+ }
127056
127266
  dispatchMappingJobs();
127057
127267
  break;
127058
127268
  }
@@ -127123,6 +127333,7 @@
127123
127333
  const skipModifications = organizeOptions.skipModifications ?? false;
127124
127334
  const skipValidation = organizeOptions.skipValidation ?? false;
127125
127335
  const hashMethod = organizeOptions.hashMethod;
127336
+ const hashPartSize = organizeOptions.hashPartSize;
127126
127337
  const dateOffset = organizeOptions.dateOffset;
127127
127338
  if (requiresDateOffset(deIdOpts) && !dateOffset?.match(iso8601)) {
127128
127339
  throw new Error('When using "Offset" for retainLongitudinalTemporalInformationOptions, an iso8601 compatible dateOffset must be provided.');
@@ -127136,6 +127347,7 @@
127136
127347
  skipValidation,
127137
127348
  dateOffset,
127138
127349
  hashMethod,
127350
+ hashPartSize,
127139
127351
  };
127140
127352
  }
127141
127353
  function queueFilesForMapping(organizeOptions) {
@@ -127215,7 +127427,7 @@
127215
127427
  };
127216
127428
  try {
127217
127429
  // create the mapping workers
127218
- await initializeMappingWorkers(organizeOptions.skipCollectingMappings, organizeOptions.fileInfoIndex, progressCallback);
127430
+ await initializeMappingWorkers(organizeOptions.skipCollectingMappings, organizeOptions.fileInfoIndex, progressCallback, organizeOptions.workerCount);
127219
127431
  // Set global mappingWorkerOptions
127220
127432
  setMappingWorkerOptions((await collectMappingOptions(organizeOptions)));
127221
127433
  //
@@ -127228,6 +127440,11 @@
127228
127440
  organizeOptions.inputType === 'path' ||
127229
127441
  organizeOptions.inputType === 's3') {
127230
127442
  const fileListWorker = await initializeFileListWorker(rejectCallback);
127443
+ // Wire up backpressure resume: when the dispatch loop drains the
127444
+ // queue below the low-water mark, it calls this to resume scanning.
127445
+ setScanResumeCallback(() => {
127446
+ fileListWorker.postMessage({ request: 'resume' });
127447
+ });
127231
127448
  let specExcludedFiletypes;
127232
127449
  let noDicomSignatureCheck = false;
127233
127450
  let noDefaultExclusions = false;