npm - @cloudant/couchbackup - Versions diffs - 2.9.17 → 2.10.0-206 - Mend

@cloudant/couchbackup 2.9.17 → 2.10.0-206

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +10 -0
package/app.js +253 -268
package/bin/couchbackup.bin.js +1 -3
package/bin/couchrestore.bin.js +2 -4
package/includes/allDocsGenerator.js +53 -0
package/includes/backup.js +103 -247
package/includes/backupMappings.js +260 -0
package/includes/config.js +10 -9
package/includes/error.js +42 -44
package/includes/liner.js +134 -23
package/includes/logfilegetbatches.js +25 -60
package/includes/logfilesummary.js +41 -71
package/includes/parser.js +3 -3
package/includes/request.js +95 -106
package/includes/restore.js +45 -14
package/includes/restoreMappings.js +141 -0
package/includes/spoolchanges.js +57 -79
package/includes/transforms.js +378 -0
package/package.json +5 -8
package/includes/change.js +0 -41
package/includes/shallowbackup.js +0 -80
package/includes/writer.js +0 -164

package/bin/couchrestore.bin.js CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env node
-// Copyright © 2017, 2021 IBM Corp. All rights reserved.
+// Copyright © 2017, 2024 IBM Corp. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -49,9 +49,7 @@ try {
     opts,
     error.terminationCallback
   ).on('restored', function(obj) {
-    restoreBatchDebug('restored', obj.total);
-  }).on('error', function(e) {
-    restoreDebug('ERROR', e);
+    restoreBatchDebug('Restored batch ID:', obj.batch, 'Total document revisions restored:', obj.total, 'Time:', obj.time);
   }).on('finished', function(obj) {
     restoreDebug('finished', obj);
   });

package/includes/allDocsGenerator.js ADDED Viewed

@@ -0,0 +1,53 @@
+// Copyright © 2023 IBM Corp. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+'use strict';
+const debug = require('debug')('couchbackup:alldocsgenerator');
+const { BackupError } = require('./error.js');
+/**
+ * Async generator function for paginating _all_docs for shallow backups.
+ *
+ * @param {object} dbClient - object for connection to source database containing name, service and url
+ * @param {object} options - backup configuration
+ * @yields {object} a "done" type backup batch {command: d, batch: #, docs: [{_id: id, ...}, ...]}
+ */
+module.exports = async function * (dbClient, options = {}) {
+  let batch = 0;
+  let lastPage = false;
+  let startKey = null;
+  const opts = { db: dbClient.dbName, limit: options.bufferSize, includeDocs: true };
+  do {
+    if (startKey) opts.startKey = startKey;
+    yield dbClient.service.postAllDocs(opts).then(response => {
+      if (!(response.result && response.result.rows)) {
+        throw new BackupError('AllDocsError', 'Invalid all docs response');
+      }
+      debug(`Got page from start key '${startKey}'`);
+      const docs = response.result.rows;
+      debug(`Received ${docs.length} docs`);
+      lastPage = docs.length < opts.limit;
+      if (docs.length > 0) {
+        const lastKey = docs[docs.length - 1].id;
+        debug(`Received up to key ${lastKey}`);
+        // To avoid double fetching a document solely for the purposes of getting
+        // the next ID to use as a startKey for the next page we instead use the
+        // last ID of the current page and append the lowest unicode sort
+        // character.
+        startKey = `${lastKey}\0`;
+      }
+      return { command: 'd', batch: batch++, docs: docs.map(doc => { return doc.doc; }) };
+    });
+  } while (!lastPage);
+};

package/includes/backup.js CHANGED Viewed

@@ -1,4 +1,4 @@
-// Copyright © 2017, 2021 IBM Corp. All rights reserved.
+// Copyright © 2017, 2024 IBM Corp. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,269 +13,125 @@
 // limitations under the License.
 'use strict';
-const async = require('async');
-const events = require('events');
-const fs = require('fs');
-const error = require('./error.js');
+const { createWriteStream } = require('node:fs');
+const { pipeline } = require('node:stream/promises');
+const { Backup } = require('./backupMappings.js');
+const { BackupError } = require('./error.js');
+const logFileSummary = require('./logfilesummary.js');
+const logFileGetBatches = require('./logfilegetbatches.js');
 const spoolchanges = require('./spoolchanges.js');
-const logfilesummary = require('./logfilesummary.js');
-const logfilegetbatches = require('./logfilegetbatches.js');
+const { MappingStream, WritableWithPassThrough, DelegateWritable } = require('./transforms.js');
+const allDocsGenerator = require('./allDocsGenerator.js');
 /**
- * Read documents from a database to be backed up.
+ * Validate /_bulk_get support for a specified database.
  *
- * @param {string} db - `@cloudant/cloudant` DB object for source database.
- * @param {number} blocksize - number of documents to download in single request
- * @param {number} parallelism - number of concurrent downloads
- * @param {string} log - path to log file to use
- * @param {boolean} resume - whether to resume from an existing log file
- * @returns EventEmitter with following events:
- *  - `received` - called with a block of documents to write to backup
- *  - `error` - on error
- *  - `finished` - when backup process is finished (either complete or errored)
+ * @param {object} dbClient - object for connection to source database containing name, service and url
  */
-module.exports = function(db, options) {
-  const ee = new events.EventEmitter();
-  const start = new Date().getTime(); // backup start time
-  const batchesPerDownloadSession = 50; // max batches to read from log file for download at a time (prevent OOM)
-  function proceedWithBackup() {
-    if (options.resume) {
-      // pick up from existing log file from previous run
-      downloadRemainingBatches(options.log, db, ee, start, batchesPerDownloadSession, options.parallelism);
+async function validateBulkGetSupport(dbClient) {
+  try {
+    await dbClient.service.postBulkGet({ db: dbClient.dbName, docs: [] });
+  } catch (err) {
+    // if _bulk_get isn't available throw a special error
+    if (err.status === 404) {
+      throw new BackupError('BulkGetError', 'Database does not support /_bulk_get endpoint');
     } else {
-      // create new log file and process
-      spoolchanges(db, options.log, options.bufferSize, ee, function(err) {
-        if (err) {
-          ee.emit('error', err);
-        } else {
-          downloadRemainingBatches(options.log, db, ee, start, batchesPerDownloadSession, options.parallelism);
-        }
-      });
+      throw err;
     }
   }
-  validateBulkGetSupport(db, function(err) {
-    if (err) {
-      return ee.emit('error', err);
-    } else {
-      proceedWithBackup();
-    }
-  });
-  return ee;
-};
-/**
- * Validate /_bulk_get support for a specified database.
- *
- * @param {string} db - nodejs-cloudant db
- * @param {function} callback - called on completion with signature (err)
- */
-function validateBulkGetSupport(db, callback) {
-  db.service.postBulkGet({ db: db.db, docs: [] }).then(() => { callback(); }).catch(err => {
-    err = error.convertResponseError(err, function(err) {
-      switch (err.status) {
-        case undefined:
-          // There was no status code on the error
-          return err;
-        case 404:
-          return new error.BackupError('BulkGetError', 'Database does not support /_bulk_get endpoint');
-        default:
-          return new error.HTTPError(err);
-      }
-    });
-    callback(err);
-  });
 }
 /**
- * Download remaining batches in a log file, splitting batches into sets
- * to avoid enqueueing too many in one go.
+ * Read documents from a database to be backed up.
  *
- * @param {string} log - log file name to maintain download state
- * @param {string} db - nodejs-cloudant db
- * @param {events.EventEmitter} ee - event emitter to emit received events on
- * @param {time} startTime - start time for backup process
- * @param {number} batchesPerDownloadSession - max batches to enqueue for
- *  download at a time. As batches contain many doc IDs, this helps avoid
- *  exhausting memory.
- * @param {number} parallelism - number of concurrent downloads
- * @returns function to call do download remaining batches with signature
- *  (err, {batches: batch, docs: doccount}) {@see spoolchanges}.
+ * @param {object} dbClient - object for connection to source database containing name, service and url
+ * @param {number} options - backup configuration
+ * @param {Writable} targetStream - destination for the backup contents
+ * @param {EventEmitter} ee - user facing event emitter
+ * @returns pipeline promise that resolves for a successful backup or rejects on failure
  */
-function downloadRemainingBatches(log, db, ee, startTime, batchesPerDownloadSession, parallelism) {
-  let total = 0; // running total of documents downloaded so far
-  let noRemainingBatches = false;
-  // Generate a set of batches (up to batchesPerDownloadSession) to download from the
-  // log file and download them. Set noRemainingBatches to `true` for last batch.
-  function downloadSingleBatchSet(done) {
-    // Fetch the doc IDs for the batches in the current set to
-    // download them.
-    function batchSetComplete(err, data) {
-      if (!err) {
-        total = data.total;
-      }
-      done(err);
-    }
-    function processRetrievedBatches(err, batches) {
-      if (!err) {
-        // process them in parallelised queue
-        processBatchSet(db, parallelism, log, batches, ee, startTime, total, batchSetComplete);
-      } else {
-        batchSetComplete(err);
-      }
-    }
-    readBatchSetIdsFromLogFile(log, batchesPerDownloadSession, function(err, batchSetIds) {
-      if (err) {
-        ee.emit('error', err);
-        // Stop processing changes file for fatal errors
-        noRemainingBatches = true;
-        done();
+module.exports = function(dbClient, options, targetStream, ee) {
+  const start = new Date().getTime(); // backup start time
+  let total = 0; // total documents backed up
+  // Full backups use _bulk_get, validate it is available, shallow skips that check
+  return (options.mode === 'full' ? validateBulkGetSupport(dbClient) : Promise.resolve())
+  // Check if the backup is new or resuming and configure the source
+    .then(async() => {
+      if (options.mode === 'shallow') {
+        // shallow backup, start from async _all_docs generator
+        return [
+          allDocsGenerator(dbClient, options)
+        ];
       } else {
-        if (batchSetIds.length === 0) {
-          noRemainingBatches = true;
-          return done();
+        // Full backup, we'll return a stream over a completed changes log file
+        if (!options.resume) {
+          // Not resuming, start spooling changes to create a log file
+          await spoolchanges(dbClient, options.log, (backupBatch) => {
+            ee.emit('changes', backupBatch.batch);
+          }, options.bufferSize);
         }
-        logfilegetbatches(log, batchSetIds, processRetrievedBatches);
-      }
-    });
-  }
-  // Return true if all batches in log file have been downloaded
-  function isFinished(callback) { callback(null, noRemainingBatches); }
-  function onComplete() {
-    ee.emit('finished', { total: total });
-  }
-  async.doUntil(downloadSingleBatchSet, isFinished, onComplete);
-}
-/**
- * Return a set of uncompleted download batch IDs from the log file.
- *
- * @param {string} log - log file path
- * @param {number} batchesPerDownloadSession - maximum IDs to return
- * @param {function} callback - sign (err, batchSetIds array)
- */
-function readBatchSetIdsFromLogFile(log, batchesPerDownloadSession, callback) {
-  logfilesummary(log, function processSummary(err, summary) {
-    if (!err) {
-      if (!summary.changesComplete) {
-        callback(new error.BackupError('IncompleteChangesInLogFile',
-          'WARNING: Changes did not finish spooling'));
-        return;
-      }
-      if (Object.keys(summary.batches).length === 0) {
-        return callback(null, []);
+        // At this point we should be changes complete because spooling has finished
+        // or because we resumed a backup that had already completed spooling (and
+        // potentially already downloaded some batches)
+        // Get the log file summary to validate changes complete and obtain the
+        // [remaining] batch numbers to backup
+        const summary = await logFileSummary(options.log);
+        if (!summary.changesComplete) {
+          // We must only backup if changes finished spooling
+          throw new BackupError('IncompleteChangesInLogFile',
+            'WARNING: Changes did not finish spooling, a backup can only be resumed if changes finished spooling. Start a new backup.');
+        }
+        return logFileGetBatches(options.log, summary.batches);
       }
-      // batch IDs are the property names of summary.batches
-      const batchSetIds = getPropertyNames(summary.batches, batchesPerDownloadSession);
-      callback(null, batchSetIds);
-    } else {
-      callback(err);
-    }
-  });
-}
-/**
- * Download a set of batches retrieved from a log file. When a download is
- * complete, add a line to the logfile indicating such.
- *
- * @param {any} db - nodejs-cloudant database
- * @param {any} parallelism - number of concurrent requests to make
- * @param {any} log - log file to drive downloads from
- * @param {any} batches - batches to download
- * @param {any} ee - event emitter for progress. This funciton emits
- *  received and error events.
- * @param {any} start - time backup started, to report deltas
- * @param {any} grandtotal - count of documents downloaded prior to this set
- *  of batches
- * @param {any} callback - completion callback, (err, {total: number}).
- */
-function processBatchSet(db, parallelism, log, batches, ee, start, grandtotal, callback) {
-  let hasErrored = false;
-  let total = grandtotal;
-  // queue to process the fetch requests in an orderly fashion using _bulk_get
-  const q = async.queue(function(payload, done) {
-    const output = [];
-    const thisBatch = payload.batch;
-    delete payload.batch;
-    delete payload.command;
-    function logCompletedBatch(batch) {
-      if (log) {
-        fs.appendFile(log, ':d batch' + thisBatch + '\n', done);
+    })
+    // Create a pipeline of the source streams and the backup mappings
+    .then((srcStreams) => {
+      const backup = new Backup(dbClient);
+      const postWrite = (backupBatch) => {
+        total += backupBatch.docs.length;
+        const totalRunningTimeSec = (new Date().getTime() - start) / 1000;
+        ee.emit('written', { total, time: totalRunningTimeSec, batch: backupBatch.batch });
+      };
+      const destinationStreams = [];
+      if (options.mode === 'shallow') {
+        // shallow mode writes only to backup file
+        destinationStreams.push(
+          new DelegateWritable(
+            'backup', // Name for debug
+            targetStream, // backup file
+            null, // no last chunk to write
+            backup.backupBatchToBackupFileLine, // map the backup batch to a string for the backup file
+            postWrite // post write function emits the written event
+          ) // DelegateWritable writes the log file done lines
+        );
       } else {
-        done();
+        // full mode needs to fetch spooled changes and writes a backup file then finally a log file
+        destinationStreams.push(...[
+          new MappingStream(backup.pendingToFetched, options.parallelism), // fetch the batches at the configured concurrency
+          new WritableWithPassThrough(
+            'backup', // name for logging
+            targetStream, // backup file
+            null, // no need to write a last chunk
+            backup.backupBatchToBackupFileLine // map the backup batch to a string for the backup file
+          ), // WritableWithPassThrough writes the fetched docs to the backup file and passes on the result metadata
+          new DelegateWritable(
+            'logFileDoneWriter', // Name for debug
+            createWriteStream(options.log, { flags: 'a' }), // log file for appending
+            null, // no last chunk to write
+            backup.backupBatchToLogFileLine, // Map the backed up batch result to a log file "done" line
+            postWrite // post write function emits the written event
+          ) // DelegateWritable writes the log file done lines
+        ]);
       }
-    }
-    // do the /db/_bulk_get request
-    db.service.postBulkGet({
-      db: db.db,
-      revs: true,
-      docs: payload.docs
-    }).then(response => {
-      // create an output array with the docs returned
-      response.result.results.forEach(function(d) {
-        if (d.docs) {
-          d.docs.forEach(function(doc) {
-            if (doc.ok) {
-              output.push(doc.ok);
-            }
-          });
-        }
-      });
-      total += output.length;
-      const t = (new Date().getTime() - start) / 1000;
-      ee.emit('received', {
-        batch: thisBatch,
-        data: output,
-        length: output.length,
-        time: t,
-        total: total
-      }, q, logCompletedBatch);
-    }).catch(err => {
-      if (!hasErrored) {
-        hasErrored = true;
-        err = error.convertResponseError(err);
-        // Kill the queue for fatal errors
-        q.kill();
-        ee.emit('error', err);
-      }
-      done();
+      return pipeline(
+        ...srcStreams, // the source streams from the previous block (all docs async generator for shallow or for full either spool changes or resumed log)
+        ...destinationStreams // the appropriate destination streams for the mode
+      );
+    })
+    .then(() => {
+      return { total };
     });
-  }, parallelism);
-  for (const i in batches) {
-    q.push(batches[i]);
-  }
-  q.drain(function() {
-    callback(null, { total: total });
-  });
-}
-/**
- * Returns first N properties on an object.
- *
- * @param {object} obj - object with properties
- * @param {number} count - number of properties to return
- */
-function getPropertyNames(obj, count) {
-  // decide which batch numbers to deal with
-  const batchestofetch = [];
-  let j = 0;
-  for (const i in obj) {
-    batchestofetch.push(parseInt(i));
-    j++;
-    if (j >= count) break;
-  }
-  return batchestofetch;
-}
+};