npm - @joystick.js/db-canary - Versions diffs - 0.0.0-canary.2270 → 0.0.0-canary.2272 - Mend

@joystick.js/db-canary 0.0.0-canary.2270 → 0.0.0-canary.2272

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/server/lib/auto_index_manager.js +1 -1
package/dist/server/lib/bulk_insert_optimizer.js +1 -0
package/dist/server/lib/memory_efficient_bulk_insert.js +1 -0
package/dist/server/lib/write_queue.js +1 -1
package/package.json +10 -4
package/src/server/lib/auto_index_manager.js +11 -4
package/src/server/lib/bulk_insert_optimizer.js +559 -0
package/src/server/lib/memory_efficient_bulk_insert.js +262 -0
package/src/server/lib/write_queue.js +2 -137
package/test_runner.js +353 -0
package/tests/client/index.test.js +3 -1
package/tests/performance/bulk_insert_1m_test.js +113 -0
package/tests/performance/bulk_insert_benchmarks.test.js +570 -0
package/tests/performance/bulk_insert_enterprise_isolated.test.js +469 -0
package/tests/performance/bulk_insert_enterprise_scale_test.js +216 -0
package/tests/server/integration/authentication_integration.test.js +3 -1
package/tests/server/integration/auto_indexing_integration.test.js +1 -1
package/tests/server/integration/development_mode_authentication.test.js +3 -1
package/tests/server/integration/production_safety_integration.test.js +3 -1
package/tests/server/lib/bulk_insert_optimizer.test.js +523 -0
package/tests/server/lib/operations/admin.test.js +3 -1
package/dist/server/lib/batched_write_queue.js +0 -1
package/dist/server/lib/processing_lane.js +0 -1
package/src/server/lib/batched_write_queue.js +0 -331
package/src/server/lib/processing_lane.js +0 -417
package/tests/server/lib/batched_write_queue.test.js +0 -402
package/tests/server/lib/write_queue_integration.test.js +0 -186

package/src/server/lib/bulk_insert_optimizer.js ADDED Viewed

@@ -0,0 +1,559 @@
+/**
+ * @fileoverview Bulk insert performance optimizer for JoystickDB.
+ *
+ * Provides enterprise-scale bulk insert capabilities with optimizations for:
+ * - Map size pre-allocation to prevent MDB_MAP_FULL errors
+ * - Size-based transaction batching for optimal performance
+ * - Key ordering and append mode for B-tree efficiency
+ * - Direct serialization to eliminate double encoding overhead
+ * - Safe index management with deferred rebuilding
+ * - Memory management and streaming processing
+ * - Concurrent read safety during bulk operations
+ */
+import { get_database, generate_document_id, build_collection_key, check_and_grow_map_size } from './query_engine.js';
+import { get_write_queue } from './write_queue.js';
+import { get_auto_index_database, initialize_auto_index_database } from './auto_index_manager.js';
+import create_logger from './logger.js';
+const { create_context_logger } = create_logger('bulk_insert_optimizer');
+/** @type {number} Optimal transaction size in bytes (100MB) */
+const OPTIMAL_TRANSACTION_SIZE = 100 * 1024 * 1024;
+/** @type {number} Default batch size for streaming processing */
+const DEFAULT_STREAM_BATCH_SIZE = 1000;
+/** @type {number} Progress logging interval */
+const PROGRESS_LOG_INTERVAL = 10000;
+/**
+ * Calculates the average document size from a sample.
+ * @param {Array<Object>} documents - Sample documents
+ * @param {number} [sample_size=100] - Number of documents to sample
+ * @returns {number} Average document size in bytes
+ */
+const calculate_average_document_size = (documents, sample_size = 100) => {
+  const sample = documents.slice(0, Math.min(sample_size, documents.length));
+  const total_size = sample.reduce((sum, doc) => {
+    return sum + Buffer.byteLength(JSON.stringify(doc), 'utf8');
+  }, 0);
+  return Math.ceil(total_size / sample.length);
+};
+/**
+ * Calculates required map size for bulk insert operation.
+ * @param {number} document_count - Number of documents to insert
+ * @param {number} avg_document_size - Average document size in bytes
+ * @returns {number} Required map size in bytes
+ */
+const calculate_bulk_map_size = (document_count, avg_document_size) => {
+  const estimated_size = document_count * avg_document_size;
+  const safety_factor = 2.0; // 100% overhead for indexes and growth
+  const minimum_size = 1024 * 1024 * 1024 * 10; // 10GB minimum
+  return Math.max(estimated_size * safety_factor, minimum_size);
+};
+/**
+ * Pre-allocates map size for bulk insert operation.
+ * @param {Array<Object>} documents - Documents to be inserted
+ * @returns {Promise<void>}
+ */
+const prepare_bulk_insert_map_size = async (documents) => {
+  const log = create_context_logger();
+  if (documents.length === 0) {
+    return;
+  }
+  const avg_size = calculate_average_document_size(documents);
+  const required_map_size = calculate_bulk_map_size(documents.length, avg_size);
+  log.info('Pre-allocating map size for bulk insert', {
+    document_count: documents.length,
+    avg_document_size: avg_size,
+    required_map_size,
+    required_map_size_gb: Math.round(required_map_size / (1024 * 1024 * 1024) * 100) / 100
+  });
+  // Trigger map size growth check
+  await check_and_grow_map_size();
+  const db = get_database();
+  if (db.resize) {
+    try {
+      db.resize(required_map_size);
+      log.info('Map size pre-allocated successfully', {
+        new_map_size: required_map_size,
+        new_map_size_gb: Math.round(required_map_size / (1024 * 1024 * 1024) * 100) / 100
+      });
+    } catch (error) {
+      log.warn('Failed to pre-allocate map size', { error: error.message });
+    }
+  }
+};
+/**
+ * Creates size-based batches for optimal transaction performance.
+ * @param {Array<Object>} documents - Documents to batch
+ * @param {number} [target_size=OPTIMAL_TRANSACTION_SIZE] - Target batch size in bytes
+ * @returns {Array<Array<Object>>} Array of document batches
+ */
+const create_size_based_batches = (documents, target_size = OPTIMAL_TRANSACTION_SIZE) => {
+  const batches = [];
+  let current_batch = [];
+  let current_size = 0;
+  for (const doc of documents) {
+    const doc_size = Buffer.byteLength(JSON.stringify(doc), 'utf8');
+    if (current_size + doc_size > target_size && current_batch.length > 0) {
+      batches.push(current_batch);
+      current_batch = [doc];
+      current_size = doc_size;
+    } else {
+      current_batch.push(doc);
+      current_size += doc_size;
+    }
+  }
+  if (current_batch.length > 0) {
+    batches.push(current_batch);
+  }
+  return batches;
+};
+/**
+ * Generates sequential document ID for optimal key ordering.
+ * @returns {string} Sequential document ID
+ */
+const generate_sequential_id = (() => {
+  let counter = Date.now() * 1000; // Microsecond precision
+  return () => {
+    return (++counter).toString(36).padStart(12, '0');
+  };
+})();
+/**
+ * Sorts documents by key for optimal B-tree insertion.
+ * @param {Array<Object>} documents - Documents to sort
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @returns {Array<Object>} Sorted documents with assigned IDs
+ */
+const sort_documents_by_key = (documents, database_name, collection_name) => {
+  return documents.map(doc => ({
+    ...doc,
+    _id: doc._id || generate_sequential_id()
+  })).sort((a, b) => {
+    const key_a = build_collection_key(database_name, collection_name, a._id);
+    const key_b = build_collection_key(database_name, collection_name, b._id);
+    return key_a.localeCompare(key_b);
+  });
+};
+/**
+ * Pre-encodes documents as Buffers for direct LMDB storage.
+ * @param {Array<Object>} documents - Documents to encode
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @returns {Array<Object>} Encoded document entries
+ */
+const pre_encode_documents = (documents, database_name, collection_name) => {
+  const current_timestamp = new Date().toISOString();
+  return documents.map(doc => {
+    // Ensure document has an ID
+    const document_id = doc._id || generate_sequential_id();
+    const document_with_timestamps = {
+      ...doc,
+      _id: document_id,
+      _created_at: doc._created_at || current_timestamp,
+      _updated_at: doc._updated_at || current_timestamp
+    };
+    const json_string = JSON.stringify(document_with_timestamps);
+    const key = build_collection_key(database_name, collection_name, document_id);
+    return {
+      key,
+      value: json_string, // Store as string for LMDB msgpack encoding
+      document_id: document_id
+    };
+  });
+};
+/**
+ * Performs optimized bulk insert with pre-encoded documents.
+ * @param {Object} db - Database instance
+ * @param {Array<Object>} encoded_documents - Pre-encoded document entries
+ * @returns {Promise<Array<string>>} Array of inserted document IDs
+ */
+const bulk_insert_pre_encoded = async (db, encoded_documents) => {
+  const inserted_ids = [];
+  await db.transaction(() => {
+    for (const { key, value, document_id } of encoded_documents) {
+      // Check if document already exists
+      const existing = db.get(key);
+      if (existing) {
+        throw new Error(`Document with _id ${document_id} already exists`);
+      }
+      db.put(key, value);
+      inserted_ids.push(document_id);
+    }
+  });
+  return inserted_ids;
+};
+/**
+ * Streaming bulk insert processor with aggressive memory management.
+ * @param {Array<Object>} documents - Documents to insert
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @param {number} [batch_size=DEFAULT_STREAM_BATCH_SIZE] - Streaming batch size
+ * @returns {AsyncGenerator<Array<string>>} Generator yielding inserted document IDs
+ */
+const stream_bulk_insert = async function* (documents, database_name, collection_name, batch_size = DEFAULT_STREAM_BATCH_SIZE) {
+  const db = get_database();
+  for (let i = 0; i < documents.length; i += batch_size) {
+    const batch = documents.slice(i, i + batch_size);
+    const encoded_batch = pre_encode_documents(batch, database_name, collection_name);
+    const inserted_ids = await bulk_insert_pre_encoded(db, encoded_batch);
+    yield inserted_ids;
+    // Clear batch references immediately to help GC
+    batch.length = 0;
+    encoded_batch.length = 0;
+    const batch_number = Math.floor(i / batch_size);
+    // Ultra-aggressive memory management for very large datasets
+    if (documents.length >= 5000000) {
+      // For 5M+ documents, force GC every 5 batches with longer delays
+      if (batch_number % 5 === 0 && global.gc) {
+        global.gc();
+        await new Promise(resolve => setTimeout(resolve, 100));
+      }
+      // Always yield to event loop for very large datasets
+      await new Promise(resolve => setImmediate(resolve));
+    } else if (documents.length >= 1000000) {
+      // For 1M+ documents, force GC every 8 batches
+      if (batch_number % 8 === 0 && global.gc) {
+        global.gc();
+        await new Promise(resolve => setTimeout(resolve, 75));
+      }
+      // Yield every batch for large datasets
+      await new Promise(resolve => setImmediate(resolve));
+    } else if (documents.length > 100000) {
+      // For 100K-1M documents, force GC every 25 batches
+      if (batch_number % 25 === 0 && global.gc) {
+        global.gc();
+        await new Promise(resolve => setTimeout(resolve, 25));
+      }
+      // Yield every batch for medium datasets
+      await new Promise(resolve => setImmediate(resolve));
+    } else {
+      // For smaller datasets, yield every 10 batches as before
+      if (batch_number % 10 === 0) {
+        await new Promise(resolve => setImmediate(resolve));
+      }
+    }
+  }
+};
+/**
+ * Safely disables auto-indexing during bulk operations.
+ * @returns {boolean} Previous auto-indexing state
+ */
+const disable_auto_indexing = () => {
+  // Auto-indexing management will be implemented in future versions
+  // For now, return false to indicate no auto-indexing was disabled
+  return false;
+};
+/**
+ * Re-enables auto-indexing after bulk operations.
+ * @param {boolean} was_enabled - Previous auto-indexing state
+ */
+const restore_auto_indexing = (was_enabled) => {
+  // Auto-indexing management will be implemented in future versions
+  // For now, this is a no-op
+};
+/**
+ * Rebuilds collection indexes after bulk insert.
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @returns {Promise<void>}
+ */
+const rebuild_collection_indexes = async (database_name, collection_name) => {
+  const log = create_context_logger();
+  // Index rebuilding will be implemented in future versions
+  // For now, this is a no-op
+  log.debug('Index rebuilding skipped (not implemented)', {
+    database: database_name,
+    collection: collection_name
+  });
+};
+/**
+ * Optimized bulk insert implementation with all performance optimizations.
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @param {Array<Object>} documents - Documents to insert
+ * @param {Object} [options={}] - Optimization options
+ * @returns {Promise<Object>} Bulk insert results with performance metrics
+ */
+const bulk_insert_optimized = async (database_name, collection_name, documents, options = {}) => {
+  const {
+    disable_indexing = true,
+    pre_allocate_map_size = true,
+    sort_keys = true,
+    stream_processing = true,
+    batch_size = DEFAULT_STREAM_BATCH_SIZE
+  } = options;
+  const log = create_context_logger();
+  const start_time = Date.now();
+  const start_memory = process.memoryUsage();
+  // Validate parameters
+  if (!database_name || !collection_name) {
+    throw new Error('Database name and collection name are required');
+  }
+  if (!Array.isArray(documents) || documents.length === 0) {
+    throw new Error('Documents must be a non-empty array');
+  }
+  log.info('Starting optimized bulk insert', {
+    database: database_name,
+    collection: collection_name,
+    document_count: documents.length,
+    options
+  });
+  let auto_index_was_enabled = false;
+  try {
+    // Phase 1: Pre-allocate map size
+    if (pre_allocate_map_size) {
+      await prepare_bulk_insert_map_size(documents);
+    }
+    // Phase 2: Disable auto-indexing
+    if (disable_indexing) {
+      auto_index_was_enabled = disable_auto_indexing();
+    }
+    // Phase 3: Sort documents by key
+    let processed_documents = documents;
+    if (sort_keys) {
+      processed_documents = sort_documents_by_key(documents, database_name, collection_name);
+    }
+    // Phase 4: Process documents
+    const all_inserted_ids = [];
+    let processed_count = 0;
+    if (stream_processing) {
+      // Streaming processing for memory efficiency
+      for await (const inserted_ids of stream_bulk_insert(processed_documents, database_name, collection_name, batch_size)) {
+        all_inserted_ids.push(...inserted_ids);
+        processed_count += inserted_ids.length;
+        // Log progress
+        if (processed_count % PROGRESS_LOG_INTERVAL === 0) {
+          log.info('Bulk insert progress', {
+            processed: processed_count,
+            total: documents.length,
+            percentage: Math.round((processed_count / documents.length) * 100)
+          });
+        }
+      }
+    } else {
+      // Batch processing for smaller datasets
+      const batches = create_size_based_batches(processed_documents);
+      const db = get_database();
+      for (const batch of batches) {
+        const encoded_batch = pre_encode_documents(batch, database_name, collection_name);
+        const inserted_ids = await bulk_insert_pre_encoded(db, encoded_batch);
+        all_inserted_ids.push(...inserted_ids);
+        processed_count += inserted_ids.length;
+        // Log progress
+        if (processed_count % PROGRESS_LOG_INTERVAL === 0) {
+          log.info('Bulk insert progress', {
+            processed: processed_count,
+            total: documents.length,
+            percentage: Math.round((processed_count / documents.length) * 100)
+          });
+        }
+      }
+    }
+    // Phase 5: Rebuild indexes
+    if (disable_indexing) {
+      await rebuild_collection_indexes(database_name, collection_name);
+    }
+    const end_time = Date.now();
+    const end_memory = process.memoryUsage();
+    const performance_metrics = {
+      duration_ms: end_time - start_time,
+      documents_per_second: Math.round(documents.length / ((end_time - start_time) / 1000)),
+      memory_delta_mb: Math.round((end_memory.heapUsed - start_memory.heapUsed) / (1024 * 1024)),
+      peak_memory_mb: Math.round(end_memory.heapUsed / (1024 * 1024))
+    };
+    log.info('Optimized bulk insert completed', {
+      database: database_name,
+      collection: collection_name,
+      inserted_count: all_inserted_ids.length,
+      performance: performance_metrics
+    });
+    return {
+      acknowledged: true,
+      inserted_count: all_inserted_ids.length,
+      inserted_ids: all_inserted_ids,
+      performance: performance_metrics
+    };
+  } catch (error) {
+    log.error('Optimized bulk insert failed', {
+      database: database_name,
+      collection: collection_name,
+      error: error.message
+    });
+    throw error;
+  } finally {
+    // Always restore auto-indexing
+    if (disable_indexing) {
+      restore_auto_indexing(auto_index_was_enabled);
+    }
+  }
+};
+/**
+ * Non-blocking bulk insert that yields to allow concurrent reads.
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @param {Array<Object>} documents - Documents to insert
+ * @param {Object} [options={}] - Options
+ * @returns {Promise<Object>} Bulk insert results
+ */
+const non_blocking_bulk_insert = async (database_name, collection_name, documents, options = {}) => {
+  const { chunk_size = 10000 } = options;
+  const all_results = {
+    acknowledged: true,
+    inserted_count: 0,
+    inserted_ids: [],
+    performance: {
+      duration_ms: 0,
+      documents_per_second: 0,
+      memory_delta_mb: 0,
+      peak_memory_mb: 0
+    }
+  };
+  const start_time = Date.now();
+  // Process in smaller chunks to ensure reads are never blocked
+  for (let i = 0; i < documents.length; i += chunk_size) {
+    const chunk = documents.slice(i, i + chunk_size);
+    const result = await bulk_insert_optimized(database_name, collection_name, chunk, options);
+    all_results.inserted_count += result.inserted_count;
+    all_results.inserted_ids.push(...result.inserted_ids);
+    // Brief yield to allow reads to proceed
+    await new Promise(resolve => setImmediate(resolve));
+  }
+  const end_time = Date.now();
+  all_results.performance.duration_ms = end_time - start_time;
+  all_results.performance.documents_per_second = Math.round(documents.length / ((end_time - start_time) / 1000));
+  return all_results;
+};
+/**
+ * Bulk insert with performance monitoring and metrics.
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @param {Array<Object>} documents - Documents to insert
+ * @param {Object} [options={}] - Options
+ * @returns {Promise<Object>} Bulk insert results with detailed metrics
+ */
+const bulk_insert_with_metrics = async (database_name, collection_name, documents, options = {}) => {
+  const start_time = Date.now();
+  const start_memory = process.memoryUsage();
+  const result = await bulk_insert_optimized(database_name, collection_name, documents, options);
+  const end_time = Date.now();
+  const end_memory = process.memoryUsage();
+  return {
+    ...result,
+    performance: {
+      ...result.performance,
+      total_duration_ms: end_time - start_time,
+      memory_usage: {
+        start_heap_mb: Math.round(start_memory.heapUsed / (1024 * 1024)),
+        end_heap_mb: Math.round(end_memory.heapUsed / (1024 * 1024)),
+        delta_heap_mb: Math.round((end_memory.heapUsed - start_memory.heapUsed) / (1024 * 1024)),
+        peak_heap_mb: Math.round(end_memory.heapUsed / (1024 * 1024))
+      }
+    }
+  };
+};
+/**
+ * Main bulk insert function with write queue integration.
+ * @param {string} database_name - Database name
+ * @param {string} collection_name - Collection name
+ * @param {Array<Object>} documents - Documents to insert
+ * @param {Object} [options={}] - Options
+ * @returns {Promise<Object>} Bulk insert results
+ */
+const bulk_insert = async (database_name, collection_name, documents, options = {}) => {
+  const write_queue = get_write_queue();
+  const operation_metadata = {
+    operation: 'bulk_insert_optimized',
+    database: database_name,
+    collection: collection_name,
+    document_count: documents.length
+  };
+  return await write_queue.enqueue_write_operation(
+    () => bulk_insert_optimized(database_name, collection_name, documents, options),
+    operation_metadata
+  );
+};
+export {
+  bulk_insert_optimized,
+  bulk_insert_with_metrics,
+  non_blocking_bulk_insert,
+  bulk_insert,
+  calculate_average_document_size,
+  calculate_bulk_map_size,
+  create_size_based_batches,
+  sort_documents_by_key,
+  pre_encode_documents
+};