@joystick.js/db-canary 0.0.0-canary.2270 → 0.0.0-canary.2272

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/dist/server/lib/auto_index_manager.js +1 -1
  2. package/dist/server/lib/bulk_insert_optimizer.js +1 -0
  3. package/dist/server/lib/memory_efficient_bulk_insert.js +1 -0
  4. package/dist/server/lib/write_queue.js +1 -1
  5. package/package.json +10 -4
  6. package/src/server/lib/auto_index_manager.js +11 -4
  7. package/src/server/lib/bulk_insert_optimizer.js +559 -0
  8. package/src/server/lib/memory_efficient_bulk_insert.js +262 -0
  9. package/src/server/lib/write_queue.js +2 -137
  10. package/test_runner.js +353 -0
  11. package/tests/client/index.test.js +3 -1
  12. package/tests/performance/bulk_insert_1m_test.js +113 -0
  13. package/tests/performance/bulk_insert_benchmarks.test.js +570 -0
  14. package/tests/performance/bulk_insert_enterprise_isolated.test.js +469 -0
  15. package/tests/performance/bulk_insert_enterprise_scale_test.js +216 -0
  16. package/tests/server/integration/authentication_integration.test.js +3 -1
  17. package/tests/server/integration/auto_indexing_integration.test.js +1 -1
  18. package/tests/server/integration/development_mode_authentication.test.js +3 -1
  19. package/tests/server/integration/production_safety_integration.test.js +3 -1
  20. package/tests/server/lib/bulk_insert_optimizer.test.js +523 -0
  21. package/tests/server/lib/operations/admin.test.js +3 -1
  22. package/dist/server/lib/batched_write_queue.js +0 -1
  23. package/dist/server/lib/processing_lane.js +0 -1
  24. package/src/server/lib/batched_write_queue.js +0 -331
  25. package/src/server/lib/processing_lane.js +0 -417
  26. package/tests/server/lib/batched_write_queue.test.js +0 -402
  27. package/tests/server/lib/write_queue_integration.test.js +0 -186
@@ -0,0 +1,559 @@
1
+ /**
2
+ * @fileoverview Bulk insert performance optimizer for JoystickDB.
3
+ *
4
+ * Provides enterprise-scale bulk insert capabilities with optimizations for:
5
+ * - Map size pre-allocation to prevent MDB_MAP_FULL errors
6
+ * - Size-based transaction batching for optimal performance
7
+ * - Key ordering and append mode for B-tree efficiency
8
+ * - Direct serialization to eliminate double encoding overhead
9
+ * - Safe index management with deferred rebuilding
10
+ * - Memory management and streaming processing
11
+ * - Concurrent read safety during bulk operations
12
+ */
13
+
14
+ import { get_database, generate_document_id, build_collection_key, check_and_grow_map_size } from './query_engine.js';
15
+ import { get_write_queue } from './write_queue.js';
16
+ import { get_auto_index_database, initialize_auto_index_database } from './auto_index_manager.js';
17
+ import create_logger from './logger.js';
18
+
19
+ const { create_context_logger } = create_logger('bulk_insert_optimizer');
20
+
21
+ /** @type {number} Optimal transaction size in bytes (100MB) */
22
+ const OPTIMAL_TRANSACTION_SIZE = 100 * 1024 * 1024;
23
+
24
+ /** @type {number} Default batch size for streaming processing */
25
+ const DEFAULT_STREAM_BATCH_SIZE = 1000;
26
+
27
+ /** @type {number} Progress logging interval */
28
+ const PROGRESS_LOG_INTERVAL = 10000;
29
+
30
+ /**
31
+ * Calculates the average document size from a sample.
32
+ * @param {Array<Object>} documents - Sample documents
33
+ * @param {number} [sample_size=100] - Number of documents to sample
34
+ * @returns {number} Average document size in bytes
35
+ */
36
+ const calculate_average_document_size = (documents, sample_size = 100) => {
37
+ const sample = documents.slice(0, Math.min(sample_size, documents.length));
38
+ const total_size = sample.reduce((sum, doc) => {
39
+ return sum + Buffer.byteLength(JSON.stringify(doc), 'utf8');
40
+ }, 0);
41
+
42
+ return Math.ceil(total_size / sample.length);
43
+ };
44
+
45
+ /**
46
+ * Calculates required map size for bulk insert operation.
47
+ * @param {number} document_count - Number of documents to insert
48
+ * @param {number} avg_document_size - Average document size in bytes
49
+ * @returns {number} Required map size in bytes
50
+ */
51
+ const calculate_bulk_map_size = (document_count, avg_document_size) => {
52
+ const estimated_size = document_count * avg_document_size;
53
+ const safety_factor = 2.0; // 100% overhead for indexes and growth
54
+ const minimum_size = 1024 * 1024 * 1024 * 10; // 10GB minimum
55
+
56
+ return Math.max(estimated_size * safety_factor, minimum_size);
57
+ };
58
+
59
+ /**
60
+ * Pre-allocates map size for bulk insert operation.
61
+ * @param {Array<Object>} documents - Documents to be inserted
62
+ * @returns {Promise<void>}
63
+ */
64
+ const prepare_bulk_insert_map_size = async (documents) => {
65
+ const log = create_context_logger();
66
+
67
+ if (documents.length === 0) {
68
+ return;
69
+ }
70
+
71
+ const avg_size = calculate_average_document_size(documents);
72
+ const required_map_size = calculate_bulk_map_size(documents.length, avg_size);
73
+
74
+ log.info('Pre-allocating map size for bulk insert', {
75
+ document_count: documents.length,
76
+ avg_document_size: avg_size,
77
+ required_map_size,
78
+ required_map_size_gb: Math.round(required_map_size / (1024 * 1024 * 1024) * 100) / 100
79
+ });
80
+
81
+ // Trigger map size growth check
82
+ await check_and_grow_map_size();
83
+
84
+ const db = get_database();
85
+ if (db.resize) {
86
+ try {
87
+ db.resize(required_map_size);
88
+ log.info('Map size pre-allocated successfully', {
89
+ new_map_size: required_map_size,
90
+ new_map_size_gb: Math.round(required_map_size / (1024 * 1024 * 1024) * 100) / 100
91
+ });
92
+ } catch (error) {
93
+ log.warn('Failed to pre-allocate map size', { error: error.message });
94
+ }
95
+ }
96
+ };
97
+
98
+ /**
99
+ * Creates size-based batches for optimal transaction performance.
100
+ * @param {Array<Object>} documents - Documents to batch
101
+ * @param {number} [target_size=OPTIMAL_TRANSACTION_SIZE] - Target batch size in bytes
102
+ * @returns {Array<Array<Object>>} Array of document batches
103
+ */
104
+ const create_size_based_batches = (documents, target_size = OPTIMAL_TRANSACTION_SIZE) => {
105
+ const batches = [];
106
+ let current_batch = [];
107
+ let current_size = 0;
108
+
109
+ for (const doc of documents) {
110
+ const doc_size = Buffer.byteLength(JSON.stringify(doc), 'utf8');
111
+
112
+ if (current_size + doc_size > target_size && current_batch.length > 0) {
113
+ batches.push(current_batch);
114
+ current_batch = [doc];
115
+ current_size = doc_size;
116
+ } else {
117
+ current_batch.push(doc);
118
+ current_size += doc_size;
119
+ }
120
+ }
121
+
122
+ if (current_batch.length > 0) {
123
+ batches.push(current_batch);
124
+ }
125
+
126
+ return batches;
127
+ };
128
+
129
+ /**
130
+ * Generates sequential document ID for optimal key ordering.
131
+ * @returns {string} Sequential document ID
132
+ */
133
+ const generate_sequential_id = (() => {
134
+ let counter = Date.now() * 1000; // Microsecond precision
135
+ return () => {
136
+ return (++counter).toString(36).padStart(12, '0');
137
+ };
138
+ })();
139
+
140
+ /**
141
+ * Sorts documents by key for optimal B-tree insertion.
142
+ * @param {Array<Object>} documents - Documents to sort
143
+ * @param {string} database_name - Database name
144
+ * @param {string} collection_name - Collection name
145
+ * @returns {Array<Object>} Sorted documents with assigned IDs
146
+ */
147
+ const sort_documents_by_key = (documents, database_name, collection_name) => {
148
+ return documents.map(doc => ({
149
+ ...doc,
150
+ _id: doc._id || generate_sequential_id()
151
+ })).sort((a, b) => {
152
+ const key_a = build_collection_key(database_name, collection_name, a._id);
153
+ const key_b = build_collection_key(database_name, collection_name, b._id);
154
+ return key_a.localeCompare(key_b);
155
+ });
156
+ };
157
+
158
+ /**
159
+ * Pre-encodes documents as Buffers for direct LMDB storage.
160
+ * @param {Array<Object>} documents - Documents to encode
161
+ * @param {string} database_name - Database name
162
+ * @param {string} collection_name - Collection name
163
+ * @returns {Array<Object>} Encoded document entries
164
+ */
165
+ const pre_encode_documents = (documents, database_name, collection_name) => {
166
+ const current_timestamp = new Date().toISOString();
167
+
168
+ return documents.map(doc => {
169
+ // Ensure document has an ID
170
+ const document_id = doc._id || generate_sequential_id();
171
+
172
+ const document_with_timestamps = {
173
+ ...doc,
174
+ _id: document_id,
175
+ _created_at: doc._created_at || current_timestamp,
176
+ _updated_at: doc._updated_at || current_timestamp
177
+ };
178
+
179
+ const json_string = JSON.stringify(document_with_timestamps);
180
+ const key = build_collection_key(database_name, collection_name, document_id);
181
+
182
+ return {
183
+ key,
184
+ value: json_string, // Store as string for LMDB msgpack encoding
185
+ document_id: document_id
186
+ };
187
+ });
188
+ };
189
+
190
+ /**
191
+ * Performs optimized bulk insert with pre-encoded documents.
192
+ * @param {Object} db - Database instance
193
+ * @param {Array<Object>} encoded_documents - Pre-encoded document entries
194
+ * @returns {Promise<Array<string>>} Array of inserted document IDs
195
+ */
196
+ const bulk_insert_pre_encoded = async (db, encoded_documents) => {
197
+ const inserted_ids = [];
198
+
199
+ await db.transaction(() => {
200
+ for (const { key, value, document_id } of encoded_documents) {
201
+ // Check if document already exists
202
+ const existing = db.get(key);
203
+ if (existing) {
204
+ throw new Error(`Document with _id ${document_id} already exists`);
205
+ }
206
+
207
+ db.put(key, value);
208
+ inserted_ids.push(document_id);
209
+ }
210
+ });
211
+
212
+ return inserted_ids;
213
+ };
214
+
215
+ /**
216
+ * Streaming bulk insert processor with aggressive memory management.
217
+ * @param {Array<Object>} documents - Documents to insert
218
+ * @param {string} database_name - Database name
219
+ * @param {string} collection_name - Collection name
220
+ * @param {number} [batch_size=DEFAULT_STREAM_BATCH_SIZE] - Streaming batch size
221
+ * @returns {AsyncGenerator<Array<string>>} Generator yielding inserted document IDs
222
+ */
223
+ const stream_bulk_insert = async function* (documents, database_name, collection_name, batch_size = DEFAULT_STREAM_BATCH_SIZE) {
224
+ const db = get_database();
225
+
226
+ for (let i = 0; i < documents.length; i += batch_size) {
227
+ const batch = documents.slice(i, i + batch_size);
228
+ const encoded_batch = pre_encode_documents(batch, database_name, collection_name);
229
+
230
+ const inserted_ids = await bulk_insert_pre_encoded(db, encoded_batch);
231
+ yield inserted_ids;
232
+
233
+ // Clear batch references immediately to help GC
234
+ batch.length = 0;
235
+ encoded_batch.length = 0;
236
+
237
+ const batch_number = Math.floor(i / batch_size);
238
+
239
+ // Ultra-aggressive memory management for very large datasets
240
+ if (documents.length >= 5000000) {
241
+ // For 5M+ documents, force GC every 5 batches with longer delays
242
+ if (batch_number % 5 === 0 && global.gc) {
243
+ global.gc();
244
+ await new Promise(resolve => setTimeout(resolve, 100));
245
+ }
246
+ // Always yield to event loop for very large datasets
247
+ await new Promise(resolve => setImmediate(resolve));
248
+ } else if (documents.length >= 1000000) {
249
+ // For 1M+ documents, force GC every 8 batches
250
+ if (batch_number % 8 === 0 && global.gc) {
251
+ global.gc();
252
+ await new Promise(resolve => setTimeout(resolve, 75));
253
+ }
254
+ // Yield every batch for large datasets
255
+ await new Promise(resolve => setImmediate(resolve));
256
+ } else if (documents.length > 100000) {
257
+ // For 100K-1M documents, force GC every 25 batches
258
+ if (batch_number % 25 === 0 && global.gc) {
259
+ global.gc();
260
+ await new Promise(resolve => setTimeout(resolve, 25));
261
+ }
262
+ // Yield every batch for medium datasets
263
+ await new Promise(resolve => setImmediate(resolve));
264
+ } else {
265
+ // For smaller datasets, yield every 10 batches as before
266
+ if (batch_number % 10 === 0) {
267
+ await new Promise(resolve => setImmediate(resolve));
268
+ }
269
+ }
270
+ }
271
+ };
272
+
273
+ /**
274
+ * Safely disables auto-indexing during bulk operations.
275
+ * @returns {boolean} Previous auto-indexing state
276
+ */
277
+ const disable_auto_indexing = () => {
278
+ // Auto-indexing management will be implemented in future versions
279
+ // For now, return false to indicate no auto-indexing was disabled
280
+ return false;
281
+ };
282
+
283
+ /**
284
+ * Re-enables auto-indexing after bulk operations.
285
+ * @param {boolean} was_enabled - Previous auto-indexing state
286
+ */
287
+ const restore_auto_indexing = (was_enabled) => {
288
+ // Auto-indexing management will be implemented in future versions
289
+ // For now, this is a no-op
290
+ };
291
+
292
+ /**
293
+ * Rebuilds collection indexes after bulk insert.
294
+ * @param {string} database_name - Database name
295
+ * @param {string} collection_name - Collection name
296
+ * @returns {Promise<void>}
297
+ */
298
+ const rebuild_collection_indexes = async (database_name, collection_name) => {
299
+ const log = create_context_logger();
300
+
301
+ // Index rebuilding will be implemented in future versions
302
+ // For now, this is a no-op
303
+ log.debug('Index rebuilding skipped (not implemented)', {
304
+ database: database_name,
305
+ collection: collection_name
306
+ });
307
+ };
308
+
309
+ /**
310
+ * Optimized bulk insert implementation with all performance optimizations.
311
+ * @param {string} database_name - Database name
312
+ * @param {string} collection_name - Collection name
313
+ * @param {Array<Object>} documents - Documents to insert
314
+ * @param {Object} [options={}] - Optimization options
315
+ * @returns {Promise<Object>} Bulk insert results with performance metrics
316
+ */
317
+ const bulk_insert_optimized = async (database_name, collection_name, documents, options = {}) => {
318
+ const {
319
+ disable_indexing = true,
320
+ pre_allocate_map_size = true,
321
+ sort_keys = true,
322
+ stream_processing = true,
323
+ batch_size = DEFAULT_STREAM_BATCH_SIZE
324
+ } = options;
325
+
326
+ const log = create_context_logger();
327
+ const start_time = Date.now();
328
+ const start_memory = process.memoryUsage();
329
+
330
+ // Validate parameters
331
+ if (!database_name || !collection_name) {
332
+ throw new Error('Database name and collection name are required');
333
+ }
334
+
335
+ if (!Array.isArray(documents) || documents.length === 0) {
336
+ throw new Error('Documents must be a non-empty array');
337
+ }
338
+
339
+ log.info('Starting optimized bulk insert', {
340
+ database: database_name,
341
+ collection: collection_name,
342
+ document_count: documents.length,
343
+ options
344
+ });
345
+
346
+ let auto_index_was_enabled = false;
347
+
348
+ try {
349
+ // Phase 1: Pre-allocate map size
350
+ if (pre_allocate_map_size) {
351
+ await prepare_bulk_insert_map_size(documents);
352
+ }
353
+
354
+ // Phase 2: Disable auto-indexing
355
+ if (disable_indexing) {
356
+ auto_index_was_enabled = disable_auto_indexing();
357
+ }
358
+
359
+ // Phase 3: Sort documents by key
360
+ let processed_documents = documents;
361
+ if (sort_keys) {
362
+ processed_documents = sort_documents_by_key(documents, database_name, collection_name);
363
+ }
364
+
365
+ // Phase 4: Process documents
366
+ const all_inserted_ids = [];
367
+ let processed_count = 0;
368
+
369
+ if (stream_processing) {
370
+ // Streaming processing for memory efficiency
371
+ for await (const inserted_ids of stream_bulk_insert(processed_documents, database_name, collection_name, batch_size)) {
372
+ all_inserted_ids.push(...inserted_ids);
373
+ processed_count += inserted_ids.length;
374
+
375
+ // Log progress
376
+ if (processed_count % PROGRESS_LOG_INTERVAL === 0) {
377
+ log.info('Bulk insert progress', {
378
+ processed: processed_count,
379
+ total: documents.length,
380
+ percentage: Math.round((processed_count / documents.length) * 100)
381
+ });
382
+ }
383
+ }
384
+ } else {
385
+ // Batch processing for smaller datasets
386
+ const batches = create_size_based_batches(processed_documents);
387
+ const db = get_database();
388
+
389
+ for (const batch of batches) {
390
+ const encoded_batch = pre_encode_documents(batch, database_name, collection_name);
391
+ const inserted_ids = await bulk_insert_pre_encoded(db, encoded_batch);
392
+ all_inserted_ids.push(...inserted_ids);
393
+ processed_count += inserted_ids.length;
394
+
395
+ // Log progress
396
+ if (processed_count % PROGRESS_LOG_INTERVAL === 0) {
397
+ log.info('Bulk insert progress', {
398
+ processed: processed_count,
399
+ total: documents.length,
400
+ percentage: Math.round((processed_count / documents.length) * 100)
401
+ });
402
+ }
403
+ }
404
+ }
405
+
406
+ // Phase 5: Rebuild indexes
407
+ if (disable_indexing) {
408
+ await rebuild_collection_indexes(database_name, collection_name);
409
+ }
410
+
411
+ const end_time = Date.now();
412
+ const end_memory = process.memoryUsage();
413
+
414
+ const performance_metrics = {
415
+ duration_ms: end_time - start_time,
416
+ documents_per_second: Math.round(documents.length / ((end_time - start_time) / 1000)),
417
+ memory_delta_mb: Math.round((end_memory.heapUsed - start_memory.heapUsed) / (1024 * 1024)),
418
+ peak_memory_mb: Math.round(end_memory.heapUsed / (1024 * 1024))
419
+ };
420
+
421
+ log.info('Optimized bulk insert completed', {
422
+ database: database_name,
423
+ collection: collection_name,
424
+ inserted_count: all_inserted_ids.length,
425
+ performance: performance_metrics
426
+ });
427
+
428
+ return {
429
+ acknowledged: true,
430
+ inserted_count: all_inserted_ids.length,
431
+ inserted_ids: all_inserted_ids,
432
+ performance: performance_metrics
433
+ };
434
+
435
+ } catch (error) {
436
+ log.error('Optimized bulk insert failed', {
437
+ database: database_name,
438
+ collection: collection_name,
439
+ error: error.message
440
+ });
441
+ throw error;
442
+ } finally {
443
+ // Always restore auto-indexing
444
+ if (disable_indexing) {
445
+ restore_auto_indexing(auto_index_was_enabled);
446
+ }
447
+ }
448
+ };
449
+
450
+ /**
451
+ * Non-blocking bulk insert that yields to allow concurrent reads.
452
+ * @param {string} database_name - Database name
453
+ * @param {string} collection_name - Collection name
454
+ * @param {Array<Object>} documents - Documents to insert
455
+ * @param {Object} [options={}] - Options
456
+ * @returns {Promise<Object>} Bulk insert results
457
+ */
458
+ const non_blocking_bulk_insert = async (database_name, collection_name, documents, options = {}) => {
459
+ const { chunk_size = 10000 } = options;
460
+
461
+ const all_results = {
462
+ acknowledged: true,
463
+ inserted_count: 0,
464
+ inserted_ids: [],
465
+ performance: {
466
+ duration_ms: 0,
467
+ documents_per_second: 0,
468
+ memory_delta_mb: 0,
469
+ peak_memory_mb: 0
470
+ }
471
+ };
472
+
473
+ const start_time = Date.now();
474
+
475
+ // Process in smaller chunks to ensure reads are never blocked
476
+ for (let i = 0; i < documents.length; i += chunk_size) {
477
+ const chunk = documents.slice(i, i + chunk_size);
478
+ const result = await bulk_insert_optimized(database_name, collection_name, chunk, options);
479
+
480
+ all_results.inserted_count += result.inserted_count;
481
+ all_results.inserted_ids.push(...result.inserted_ids);
482
+
483
+ // Brief yield to allow reads to proceed
484
+ await new Promise(resolve => setImmediate(resolve));
485
+ }
486
+
487
+ const end_time = Date.now();
488
+ all_results.performance.duration_ms = end_time - start_time;
489
+ all_results.performance.documents_per_second = Math.round(documents.length / ((end_time - start_time) / 1000));
490
+
491
+ return all_results;
492
+ };
493
+
494
+ /**
495
+ * Bulk insert with performance monitoring and metrics.
496
+ * @param {string} database_name - Database name
497
+ * @param {string} collection_name - Collection name
498
+ * @param {Array<Object>} documents - Documents to insert
499
+ * @param {Object} [options={}] - Options
500
+ * @returns {Promise<Object>} Bulk insert results with detailed metrics
501
+ */
502
+ const bulk_insert_with_metrics = async (database_name, collection_name, documents, options = {}) => {
503
+ const start_time = Date.now();
504
+ const start_memory = process.memoryUsage();
505
+
506
+ const result = await bulk_insert_optimized(database_name, collection_name, documents, options);
507
+
508
+ const end_time = Date.now();
509
+ const end_memory = process.memoryUsage();
510
+
511
+ return {
512
+ ...result,
513
+ performance: {
514
+ ...result.performance,
515
+ total_duration_ms: end_time - start_time,
516
+ memory_usage: {
517
+ start_heap_mb: Math.round(start_memory.heapUsed / (1024 * 1024)),
518
+ end_heap_mb: Math.round(end_memory.heapUsed / (1024 * 1024)),
519
+ delta_heap_mb: Math.round((end_memory.heapUsed - start_memory.heapUsed) / (1024 * 1024)),
520
+ peak_heap_mb: Math.round(end_memory.heapUsed / (1024 * 1024))
521
+ }
522
+ }
523
+ };
524
+ };
525
+
526
+ /**
527
+ * Main bulk insert function with write queue integration.
528
+ * @param {string} database_name - Database name
529
+ * @param {string} collection_name - Collection name
530
+ * @param {Array<Object>} documents - Documents to insert
531
+ * @param {Object} [options={}] - Options
532
+ * @returns {Promise<Object>} Bulk insert results
533
+ */
534
+ const bulk_insert = async (database_name, collection_name, documents, options = {}) => {
535
+ const write_queue = get_write_queue();
536
+ const operation_metadata = {
537
+ operation: 'bulk_insert_optimized',
538
+ database: database_name,
539
+ collection: collection_name,
540
+ document_count: documents.length
541
+ };
542
+
543
+ return await write_queue.enqueue_write_operation(
544
+ () => bulk_insert_optimized(database_name, collection_name, documents, options),
545
+ operation_metadata
546
+ );
547
+ };
548
+
549
+ export {
550
+ bulk_insert_optimized,
551
+ bulk_insert_with_metrics,
552
+ non_blocking_bulk_insert,
553
+ bulk_insert,
554
+ calculate_average_document_size,
555
+ calculate_bulk_map_size,
556
+ create_size_based_batches,
557
+ sort_documents_by_key,
558
+ pre_encode_documents
559
+ };