@tb.p/dd 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,541 @@
1
+ import { calculateContentHash } from './fileHasher.js';
2
+ import { pathExists, getFileSize } from './fileSystemUtils.js';
3
+ import { createConnection } from '../database/dbConnection.js';
4
+ import { createOperations } from '../database/dbOperations.js';
5
+
6
+ /**
7
+ * Candidate Detection for @tb.p/dd
8
+ * Groups files by size and hashes candidates for duplicate detection
9
+ */
10
+ class CandidateDetection {
11
+ constructor(dbOperations, options = {}) {
12
+ this.db = dbOperations;
13
+ this.options = {
14
+ hashAlgorithm: 'blake3',
15
+ batchSize: 100,
16
+ maxConcurrency: 5,
17
+ verbose: false,
18
+ onProgress: null,
19
+ ...options
20
+ };
21
+ this.stats = {
22
+ totalFiles: 0,
23
+ uniqueFiles: 0,
24
+ candidatesFound: 0,
25
+ hashedFiles: 0,
26
+ skippedFiles: 0,
27
+ errors: 0,
28
+ startTime: null,
29
+ endTime: null
30
+ };
31
+ }
32
+
33
+ /**
34
+ * Detect candidates and process them
35
+ */
36
+ async detectAndProcessCandidates(options = {}) {
37
+ const config = { ...this.options, ...options };
38
+
39
+ if (config.verbose) {
40
+ console.log('🔍 Starting candidate detection and processing...');
41
+ }
42
+
43
+ this.stats.startTime = Date.now();
44
+
45
+ try {
46
+ // Find candidates
47
+ const detectionResult = await this.findCandidates();
48
+
49
+ if (!detectionResult.success) {
50
+ return {
51
+ success: false,
52
+ error: `Candidate detection failed: ${detectionResult.error}`,
53
+ stats: this.stats
54
+ };
55
+ }
56
+
57
+ if (config.verbose) {
58
+ console.log(`📊 Found ${detectionResult.totalCandidates} candidates in ${detectionResult.candidateGroups} size groups`);
59
+ }
60
+
61
+ // Process candidates if any found
62
+ if (detectionResult.totalCandidates > 0) {
63
+ const processResult = await this.processCandidates();
64
+
65
+ if (!processResult.success) {
66
+ return {
67
+ success: false,
68
+ error: `Candidate processing failed: ${processResult.error}`,
69
+ stats: this.stats
70
+ };
71
+ }
72
+
73
+ if (config.verbose) {
74
+ console.log(`✅ Processed ${processResult.hashedFiles} files successfully`);
75
+ if (processResult.skippedFiles > 0) {
76
+ console.log(`â­ī¸ ${processResult.skippedFiles} files already had hashes, skipped`);
77
+ }
78
+ if (processResult.errors > 0) {
79
+ console.log(`âš ī¸ ${processResult.errors} files had errors during processing`);
80
+ }
81
+ }
82
+
83
+ return {
84
+ success: true,
85
+ detection: detectionResult,
86
+ processing: processResult,
87
+ stats: this.stats
88
+ };
89
+ } else {
90
+ if (config.verbose) {
91
+ console.log('â„šī¸ No candidates found for processing');
92
+ }
93
+
94
+ return {
95
+ success: true,
96
+ detection: detectionResult,
97
+ processing: { hashedFiles: 0, errors: 0 },
98
+ stats: this.stats
99
+ };
100
+ }
101
+
102
+ } catch (error) {
103
+ return {
104
+ success: false,
105
+ error: error.message,
106
+ stats: this.stats
107
+ };
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Find all files with matching sizes (candidates for hashing)
113
+ */
114
+ async findCandidates() {
115
+ this.stats.totalFiles = 0;
116
+ this.stats.uniqueFiles = 0;
117
+ this.stats.candidatesFound = 0;
118
+
119
+ try {
120
+ // First check if there are any unhashed active files at all
121
+ const totalUnhashed = await this.db.db.queryOne('SELECT COUNT(*) as total FROM copies WHERE file_hash IS NULL AND active = 1');
122
+
123
+ if (totalUnhashed.total === 0) {
124
+ console.log('â„šī¸ No unhashed active files found - all active files already processed');
125
+ this.stats.endTime = Date.now();
126
+ return {
127
+ success: true,
128
+ stats: this.stats,
129
+ candidateGroups: 0,
130
+ totalCandidates: 0,
131
+ message: 'No unhashed active files found'
132
+ };
133
+ }
134
+
135
+ // Get all files grouped by size
136
+ const sizeGroups = await this.getFilesBySize();
137
+
138
+ // Filter groups that have more than one file (candidates)
139
+ const candidateGroups = sizeGroups.filter(group => group.count > 1);
140
+
141
+ this.stats.candidatesFound = candidateGroups.reduce((total, group) => total + group.count, 0);
142
+ this.stats.totalFiles = sizeGroups.reduce((total, group) => total + group.count, 0);
143
+ this.stats.uniqueFiles = sizeGroups.length;
144
+
145
+ if (this.options.onProgress) {
146
+ this.options.onProgress({
147
+ phase: 'candidate_detection',
148
+ candidatesFound: this.stats.candidatesFound,
149
+ totalFiles: this.stats.totalFiles,
150
+ candidateGroups: candidateGroups.length
151
+ });
152
+ }
153
+
154
+ return {
155
+ success: true,
156
+ stats: this.stats,
157
+ candidateGroups: candidateGroups.length,
158
+ totalCandidates: this.stats.candidatesFound
159
+ };
160
+
161
+ } catch (error) {
162
+ this.stats.endTime = Date.now();
163
+ return {
164
+ success: false,
165
+ error: error.message,
166
+ stats: this.stats
167
+ };
168
+ }
169
+ }
170
+
171
+ /**
172
+ * Process candidates by calculating hashes and updating database
173
+ */
174
+ async processCandidates() {
175
+ try {
176
+ // First check if there are any unhashed files at all
177
+ const totalUnhashed = await this.db.db.queryOne('SELECT COUNT(*) as total FROM copies WHERE file_hash IS NULL AND active = 1');
178
+
179
+ if (totalUnhashed.total === 0) {
180
+ console.log('â„šī¸ No unhashed active files found - all active files already processed');
181
+ this.stats.endTime = Date.now();
182
+ return {
183
+ success: true,
184
+ message: 'No unhashed active files found - all active files already processed',
185
+ stats: this.stats,
186
+ hashedFiles: 0,
187
+ errors: 0
188
+ };
189
+ }
190
+
191
+ // Get all files that need hashing (candidates with matching sizes)
192
+ const candidates = await this.getCandidatesForHashing();
193
+
194
+ if (candidates.length === 0) {
195
+ return {
196
+ success: true,
197
+ message: 'No candidates found for hashing',
198
+ stats: this.stats,
199
+ hashedFiles: 0,
200
+ errors: 0
201
+ };
202
+ }
203
+
204
+ // Process candidates in batches
205
+ const results = await this.processCandidatesInBatches(candidates);
206
+
207
+ this.stats.endTime = Date.now();
208
+ this.stats.hashedFiles = results.hashedFiles;
209
+ this.stats.skippedFiles = results.skippedFiles;
210
+ this.stats.errors = results.errors;
211
+
212
+ return {
213
+ success: true,
214
+ stats: this.stats,
215
+ hashedFiles: results.hashedFiles,
216
+ skippedFiles: results.skippedFiles,
217
+ errors: results.errors,
218
+ processingTime: this.stats.endTime - this.stats.startTime
219
+ };
220
+
221
+ } catch (error) {
222
+ this.stats.endTime = Date.now();
223
+ return {
224
+ success: false,
225
+ error: error.message,
226
+ stats: this.stats
227
+ };
228
+ }
229
+ }
230
+
231
+ /**
232
+ * Get files grouped by size from database
233
+ */
234
+ async getFilesBySize() {
235
+ const sql = `
236
+ SELECT
237
+ file_size,
238
+ COUNT(id) as count,
239
+ GROUP_CONCAT(id, ',') as file_ids
240
+ FROM copies
241
+ WHERE file_hash IS NULL AND active = 1
242
+ GROUP BY file_size
243
+ ORDER BY file_size DESC
244
+ `;
245
+
246
+ const results = await this.db.db.query(sql);
247
+
248
+ return results.map(row => {
249
+ const count = parseInt(row.count) || 0;
250
+
251
+ // Debug logging for negative counts
252
+ if (count < 0) {
253
+ console.error('Negative count detected in getFilesBySize:');
254
+ console.error(' Row data:', row);
255
+ console.error(' Parsed count:', count);
256
+ console.error(' Raw count value:', row.count);
257
+ console.error(' Type of count:', typeof row.count);
258
+ }
259
+
260
+ return {
261
+ size: row.file_size,
262
+ count: Math.max(0, count), // Ensure count is never negative
263
+ fileIds: row.file_ids ? row.file_ids.split(',').map(id => parseInt(id)) : []
264
+ };
265
+ });
266
+ }
267
+
268
+ /**
269
+ * Get candidate files that need hashing (files with matching sizes)
270
+ */
271
+ async getCandidatesForHashing() {
272
+ const sql = `
273
+ SELECT c1.*
274
+ FROM copies c1
275
+ WHERE c1.file_hash IS NULL
276
+ AND EXISTS (
277
+ SELECT 1
278
+ FROM copies c2
279
+ WHERE c2.file_size = c1.file_size
280
+ AND c2.id != c1.id
281
+ AND c2.file_hash IS NULL
282
+ )
283
+ ORDER BY c1.file_size DESC, c1.file_path
284
+ `;
285
+
286
+ return await this.db.db.query(sql);
287
+ }
288
+
289
+ /**
290
+ * Process candidates in batches with controlled concurrency
291
+ */
292
+ async processCandidatesInBatches(candidates) {
293
+ const results = {
294
+ hashedFiles: 0,
295
+ skippedFiles: 0,
296
+ errors: 0
297
+ };
298
+
299
+ // Process in batches
300
+ for (let i = 0; i < candidates.length; i += this.options.batchSize) {
301
+ const batch = candidates.slice(i, i + this.options.batchSize);
302
+
303
+ // Process batch with controlled concurrency
304
+ const batchResults = await this.processBatch(batch);
305
+
306
+ results.hashedFiles += batchResults.hashedFiles;
307
+ results.skippedFiles += batchResults.skippedFiles;
308
+ results.errors += batchResults.errors;
309
+
310
+ // Report progress
311
+ if (this.options.onProgress) {
312
+ this.options.onProgress({
313
+ phase: 'hashing',
314
+ processed: i + batch.length,
315
+ total: candidates.length,
316
+ hashedFiles: results.hashedFiles,
317
+ errors: results.errors,
318
+ percentage: ((i + batch.length) / candidates.length) * 100
319
+ });
320
+ }
321
+ }
322
+
323
+ return results;
324
+ }
325
+
326
+ /**
327
+ * Process a batch of candidates with controlled concurrency
328
+ */
329
+ async processBatch(batch) {
330
+ const results = {
331
+ hashedFiles: 0,
332
+ skippedFiles: 0,
333
+ errors: 0
334
+ };
335
+
336
+ // Process with controlled concurrency
337
+ const chunks = [];
338
+ for (let i = 0; i < batch.length; i += this.options.maxConcurrency) {
339
+ chunks.push(batch.slice(i, i + this.options.maxConcurrency));
340
+ }
341
+
342
+ for (const chunk of chunks) {
343
+ const chunkPromises = chunk.map(candidate => this.processCandidate(candidate));
344
+ const chunkResults = await Promise.all(chunkPromises);
345
+
346
+ chunkResults.forEach(result => {
347
+ if (result.success) {
348
+ if (result.skipped) {
349
+ results.skippedFiles++;
350
+ } else {
351
+ results.hashedFiles++;
352
+ }
353
+ } else {
354
+ results.errors++;
355
+ }
356
+ });
357
+ }
358
+
359
+ return results;
360
+ }
361
+
362
+ /**
363
+ * Process a single candidate file
364
+ */
365
+ async processCandidate(candidate) {
366
+ try {
367
+ // Report current file being hashed
368
+ if (this.options.onProgress) {
369
+ this.options.onProgress({
370
+ phase: 'hashing',
371
+ currentFile: candidate.file_path
372
+ });
373
+ }
374
+
375
+ // Skip if file already has a hash
376
+ if (candidate.file_hash !== null && candidate.file_hash !== undefined) {
377
+ return {
378
+ success: true,
379
+ candidateId: candidate.id,
380
+ filePath: candidate.file_path,
381
+ message: 'File already hashed, skipping',
382
+ skipped: true
383
+ };
384
+ }
385
+
386
+ // Verify file still exists
387
+ if (!(await pathExists(candidate.file_path))) {
388
+ return {
389
+ success: false,
390
+ error: `File not found: ${candidate.file_path}`,
391
+ candidateId: candidate.id
392
+ };
393
+ }
394
+
395
+ // Verify file size hasn't changed
396
+ const currentSize = await getFileSize(candidate.file_path);
397
+ if (currentSize !== candidate.file_size) {
398
+ return {
399
+ success: false,
400
+ error: `File size changed: ${candidate.file_path} (expected: ${candidate.file_size}, actual: ${currentSize})`,
401
+ candidateId: candidate.id
402
+ };
403
+ }
404
+
405
+ // Calculate hash
406
+ const hash = await calculateContentHash(candidate.file_path, this.options.hashAlgorithm);
407
+
408
+ // Update database
409
+ const updateResult = await this.db.updateFileHash(candidate.id, hash);
410
+
411
+ if (!updateResult.success) {
412
+ return {
413
+ success: false,
414
+ error: `Database update failed: ${updateResult.error}`,
415
+ candidateId: candidate.id
416
+ };
417
+ }
418
+
419
+ return {
420
+ success: true,
421
+ candidateId: candidate.id,
422
+ filePath: candidate.file_path,
423
+ hash: hash,
424
+ size: candidate.file_size
425
+ };
426
+
427
+ } catch (error) {
428
+ return {
429
+ success: false,
430
+ error: error.message,
431
+ candidateId: candidate.id
432
+ };
433
+ }
434
+ }
435
+
436
+ /**
437
+ * Get detailed candidate information
438
+ */
439
+ async getCandidateDetails() {
440
+ try {
441
+ const sizeGroups = await this.getFilesBySize();
442
+ const candidateGroups = sizeGroups.filter(group => group.count > 1);
443
+
444
+ const details = {
445
+ totalSizeGroups: sizeGroups.length,
446
+ candidateGroups: candidateGroups.length,
447
+ totalCandidates: candidateGroups.reduce((total, group) => total + group.count, 0),
448
+ sizeDistribution: candidateGroups.map(group => ({
449
+ size: group.size,
450
+ count: group.count,
451
+ sizeFormatted: this.formatFileSize(group.size)
452
+ })).sort((a, b) => b.size - a.size)
453
+ };
454
+
455
+ return {
456
+ success: true,
457
+ details
458
+ };
459
+
460
+ } catch (error) {
461
+ return {
462
+ success: false,
463
+ error: error.message
464
+ };
465
+ }
466
+ }
467
+
468
+ /**
469
+ * Format file size in human readable format
470
+ */
471
+ formatFileSize(bytes) {
472
+ const units = ['B', 'KB', 'MB', 'GB', 'TB'];
473
+ let size = bytes;
474
+ let unitIndex = 0;
475
+
476
+ while (size >= 1024 && unitIndex < units.length - 1) {
477
+ size /= 1024;
478
+ unitIndex++;
479
+ }
480
+
481
+ return `${size.toFixed(2)} ${units[unitIndex]}`;
482
+ }
483
+ }
484
+
485
+ /**
486
+ * Run candidate detection with database connection
487
+ */
488
+ export async function runCandidateDetection(options) {
489
+ const databasePath = options.saveDb || options.resumeDb || options.database;
490
+
491
+ if (!databasePath) {
492
+ console.log('â„šī¸ No database path available for candidate detection');
493
+ return;
494
+ }
495
+
496
+ let db;
497
+ try {
498
+ console.log('\n🔍 Starting candidate detection...');
499
+
500
+ db = createConnection(databasePath);
501
+ await db.connect();
502
+
503
+ const dbOps = createOperations(db);
504
+
505
+ const detector = new CandidateDetection(dbOps, {
506
+ hashAlgorithm: options.hashAlgorithm || 'blake3',
507
+ batchSize: parseInt(options.batchSize) || 100,
508
+ maxConcurrency: parseInt(options.maxConcurrency) || 5,
509
+ verbose: options.verbose || false
510
+ });
511
+
512
+ const result = await detector.detectAndProcessCandidates();
513
+
514
+ if (result.success) {
515
+ console.log('\n✅ Candidate detection completed successfully!');
516
+ console.log(`📊 Statistics:`);
517
+ console.log(` - Total files: ${result.stats.totalFiles}`);
518
+ console.log(` - Unique files: ${result.stats.uniqueFiles}`);
519
+ console.log(` - Candidates found: ${result.stats.candidatesFound}`);
520
+ console.log(` - Files hashed: ${result.stats.hashedFiles}`);
521
+ console.log(` - Files skipped: ${result.stats.skippedFiles}`);
522
+ console.log(` - Errors: ${result.stats.errors}`);
523
+ if (result.stats.processingTime) {
524
+ console.log(` - Processing time: ${(result.stats.processingTime / 1000).toFixed(2)}s`);
525
+ }
526
+ } else {
527
+ console.error(`❌ Candidate detection failed: ${result.error}`);
528
+ console.log('Continuing without candidate detection...');
529
+ }
530
+
531
+ await db.close();
532
+ } catch (error) {
533
+ console.error(`❌ Candidate detection error: ${error.message}`);
534
+ console.log('Continuing without candidate detection...');
535
+ if (db) {
536
+ await db.close();
537
+ }
538
+ }
539
+ }
540
+
541
+ export { CandidateDetection };
@@ -0,0 +1,140 @@
1
+ import path from 'path';
2
+ import { createDatabaseManager } from '../database/index.js';
3
+ import { moveDuplicateFiles, moveDuplicateFilesWithPathPreservation } from './fileMover.js';
4
+ import { createDirectory } from './fileSystemUtils.js';
5
+
6
+ /**
7
+ * Move duplicated files to a duplicates directory
8
+ * @param {Object} options - Processed options containing database path and preservePaths setting
9
+ * @returns {Promise<void>}
10
+ */
11
+ async function moveDuplicates(options) {
12
+ const preservePaths = options.preservePaths !== false;
13
+ // Determine database path
14
+ const dbPath = options.saveDb || options.resumeDb;
15
+ if (!dbPath) {
16
+ console.error('Error: No database path available for move operation');
17
+ return;
18
+ }
19
+
20
+ console.log('Starting duplicate file move operation...');
21
+ console.log(`Path preservation: ${preservePaths ? 'enabled' : 'disabled'}`);
22
+
23
+ let dbManager;
24
+ try {
25
+ // Initialize database connection
26
+ dbManager = createDatabaseManager(dbPath);
27
+ await dbManager.initialize();
28
+
29
+ // Get duplicate groups from database
30
+ const duplicateGroups = await dbManager.getOperations().getDuplicateGroups();
31
+
32
+ if (duplicateGroups.length === 0) {
33
+ console.log('No duplicate files found to move.');
34
+ return;
35
+ }
36
+
37
+ console.log(`Found ${duplicateGroups.length} duplicate groups`);
38
+
39
+ // Create duplicates directories in each target
40
+ const duplicatesDirs = {};
41
+ for (const target of options.targets) {
42
+ const duplicatesDir = path.join(target, '!@duplicates');
43
+ await createDirectory(duplicatesDir);
44
+ duplicatesDirs[target] = duplicatesDir;
45
+ }
46
+
47
+ let totalMoved = 0;
48
+ let totalErrors = 0;
49
+
50
+ // Process each duplicate group
51
+ for (const group of duplicateGroups) {
52
+ const filePaths = group.file_paths.split(';');
53
+ const duplicateFiles = filePaths.map(filePath => ({
54
+ path: filePath,
55
+ size: group.file_size,
56
+ hash: group.file_hash
57
+ }));
58
+
59
+ // Sort by priority (keep the first file, move the rest)
60
+ // The first file in the group should be kept, others moved
61
+ const filesToMove = duplicateFiles.slice(1); // Skip the first file (keep it)
62
+
63
+ if (filesToMove.length > 0) {
64
+ // Group files by their target directory
65
+ const filesByTarget = {};
66
+ for (const file of filesToMove) {
67
+ const fileDir = path.dirname(file.path);
68
+ // Find which target directory this file belongs to
69
+ let targetDir = null;
70
+ for (const target of options.targets) {
71
+ // Resolve both paths to absolute for proper comparison
72
+ const absoluteTarget = path.resolve(target);
73
+ if (fileDir.startsWith(absoluteTarget)) {
74
+ targetDir = target;
75
+ break;
76
+ }
77
+ }
78
+
79
+ if (!targetDir) {
80
+ // If file doesn't belong to any target, use the first target
81
+ targetDir = options.targets[0];
82
+ }
83
+
84
+ if (!filesByTarget[targetDir]) {
85
+ filesByTarget[targetDir] = [];
86
+ }
87
+ filesByTarget[targetDir].push(file);
88
+ }
89
+
90
+ // Move files to their respective target duplicates directories
91
+ for (const [targetDir, files] of Object.entries(filesByTarget)) {
92
+ try {
93
+ const moveOptions = {
94
+ dryRun: false,
95
+ overwrite: false,
96
+ onProgress: (progress) => {
97
+ if (options.verbose) {
98
+ console.log(`Moving: ${progress.source} -> ${progress.destination}`);
99
+ }
100
+ },
101
+ onError: (error, result) => {
102
+ console.error(`Error moving file ${result.sourcePath}: ${error.message}`);
103
+ totalErrors++;
104
+ }
105
+ };
106
+
107
+ const moveResults = preservePaths
108
+ ? await moveDuplicateFilesWithPathPreservation(files, duplicatesDirs[targetDir], targetDir, moveOptions)
109
+ : await moveDuplicateFiles(files, duplicatesDirs[targetDir], moveOptions);
110
+
111
+ const successfulMoves = moveResults.filter(result => result.success);
112
+ totalMoved += successfulMoves.length;
113
+ totalErrors += moveResults.length - successfulMoves.length;
114
+
115
+ } catch (error) {
116
+ console.error(`Error processing duplicate group for target ${targetDir}: ${error.message}`);
117
+ totalErrors += files.length;
118
+ }
119
+ }
120
+ }
121
+ }
122
+
123
+ console.log(`Move operation completed:`);
124
+ console.log(` - Files moved: ${totalMoved}`);
125
+ console.log(` - Errors: ${totalErrors}`);
126
+ console.log(` - Duplicates directories created:`);
127
+ for (const [target, duplicatesDir] of Object.entries(duplicatesDirs)) {
128
+ console.log(` - ${target}/!@duplicates`);
129
+ }
130
+
131
+ } catch (error) {
132
+ console.error(`Error during move operation: ${error.message}`);
133
+ } finally {
134
+ if (dbManager) {
135
+ await dbManager.close();
136
+ }
137
+ }
138
+ }
139
+
140
+ export { moveDuplicates };