tm-extractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +64 -0
  2. package/dist/constants/scm-activities.d.ts +35 -0
  3. package/dist/constants/scm-activities.d.ts.map +1 -0
  4. package/dist/constants/tma-formats.d.ts +77 -0
  5. package/dist/constants/tma-formats.d.ts.map +1 -0
  6. package/dist/constants/typology-definitions.d.ts +519 -0
  7. package/dist/constants/typology-definitions.d.ts.map +1 -0
  8. package/dist/core/data-transformer.d.ts +44 -0
  9. package/dist/core/data-transformer.d.ts.map +1 -0
  10. package/dist/core/pdf-processor.d.ts +48 -0
  11. package/dist/core/pdf-processor.d.ts.map +1 -0
  12. package/dist/extractors/branding-extractor.d.ts +21 -0
  13. package/dist/extractors/branding-extractor.d.ts.map +1 -0
  14. package/dist/extractors/scm-extractor.d.ts +96 -0
  15. package/dist/extractors/scm-extractor.d.ts.map +1 -0
  16. package/dist/extractors/strength-extractor.d.ts +21 -0
  17. package/dist/extractors/strength-extractor.d.ts.map +1 -0
  18. package/dist/extractors/talent-extractor.d.ts +25 -0
  19. package/dist/extractors/talent-extractor.d.ts.map +1 -0
  20. package/dist/extractors/typology-extractor.d.ts +25 -0
  21. package/dist/extractors/typology-extractor.d.ts.map +1 -0
  22. package/dist/index.cjs +1502 -0
  23. package/dist/index.cjs.map +1 -0
  24. package/dist/index.d.ts +37 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +1475 -0
  27. package/dist/index.js.map +1 -0
  28. package/dist/types/tma-types.d.ts +133 -0
  29. package/dist/types/tma-types.d.ts.map +1 -0
  30. package/dist/utils/error-handling.d.ts +46 -0
  31. package/dist/utils/error-handling.d.ts.map +1 -0
  32. package/dist/utils/format-detection.d.ts +48 -0
  33. package/dist/utils/format-detection.d.ts.map +1 -0
  34. package/package.json +67 -0
package/dist/index.cjs ADDED
@@ -0,0 +1,1502 @@
1
+ 'use strict';
2
+
3
+ Object.defineProperty(exports, '__esModule', { value: true });
4
+
5
+ var pdfjs = require('pdfjs-dist');
6
+
7
+ var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
8
+ function _interopNamespaceDefault(e) {
9
+ var n = Object.create(null);
10
+ if (e) {
11
+ Object.keys(e).forEach(function (k) {
12
+ if (k !== 'default') {
13
+ var d = Object.getOwnPropertyDescriptor(e, k);
14
+ Object.defineProperty(n, k, d.get ? d : {
15
+ enumerable: true,
16
+ get: function () { return e[k]; }
17
+ });
18
+ }
19
+ });
20
+ }
21
+ n.default = e;
22
+ return Object.freeze(n);
23
+ }
24
+
25
+ var pdfjs__namespace = /*#__PURE__*/_interopNamespaceDefault(pdfjs);
26
+
27
+ /**
28
+ * TMA format configurations for different PDF page counts
29
+ */
30
+ const TMA_FORMATS = {
31
+ "49": {
32
+ pages: [9, 11, 13],
33
+ scmPage: 12,
34
+ format: "49-page",
35
+ },
36
+ "54": {
37
+ pages: [11, 13, 15],
38
+ scmPage: 14,
39
+ format: "54-page",
40
+ },
41
+ "46": {
42
+ pages: [12, 14, 16],
43
+ scmPage: 15,
44
+ format: "46-page",
45
+ },
46
+ "6": {
47
+ pages: [2, 4, 6],
48
+ scmPage: 5,
49
+ format: "6-page",
50
+ },
51
+ "22": {
52
+ pages: [2, 4, 6],
53
+ scmPage: 5,
54
+ format: "22-page",
55
+ },
56
+ };
57
+ /**
58
+ * Talent extraction regex patterns for different formats
59
+ */
60
+ const TALENT_REGEX_PATTERNS = {
61
+ // format49: /([A-Z-]+\.?)\s+(\d+)\./g,
62
+ // default: /^([1-9]|[12][0-9]|3[0-4])\s+([A-Z-]+)/gm,
63
+ // pdfParse: /([1-9]|[12][0-9]|3[0-4])([A-Z]+)/g
64
+ format49: /([A-Z-]+\.?)\s+(\d+)\./g,
65
+ default: /(\d+)\s+([A-Z-]+)/g,
66
+ pdfParse: /(\d+)([A-Z]+)/g,
67
+ };
68
+ /**
69
+ * Strength extraction regex patterns for different formats
70
+ */
71
+ const STRENGTH_REGEX_PATTERNS = {
72
+ format49: /([A-Z]+)\s+(\d+)(?:\s+|$)/g,
73
+ default: /\d+\.\s*([\w-]+)/g,
74
+ };
75
+ /**
76
+ * Personal branding extraction regex patterns for different formats
77
+ */
78
+ const PERSONAL_BRANDING_REGEX_PATTERNS = {
79
+ format49: /(QUALITY\s+CONTROLLER|[A-Z]{3,})\s*[‐-]?\s*(-?\d+(?:\.\d+)?)/g,
80
+ default: /([A-Z]+(?:\s+[A-Z]+)*)\s+Anda/g,
81
+ };
82
+ /**
83
+ * Valid talent number range (1-34)
84
+ */
85
+ const TALENT_RANGE = { min: 1, max: 34 };
86
+ /**
87
+ * Maximum number of personal branding items to extract
88
+ */
89
+ const PERSONAL_BRANDING_LIMIT = 5;
90
+
91
+ /**
92
+ * Detects TMA format based on page count
93
+ */
94
+ function detectTmaFormat(pageCount) {
95
+ // Handle invalid page counts
96
+ if (!pageCount || typeof pageCount !== 'number' || pageCount <= 0) {
97
+ return 'unknown';
98
+ }
99
+ const formatKey = pageCount.toString();
100
+ return TMA_FORMATS[formatKey]?.format || 'unknown';
101
+ }
102
+ /**
103
+ * Gets all supported page counts
104
+ */
105
+ function getSupportedPageCounts() {
106
+ return Object.keys(TMA_FORMATS).map(key => parseInt(key));
107
+ }
108
+ /**
109
+ * Gets format information for a given page count
110
+ */
111
+ function getFormatInfo(pageCount) {
112
+ const formatKey = pageCount.toString();
113
+ return TMA_FORMATS[formatKey] || null;
114
+ }
115
+ /**
116
+ * Validates if a page count is supported and returns format details
117
+ */
118
+ function validateAndGetFormat(pageCount) {
119
+ if (pageCount === 5) {
120
+ return {
121
+ isValid: false,
122
+ format: 'unknown',
123
+ error: 'Page count 5 is explicitly not supported'
124
+ };
125
+ }
126
+ const formatInfo = getFormatInfo(pageCount);
127
+ if (!formatInfo) {
128
+ return {
129
+ isValid: false,
130
+ format: 'unknown',
131
+ error: `Unsupported page count: ${pageCount}. Supported formats: ${getSupportedPageCounts().join(', ')}`
132
+ };
133
+ }
134
+ return {
135
+ isValid: true,
136
+ format: formatInfo.format,
137
+ pages: formatInfo.pages,
138
+ scmPage: formatInfo.scmPage
139
+ };
140
+ }
141
+
142
+ /**
143
+ * Custom error class for TMA extraction operations
144
+ */
145
+ class TmaExtractionError extends Error {
146
+ constructor(message, code, originalError) {
147
+ super(message);
148
+ this.code = code;
149
+ this.originalError = originalError;
150
+ this.name = 'TmaExtractionError';
151
+ // Maintains proper stack trace for where our error was thrown (only available on V8)
152
+ if (Error.captureStackTrace) {
153
+ Error.captureStackTrace(this, TmaExtractionError);
154
+ }
155
+ }
156
+ }
157
+ /**
158
+ * Error codes used throughout the package
159
+ */
160
+ const ERROR_CODES_BASE = {
161
+ // File validation errors
162
+ INVALID_FILE_TYPE: 'INVALID_FILE_TYPE',
163
+ NO_FILES: 'NO_FILES',
164
+ TOO_MANY_FILES: 'TOO_MANY_FILES',
165
+ FILE_TOO_LARGE: 'FILE_TOO_LARGE',
166
+ // PDF processing errors
167
+ PDF_PROCESSING_ERROR: 'PDF_PROCESSING_ERROR',
168
+ PAGE_TEXT_EXTRACTION_ERROR: 'PAGE_TEXT_EXTRACTION_ERROR',
169
+ INVALID_PAGE_NUMBERS: 'INVALID_PAGE_NUMBERS',
170
+ INVALID_PAGE_NUMBER: 'INVALID_PAGE_NUMBER',
171
+ // Format errors
172
+ UNSUPPORTED_FORMAT: 'UNSUPPORTED_FORMAT',
173
+ // Extraction errors
174
+ TALENT_EXTRACTION_ERROR: 'TALENT_EXTRACTION_ERROR',
175
+ STRENGTH_EXTRACTION_ERROR: 'STRENGTH_EXTRACTION_ERROR',
176
+ TYPOLOGY_EXTRACTION_ERROR: 'TYPOLOGY_EXTRACTION_ERROR',
177
+ BRANDING_EXTRACTION_ERROR: 'BRANDING_EXTRACTION_ERROR',
178
+ // Validation errors
179
+ VALIDATION_ERROR: 'VALIDATION_ERROR',
180
+ NO_VALID_RESULTS: 'NO_VALID_RESULTS',
181
+ INSUFFICIENT_VALID_FILES: 'INSUFFICIENT_VALID_FILES',
182
+ // Configuration errors
183
+ WORKER_CONFIGURATION_ERROR: 'WORKER_CONFIGURATION_ERROR',
184
+ TIMEOUT_ERROR: 'TIMEOUT_ERROR',
185
+ // Server side error
186
+ SERVER_SIDE_ERROR: 'SERVER_SIDE_ERROR'
187
+ };
188
+ const ERROR_CODES = Object.freeze(ERROR_CODES_BASE);
189
+ /**
190
+ * Creates a TmaExtractionError with predefined error code
191
+ */
192
+ function createError(code, message, originalError) {
193
+ return new TmaExtractionError(message, ERROR_CODES[code], originalError);
194
+ }
195
+ /**
196
+ * Checks if an error is a TmaExtractionError
197
+ */
198
+ function isTmaExtractionError(error) {
199
+ return error instanceof TmaExtractionError;
200
+ }
201
+ /**
202
+ * Handles and wraps unknown errors in TmaExtractionError
203
+ */
204
+ function wrapError(error, defaultCode = 'PDF_PROCESSING_ERROR', defaultMessage = 'An unexpected error occurred') {
205
+ if (isTmaExtractionError(error)) {
206
+ return error;
207
+ }
208
+ if (error instanceof Error) {
209
+ return createError(defaultCode, error.message, error);
210
+ }
211
+ return createError(defaultCode, defaultMessage);
212
+ }
213
+
214
+ /**
215
+ * PDF Processing class that wraps PDF.js functionality for TMA extraction
216
+ */
217
+ class PdfProcessor {
218
+ constructor(config) {
219
+ this.workerSrc = config?.workerSrc;
220
+ this.configurePdfWorker();
221
+ }
222
+ /**
223
+ * Configures PDF.js worker source
224
+ */
225
+ configurePdfWorker() {
226
+ if (this.workerSrc && this.workerSrc !== "auto") {
227
+ pdfjs__namespace.GlobalWorkerOptions.workerSrc = this.workerSrc;
228
+ }
229
+ else if (typeof window !== "undefined") {
230
+ // Browser environment - use default worker
231
+ pdfjs__namespace.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('index.cjs', document.baseURI).href))).toString();
232
+ }
233
+ }
234
+ /**
235
+ * Gets the total number of pages in a PDF file
236
+ */
237
+ async getPageCount(file) {
238
+ try {
239
+ const arrayBuffer = await file.arrayBuffer();
240
+ const loadingTask = pdfjs__namespace.getDocument(arrayBuffer);
241
+ const pdfDoc = await loadingTask.promise;
242
+ return pdfDoc.numPages;
243
+ }
244
+ catch (error) {
245
+ throw new TmaExtractionError("Failed to get PDF page count", "PDF_PROCESSING_ERROR", error);
246
+ }
247
+ }
248
+ /**
249
+ * Detects TMA format based on page count
250
+ */
251
+ detectFormat(pageCount) {
252
+ return detectTmaFormat(pageCount);
253
+ }
254
+ /**
255
+ * Gets page numbers for extraction based on page count
256
+ */
257
+ getPageNumbers(pageCount) {
258
+ const formatKey = pageCount.toString();
259
+ const format = TMA_FORMATS[formatKey];
260
+ if (!format) {
261
+ throw new TmaExtractionError(`Unsupported TMA format: ${pageCount} pages`, "UNSUPPORTED_FORMAT");
262
+ }
263
+ return format.pages;
264
+ }
265
+ /**
266
+ * Gets SCM page number based on page count
267
+ */
268
+ getScmPageNumber(pageCount) {
269
+ const formatKey = pageCount.toString();
270
+ const format = TMA_FORMATS[formatKey];
271
+ if (!format) {
272
+ throw new TmaExtractionError(`Unsupported TMA format for SCM: ${pageCount} pages`, "UNSUPPORTED_FORMAT");
273
+ }
274
+ return format.scmPage;
275
+ }
276
+ /**
277
+ * Extracts text content from a single PDF page
278
+ */
279
+ async getPageText(page) {
280
+ try {
281
+ const content = await page.getTextContent();
282
+ return content.items.map((item) => item.str).join(" ");
283
+ }
284
+ catch (error) {
285
+ throw new TmaExtractionError("Failed to extract text from PDF page", "PAGE_TEXT_EXTRACTION_ERROR", error);
286
+ }
287
+ }
288
+ /**
289
+ * Extracts text content from specific PDF pages
290
+ */
291
+ async extractPageTexts(file, pageNumbers) {
292
+ try {
293
+ const arrayBuffer = await file.arrayBuffer();
294
+ const loadingTask = pdfjs__namespace.getDocument(arrayBuffer);
295
+ const pdfDoc = await loadingTask.promise;
296
+ // Validate page numbers
297
+ const maxPage = pdfDoc.numPages;
298
+ const invalidPages = pageNumbers.filter((num) => num > maxPage || num < 1);
299
+ if (invalidPages.length > 0) {
300
+ throw new TmaExtractionError(`Invalid page numbers: ${invalidPages.join(", ")} (PDF has ${maxPage} pages)`, "INVALID_PAGE_NUMBERS");
301
+ }
302
+ // Get specific pages
303
+ const pages = await Promise.all(pageNumbers.map((pageNum) => pdfDoc.getPage(pageNum)));
304
+ const [talentPage, strengthPage, typologyPage] = pages;
305
+ return {
306
+ talentOrder: await this.getPageText(talentPage),
307
+ strength: await this.getPageText(strengthPage),
308
+ typologyAndBranding: await this.getPageText(typologyPage),
309
+ };
310
+ }
311
+ catch (error) {
312
+ if (error instanceof TmaExtractionError) {
313
+ throw error;
314
+ }
315
+ throw new TmaExtractionError("Failed to extract page texts from PDF", "PDF_PROCESSING_ERROR", error);
316
+ }
317
+ }
318
+ /**
319
+ * Gets a specific PDF page for external processing (e.g., SCM)
320
+ */
321
+ async getPage(file, pageNumber) {
322
+ try {
323
+ const arrayBuffer = await file.arrayBuffer();
324
+ const loadingTask = pdfjs__namespace.getDocument(arrayBuffer);
325
+ const pdfDoc = await loadingTask.promise;
326
+ if (pageNumber > pdfDoc.numPages || pageNumber < 1) {
327
+ throw new TmaExtractionError(`Invalid page number: ${pageNumber} (PDF has ${pdfDoc.numPages} pages)`, "INVALID_PAGE_NUMBER");
328
+ }
329
+ return await pdfDoc.getPage(pageNumber);
330
+ }
331
+ catch (error) {
332
+ if (error instanceof TmaExtractionError) {
333
+ throw error;
334
+ }
335
+ throw new TmaExtractionError(`Failed to get PDF page ${pageNumber}`, "PDF_PROCESSING_ERROR", error);
336
+ }
337
+ }
338
+ /**
339
+ * Validates if a file is a valid PDF
340
+ */
341
+ validatePdfFile(file) {
342
+ if (!file) {
343
+ return { isValid: false, error: "No file provided" };
344
+ }
345
+ if (file.type !== "application/pdf") {
346
+ return { isValid: false, error: "File must be a PDF" };
347
+ }
348
+ if (file.size === 0) {
349
+ return { isValid: false, error: "File is empty" };
350
+ }
351
+ // Basic size check (max 50MB)
352
+ if (file.size > 50 * 1024 * 1024) {
353
+ return { isValid: false, error: "File size exceeds 50MB limit" };
354
+ }
355
+ return { isValid: true };
356
+ }
357
+ }
358
+
359
+ /**
360
+ * Data transformation class that implements filteredExtractedData logic
361
+ * Removes unnecessary fields and metadata from extracted TMA data
362
+ */
363
+ class DataTransformer {
364
+ /**
365
+ * Removes the 'sign' field from talents array
366
+ */
367
+ static removeTalentSign(talents) {
368
+ return talents.map(({ number, tema }) => ({
369
+ number,
370
+ tema
371
+ }));
372
+ }
373
+ /**
374
+ * Removes SCM metadata while keeping activity data
375
+ * Filters out detectionConfidence and metadata according to filtered output spec
376
+ */
377
+ static removeScmMetadata(scmData) {
378
+ if (!scmData) {
379
+ return scmData;
380
+ }
381
+ if (!scmData.activities) {
382
+ return {
383
+ activities: []
384
+ };
385
+ }
386
+ return {
387
+ activities: scmData.activities.map((activity) => {
388
+ const { detectionConfidence, ...cleanedActivity } = activity;
389
+ return cleanedActivity;
390
+ })
391
+ // Note: metadata field is completely removed
392
+ };
393
+ }
394
+ /**
395
+ * Cleans talent data by removing unnecessary fields
396
+ * Implements the filteredExtractedData logic from TMARawExtractor.vue
397
+ */
398
+ static cleanTalentData(rawData) {
399
+ const cleanedData = {
400
+ name: rawData.name,
401
+ talents: this.removeTalentSign(rawData.talents),
402
+ strength: rawData.strength,
403
+ typology: rawData.typology,
404
+ personalbranding: rawData.personalbranding
405
+ };
406
+ // Only include SCM if it exists and clean it
407
+ if (rawData.scm) {
408
+ cleanedData.scm = this.removeScmMetadata(rawData.scm);
409
+ }
410
+ return cleanedData;
411
+ }
412
+ /**
413
+ * Transforms single person result to match filtered output format
414
+ */
415
+ static transformSinglePersonResult(result) {
416
+ // Remove top14Talents, top7Talents, low14Talents from person object
417
+ const { top14Talents, top7Talents, low14Talents, ...cleanedPerson } = result.person;
418
+ return {
419
+ person: this.cleanTalentData(cleanedPerson)
420
+ };
421
+ }
422
+ /**
423
+ * Validates cleaned data structure
424
+ */
425
+ static validateCleanedData(data) {
426
+ const errors = [];
427
+ // Check required fields
428
+ if (!data.name || data.name.trim().length === 0) {
429
+ errors.push('Person name is required');
430
+ }
431
+ if (!data.talents || data.talents.length === 0) {
432
+ errors.push('Talents array is required and cannot be empty');
433
+ }
434
+ if (!data.strength || data.strength.length === 0) {
435
+ errors.push('Strength array is required and cannot be empty');
436
+ }
437
+ if (!data.typology || data.typology.length === 0) {
438
+ errors.push('Typology array is required and cannot be empty');
439
+ }
440
+ if (!data.personalbranding || data.personalbranding.length === 0) {
441
+ errors.push('Personal branding array is required and cannot be empty');
442
+ }
443
+ // Validate talent structure (should not have 'sign' field)
444
+ if (data.talents) {
445
+ const talentsWithSign = data.talents.filter((talent) => 'sign' in talent);
446
+ if (talentsWithSign.length > 0) {
447
+ errors.push(`Found ${talentsWithSign.length} talents with 'sign' field (should be removed)`);
448
+ }
449
+ // Check for required fields in talents
450
+ const invalidTalents = data.talents.filter(t => typeof t.number !== 'number' || !t.tema || t.tema.trim().length === 0);
451
+ if (invalidTalents.length > 0) {
452
+ errors.push(`Found ${invalidTalents.length} talents with missing number or tema`);
453
+ }
454
+ }
455
+ // Validate SCM structure if present
456
+ if (data.scm) {
457
+ if (data.scm.metadata) {
458
+ errors.push('SCM metadata should be removed from cleaned data');
459
+ }
460
+ if (data.scm.activities) {
461
+ const activitiesWithConfidence = data.scm.activities.filter((activity) => 'detectionConfidence' in activity);
462
+ if (activitiesWithConfidence.length > 0) {
463
+ errors.push(`Found ${activitiesWithConfidence.length} SCM activities with detectionConfidence (should be removed)`);
464
+ }
465
+ }
466
+ }
467
+ return { isValid: errors.length === 0, errors };
468
+ }
469
+ /**
470
+ * Gets summary statistics of cleaned data
471
+ */
472
+ static getDataSummary(data) {
473
+ return {
474
+ totalTalents: data.talents?.length || 0,
475
+ totalStrengths: data.strength?.length || 0,
476
+ totalTypologies: data.typology?.length || 0,
477
+ totalPersonalBranding: data.personalbranding?.length || 0,
478
+ totalScmActivities: data.scm?.activities?.length || null,
479
+ hasScmData: !!data.scm
480
+ };
481
+ }
482
+ }
483
+
484
+ /**
485
+ * Validates talent number is within valid range (1-34)
486
+ */
487
+ function isValidTalentNumber(number) {
488
+ return number >= TALENT_RANGE.min && number <= TALENT_RANGE.max;
489
+ }
490
+ /**
491
+ * Extracts talent order for 49-page format
492
+ */
493
+ function extractTalentOrderFormat49(text) {
494
+ const matches = [...text.matchAll(TALENT_REGEX_PATTERNS.format49)];
495
+ return matches
496
+ .map((match) => ({
497
+ number: parseInt(match[2]),
498
+ tema: match[1].replace(/\.$/, ''),
499
+ sign: 'false'
500
+ }))
501
+ .filter((talent) => isValidTalentNumber(talent.number))
502
+ .sort((a, b) => a.number - b.number);
503
+ }
504
+ /**
505
+ * Extracts talent order for default formats (54, 46, 6 page)
506
+ */
507
+ function extractTalentOrderDefault(text, isPdfParse = false) {
508
+ const pattern = isPdfParse ? TALENT_REGEX_PATTERNS.pdfParse : TALENT_REGEX_PATTERNS.default;
509
+ const matches = [...text.matchAll(pattern)];
510
+ const result = matches
511
+ .map((match) => ({
512
+ number: parseInt(match[1]),
513
+ tema: match[2],
514
+ sign: 'false'
515
+ }))
516
+ .filter((talent) => isValidTalentNumber(talent.number))
517
+ .sort((a, b) => a.number - b.number);
518
+ return result;
519
+ }
520
+ /**
521
+ * Extracts talent order from text based on TMA format
522
+ */
523
+ function extractTalentOrder(text, isPdfParse = false, pageCount = 0) {
524
+ const format = detectTmaFormat(pageCount);
525
+ // Handle unknown format early to prevent extraction attempts
526
+ if (format === 'unknown') {
527
+ return [];
528
+ }
529
+ if (format === '49-page') {
530
+ return extractTalentOrderFormat49(text);
531
+ }
532
+ return extractTalentOrderDefault(text, isPdfParse);
533
+ }
534
+ /**
535
+ * Extracts name from talent order text based on format
536
+ */
537
+ function extractName(text, pageCount) {
538
+ const format = detectTmaFormat(pageCount);
539
+ let pattern = /URUTAN BAKAT\s+([A-Z\s.']+?)(?=\d)/;
540
+ if (format === '54-page') {
541
+ pattern = /URUTAN BAKAT\s+([A-Z\s.''-]+?)(?=\n\d)/;
542
+ }
543
+ else if (format === '49-page') {
544
+ pattern = /URUTAN BAKAT\s+([A-Z\s.']+?)(?=\n[A-Z])/;
545
+ }
546
+ const nameMatch = text.match(pattern);
547
+ return nameMatch ? nameMatch[1]?.trim() : '';
548
+ }
549
+
550
+ /**
551
+ * Extracts strength data for 49-page format
552
+ */
553
+ function extractStrengthFormat49(text) {
554
+ const strengthLines = text.match(STRENGTH_REGEX_PATTERNS.format49);
555
+ return (strengthLines?.map((line) => {
556
+ const [name, number] = line.trim().split(/\s+/);
557
+ return {
558
+ number: parseInt(number),
559
+ name: name.trim()
560
+ };
561
+ }) || []);
562
+ }
563
+ /**
564
+ * Extracts strength data for default formats (54, 46, 6 page)
565
+ */
566
+ function extractStrengthDefault(text) {
567
+ const strengthLines = text.match(STRENGTH_REGEX_PATTERNS.default);
568
+ return (strengthLines?.map((line) => {
569
+ const [number, name] = line.split('.');
570
+ return {
571
+ number: parseInt(number),
572
+ name: name.trim()
573
+ };
574
+ }) || []);
575
+ }
576
+ /**
577
+ * Extracts strength data from text based on TMA format
578
+ */
579
+ function extractStrength(text, pageCount = 0) {
580
+ const format = detectTmaFormat(pageCount);
581
+ if (format === '49-page') {
582
+ return extractStrengthFormat49(text);
583
+ }
584
+ return extractStrengthDefault(text);
585
+ }
586
+
587
+ /**
588
+ * Fixed mapping of all 30 typologies with their standardized categories
589
+ * This ensures consistent typology identification across all TMA formats
590
+ */
591
+ const TYPOLOGY_DEFINITIONS = [
592
+ { id: 1, name: 'ARRANGER', category: 'HEADMAN' },
593
+ { id: 2, name: 'SELLER', category: 'HEADMAN' },
594
+ { id: 3, name: 'COMMANDER', category: 'HEADMAN' },
595
+ { id: 4, name: 'MEDIATOR', category: 'HEADMAN' },
596
+ { id: 5, name: 'SELECTOR', category: 'HEADMAN' },
597
+ { id: 6, name: 'AMBASADOR', category: 'NETWORKING' },
598
+ { id: 7, name: 'COMMUNICATOR', category: 'NETWORKING' },
599
+ { id: 8, name: 'EDUCATOR', category: 'NETWORKING' },
600
+ { id: 9, name: 'MOTIVATOR', category: 'NETWORKING' },
601
+ { id: 10, name: 'CARETAKER', category: 'SERVICING' },
602
+ { id: 11, name: 'SERVER', category: 'SERVICING' },
603
+ { id: 12, name: 'ANALYST', category: 'THINKING' },
604
+ { id: 13, name: 'TREASURER', category: 'THINKING' },
605
+ { id: 14, name: 'RESTORER', category: 'REASONING' },
606
+ { id: 15, name: 'EVALUATOR', category: 'REASONING' },
607
+ { id: 16, name: 'EXPLORER', category: 'REASONING' },
608
+ { id: 17, name: 'DESIGNER', category: 'GENERATING IDEA' },
609
+ { id: 18, name: 'CREATOR', category: 'GENERATING IDEA' },
610
+ { id: 19, name: 'SYNTHESIZER', category: 'GENERATING IDEA' },
611
+ { id: 20, name: 'MARKETER', category: 'GENERATING IDEA' },
612
+ { id: 21, name: 'STRATEGIST', category: 'GENERATING IDEA' },
613
+ { id: 22, name: 'VISIONARY', category: 'GENERATING IDEA' },
614
+ { id: 23, name: 'JOURNALIST', category: 'ELEMENTARY' },
615
+ { id: 24, name: 'INTERPRETER', category: 'ELEMENTARY' },
616
+ { id: 25, name: 'ADMINISTRATOR', category: 'ELEMENTARY' },
617
+ { id: 26, name: 'SAFEKEEPER', category: 'TECHNICAL' },
618
+ { id: 27, name: 'PRODUCER', category: 'TECHNICAL' },
619
+ { id: 28, name: 'QUALITY CONTROLLER', category: 'TECHNICAL' },
620
+ { id: 29, name: 'DISTRIBUTOR', category: 'TECHNICAL' },
621
+ { id: 30, name: 'OPERATOR', category: 'TECHNICAL' }
622
+ ];
623
+
624
+ /**
625
+ * Creates a dash-separated label from typology name
626
+ */
627
+ function createTypologyLabel(name) {
628
+ return name.toLowerCase().replace(/\s+/g, '-');
629
+ }
630
+ /**
631
+ * Extracts score for a specific typology from text
632
+ */
633
+ function extractTypologyScore(text, typologyName) {
634
+ // Escape special characters for regex
635
+ const escapedName = typologyName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
636
+ // Create regex pattern to find typology name followed by score
637
+ const pattern = new RegExp(`${escapedName}\\s+(-?\\d+(?:\\.\\d+)?)`, 'i');
638
+ const match = text.match(pattern);
639
+ if (match && match[1]) {
640
+ const score = parseFloat(match[1]);
641
+ return isNaN(score) ? null : score;
642
+ }
643
+ return null;
644
+ }
645
+ /**
646
+ * Extracts all 30 typologies with their scores from text
647
+ */
648
+ function extractTypology(text) {
649
+ const typologies = [];
650
+ // Loop through all 30 predefined typologies
651
+ for (const definition of TYPOLOGY_DEFINITIONS) {
652
+ const score = extractTypologyScore(text, definition.name);
653
+ // Only include typologies that have scores found in the text
654
+ if (score !== null) {
655
+ typologies.push({
656
+ id: definition.id,
657
+ name: definition.name,
658
+ label: createTypologyLabel(definition.name),
659
+ category: definition.category,
660
+ score: score
661
+ });
662
+ }
663
+ }
664
+ // Sort by id to maintain consistent order
665
+ return typologies.sort((a, b) => a.id - b.id);
666
+ }
667
+
668
+ /**
669
+ * Extracts personal branding for 49-page format
670
+ */
671
+ function extractPersonalBrandingFormat49(text) {
672
+ const temp = [];
673
+ let match;
674
+ while ((match = PERSONAL_BRANDING_REGEX_PATTERNS.format49.exec(text)) !== null) {
675
+ const id = match[1].replace(/\s+/g, ' ').trim();
676
+ const score = parseFloat(match[2]);
677
+ if (id.length > 2 && !isNaN(score)) {
678
+ temp.push({ id, score });
679
+ }
680
+ }
681
+ // Keep unique entries with highest score
682
+ const uniqueMap = new Map();
683
+ temp.forEach(({ id, score }) => {
684
+ if (!uniqueMap.has(id) || uniqueMap.get(id) < score) {
685
+ uniqueMap.set(id, score);
686
+ }
687
+ });
688
+ // Sort positives and negatives separately
689
+ const arr = Array.from(uniqueMap.entries()).map(([id, score]) => ({ id, score }));
690
+ const positives = arr.filter((item) => item.score >= 0).sort((a, b) => b.score - a.score);
691
+ const negatives = arr.filter((item) => item.score < 0).sort((a, b) => a.score - b.score);
692
+ // Take top 5 from combined list
693
+ const top5 = [...positives, ...negatives].slice(0, PERSONAL_BRANDING_LIMIT);
694
+ return top5.map((item) => ({ id: item.id }));
695
+ }
696
+ /**
697
+ * Extracts personal branding for default formats
698
+ */
699
+ function extractPersonalBrandingDefault(text) {
700
+ const personalBrandingValues = text.match(PERSONAL_BRANDING_REGEX_PATTERNS.default);
701
+ if (!personalBrandingValues)
702
+ return [];
703
+ return personalBrandingValues
704
+ .map((match) => match.replace(/\s+Anda$/, ''))
705
+ .map((value) => value.replace(/PERSONAL BRANDING\s+/, ''))
706
+ .filter((value) => value && value !== 'PERSONAL BRANDING')
707
+ .slice(0, PERSONAL_BRANDING_LIMIT)
708
+ .map((value) => ({ id: value.trim() }));
709
+ }
710
+ /**
711
+ * Extracts personal branding data based on TMA format
712
+ */
713
+ function extractPersonalBranding(text, pageCount = 0) {
714
+ const format = detectTmaFormat(pageCount);
715
+ if (format === '49-page') {
716
+ return extractPersonalBrandingFormat49(text);
717
+ }
718
+ return extractPersonalBrandingDefault(text);
719
+ }
720
+
721
+ /**
722
+ * SCM (Strength Cluster Map) Activities Definitions
723
+ * Pre-defined list of all 114 activities with their typology and cluster information
724
+ * Framework-independent constants for tma-extractor package
725
+ */
726
+ const SCM_ACTIVITIES_DEFINITIONS = [
727
+ // TOP AREA (24 activities, dari kiri ke kanan)
728
+ { id: 'RELATING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 0 },
729
+ { id: 'REPRESENTING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 1 },
730
+ { id: 'COMMUNICATING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 2 },
731
+ { id: 'CORRESPONDING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 3 },
732
+ { id: 'ENTERTAINING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 4 },
733
+ { id: 'PRESENTING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 5 },
734
+ { id: 'COOPERATING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 6 },
735
+ { id: 'COORDINATING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 7 },
736
+ { id: 'DISPATCHING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 8 },
737
+ { id: 'MEDIATING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 9 },
738
+ { id: 'NEGOTIATING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 10 },
739
+ { id: 'PURCHASING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 11 },
740
+ { id: 'COLLECTING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 12 },
741
+ { id: 'CONTROLLING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 13 },
742
+ { id: 'INTERROGATING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 14 },
743
+ { id: 'BROKERING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 15 },
744
+ { id: 'INFLUENCING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 16 },
745
+ { id: 'SELLING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 17 },
746
+ { id: 'RECRUITING', typology: 'SELECTOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 18 },
747
+ { id: 'INTERVIEWING', typology: 'SELECTOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 19 },
748
+ { id: 'CARING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 20 },
749
+ { id: 'COUNSELING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 21 },
750
+ { id: 'SPIRITUALIZING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 22 },
751
+ { id: 'THERAPIES', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 23 },
752
+ // LEFT AREA (33 activities, dari atas ke bawah) - pattern: NAME-PSS-PSP
753
+ { id: 'LIAISING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 0 },
754
+ { id: 'GUIDING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 1 },
755
+ { id: 'MOTIVATING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 2 },
756
+ { id: 'SUPPORTING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 3 },
757
+ { id: 'ADVISING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 4 },
758
+ { id: 'COACHING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 5 },
759
+ { id: 'CONSULTING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 6 },
760
+ { id: 'MENTORING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 7 },
761
+ { id: 'TEACHING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 8 },
762
+ { id: 'TRAINING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 9 },
763
+ { id: 'ANALYSING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 10 },
764
+ { id: 'BOOKEEPING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 11 },
765
+ { id: 'PROGRAMMING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 12 },
766
+ { id: 'BUDGETING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 13 },
767
+ { id: 'CASHIERING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 14 },
768
+ { id: 'COSTING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 15 },
769
+ { id: 'ESTIMATING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 16 },
770
+ { id: 'AUDITING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 17 },
771
+ { id: 'EVALUATING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 18 },
772
+ { id: 'INSPECTING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 19 },
773
+ { id: 'INVESTIGATING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 20 },
774
+ { id: 'REVIEWING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 21 },
775
+ { id: 'VERIFYING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 22 },
776
+ { id: 'DIAGNOSING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 23 },
777
+ { id: 'IDENTIFYING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 24 },
778
+ { id: 'RESTORING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 25 },
779
+ { id: 'APPRAISING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 26 },
780
+ { id: 'OBSERVING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 27 },
781
+ { id: 'RESEARCHING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 28 },
782
+ { id: 'SURVEYING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 29 },
783
+ { id: 'CONCEPTUALIZING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 30 },
784
+ { id: 'EDITING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 31 },
785
+ { id: 'REDACTING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 32 },
786
+ // RIGHT AREA (33 activities, dari atas ke bawah) - pattern: PSS-PSP-NAME
787
+ { id: 'VOLUNTEERING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 0 },
788
+ { id: 'ASSISTING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 1 },
789
+ { id: 'GREETING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 2 },
790
+ { id: 'INFORMING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 3 },
791
+ { id: 'SERVING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 4 },
792
+ { id: 'DESIGNING', typology: 'DESIGNER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 5 },
793
+ { id: 'DRAFTING', typology: 'DESIGNER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 6 },
794
+ { id: 'ANIMATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 7 },
795
+ { id: 'CREATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 8 },
796
+ { id: 'IDEATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 9 },
797
+ { id: 'SYNTHESIZING', typology: 'SYNTHESIZER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 10 },
798
+ { id: 'ADVERTISING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 11 },
799
+ { id: 'DEVELOPING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 12 },
800
+ { id: 'MARKETING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 13 },
801
+ { id: 'PUBLICIZING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 14 },
802
+ { id: 'PLANNING', typology: 'STRATEGIST', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 15 },
803
+ { id: 'STRATEGIZING', typology: 'STRATEGIST', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 16 },
804
+ { id: 'VISIONING', typology: 'VISIONARY', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 17 },
805
+ // 15 activities without PSP (hasPsp: false)
806
+ { id: 'ACTING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 18 },
807
+ { id: 'BEAUTIFYING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 19 },
808
+ { id: 'CONSERVING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 20 },
809
+ { id: 'COOKING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 21 },
810
+ { id: 'DANCING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 22 },
811
+ { id: 'DRAMATIZING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 23 },
812
+ { id: 'MODELLING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 24 },
813
+ { id: 'MUSICAL ART', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 25 },
814
+ { id: 'SINGING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 26 },
815
+ { id: 'VISUAL ART', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 27 },
816
+ { id: 'MANUAL SKILL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 28 },
817
+ { id: 'PHYSICAL SKILL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 29 },
818
+ { id: 'PLANTING', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 30 },
819
+ { id: 'SPORT', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 31 },
820
+ { id: 'TENDING ANIMAL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 32 },
821
+ // BOTTOM AREA (24 activities, dari kiri ke kanan) - pattern: PSS-PSP-NAME
822
+ { id: 'REPORTING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 0 },
823
+ { id: 'WRITING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 1 },
824
+ { id: 'INTERPRETING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 2 },
825
+ { id: 'TRANSCRIBING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 3 },
826
+ { id: 'TRANSLATING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 4 },
827
+ { id: 'COMPLIANCING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 5 },
828
+ { id: 'FILING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 6 },
829
+ { id: 'HOUSEKEEPING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 7 },
830
+ { id: 'ORGANISING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 8 },
831
+ { id: 'SCHEDULING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 9 },
832
+ { id: 'TYPEWRITING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 10 },
833
+ { id: 'ASSEMBLING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 11 },
834
+ { id: 'BUILDING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 12 },
835
+ { id: 'INSTALLING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 13 },
836
+ { id: 'PRODUCING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 14 },
837
+ { id: 'MONITORING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 15 },
838
+ { id: 'SAFEKEEPING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 16 },
839
+ { id: 'SECURING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 17 },
840
+ { id: 'FINISHING', typology: 'QUALITY-CONTROLLER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 18 },
841
+ { id: 'TESTING', typology: 'QUALITY-CONTROLLER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 19 },
842
+ { id: 'DELIVERING', typology: 'DISTRIBUTOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 20 },
843
+ { id: 'DISTRIBUTING', typology: 'DISTRIBUTOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 21 },
844
+ { id: 'MAINTAINING', typology: 'OPERATOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 22 },
845
+ { id: 'OPERATING', typology: 'OPERATOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 23 },
846
+ ];
847
+ // Helper Map for quick lookup
848
+ const SCM_ACTIVITIES_MAP = new Map(SCM_ACTIVITIES_DEFINITIONS.map((activity) => [activity.id, activity]));
849
+ function getActivityByPosition(area, position) {
850
+ return SCM_ACTIVITIES_DEFINITIONS.find((activity) => activity.area === area && activity.position === position);
851
+ }
852
+ // Validation constants
853
+ const SCM_VALIDATION = {
854
+ TOTAL_ACTIVITIES: 114,
855
+ ACTIVITIES_WITH_PSP: 99,
856
+ ACTIVITIES_WITHOUT_PSP: 15,
857
+ EXPECTED_COUNTS: {
858
+ left: 33,
859
+ right: 33,
860
+ top: 24,
861
+ bottom: 24
862
+ }
863
+ };
864
+
865
+ /**
866
+ * SCM (Strength Cluster Map) Extractor
867
+ * Framework-independent extraction of 114 activities with PSP/PSS color detection
868
+ * Integrates PDF.js for rendering and Tesseract.js for OCR
869
+ */
870
+ class ScmExtractor {
871
+ constructor() {
872
+ this.tesseractWorker = null; // eslint-disable-line @typescript-eslint/no-explicit-any
873
+ }
874
+ /**
875
+ * Extract SCM data from a PDF page
876
+ */
877
+ async extractScmData(pdfPage) {
878
+ // eslint-disable-line @typescript-eslint/no-explicit-any
879
+ const startTime = Date.now();
880
+ try {
881
+ // 1. Detect ID presence using PDF.js text extraction
882
+ const scmHasId = await this.detectScmHasId(pdfPage);
883
+ const scmAreaConfigs = this.getScmAreaConfigs(scmHasId);
884
+ console.log(`🆔 SCM ID Detection: ${scmHasId ? 'ID Present' : 'No ID'}`);
885
+ // 2. Render page to canvas
886
+ const canvas = await this.renderPageToCanvas(pdfPage);
887
+ // 3. Extract from all 4 areas
888
+ const [leftActivities, rightActivities, topActivities, bottomActivities] = await Promise.all([
889
+ this.extractAreaActivities(canvas, 'left', scmAreaConfigs),
890
+ this.extractAreaActivities(canvas, 'right', scmAreaConfigs),
891
+ this.extractAreaActivities(canvas, 'top', scmAreaConfigs),
892
+ this.extractAreaActivities(canvas, 'bottom', scmAreaConfigs)
893
+ ]);
894
+ const allDetected = [
895
+ ...leftActivities,
896
+ ...rightActivities,
897
+ ...topActivities,
898
+ ...bottomActivities
899
+ ];
900
+ // 4. Map to complete definitions
901
+ const mappedActivities = this.mapDetectedToDefinitions(allDetected);
902
+ // 5. Calculate statistics
903
+ const detectedCount = allDetected.length;
904
+ const unmappedActivities = allDetected
905
+ .filter((d) => !SCM_ACTIVITIES_MAP.has(d.activityName))
906
+ .map((d) => d.activityName);
907
+ const totalConfidence = mappedActivities.reduce((sum, activity) => sum + activity.detectionConfidence, 0);
908
+ const averageConfidence = mappedActivities.length > 0 ? totalConfidence / mappedActivities.length : 0;
909
+ const extractionTime = Date.now() - startTime;
910
+ return {
911
+ activities: mappedActivities,
912
+ metadata: {
913
+ totalActivities: SCM_VALIDATION.TOTAL_ACTIVITIES,
914
+ detectedActivities: detectedCount,
915
+ unmappedActivities,
916
+ averageConfidence,
917
+ extractionTime
918
+ }
919
+ };
920
+ }
921
+ catch (error) {
922
+ console.error('SCM extraction failed:', error);
923
+ return null;
924
+ }
925
+ }
926
+ /**
927
+ * Detect if SCM page has ID below the name using PDF.js text extraction
928
+ */
929
+ async detectScmHasId(pdfPage) {
930
+ // eslint-disable-line @typescript-eslint/no-explicit-any
931
+ try {
932
+ const textContent = await pdfPage.getTextContent();
933
+ const textItems = textContent.items;
934
+ const viewport = pdfPage.getViewport({ scale: 1.0 });
935
+ const pageHeight = viewport.height;
936
+ const idSearchArea = pageHeight * 0.15; // Search in upper 15% of page
937
+ for (const item of textItems) {
938
+ if (item.transform && item.transform[5] > pageHeight - idSearchArea) {
939
+ const text = item.str?.trim();
940
+ if (text && /^\d+$/.test(text) && text.length >= 3) {
941
+ console.log(`🔍 ID Detection: Found ID "${text}" at position (${Math.round(item.transform[4])}, ${Math.round(item.transform[5])})`);
942
+ return true;
943
+ }
944
+ }
945
+ }
946
+ console.log('🔍 ID Detection: No numeric ID found in upper area');
947
+ return false;
948
+ }
949
+ catch (error) {
950
+ console.warn('PDF.js ID detection failed:', error);
951
+ return false;
952
+ }
953
+ }
954
+ /**
955
+ * Get area configurations based on whether ID is present
956
+ */
957
+ getScmAreaConfigs(scmHasId) {
958
+ const topStartY = scmHasId ? 0.16 : 0.148;
959
+ const topEndY = scmHasId ? 0.258 : 0.246;
960
+ const bottomStartY = scmHasId ? 0.66 : 0.652;
961
+ const bottomEndY = scmHasId ? 0.758 : 0.746;
962
+ const leftRightStartY = scmHasId ? 0.256 : 0.244;
963
+ const leftRightEndY = scmHasId ? 0.662 : 0.65;
964
+ return {
965
+ left: {
966
+ startX: 0.065,
967
+ endX: 0.278,
968
+ startY: leftRightStartY,
969
+ endY: leftRightEndY,
970
+ expectedCount: 33,
971
+ pattern: 'NAME-PSS-PSP',
972
+ orientation: 'horizontal'
973
+ },
974
+ right: {
975
+ startX: 0.76,
976
+ endX: 0.92,
977
+ startY: leftRightStartY,
978
+ endY: leftRightEndY,
979
+ expectedCount: 33,
980
+ pattern: 'PSS-PSP-NAME',
981
+ orientation: 'horizontal'
982
+ },
983
+ top: {
984
+ startX: 0.255,
985
+ endX: 0.785,
986
+ startY: topStartY,
987
+ endY: topEndY,
988
+ expectedCount: 24,
989
+ pattern: 'NAME-PSS-PSP',
990
+ orientation: 'vertical'
991
+ },
992
+ bottom: {
993
+ startX: 0.255,
994
+ endX: 0.785,
995
+ startY: bottomStartY,
996
+ endY: bottomEndY,
997
+ expectedCount: 24,
998
+ pattern: 'PSS-PSP-NAME',
999
+ orientation: 'vertical'
1000
+ }
1001
+ };
1002
+ }
1003
+ /**
1004
+ * Render PDF page to high-resolution canvas
1005
+ */
1006
+ async renderPageToCanvas(pdfPage) {
1007
+ // eslint-disable-line @typescript-eslint/no-explicit-any
1008
+ const scale = 2.0; // Higher resolution for better OCR
1009
+ const viewport = pdfPage.getViewport({ scale });
1010
+ const canvas = document.createElement('canvas');
1011
+ canvas.width = viewport.width;
1012
+ canvas.height = viewport.height;
1013
+ const context = canvas.getContext('2d');
1014
+ if (!context) {
1015
+ throw new Error('Could not get canvas context');
1016
+ }
1017
+ const renderTask = pdfPage.render({
1018
+ canvasContext: context,
1019
+ viewport: viewport
1020
+ });
1021
+ await renderTask.promise;
1022
+ return canvas;
1023
+ }
1024
+ /**
1025
+ * Detect color at specific coordinates using Canvas API
1026
+ */
1027
+ detectColorAt(canvas, point) {
1028
+ const ctx = canvas.getContext('2d');
1029
+ if (!ctx)
1030
+ return 'white';
1031
+ const sampleSize = 10;
1032
+ const imageData = ctx.getImageData(Math.max(0, point.x - sampleSize / 2), Math.max(0, point.y - sampleSize / 2), sampleSize, sampleSize);
1033
+ let totalR = 0, totalG = 0, totalB = 0;
1034
+ const pixelCount = sampleSize * sampleSize;
1035
+ for (let i = 0; i < imageData.data.length; i += 4) {
1036
+ totalR += imageData.data[i];
1037
+ totalG += imageData.data[i + 1];
1038
+ totalB += imageData.data[i + 2];
1039
+ }
1040
+ const avgR = Math.round(totalR / pixelCount);
1041
+ const avgG = Math.round(totalG / pixelCount);
1042
+ const avgB = Math.round(totalB / pixelCount);
1043
+ return this.mapRgbToScmColor(avgR, avgG, avgB);
1044
+ }
1045
+ /**
1046
+ * Map RGB values to SCM color using exact hex color matching
1047
+ */
1048
+ mapRgbToScmColor(r, g, b) {
1049
+ const colors = {
1050
+ black: { r: 0, g: 0, b: 0 },
1051
+ gray: { r: 125, g: 125, b: 125 },
1052
+ white: { r: 255, g: 255, b: 255 },
1053
+ yellow: { r: 255, g: 255, b: 51 },
1054
+ red: { r: 255, g: 0, b: 0 }
1055
+ };
1056
+ let minDistance = Infinity;
1057
+ let closestColor = 'white';
1058
+ const tolerance = 30;
1059
+ for (const [colorName, colorRgb] of Object.entries(colors)) {
1060
+ const distance = Math.sqrt(Math.pow(r - colorRgb.r, 2) + Math.pow(g - colorRgb.g, 2) + Math.pow(b - colorRgb.b, 2));
1061
+ if (distance <= tolerance && distance < minDistance) {
1062
+ minDistance = distance;
1063
+ closestColor = colorName;
1064
+ }
1065
+ }
1066
+ if (minDistance > tolerance) {
1067
+ minDistance = Infinity;
1068
+ for (const [colorName, colorRgb] of Object.entries(colors)) {
1069
+ const distance = Math.sqrt(Math.pow(r - colorRgb.r, 2) + Math.pow(g - colorRgb.g, 2) + Math.pow(b - colorRgb.b, 2));
1070
+ if (distance < minDistance) {
1071
+ minDistance = distance;
1072
+ closestColor = colorName;
1073
+ }
1074
+ }
1075
+ }
1076
+ return closestColor;
1077
+ }
1078
+ /**
1079
+ * Extract activities from a specific area using pre-defined positions
1080
+ */
1081
+ async extractAreaActivities(canvas, area, scmAreaConfigs // eslint-disable-line @typescript-eslint/no-explicit-any
1082
+ ) {
1083
+ const config = scmAreaConfigs[area];
1084
+ const activities = [];
1085
+ for (let i = 0; i < config.expectedCount; i++) {
1086
+ try {
1087
+ const expectedActivity = getActivityByPosition(area, i);
1088
+ if (!expectedActivity) {
1089
+ console.warn(`No activity defined for ${area} area position ${i}`);
1090
+ continue;
1091
+ }
1092
+ const coords = this.calculateActivityCoordinates(canvas, area, i, scmAreaConfigs);
1093
+ // Extract activity name using Tesseract OCR
1094
+ const detectedName = await this.extractActivityName(canvas, coords.name, area);
1095
+ if (expectedActivity.id !== detectedName) {
1096
+ console.log(`🚀 ~ ${area}[${i}] Expected: ${expectedActivity.id}, Detected: "${detectedName}"`);
1097
+ }
1098
+ // Detect colors
1099
+ const pssColor = this.detectColorAt(canvas, coords.pss);
1100
+ const pspColor = expectedActivity.hasPsp ? this.detectColorAt(canvas, coords.psp) : null;
1101
+ // Calculate confidence
1102
+ const confidence = this.calculateDetectionConfidence(detectedName, expectedActivity.id, pssColor, pspColor);
1103
+ activities.push({
1104
+ activityName: expectedActivity.id,
1105
+ psp: pspColor,
1106
+ pss: pssColor,
1107
+ area,
1108
+ position: i,
1109
+ confidence
1110
+ });
1111
+ }
1112
+ catch (error) {
1113
+ console.warn(`Failed to extract activity at ${area} position ${i}:`, error);
1114
+ }
1115
+ }
1116
+ return activities;
1117
+ }
1118
+ /**
1119
+ * Calculate activity coordinates based on area and position
1120
+ */
1121
+ calculateActivityCoordinates(canvas, area, index, scmAreaConfigs // eslint-disable-line @typescript-eslint/no-explicit-any
1122
+ ) {
1123
+ const config = scmAreaConfigs[area];
1124
+ const canvasWidth = canvas.width;
1125
+ const canvasHeight = canvas.height;
1126
+ const regionX = config.startX * canvasWidth;
1127
+ const regionY = config.startY * canvasHeight;
1128
+ const regionWidth = (config.endX - config.startX) * canvasWidth;
1129
+ const regionHeight = (config.endY - config.startY) * canvasHeight;
1130
+ if (area === 'left' || area === 'right') {
1131
+ const itemHeight = regionHeight / config.expectedCount;
1132
+ const rowY = regionY + index * itemHeight;
1133
+ const centerY = rowY + itemHeight / 2;
1134
+ if (area === 'left') {
1135
+ const nameWidth = regionWidth * 0.5;
1136
+ return {
1137
+ name: {
1138
+ x: regionX + regionWidth * 0.28,
1139
+ y: centerY - 8,
1140
+ width: nameWidth,
1141
+ height: itemHeight * 0.9
1142
+ },
1143
+ pss: { x: regionX + regionWidth * 0.84, y: centerY },
1144
+ psp: { x: regionX + regionWidth * 0.94, y: centerY }
1145
+ };
1146
+ }
1147
+ else {
1148
+ const nameWidth = regionWidth * 0.7;
1149
+ return {
1150
+ pss: { x: regionX + regionWidth * 0.09, y: centerY },
1151
+ psp: { x: regionX + regionWidth * 0.22, y: centerY },
1152
+ name: {
1153
+ x: regionX + regionWidth * 0.28,
1154
+ y: rowY + itemHeight * 0.28,
1155
+ width: nameWidth,
1156
+ height: itemHeight * 0.8
1157
+ }
1158
+ };
1159
+ }
1160
+ }
1161
+ else {
1162
+ const itemWidth = regionWidth / config.expectedCount;
1163
+ const colX = regionX + index * itemWidth;
1164
+ const centerX = colX + itemWidth / 2;
1165
+ if (area === 'top') {
1166
+ return {
1167
+ name: {
1168
+ x: colX + itemWidth * 0.1,
1169
+ y: regionY + regionHeight * 0.03,
1170
+ width: itemWidth * 0.8,
1171
+ height: regionHeight * 0.75
1172
+ },
1173
+ pss: { x: centerX, y: regionY + regionHeight * 0.82 },
1174
+ psp: { x: centerX + itemWidth * 0.05, y: regionY + regionHeight * 0.95 }
1175
+ };
1176
+ }
1177
+ else {
1178
+ return {
1179
+ psp: { x: centerX, y: regionY + regionHeight * 0.075 },
1180
+ pss: { x: centerX, y: regionY + regionHeight * 0.2 },
1181
+ name: {
1182
+ x: colX + itemWidth * 0.1,
1183
+ y: regionY + regionHeight * 0.24,
1184
+ width: itemWidth * 0.8,
1185
+ height: regionHeight * 0.8
1186
+ }
1187
+ };
1188
+ }
1189
+ }
1190
+ }
1191
+ /**
1192
+ * Extract activity name using Tesseract OCR from canvas region
1193
+ */
1194
+ async extractActivityName(canvas, region, area) {
1195
+ try {
1196
+ const worker = await this.initializeTesseract();
1197
+ const tempCanvas = document.createElement('canvas');
1198
+ tempCanvas.width = region.width;
1199
+ tempCanvas.height = region.height;
1200
+ const tempCtx = tempCanvas.getContext('2d');
1201
+ if (!tempCtx)
1202
+ return '';
1203
+ tempCtx.drawImage(canvas, region.x, region.y, region.width, region.height, 0, 0, region.width, region.height);
1204
+ // Handle vertical text rotation for top/bottom areas
1205
+ if (area === 'top' || area === 'bottom') {
1206
+ const rotatedCanvas = document.createElement('canvas');
1207
+ const rotatedCtx = rotatedCanvas.getContext('2d');
1208
+ if (!rotatedCtx)
1209
+ return '';
1210
+ rotatedCanvas.width = region.height;
1211
+ rotatedCanvas.height = region.width;
1212
+ rotatedCtx.translate(region.height / 2, region.width / 2);
1213
+ rotatedCtx.rotate(Math.PI / 2);
1214
+ rotatedCtx.drawImage(tempCanvas, -region.width / 2, -region.height / 2, region.width, region.height);
1215
+ tempCanvas.width = rotatedCanvas.width;
1216
+ tempCanvas.height = rotatedCanvas.height;
1217
+ tempCtx.clearRect(0, 0, tempCanvas.width, tempCanvas.height);
1218
+ tempCtx.drawImage(rotatedCanvas, 0, 0);
1219
+ }
1220
+ // Enhance image for OCR
1221
+ tempCtx.filter = 'contrast(150%) brightness(100%)';
1222
+ tempCtx.drawImage(tempCanvas, 0, 0);
1223
+ // Configure OCR
1224
+ const Tesseract = await import('tesseract.js');
1225
+ await worker.setParameters({
1226
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ',
1227
+ tessedit_pageseg_mode: area === 'top' || area === 'bottom'
1228
+ ? Tesseract.PSM.SINGLE_LINE
1229
+ : Tesseract.PSM.SINGLE_BLOCK,
1230
+ preserve_interword_spaces: '1'
1231
+ });
1232
+ const { data: { text } } = await worker.recognize(tempCanvas);
1233
+ const cleanedText = this.normalizeActivityName(text);
1234
+ const bestMatch = this.findBestActivityMatch(cleanedText);
1235
+ return bestMatch || cleanedText;
1236
+ }
1237
+ catch (error) {
1238
+ console.warn('OCR extraction failed:', error);
1239
+ return '';
1240
+ }
1241
+ }
1242
+ /**
1243
+ * Initialize Tesseract worker for OCR
1244
+ */
1245
+ async initializeTesseract() {
1246
+ if (!this.tesseractWorker) {
1247
+ const Tesseract = await import('tesseract.js');
1248
+ this.tesseractWorker = await Tesseract.createWorker('eng');
1249
+ await this.tesseractWorker.setParameters({
1250
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ',
1251
+ tessedit_pageseg_mode: Tesseract.PSM.SINGLE_BLOCK,
1252
+ preserve_interword_spaces: '1'
1253
+ });
1254
+ }
1255
+ return this.tesseractWorker;
1256
+ }
1257
+ /**
1258
+ * Normalize activity name for matching
1259
+ */
1260
+ normalizeActivityName(text) {
1261
+ return text
1262
+ .toUpperCase()
1263
+ .trim()
1264
+ .replace(/[^A-Z\s]/g, '')
1265
+ .replace(/\s+/g, ' ')
1266
+ .replace(/\s*$/, '');
1267
+ }
1268
+ /**
1269
+ * Find best matching activity from pre-defined list
1270
+ */
1271
+ findBestActivityMatch(detectedName) {
1272
+ const normalized = this.normalizeActivityName(detectedName);
1273
+ if (!normalized)
1274
+ return null;
1275
+ // Exact match first
1276
+ for (const [activityId] of SCM_ACTIVITIES_MAP) {
1277
+ if (this.normalizeActivityName(activityId) === normalized) {
1278
+ return activityId;
1279
+ }
1280
+ }
1281
+ // Fuzzy matching for partial matches
1282
+ for (const [activityId] of SCM_ACTIVITIES_MAP) {
1283
+ const activityNormalized = this.normalizeActivityName(activityId);
1284
+ if (activityNormalized.includes(normalized) || normalized.includes(activityNormalized)) {
1285
+ return activityId;
1286
+ }
1287
+ }
1288
+ return null;
1289
+ }
1290
+ /**
1291
+ * Calculate detection confidence score
1292
+ */
1293
+ calculateDetectionConfidence(detected, matched, pss, psp) {
1294
+ let confidence = 0.3; // Base confidence
1295
+ // Text matching confidence
1296
+ const detectedNorm = this.normalizeActivityName(detected);
1297
+ const matchedNorm = this.normalizeActivityName(matched);
1298
+ if (detectedNorm === matchedNorm) {
1299
+ confidence += 0.5;
1300
+ }
1301
+ else if (detectedNorm.length > 0) {
1302
+ if (detectedNorm.includes(matchedNorm) || matchedNorm.includes(detectedNorm)) {
1303
+ confidence += 0.3;
1304
+ }
1305
+ else if (detectedNorm.length > 3) {
1306
+ const words1 = detectedNorm.split(' ').filter((w) => w.length > 2);
1307
+ const words2 = matchedNorm.split(' ').filter((w) => w.length > 2);
1308
+ const commonWords = words1.filter((w) => words2.includes(w));
1309
+ if (commonWords.length > 0) {
1310
+ confidence += 0.2;
1311
+ }
1312
+ else {
1313
+ confidence += 0.1;
1314
+ }
1315
+ }
1316
+ }
1317
+ // Color detection confidence
1318
+ if (pss !== 'white')
1319
+ confidence += 0.15;
1320
+ if (psp && psp !== 'white')
1321
+ confidence += 0.15;
1322
+ return Math.min(1.0, confidence);
1323
+ }
1324
+ /**
1325
+ * Map detected activities to definitions
1326
+ */
1327
+ mapDetectedToDefinitions(detected) {
1328
+ const mapped = [];
1329
+ for (const [activityId, definition] of SCM_ACTIVITIES_MAP) {
1330
+ const detectedMatch = detected.find((d) => d.activityName === activityId);
1331
+ if (detectedMatch) {
1332
+ mapped.push({
1333
+ id: definition.id,
1334
+ typology: definition.typology,
1335
+ cluster: definition.cluster,
1336
+ psp: definition.hasPsp ? detectedMatch.psp : null,
1337
+ pss: detectedMatch.pss,
1338
+ detectionConfidence: detectedMatch.confidence
1339
+ });
1340
+ }
1341
+ else {
1342
+ mapped.push({
1343
+ id: definition.id,
1344
+ typology: definition.typology,
1345
+ cluster: definition.cluster,
1346
+ psp: definition.hasPsp ? 'white' : null,
1347
+ pss: 'white',
1348
+ detectionConfidence: 0
1349
+ });
1350
+ }
1351
+ }
1352
+ return mapped;
1353
+ }
1354
+ /**
1355
+ * Cleanup Tesseract worker
1356
+ */
1357
+ async cleanup() {
1358
+ if (this.tesseractWorker) {
1359
+ await this.tesseractWorker.terminate();
1360
+ this.tesseractWorker = null;
1361
+ }
1362
+ }
1363
+ }
1364
+
1365
+ /**
1366
+ * TMA data type definitions for the extractor package
1367
+ * Based on the original useExtractTMA.ts interfaces
1368
+ */
1369
+ const DEFAULT_CONFIG = {
1370
+ includeSCM: true, // Include SCM by default as per Phase 2 requirements
1371
+ workerSrc: "auto",
1372
+ tesseractWorkerSrc: "auto",
1373
+ debug: false,
1374
+ timeoutMs: 30000,
1375
+ };
1376
+
1377
+ /**
1378
+ * Main TMA extractor class that orchestrates the extraction process
1379
+ */
1380
+ class TmaExtractor {
1381
+ constructor(config = {}) {
1382
+ this.config = { ...DEFAULT_CONFIG, ...config };
1383
+ this.pdfProcessor = new PdfProcessor(this.config);
1384
+ this.scmExtractor = new ScmExtractor();
1385
+ }
1386
+ /**
1387
+ * Validates input file
1388
+ */
1389
+ validateFile(file) {
1390
+ const validation = this.pdfProcessor.validatePdfFile(file);
1391
+ if (!validation.isValid) {
1392
+ throw createError("INVALID_FILE_TYPE", validation.error || "Invalid file");
1393
+ }
1394
+ }
1395
+ /**
1396
+ * Extracts all talent data from page texts
1397
+ */
1398
+ extractTalentData(pageTexts, pageCount) {
1399
+ try {
1400
+ const name = extractName(pageTexts.talentOrder, pageCount);
1401
+ const talents = extractTalentOrder(pageTexts.talentOrder, false, pageCount);
1402
+ const strength = extractStrength(pageTexts.strength, pageCount);
1403
+ const typology = extractTypology(pageTexts.typologyAndBranding);
1404
+ const personalbranding = extractPersonalBranding(pageTexts.typologyAndBranding, pageCount);
1405
+ return {
1406
+ name,
1407
+ talents,
1408
+ strength,
1409
+ typology,
1410
+ personalbranding,
1411
+ };
1412
+ }
1413
+ catch (error) {
1414
+ throw wrapError(error, "TALENT_EXTRACTION_ERROR", "Failed to extract talent data");
1415
+ }
1416
+ }
1417
+ /**
1418
+ * Processes a single PDF file and extracts TMA data
1419
+ */
1420
+ async extractFromPdf(file) {
1421
+ // Validate input file
1422
+ this.validateFile(file);
1423
+ try {
1424
+ // Get page count and validate format
1425
+ const pageCount = await this.pdfProcessor.getPageCount(file);
1426
+ const formatValidation = validateAndGetFormat(pageCount);
1427
+ if (!formatValidation.isValid) {
1428
+ throw createError("UNSUPPORTED_FORMAT", formatValidation.error);
1429
+ }
1430
+ // Extract page texts
1431
+ const pageNumbers = formatValidation.pages;
1432
+ const pageTexts = await this.pdfProcessor.extractPageTexts(file, pageNumbers);
1433
+ // Extract talent data
1434
+ const talentData = this.extractTalentData(pageTexts, pageCount);
1435
+ // Add SCM extraction if requested (Phase 2 feature)
1436
+ if (this.config.includeSCM) {
1437
+ try {
1438
+ this.logDebug("Starting SCM extraction...");
1439
+ // Get SCM page number based on TMA format
1440
+ const scmPageNumber = this.pdfProcessor.getScmPageNumber(pageCount);
1441
+ this.logDebug(`SCM page number for ${pageCount}-page TMA: ${scmPageNumber}`);
1442
+ const scmPage = await this.pdfProcessor.getPage(file, scmPageNumber);
1443
+ if (scmPage) {
1444
+ const scmData = await this.scmExtractor.extractScmData(scmPage);
1445
+ if (scmData) {
1446
+ talentData.scm = scmData;
1447
+ this.logDebug(`SCM extraction successful: ${scmData.activities.length} activities extracted`);
1448
+ }
1449
+ else {
1450
+ this.logDebug("SCM extraction returned null, continuing without SCM data");
1451
+ }
1452
+ }
1453
+ else {
1454
+ this.logDebug("No SCM page found, continuing without SCM data");
1455
+ }
1456
+ }
1457
+ catch (error) {
1458
+ this.logDebug("SCM extraction failed, continuing without SCM data: " + error);
1459
+ // Continue without SCM data - don't break the main extraction
1460
+ }
1461
+ }
1462
+ // Transform to cleaned format
1463
+ const singlePersonResult = {
1464
+ person: {
1465
+ ...talentData,
1466
+ top14Talents: talentData.talents.slice(0, 14).map((t) => t.tema),
1467
+ top7Talents: talentData.talents.slice(0, 7).map((t) => t.tema),
1468
+ low14Talents: talentData.talents.slice(-14).map((t) => t.tema),
1469
+ },
1470
+ };
1471
+ return DataTransformer.transformSinglePersonResult(singlePersonResult);
1472
+ }
1473
+ catch (error) {
1474
+ if (error instanceof TmaExtractionError) {
1475
+ throw error;
1476
+ }
1477
+ throw wrapError(error, "PDF_PROCESSING_ERROR", "Failed to process PDF file");
1478
+ }
1479
+ }
1480
+ /**
1481
+ * Logs debug messages if debug mode is enabled
1482
+ */
1483
+ logDebug(message) {
1484
+ if (this.config.debug) {
1485
+ console.log(`[TMA-Extractor] ${message}`);
1486
+ }
1487
+ }
1488
+ }
1489
+ /**
1490
+ * Main package export function
1491
+ * Simple API that creates an extractor instance and processes the file
1492
+ */
1493
+ async function tmaExtractor(file, config = {}) {
1494
+ const extractor = new TmaExtractor(config);
1495
+ return await extractor.extractFromPdf(file);
1496
+ }
1497
+
1498
+ exports.DEFAULT_CONFIG = DEFAULT_CONFIG;
1499
+ exports.TmaExtractionError = TmaExtractionError;
1500
+ exports.TmaExtractor = TmaExtractor;
1501
+ exports.default = tmaExtractor;
1502
+ //# sourceMappingURL=index.cjs.map