tm-extractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +64 -0
  2. package/dist/constants/scm-activities.d.ts +35 -0
  3. package/dist/constants/scm-activities.d.ts.map +1 -0
  4. package/dist/constants/tma-formats.d.ts +77 -0
  5. package/dist/constants/tma-formats.d.ts.map +1 -0
  6. package/dist/constants/typology-definitions.d.ts +519 -0
  7. package/dist/constants/typology-definitions.d.ts.map +1 -0
  8. package/dist/core/data-transformer.d.ts +44 -0
  9. package/dist/core/data-transformer.d.ts.map +1 -0
  10. package/dist/core/pdf-processor.d.ts +48 -0
  11. package/dist/core/pdf-processor.d.ts.map +1 -0
  12. package/dist/extractors/branding-extractor.d.ts +21 -0
  13. package/dist/extractors/branding-extractor.d.ts.map +1 -0
  14. package/dist/extractors/scm-extractor.d.ts +96 -0
  15. package/dist/extractors/scm-extractor.d.ts.map +1 -0
  16. package/dist/extractors/strength-extractor.d.ts +21 -0
  17. package/dist/extractors/strength-extractor.d.ts.map +1 -0
  18. package/dist/extractors/talent-extractor.d.ts +25 -0
  19. package/dist/extractors/talent-extractor.d.ts.map +1 -0
  20. package/dist/extractors/typology-extractor.d.ts +25 -0
  21. package/dist/extractors/typology-extractor.d.ts.map +1 -0
  22. package/dist/index.cjs +1502 -0
  23. package/dist/index.cjs.map +1 -0
  24. package/dist/index.d.ts +37 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +1475 -0
  27. package/dist/index.js.map +1 -0
  28. package/dist/types/tma-types.d.ts +133 -0
  29. package/dist/types/tma-types.d.ts.map +1 -0
  30. package/dist/utils/error-handling.d.ts +46 -0
  31. package/dist/utils/error-handling.d.ts.map +1 -0
  32. package/dist/utils/format-detection.d.ts +48 -0
  33. package/dist/utils/format-detection.d.ts.map +1 -0
  34. package/package.json +67 -0
package/dist/index.js ADDED
@@ -0,0 +1,1475 @@
1
+ import * as pdfjs from 'pdfjs-dist';
2
+
3
+ /**
4
+ * TMA format configurations for different PDF page counts
5
+ */
6
+ const TMA_FORMATS = {
7
+ "49": {
8
+ pages: [9, 11, 13],
9
+ scmPage: 12,
10
+ format: "49-page",
11
+ },
12
+ "54": {
13
+ pages: [11, 13, 15],
14
+ scmPage: 14,
15
+ format: "54-page",
16
+ },
17
+ "46": {
18
+ pages: [12, 14, 16],
19
+ scmPage: 15,
20
+ format: "46-page",
21
+ },
22
+ "6": {
23
+ pages: [2, 4, 6],
24
+ scmPage: 5,
25
+ format: "6-page",
26
+ },
27
+ "22": {
28
+ pages: [2, 4, 6],
29
+ scmPage: 5,
30
+ format: "22-page",
31
+ },
32
+ };
33
+ /**
34
+ * Talent extraction regex patterns for different formats
35
+ */
36
+ const TALENT_REGEX_PATTERNS = {
37
+ // format49: /([A-Z-]+\.?)\s+(\d+)\./g,
38
+ // default: /^([1-9]|[12][0-9]|3[0-4])\s+([A-Z-]+)/gm,
39
+ // pdfParse: /([1-9]|[12][0-9]|3[0-4])([A-Z]+)/g
40
+ format49: /([A-Z-]+\.?)\s+(\d+)\./g,
41
+ default: /(\d+)\s+([A-Z-]+)/g,
42
+ pdfParse: /(\d+)([A-Z]+)/g,
43
+ };
44
+ /**
45
+ * Strength extraction regex patterns for different formats
46
+ */
47
+ const STRENGTH_REGEX_PATTERNS = {
48
+ format49: /([A-Z]+)\s+(\d+)(?:\s+|$)/g,
49
+ default: /\d+\.\s*([\w-]+)/g,
50
+ };
51
+ /**
52
+ * Personal branding extraction regex patterns for different formats
53
+ */
54
+ const PERSONAL_BRANDING_REGEX_PATTERNS = {
55
+ format49: /(QUALITY\s+CONTROLLER|[A-Z]{3,})\s*[‐-]?\s*(-?\d+(?:\.\d+)?)/g,
56
+ default: /([A-Z]+(?:\s+[A-Z]+)*)\s+Anda/g,
57
+ };
58
+ /**
59
+ * Valid talent number range (1-34)
60
+ */
61
+ const TALENT_RANGE = { min: 1, max: 34 };
62
+ /**
63
+ * Maximum number of personal branding items to extract
64
+ */
65
+ const PERSONAL_BRANDING_LIMIT = 5;
66
+
67
+ /**
68
+ * Detects TMA format based on page count
69
+ */
70
+ function detectTmaFormat(pageCount) {
71
+ // Handle invalid page counts
72
+ if (!pageCount || typeof pageCount !== 'number' || pageCount <= 0) {
73
+ return 'unknown';
74
+ }
75
+ const formatKey = pageCount.toString();
76
+ return TMA_FORMATS[formatKey]?.format || 'unknown';
77
+ }
78
+ /**
79
+ * Gets all supported page counts
80
+ */
81
+ function getSupportedPageCounts() {
82
+ return Object.keys(TMA_FORMATS).map(key => parseInt(key));
83
+ }
84
+ /**
85
+ * Gets format information for a given page count
86
+ */
87
+ function getFormatInfo(pageCount) {
88
+ const formatKey = pageCount.toString();
89
+ return TMA_FORMATS[formatKey] || null;
90
+ }
91
+ /**
92
+ * Validates if a page count is supported and returns format details
93
+ */
94
+ function validateAndGetFormat(pageCount) {
95
+ if (pageCount === 5) {
96
+ return {
97
+ isValid: false,
98
+ format: 'unknown',
99
+ error: 'Page count 5 is explicitly not supported'
100
+ };
101
+ }
102
+ const formatInfo = getFormatInfo(pageCount);
103
+ if (!formatInfo) {
104
+ return {
105
+ isValid: false,
106
+ format: 'unknown',
107
+ error: `Unsupported page count: ${pageCount}. Supported formats: ${getSupportedPageCounts().join(', ')}`
108
+ };
109
+ }
110
+ return {
111
+ isValid: true,
112
+ format: formatInfo.format,
113
+ pages: formatInfo.pages,
114
+ scmPage: formatInfo.scmPage
115
+ };
116
+ }
117
+
118
+ /**
119
+ * Custom error class for TMA extraction operations
120
+ */
121
+ class TmaExtractionError extends Error {
122
+ constructor(message, code, originalError) {
123
+ super(message);
124
+ this.code = code;
125
+ this.originalError = originalError;
126
+ this.name = 'TmaExtractionError';
127
+ // Maintains proper stack trace for where our error was thrown (only available on V8)
128
+ if (Error.captureStackTrace) {
129
+ Error.captureStackTrace(this, TmaExtractionError);
130
+ }
131
+ }
132
+ }
133
+ /**
134
+ * Error codes used throughout the package
135
+ */
136
+ const ERROR_CODES_BASE = {
137
+ // File validation errors
138
+ INVALID_FILE_TYPE: 'INVALID_FILE_TYPE',
139
+ NO_FILES: 'NO_FILES',
140
+ TOO_MANY_FILES: 'TOO_MANY_FILES',
141
+ FILE_TOO_LARGE: 'FILE_TOO_LARGE',
142
+ // PDF processing errors
143
+ PDF_PROCESSING_ERROR: 'PDF_PROCESSING_ERROR',
144
+ PAGE_TEXT_EXTRACTION_ERROR: 'PAGE_TEXT_EXTRACTION_ERROR',
145
+ INVALID_PAGE_NUMBERS: 'INVALID_PAGE_NUMBERS',
146
+ INVALID_PAGE_NUMBER: 'INVALID_PAGE_NUMBER',
147
+ // Format errors
148
+ UNSUPPORTED_FORMAT: 'UNSUPPORTED_FORMAT',
149
+ // Extraction errors
150
+ TALENT_EXTRACTION_ERROR: 'TALENT_EXTRACTION_ERROR',
151
+ STRENGTH_EXTRACTION_ERROR: 'STRENGTH_EXTRACTION_ERROR',
152
+ TYPOLOGY_EXTRACTION_ERROR: 'TYPOLOGY_EXTRACTION_ERROR',
153
+ BRANDING_EXTRACTION_ERROR: 'BRANDING_EXTRACTION_ERROR',
154
+ // Validation errors
155
+ VALIDATION_ERROR: 'VALIDATION_ERROR',
156
+ NO_VALID_RESULTS: 'NO_VALID_RESULTS',
157
+ INSUFFICIENT_VALID_FILES: 'INSUFFICIENT_VALID_FILES',
158
+ // Configuration errors
159
+ WORKER_CONFIGURATION_ERROR: 'WORKER_CONFIGURATION_ERROR',
160
+ TIMEOUT_ERROR: 'TIMEOUT_ERROR',
161
+ // Server side error
162
+ SERVER_SIDE_ERROR: 'SERVER_SIDE_ERROR'
163
+ };
164
+ const ERROR_CODES = Object.freeze(ERROR_CODES_BASE);
165
+ /**
166
+ * Creates a TmaExtractionError with predefined error code
167
+ */
168
+ function createError(code, message, originalError) {
169
+ return new TmaExtractionError(message, ERROR_CODES[code], originalError);
170
+ }
171
+ /**
172
+ * Checks if an error is a TmaExtractionError
173
+ */
174
+ function isTmaExtractionError(error) {
175
+ return error instanceof TmaExtractionError;
176
+ }
177
+ /**
178
+ * Handles and wraps unknown errors in TmaExtractionError
179
+ */
180
+ function wrapError(error, defaultCode = 'PDF_PROCESSING_ERROR', defaultMessage = 'An unexpected error occurred') {
181
+ if (isTmaExtractionError(error)) {
182
+ return error;
183
+ }
184
+ if (error instanceof Error) {
185
+ return createError(defaultCode, error.message, error);
186
+ }
187
+ return createError(defaultCode, defaultMessage);
188
+ }
189
+
190
+ /**
191
+ * PDF Processing class that wraps PDF.js functionality for TMA extraction
192
+ */
193
+ class PdfProcessor {
194
+ constructor(config) {
195
+ this.workerSrc = config?.workerSrc;
196
+ this.configurePdfWorker();
197
+ }
198
+ /**
199
+ * Configures PDF.js worker source
200
+ */
201
+ configurePdfWorker() {
202
+ if (this.workerSrc && this.workerSrc !== "auto") {
203
+ pdfjs.GlobalWorkerOptions.workerSrc = this.workerSrc;
204
+ }
205
+ else if (typeof window !== "undefined") {
206
+ // Browser environment - use default worker
207
+ pdfjs.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", import.meta.url).toString();
208
+ }
209
+ }
210
+ /**
211
+ * Gets the total number of pages in a PDF file
212
+ */
213
+ async getPageCount(file) {
214
+ try {
215
+ const arrayBuffer = await file.arrayBuffer();
216
+ const loadingTask = pdfjs.getDocument(arrayBuffer);
217
+ const pdfDoc = await loadingTask.promise;
218
+ return pdfDoc.numPages;
219
+ }
220
+ catch (error) {
221
+ throw new TmaExtractionError("Failed to get PDF page count", "PDF_PROCESSING_ERROR", error);
222
+ }
223
+ }
224
+ /**
225
+ * Detects TMA format based on page count
226
+ */
227
+ detectFormat(pageCount) {
228
+ return detectTmaFormat(pageCount);
229
+ }
230
+ /**
231
+ * Gets page numbers for extraction based on page count
232
+ */
233
+ getPageNumbers(pageCount) {
234
+ const formatKey = pageCount.toString();
235
+ const format = TMA_FORMATS[formatKey];
236
+ if (!format) {
237
+ throw new TmaExtractionError(`Unsupported TMA format: ${pageCount} pages`, "UNSUPPORTED_FORMAT");
238
+ }
239
+ return format.pages;
240
+ }
241
+ /**
242
+ * Gets SCM page number based on page count
243
+ */
244
+ getScmPageNumber(pageCount) {
245
+ const formatKey = pageCount.toString();
246
+ const format = TMA_FORMATS[formatKey];
247
+ if (!format) {
248
+ throw new TmaExtractionError(`Unsupported TMA format for SCM: ${pageCount} pages`, "UNSUPPORTED_FORMAT");
249
+ }
250
+ return format.scmPage;
251
+ }
252
+ /**
253
+ * Extracts text content from a single PDF page
254
+ */
255
+ async getPageText(page) {
256
+ try {
257
+ const content = await page.getTextContent();
258
+ return content.items.map((item) => item.str).join(" ");
259
+ }
260
+ catch (error) {
261
+ throw new TmaExtractionError("Failed to extract text from PDF page", "PAGE_TEXT_EXTRACTION_ERROR", error);
262
+ }
263
+ }
264
+ /**
265
+ * Extracts text content from specific PDF pages
266
+ */
267
+ async extractPageTexts(file, pageNumbers) {
268
+ try {
269
+ const arrayBuffer = await file.arrayBuffer();
270
+ const loadingTask = pdfjs.getDocument(arrayBuffer);
271
+ const pdfDoc = await loadingTask.promise;
272
+ // Validate page numbers
273
+ const maxPage = pdfDoc.numPages;
274
+ const invalidPages = pageNumbers.filter((num) => num > maxPage || num < 1);
275
+ if (invalidPages.length > 0) {
276
+ throw new TmaExtractionError(`Invalid page numbers: ${invalidPages.join(", ")} (PDF has ${maxPage} pages)`, "INVALID_PAGE_NUMBERS");
277
+ }
278
+ // Get specific pages
279
+ const pages = await Promise.all(pageNumbers.map((pageNum) => pdfDoc.getPage(pageNum)));
280
+ const [talentPage, strengthPage, typologyPage] = pages;
281
+ return {
282
+ talentOrder: await this.getPageText(talentPage),
283
+ strength: await this.getPageText(strengthPage),
284
+ typologyAndBranding: await this.getPageText(typologyPage),
285
+ };
286
+ }
287
+ catch (error) {
288
+ if (error instanceof TmaExtractionError) {
289
+ throw error;
290
+ }
291
+ throw new TmaExtractionError("Failed to extract page texts from PDF", "PDF_PROCESSING_ERROR", error);
292
+ }
293
+ }
294
+ /**
295
+ * Gets a specific PDF page for external processing (e.g., SCM)
296
+ */
297
+ async getPage(file, pageNumber) {
298
+ try {
299
+ const arrayBuffer = await file.arrayBuffer();
300
+ const loadingTask = pdfjs.getDocument(arrayBuffer);
301
+ const pdfDoc = await loadingTask.promise;
302
+ if (pageNumber > pdfDoc.numPages || pageNumber < 1) {
303
+ throw new TmaExtractionError(`Invalid page number: ${pageNumber} (PDF has ${pdfDoc.numPages} pages)`, "INVALID_PAGE_NUMBER");
304
+ }
305
+ return await pdfDoc.getPage(pageNumber);
306
+ }
307
+ catch (error) {
308
+ if (error instanceof TmaExtractionError) {
309
+ throw error;
310
+ }
311
+ throw new TmaExtractionError(`Failed to get PDF page ${pageNumber}`, "PDF_PROCESSING_ERROR", error);
312
+ }
313
+ }
314
+ /**
315
+ * Validates if a file is a valid PDF
316
+ */
317
+ validatePdfFile(file) {
318
+ if (!file) {
319
+ return { isValid: false, error: "No file provided" };
320
+ }
321
+ if (file.type !== "application/pdf") {
322
+ return { isValid: false, error: "File must be a PDF" };
323
+ }
324
+ if (file.size === 0) {
325
+ return { isValid: false, error: "File is empty" };
326
+ }
327
+ // Basic size check (max 50MB)
328
+ if (file.size > 50 * 1024 * 1024) {
329
+ return { isValid: false, error: "File size exceeds 50MB limit" };
330
+ }
331
+ return { isValid: true };
332
+ }
333
+ }
334
+
335
+ /**
336
+ * Data transformation class that implements filteredExtractedData logic
337
+ * Removes unnecessary fields and metadata from extracted TMA data
338
+ */
339
+ class DataTransformer {
340
+ /**
341
+ * Removes the 'sign' field from talents array
342
+ */
343
+ static removeTalentSign(talents) {
344
+ return talents.map(({ number, tema }) => ({
345
+ number,
346
+ tema
347
+ }));
348
+ }
349
+ /**
350
+ * Removes SCM metadata while keeping activity data
351
+ * Filters out detectionConfidence and metadata according to filtered output spec
352
+ */
353
+ static removeScmMetadata(scmData) {
354
+ if (!scmData) {
355
+ return scmData;
356
+ }
357
+ if (!scmData.activities) {
358
+ return {
359
+ activities: []
360
+ };
361
+ }
362
+ return {
363
+ activities: scmData.activities.map((activity) => {
364
+ const { detectionConfidence, ...cleanedActivity } = activity;
365
+ return cleanedActivity;
366
+ })
367
+ // Note: metadata field is completely removed
368
+ };
369
+ }
370
+ /**
371
+ * Cleans talent data by removing unnecessary fields
372
+ * Implements the filteredExtractedData logic from TMARawExtractor.vue
373
+ */
374
+ static cleanTalentData(rawData) {
375
+ const cleanedData = {
376
+ name: rawData.name,
377
+ talents: this.removeTalentSign(rawData.talents),
378
+ strength: rawData.strength,
379
+ typology: rawData.typology,
380
+ personalbranding: rawData.personalbranding
381
+ };
382
+ // Only include SCM if it exists and clean it
383
+ if (rawData.scm) {
384
+ cleanedData.scm = this.removeScmMetadata(rawData.scm);
385
+ }
386
+ return cleanedData;
387
+ }
388
+ /**
389
+ * Transforms single person result to match filtered output format
390
+ */
391
+ static transformSinglePersonResult(result) {
392
+ // Remove top14Talents, top7Talents, low14Talents from person object
393
+ const { top14Talents, top7Talents, low14Talents, ...cleanedPerson } = result.person;
394
+ return {
395
+ person: this.cleanTalentData(cleanedPerson)
396
+ };
397
+ }
398
+ /**
399
+ * Validates cleaned data structure
400
+ */
401
+ static validateCleanedData(data) {
402
+ const errors = [];
403
+ // Check required fields
404
+ if (!data.name || data.name.trim().length === 0) {
405
+ errors.push('Person name is required');
406
+ }
407
+ if (!data.talents || data.talents.length === 0) {
408
+ errors.push('Talents array is required and cannot be empty');
409
+ }
410
+ if (!data.strength || data.strength.length === 0) {
411
+ errors.push('Strength array is required and cannot be empty');
412
+ }
413
+ if (!data.typology || data.typology.length === 0) {
414
+ errors.push('Typology array is required and cannot be empty');
415
+ }
416
+ if (!data.personalbranding || data.personalbranding.length === 0) {
417
+ errors.push('Personal branding array is required and cannot be empty');
418
+ }
419
+ // Validate talent structure (should not have 'sign' field)
420
+ if (data.talents) {
421
+ const talentsWithSign = data.talents.filter((talent) => 'sign' in talent);
422
+ if (talentsWithSign.length > 0) {
423
+ errors.push(`Found ${talentsWithSign.length} talents with 'sign' field (should be removed)`);
424
+ }
425
+ // Check for required fields in talents
426
+ const invalidTalents = data.talents.filter(t => typeof t.number !== 'number' || !t.tema || t.tema.trim().length === 0);
427
+ if (invalidTalents.length > 0) {
428
+ errors.push(`Found ${invalidTalents.length} talents with missing number or tema`);
429
+ }
430
+ }
431
+ // Validate SCM structure if present
432
+ if (data.scm) {
433
+ if (data.scm.metadata) {
434
+ errors.push('SCM metadata should be removed from cleaned data');
435
+ }
436
+ if (data.scm.activities) {
437
+ const activitiesWithConfidence = data.scm.activities.filter((activity) => 'detectionConfidence' in activity);
438
+ if (activitiesWithConfidence.length > 0) {
439
+ errors.push(`Found ${activitiesWithConfidence.length} SCM activities with detectionConfidence (should be removed)`);
440
+ }
441
+ }
442
+ }
443
+ return { isValid: errors.length === 0, errors };
444
+ }
445
+ /**
446
+ * Gets summary statistics of cleaned data
447
+ */
448
+ static getDataSummary(data) {
449
+ return {
450
+ totalTalents: data.talents?.length || 0,
451
+ totalStrengths: data.strength?.length || 0,
452
+ totalTypologies: data.typology?.length || 0,
453
+ totalPersonalBranding: data.personalbranding?.length || 0,
454
+ totalScmActivities: data.scm?.activities?.length || null,
455
+ hasScmData: !!data.scm
456
+ };
457
+ }
458
+ }
459
+
460
+ /**
461
+ * Validates talent number is within valid range (1-34)
462
+ */
463
+ function isValidTalentNumber(number) {
464
+ return number >= TALENT_RANGE.min && number <= TALENT_RANGE.max;
465
+ }
466
+ /**
467
+ * Extracts talent order for 49-page format
468
+ */
469
+ function extractTalentOrderFormat49(text) {
470
+ const matches = [...text.matchAll(TALENT_REGEX_PATTERNS.format49)];
471
+ return matches
472
+ .map((match) => ({
473
+ number: parseInt(match[2]),
474
+ tema: match[1].replace(/\.$/, ''),
475
+ sign: 'false'
476
+ }))
477
+ .filter((talent) => isValidTalentNumber(talent.number))
478
+ .sort((a, b) => a.number - b.number);
479
+ }
480
+ /**
481
+ * Extracts talent order for default formats (54, 46, 6 page)
482
+ */
483
+ function extractTalentOrderDefault(text, isPdfParse = false) {
484
+ const pattern = isPdfParse ? TALENT_REGEX_PATTERNS.pdfParse : TALENT_REGEX_PATTERNS.default;
485
+ const matches = [...text.matchAll(pattern)];
486
+ const result = matches
487
+ .map((match) => ({
488
+ number: parseInt(match[1]),
489
+ tema: match[2],
490
+ sign: 'false'
491
+ }))
492
+ .filter((talent) => isValidTalentNumber(talent.number))
493
+ .sort((a, b) => a.number - b.number);
494
+ return result;
495
+ }
496
+ /**
497
+ * Extracts talent order from text based on TMA format
498
+ */
499
+ function extractTalentOrder(text, isPdfParse = false, pageCount = 0) {
500
+ const format = detectTmaFormat(pageCount);
501
+ // Handle unknown format early to prevent extraction attempts
502
+ if (format === 'unknown') {
503
+ return [];
504
+ }
505
+ if (format === '49-page') {
506
+ return extractTalentOrderFormat49(text);
507
+ }
508
+ return extractTalentOrderDefault(text, isPdfParse);
509
+ }
510
+ /**
511
+ * Extracts name from talent order text based on format
512
+ */
513
+ function extractName(text, pageCount) {
514
+ const format = detectTmaFormat(pageCount);
515
+ let pattern = /URUTAN BAKAT\s+([A-Z\s.']+?)(?=\d)/;
516
+ if (format === '54-page') {
517
+ pattern = /URUTAN BAKAT\s+([A-Z\s.''-]+?)(?=\n\d)/;
518
+ }
519
+ else if (format === '49-page') {
520
+ pattern = /URUTAN BAKAT\s+([A-Z\s.']+?)(?=\n[A-Z])/;
521
+ }
522
+ const nameMatch = text.match(pattern);
523
+ return nameMatch ? nameMatch[1]?.trim() : '';
524
+ }
525
+
526
+ /**
527
+ * Extracts strength data for 49-page format
528
+ */
529
+ function extractStrengthFormat49(text) {
530
+ const strengthLines = text.match(STRENGTH_REGEX_PATTERNS.format49);
531
+ return (strengthLines?.map((line) => {
532
+ const [name, number] = line.trim().split(/\s+/);
533
+ return {
534
+ number: parseInt(number),
535
+ name: name.trim()
536
+ };
537
+ }) || []);
538
+ }
539
+ /**
540
+ * Extracts strength data for default formats (54, 46, 6 page)
541
+ */
542
+ function extractStrengthDefault(text) {
543
+ const strengthLines = text.match(STRENGTH_REGEX_PATTERNS.default);
544
+ return (strengthLines?.map((line) => {
545
+ const [number, name] = line.split('.');
546
+ return {
547
+ number: parseInt(number),
548
+ name: name.trim()
549
+ };
550
+ }) || []);
551
+ }
552
+ /**
553
+ * Extracts strength data from text based on TMA format
554
+ */
555
+ function extractStrength(text, pageCount = 0) {
556
+ const format = detectTmaFormat(pageCount);
557
+ if (format === '49-page') {
558
+ return extractStrengthFormat49(text);
559
+ }
560
+ return extractStrengthDefault(text);
561
+ }
562
+
563
+ /**
564
+ * Fixed mapping of all 30 typologies with their standardized categories
565
+ * This ensures consistent typology identification across all TMA formats
566
+ */
567
+ const TYPOLOGY_DEFINITIONS = [
568
+ { id: 1, name: 'ARRANGER', category: 'HEADMAN' },
569
+ { id: 2, name: 'SELLER', category: 'HEADMAN' },
570
+ { id: 3, name: 'COMMANDER', category: 'HEADMAN' },
571
+ { id: 4, name: 'MEDIATOR', category: 'HEADMAN' },
572
+ { id: 5, name: 'SELECTOR', category: 'HEADMAN' },
573
+ { id: 6, name: 'AMBASADOR', category: 'NETWORKING' },
574
+ { id: 7, name: 'COMMUNICATOR', category: 'NETWORKING' },
575
+ { id: 8, name: 'EDUCATOR', category: 'NETWORKING' },
576
+ { id: 9, name: 'MOTIVATOR', category: 'NETWORKING' },
577
+ { id: 10, name: 'CARETAKER', category: 'SERVICING' },
578
+ { id: 11, name: 'SERVER', category: 'SERVICING' },
579
+ { id: 12, name: 'ANALYST', category: 'THINKING' },
580
+ { id: 13, name: 'TREASURER', category: 'THINKING' },
581
+ { id: 14, name: 'RESTORER', category: 'REASONING' },
582
+ { id: 15, name: 'EVALUATOR', category: 'REASONING' },
583
+ { id: 16, name: 'EXPLORER', category: 'REASONING' },
584
+ { id: 17, name: 'DESIGNER', category: 'GENERATING IDEA' },
585
+ { id: 18, name: 'CREATOR', category: 'GENERATING IDEA' },
586
+ { id: 19, name: 'SYNTHESIZER', category: 'GENERATING IDEA' },
587
+ { id: 20, name: 'MARKETER', category: 'GENERATING IDEA' },
588
+ { id: 21, name: 'STRATEGIST', category: 'GENERATING IDEA' },
589
+ { id: 22, name: 'VISIONARY', category: 'GENERATING IDEA' },
590
+ { id: 23, name: 'JOURNALIST', category: 'ELEMENTARY' },
591
+ { id: 24, name: 'INTERPRETER', category: 'ELEMENTARY' },
592
+ { id: 25, name: 'ADMINISTRATOR', category: 'ELEMENTARY' },
593
+ { id: 26, name: 'SAFEKEEPER', category: 'TECHNICAL' },
594
+ { id: 27, name: 'PRODUCER', category: 'TECHNICAL' },
595
+ { id: 28, name: 'QUALITY CONTROLLER', category: 'TECHNICAL' },
596
+ { id: 29, name: 'DISTRIBUTOR', category: 'TECHNICAL' },
597
+ { id: 30, name: 'OPERATOR', category: 'TECHNICAL' }
598
+ ];
599
+
600
+ /**
601
+ * Creates a dash-separated label from typology name
602
+ */
603
+ function createTypologyLabel(name) {
604
+ return name.toLowerCase().replace(/\s+/g, '-');
605
+ }
606
+ /**
607
+ * Extracts score for a specific typology from text
608
+ */
609
+ function extractTypologyScore(text, typologyName) {
610
+ // Escape special characters for regex
611
+ const escapedName = typologyName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
612
+ // Create regex pattern to find typology name followed by score
613
+ const pattern = new RegExp(`${escapedName}\\s+(-?\\d+(?:\\.\\d+)?)`, 'i');
614
+ const match = text.match(pattern);
615
+ if (match && match[1]) {
616
+ const score = parseFloat(match[1]);
617
+ return isNaN(score) ? null : score;
618
+ }
619
+ return null;
620
+ }
621
+ /**
622
+ * Extracts all 30 typologies with their scores from text
623
+ */
624
+ function extractTypology(text) {
625
+ const typologies = [];
626
+ // Loop through all 30 predefined typologies
627
+ for (const definition of TYPOLOGY_DEFINITIONS) {
628
+ const score = extractTypologyScore(text, definition.name);
629
+ // Only include typologies that have scores found in the text
630
+ if (score !== null) {
631
+ typologies.push({
632
+ id: definition.id,
633
+ name: definition.name,
634
+ label: createTypologyLabel(definition.name),
635
+ category: definition.category,
636
+ score: score
637
+ });
638
+ }
639
+ }
640
+ // Sort by id to maintain consistent order
641
+ return typologies.sort((a, b) => a.id - b.id);
642
+ }
643
+
644
+ /**
645
+ * Extracts personal branding for 49-page format
646
+ */
647
+ function extractPersonalBrandingFormat49(text) {
648
+ const temp = [];
649
+ let match;
650
+ while ((match = PERSONAL_BRANDING_REGEX_PATTERNS.format49.exec(text)) !== null) {
651
+ const id = match[1].replace(/\s+/g, ' ').trim();
652
+ const score = parseFloat(match[2]);
653
+ if (id.length > 2 && !isNaN(score)) {
654
+ temp.push({ id, score });
655
+ }
656
+ }
657
+ // Keep unique entries with highest score
658
+ const uniqueMap = new Map();
659
+ temp.forEach(({ id, score }) => {
660
+ if (!uniqueMap.has(id) || uniqueMap.get(id) < score) {
661
+ uniqueMap.set(id, score);
662
+ }
663
+ });
664
+ // Sort positives and negatives separately
665
+ const arr = Array.from(uniqueMap.entries()).map(([id, score]) => ({ id, score }));
666
+ const positives = arr.filter((item) => item.score >= 0).sort((a, b) => b.score - a.score);
667
+ const negatives = arr.filter((item) => item.score < 0).sort((a, b) => a.score - b.score);
668
+ // Take top 5 from combined list
669
+ const top5 = [...positives, ...negatives].slice(0, PERSONAL_BRANDING_LIMIT);
670
+ return top5.map((item) => ({ id: item.id }));
671
+ }
672
+ /**
673
+ * Extracts personal branding for default formats
674
+ */
675
+ function extractPersonalBrandingDefault(text) {
676
+ const personalBrandingValues = text.match(PERSONAL_BRANDING_REGEX_PATTERNS.default);
677
+ if (!personalBrandingValues)
678
+ return [];
679
+ return personalBrandingValues
680
+ .map((match) => match.replace(/\s+Anda$/, ''))
681
+ .map((value) => value.replace(/PERSONAL BRANDING\s+/, ''))
682
+ .filter((value) => value && value !== 'PERSONAL BRANDING')
683
+ .slice(0, PERSONAL_BRANDING_LIMIT)
684
+ .map((value) => ({ id: value.trim() }));
685
+ }
686
+ /**
687
+ * Extracts personal branding data based on TMA format
688
+ */
689
+ function extractPersonalBranding(text, pageCount = 0) {
690
+ const format = detectTmaFormat(pageCount);
691
+ if (format === '49-page') {
692
+ return extractPersonalBrandingFormat49(text);
693
+ }
694
+ return extractPersonalBrandingDefault(text);
695
+ }
696
+
697
+ /**
698
+ * SCM (Strength Cluster Map) Activities Definitions
699
+ * Pre-defined list of all 114 activities with their typology and cluster information
700
+ * Framework-independent constants for tma-extractor package
701
+ */
702
+ const SCM_ACTIVITIES_DEFINITIONS = [
703
+ // TOP AREA (24 activities, dari kiri ke kanan)
704
+ { id: 'RELATING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 0 },
705
+ { id: 'REPRESENTING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 1 },
706
+ { id: 'COMMUNICATING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 2 },
707
+ { id: 'CORRESPONDING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 3 },
708
+ { id: 'ENTERTAINING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 4 },
709
+ { id: 'PRESENTING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 5 },
710
+ { id: 'COOPERATING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 6 },
711
+ { id: 'COORDINATING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 7 },
712
+ { id: 'DISPATCHING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 8 },
713
+ { id: 'MEDIATING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 9 },
714
+ { id: 'NEGOTIATING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 10 },
715
+ { id: 'PURCHASING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 11 },
716
+ { id: 'COLLECTING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 12 },
717
+ { id: 'CONTROLLING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 13 },
718
+ { id: 'INTERROGATING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 14 },
719
+ { id: 'BROKERING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 15 },
720
+ { id: 'INFLUENCING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 16 },
721
+ { id: 'SELLING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 17 },
722
+ { id: 'RECRUITING', typology: 'SELECTOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 18 },
723
+ { id: 'INTERVIEWING', typology: 'SELECTOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 19 },
724
+ { id: 'CARING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 20 },
725
+ { id: 'COUNSELING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 21 },
726
+ { id: 'SPIRITUALIZING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 22 },
727
+ { id: 'THERAPIES', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 23 },
728
+ // LEFT AREA (33 activities, dari atas ke bawah) - pattern: NAME-PSS-PSP
729
+ { id: 'LIAISING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 0 },
730
+ { id: 'GUIDING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 1 },
731
+ { id: 'MOTIVATING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 2 },
732
+ { id: 'SUPPORTING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 3 },
733
+ { id: 'ADVISING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 4 },
734
+ { id: 'COACHING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 5 },
735
+ { id: 'CONSULTING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 6 },
736
+ { id: 'MENTORING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 7 },
737
+ { id: 'TEACHING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 8 },
738
+ { id: 'TRAINING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 9 },
739
+ { id: 'ANALYSING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 10 },
740
+ { id: 'BOOKEEPING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 11 },
741
+ { id: 'PROGRAMMING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 12 },
742
+ { id: 'BUDGETING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 13 },
743
+ { id: 'CASHIERING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 14 },
744
+ { id: 'COSTING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 15 },
745
+ { id: 'ESTIMATING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 16 },
746
+ { id: 'AUDITING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 17 },
747
+ { id: 'EVALUATING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 18 },
748
+ { id: 'INSPECTING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 19 },
749
+ { id: 'INVESTIGATING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 20 },
750
+ { id: 'REVIEWING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 21 },
751
+ { id: 'VERIFYING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 22 },
752
+ { id: 'DIAGNOSING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 23 },
753
+ { id: 'IDENTIFYING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 24 },
754
+ { id: 'RESTORING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 25 },
755
+ { id: 'APPRAISING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 26 },
756
+ { id: 'OBSERVING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 27 },
757
+ { id: 'RESEARCHING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 28 },
758
+ { id: 'SURVEYING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 29 },
759
+ { id: 'CONCEPTUALIZING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 30 },
760
+ { id: 'EDITING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 31 },
761
+ { id: 'REDACTING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 32 },
762
+ // RIGHT AREA (33 activities, dari atas ke bawah) - pattern: PSS-PSP-NAME
763
+ { id: 'VOLUNTEERING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 0 },
764
+ { id: 'ASSISTING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 1 },
765
+ { id: 'GREETING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 2 },
766
+ { id: 'INFORMING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 3 },
767
+ { id: 'SERVING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 4 },
768
+ { id: 'DESIGNING', typology: 'DESIGNER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 5 },
769
+ { id: 'DRAFTING', typology: 'DESIGNER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 6 },
770
+ { id: 'ANIMATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 7 },
771
+ { id: 'CREATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 8 },
772
+ { id: 'IDEATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 9 },
773
+ { id: 'SYNTHESIZING', typology: 'SYNTHESIZER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 10 },
774
+ { id: 'ADVERTISING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 11 },
775
+ { id: 'DEVELOPING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 12 },
776
+ { id: 'MARKETING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 13 },
777
+ { id: 'PUBLICIZING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 14 },
778
+ { id: 'PLANNING', typology: 'STRATEGIST', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 15 },
779
+ { id: 'STRATEGIZING', typology: 'STRATEGIST', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 16 },
780
+ { id: 'VISIONING', typology: 'VISIONARY', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 17 },
781
+ // 15 activities without PSP (hasPsp: false)
782
+ { id: 'ACTING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 18 },
783
+ { id: 'BEAUTIFYING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 19 },
784
+ { id: 'CONSERVING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 20 },
785
+ { id: 'COOKING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 21 },
786
+ { id: 'DANCING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 22 },
787
+ { id: 'DRAMATIZING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 23 },
788
+ { id: 'MODELLING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 24 },
789
+ { id: 'MUSICAL ART', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 25 },
790
+ { id: 'SINGING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 26 },
791
+ { id: 'VISUAL ART', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 27 },
792
+ { id: 'MANUAL SKILL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 28 },
793
+ { id: 'PHYSICAL SKILL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 29 },
794
+ { id: 'PLANTING', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 30 },
795
+ { id: 'SPORT', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 31 },
796
+ { id: 'TENDING ANIMAL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 32 },
797
+ // BOTTOM AREA (24 activities, dari kiri ke kanan) - pattern: PSS-PSP-NAME
798
+ { id: 'REPORTING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 0 },
799
+ { id: 'WRITING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 1 },
800
+ { id: 'INTERPRETING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 2 },
801
+ { id: 'TRANSCRIBING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 3 },
802
+ { id: 'TRANSLATING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 4 },
803
+ { id: 'COMPLIANCING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 5 },
804
+ { id: 'FILING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 6 },
805
+ { id: 'HOUSEKEEPING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 7 },
806
+ { id: 'ORGANISING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 8 },
807
+ { id: 'SCHEDULING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 9 },
808
+ { id: 'TYPEWRITING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 10 },
809
+ { id: 'ASSEMBLING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 11 },
810
+ { id: 'BUILDING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 12 },
811
+ { id: 'INSTALLING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 13 },
812
+ { id: 'PRODUCING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 14 },
813
+ { id: 'MONITORING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 15 },
814
+ { id: 'SAFEKEEPING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 16 },
815
+ { id: 'SECURING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 17 },
816
+ { id: 'FINISHING', typology: 'QUALITY-CONTROLLER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 18 },
817
+ { id: 'TESTING', typology: 'QUALITY-CONTROLLER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 19 },
818
+ { id: 'DELIVERING', typology: 'DISTRIBUTOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 20 },
819
+ { id: 'DISTRIBUTING', typology: 'DISTRIBUTOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 21 },
820
+ { id: 'MAINTAINING', typology: 'OPERATOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 22 },
821
+ { id: 'OPERATING', typology: 'OPERATOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 23 },
822
+ ];
823
+ // Helper Map for quick lookup
824
+ const SCM_ACTIVITIES_MAP = new Map(SCM_ACTIVITIES_DEFINITIONS.map((activity) => [activity.id, activity]));
825
+ function getActivityByPosition(area, position) {
826
+ return SCM_ACTIVITIES_DEFINITIONS.find((activity) => activity.area === area && activity.position === position);
827
+ }
828
+ // Validation constants
829
+ const SCM_VALIDATION = {
830
+ TOTAL_ACTIVITIES: 114,
831
+ ACTIVITIES_WITH_PSP: 99,
832
+ ACTIVITIES_WITHOUT_PSP: 15,
833
+ EXPECTED_COUNTS: {
834
+ left: 33,
835
+ right: 33,
836
+ top: 24,
837
+ bottom: 24
838
+ }
839
+ };
840
+
841
+ /**
842
+ * SCM (Strength Cluster Map) Extractor
843
+ * Framework-independent extraction of 114 activities with PSP/PSS color detection
844
+ * Integrates PDF.js for rendering and Tesseract.js for OCR
845
+ */
846
+ class ScmExtractor {
847
+ constructor() {
848
+ this.tesseractWorker = null; // eslint-disable-line @typescript-eslint/no-explicit-any
849
+ }
850
+ /**
851
+ * Extract SCM data from a PDF page
852
+ */
853
+ async extractScmData(pdfPage) {
854
+ // eslint-disable-line @typescript-eslint/no-explicit-any
855
+ const startTime = Date.now();
856
+ try {
857
+ // 1. Detect ID presence using PDF.js text extraction
858
+ const scmHasId = await this.detectScmHasId(pdfPage);
859
+ const scmAreaConfigs = this.getScmAreaConfigs(scmHasId);
860
+ console.log(`🆔 SCM ID Detection: ${scmHasId ? 'ID Present' : 'No ID'}`);
861
+ // 2. Render page to canvas
862
+ const canvas = await this.renderPageToCanvas(pdfPage);
863
+ // 3. Extract from all 4 areas
864
+ const [leftActivities, rightActivities, topActivities, bottomActivities] = await Promise.all([
865
+ this.extractAreaActivities(canvas, 'left', scmAreaConfigs),
866
+ this.extractAreaActivities(canvas, 'right', scmAreaConfigs),
867
+ this.extractAreaActivities(canvas, 'top', scmAreaConfigs),
868
+ this.extractAreaActivities(canvas, 'bottom', scmAreaConfigs)
869
+ ]);
870
+ const allDetected = [
871
+ ...leftActivities,
872
+ ...rightActivities,
873
+ ...topActivities,
874
+ ...bottomActivities
875
+ ];
876
+ // 4. Map to complete definitions
877
+ const mappedActivities = this.mapDetectedToDefinitions(allDetected);
878
+ // 5. Calculate statistics
879
+ const detectedCount = allDetected.length;
880
+ const unmappedActivities = allDetected
881
+ .filter((d) => !SCM_ACTIVITIES_MAP.has(d.activityName))
882
+ .map((d) => d.activityName);
883
+ const totalConfidence = mappedActivities.reduce((sum, activity) => sum + activity.detectionConfidence, 0);
884
+ const averageConfidence = mappedActivities.length > 0 ? totalConfidence / mappedActivities.length : 0;
885
+ const extractionTime = Date.now() - startTime;
886
+ return {
887
+ activities: mappedActivities,
888
+ metadata: {
889
+ totalActivities: SCM_VALIDATION.TOTAL_ACTIVITIES,
890
+ detectedActivities: detectedCount,
891
+ unmappedActivities,
892
+ averageConfidence,
893
+ extractionTime
894
+ }
895
+ };
896
+ }
897
+ catch (error) {
898
+ console.error('SCM extraction failed:', error);
899
+ return null;
900
+ }
901
+ }
902
+ /**
903
+ * Detect if SCM page has ID below the name using PDF.js text extraction
904
+ */
905
+ async detectScmHasId(pdfPage) {
906
+ // eslint-disable-line @typescript-eslint/no-explicit-any
907
+ try {
908
+ const textContent = await pdfPage.getTextContent();
909
+ const textItems = textContent.items;
910
+ const viewport = pdfPage.getViewport({ scale: 1.0 });
911
+ const pageHeight = viewport.height;
912
+ const idSearchArea = pageHeight * 0.15; // Search in upper 15% of page
913
+ for (const item of textItems) {
914
+ if (item.transform && item.transform[5] > pageHeight - idSearchArea) {
915
+ const text = item.str?.trim();
916
+ if (text && /^\d+$/.test(text) && text.length >= 3) {
917
+ console.log(`🔍 ID Detection: Found ID "${text}" at position (${Math.round(item.transform[4])}, ${Math.round(item.transform[5])})`);
918
+ return true;
919
+ }
920
+ }
921
+ }
922
+ console.log('🔍 ID Detection: No numeric ID found in upper area');
923
+ return false;
924
+ }
925
+ catch (error) {
926
+ console.warn('PDF.js ID detection failed:', error);
927
+ return false;
928
+ }
929
+ }
930
+ /**
931
+ * Get area configurations based on whether ID is present
932
+ */
933
+ getScmAreaConfigs(scmHasId) {
934
+ const topStartY = scmHasId ? 0.16 : 0.148;
935
+ const topEndY = scmHasId ? 0.258 : 0.246;
936
+ const bottomStartY = scmHasId ? 0.66 : 0.652;
937
+ const bottomEndY = scmHasId ? 0.758 : 0.746;
938
+ const leftRightStartY = scmHasId ? 0.256 : 0.244;
939
+ const leftRightEndY = scmHasId ? 0.662 : 0.65;
940
+ return {
941
+ left: {
942
+ startX: 0.065,
943
+ endX: 0.278,
944
+ startY: leftRightStartY,
945
+ endY: leftRightEndY,
946
+ expectedCount: 33,
947
+ pattern: 'NAME-PSS-PSP',
948
+ orientation: 'horizontal'
949
+ },
950
+ right: {
951
+ startX: 0.76,
952
+ endX: 0.92,
953
+ startY: leftRightStartY,
954
+ endY: leftRightEndY,
955
+ expectedCount: 33,
956
+ pattern: 'PSS-PSP-NAME',
957
+ orientation: 'horizontal'
958
+ },
959
+ top: {
960
+ startX: 0.255,
961
+ endX: 0.785,
962
+ startY: topStartY,
963
+ endY: topEndY,
964
+ expectedCount: 24,
965
+ pattern: 'NAME-PSS-PSP',
966
+ orientation: 'vertical'
967
+ },
968
+ bottom: {
969
+ startX: 0.255,
970
+ endX: 0.785,
971
+ startY: bottomStartY,
972
+ endY: bottomEndY,
973
+ expectedCount: 24,
974
+ pattern: 'PSS-PSP-NAME',
975
+ orientation: 'vertical'
976
+ }
977
+ };
978
+ }
979
+ /**
980
+ * Render PDF page to high-resolution canvas
981
+ */
982
+ async renderPageToCanvas(pdfPage) {
983
+ // eslint-disable-line @typescript-eslint/no-explicit-any
984
+ const scale = 2.0; // Higher resolution for better OCR
985
+ const viewport = pdfPage.getViewport({ scale });
986
+ const canvas = document.createElement('canvas');
987
+ canvas.width = viewport.width;
988
+ canvas.height = viewport.height;
989
+ const context = canvas.getContext('2d');
990
+ if (!context) {
991
+ throw new Error('Could not get canvas context');
992
+ }
993
+ const renderTask = pdfPage.render({
994
+ canvasContext: context,
995
+ viewport: viewport
996
+ });
997
+ await renderTask.promise;
998
+ return canvas;
999
+ }
1000
+ /**
1001
+ * Detect color at specific coordinates using Canvas API
1002
+ */
1003
+ detectColorAt(canvas, point) {
1004
+ const ctx = canvas.getContext('2d');
1005
+ if (!ctx)
1006
+ return 'white';
1007
+ const sampleSize = 10;
1008
+ const imageData = ctx.getImageData(Math.max(0, point.x - sampleSize / 2), Math.max(0, point.y - sampleSize / 2), sampleSize, sampleSize);
1009
+ let totalR = 0, totalG = 0, totalB = 0;
1010
+ const pixelCount = sampleSize * sampleSize;
1011
+ for (let i = 0; i < imageData.data.length; i += 4) {
1012
+ totalR += imageData.data[i];
1013
+ totalG += imageData.data[i + 1];
1014
+ totalB += imageData.data[i + 2];
1015
+ }
1016
+ const avgR = Math.round(totalR / pixelCount);
1017
+ const avgG = Math.round(totalG / pixelCount);
1018
+ const avgB = Math.round(totalB / pixelCount);
1019
+ return this.mapRgbToScmColor(avgR, avgG, avgB);
1020
+ }
1021
+ /**
1022
+ * Map RGB values to SCM color using exact hex color matching
1023
+ */
1024
+ mapRgbToScmColor(r, g, b) {
1025
+ const colors = {
1026
+ black: { r: 0, g: 0, b: 0 },
1027
+ gray: { r: 125, g: 125, b: 125 },
1028
+ white: { r: 255, g: 255, b: 255 },
1029
+ yellow: { r: 255, g: 255, b: 51 },
1030
+ red: { r: 255, g: 0, b: 0 }
1031
+ };
1032
+ let minDistance = Infinity;
1033
+ let closestColor = 'white';
1034
+ const tolerance = 30;
1035
+ for (const [colorName, colorRgb] of Object.entries(colors)) {
1036
+ const distance = Math.sqrt(Math.pow(r - colorRgb.r, 2) + Math.pow(g - colorRgb.g, 2) + Math.pow(b - colorRgb.b, 2));
1037
+ if (distance <= tolerance && distance < minDistance) {
1038
+ minDistance = distance;
1039
+ closestColor = colorName;
1040
+ }
1041
+ }
1042
+ if (minDistance > tolerance) {
1043
+ minDistance = Infinity;
1044
+ for (const [colorName, colorRgb] of Object.entries(colors)) {
1045
+ const distance = Math.sqrt(Math.pow(r - colorRgb.r, 2) + Math.pow(g - colorRgb.g, 2) + Math.pow(b - colorRgb.b, 2));
1046
+ if (distance < minDistance) {
1047
+ minDistance = distance;
1048
+ closestColor = colorName;
1049
+ }
1050
+ }
1051
+ }
1052
+ return closestColor;
1053
+ }
1054
+ /**
1055
+ * Extract activities from a specific area using pre-defined positions
1056
+ */
1057
+ async extractAreaActivities(canvas, area, scmAreaConfigs // eslint-disable-line @typescript-eslint/no-explicit-any
1058
+ ) {
1059
+ const config = scmAreaConfigs[area];
1060
+ const activities = [];
1061
+ for (let i = 0; i < config.expectedCount; i++) {
1062
+ try {
1063
+ const expectedActivity = getActivityByPosition(area, i);
1064
+ if (!expectedActivity) {
1065
+ console.warn(`No activity defined for ${area} area position ${i}`);
1066
+ continue;
1067
+ }
1068
+ const coords = this.calculateActivityCoordinates(canvas, area, i, scmAreaConfigs);
1069
+ // Extract activity name using Tesseract OCR
1070
+ const detectedName = await this.extractActivityName(canvas, coords.name, area);
1071
+ if (expectedActivity.id !== detectedName) {
1072
+ console.log(`🚀 ~ ${area}[${i}] Expected: ${expectedActivity.id}, Detected: "${detectedName}"`);
1073
+ }
1074
+ // Detect colors
1075
+ const pssColor = this.detectColorAt(canvas, coords.pss);
1076
+ const pspColor = expectedActivity.hasPsp ? this.detectColorAt(canvas, coords.psp) : null;
1077
+ // Calculate confidence
1078
+ const confidence = this.calculateDetectionConfidence(detectedName, expectedActivity.id, pssColor, pspColor);
1079
+ activities.push({
1080
+ activityName: expectedActivity.id,
1081
+ psp: pspColor,
1082
+ pss: pssColor,
1083
+ area,
1084
+ position: i,
1085
+ confidence
1086
+ });
1087
+ }
1088
+ catch (error) {
1089
+ console.warn(`Failed to extract activity at ${area} position ${i}:`, error);
1090
+ }
1091
+ }
1092
+ return activities;
1093
+ }
1094
+ /**
1095
+ * Calculate activity coordinates based on area and position
1096
+ */
1097
+ calculateActivityCoordinates(canvas, area, index, scmAreaConfigs // eslint-disable-line @typescript-eslint/no-explicit-any
1098
+ ) {
1099
+ const config = scmAreaConfigs[area];
1100
+ const canvasWidth = canvas.width;
1101
+ const canvasHeight = canvas.height;
1102
+ const regionX = config.startX * canvasWidth;
1103
+ const regionY = config.startY * canvasHeight;
1104
+ const regionWidth = (config.endX - config.startX) * canvasWidth;
1105
+ const regionHeight = (config.endY - config.startY) * canvasHeight;
1106
+ if (area === 'left' || area === 'right') {
1107
+ const itemHeight = regionHeight / config.expectedCount;
1108
+ const rowY = regionY + index * itemHeight;
1109
+ const centerY = rowY + itemHeight / 2;
1110
+ if (area === 'left') {
1111
+ const nameWidth = regionWidth * 0.5;
1112
+ return {
1113
+ name: {
1114
+ x: regionX + regionWidth * 0.28,
1115
+ y: centerY - 8,
1116
+ width: nameWidth,
1117
+ height: itemHeight * 0.9
1118
+ },
1119
+ pss: { x: regionX + regionWidth * 0.84, y: centerY },
1120
+ psp: { x: regionX + regionWidth * 0.94, y: centerY }
1121
+ };
1122
+ }
1123
+ else {
1124
+ const nameWidth = regionWidth * 0.7;
1125
+ return {
1126
+ pss: { x: regionX + regionWidth * 0.09, y: centerY },
1127
+ psp: { x: regionX + regionWidth * 0.22, y: centerY },
1128
+ name: {
1129
+ x: regionX + regionWidth * 0.28,
1130
+ y: rowY + itemHeight * 0.28,
1131
+ width: nameWidth,
1132
+ height: itemHeight * 0.8
1133
+ }
1134
+ };
1135
+ }
1136
+ }
1137
+ else {
1138
+ const itemWidth = regionWidth / config.expectedCount;
1139
+ const colX = regionX + index * itemWidth;
1140
+ const centerX = colX + itemWidth / 2;
1141
+ if (area === 'top') {
1142
+ return {
1143
+ name: {
1144
+ x: colX + itemWidth * 0.1,
1145
+ y: regionY + regionHeight * 0.03,
1146
+ width: itemWidth * 0.8,
1147
+ height: regionHeight * 0.75
1148
+ },
1149
+ pss: { x: centerX, y: regionY + regionHeight * 0.82 },
1150
+ psp: { x: centerX + itemWidth * 0.05, y: regionY + regionHeight * 0.95 }
1151
+ };
1152
+ }
1153
+ else {
1154
+ return {
1155
+ psp: { x: centerX, y: regionY + regionHeight * 0.075 },
1156
+ pss: { x: centerX, y: regionY + regionHeight * 0.2 },
1157
+ name: {
1158
+ x: colX + itemWidth * 0.1,
1159
+ y: regionY + regionHeight * 0.24,
1160
+ width: itemWidth * 0.8,
1161
+ height: regionHeight * 0.8
1162
+ }
1163
+ };
1164
+ }
1165
+ }
1166
+ }
1167
+ /**
1168
+ * Extract activity name using Tesseract OCR from canvas region
1169
+ */
1170
+ async extractActivityName(canvas, region, area) {
1171
+ try {
1172
+ const worker = await this.initializeTesseract();
1173
+ const tempCanvas = document.createElement('canvas');
1174
+ tempCanvas.width = region.width;
1175
+ tempCanvas.height = region.height;
1176
+ const tempCtx = tempCanvas.getContext('2d');
1177
+ if (!tempCtx)
1178
+ return '';
1179
+ tempCtx.drawImage(canvas, region.x, region.y, region.width, region.height, 0, 0, region.width, region.height);
1180
+ // Handle vertical text rotation for top/bottom areas
1181
+ if (area === 'top' || area === 'bottom') {
1182
+ const rotatedCanvas = document.createElement('canvas');
1183
+ const rotatedCtx = rotatedCanvas.getContext('2d');
1184
+ if (!rotatedCtx)
1185
+ return '';
1186
+ rotatedCanvas.width = region.height;
1187
+ rotatedCanvas.height = region.width;
1188
+ rotatedCtx.translate(region.height / 2, region.width / 2);
1189
+ rotatedCtx.rotate(Math.PI / 2);
1190
+ rotatedCtx.drawImage(tempCanvas, -region.width / 2, -region.height / 2, region.width, region.height);
1191
+ tempCanvas.width = rotatedCanvas.width;
1192
+ tempCanvas.height = rotatedCanvas.height;
1193
+ tempCtx.clearRect(0, 0, tempCanvas.width, tempCanvas.height);
1194
+ tempCtx.drawImage(rotatedCanvas, 0, 0);
1195
+ }
1196
+ // Enhance image for OCR
1197
+ tempCtx.filter = 'contrast(150%) brightness(100%)';
1198
+ tempCtx.drawImage(tempCanvas, 0, 0);
1199
+ // Configure OCR
1200
+ const Tesseract = await import('tesseract.js');
1201
+ await worker.setParameters({
1202
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ',
1203
+ tessedit_pageseg_mode: area === 'top' || area === 'bottom'
1204
+ ? Tesseract.PSM.SINGLE_LINE
1205
+ : Tesseract.PSM.SINGLE_BLOCK,
1206
+ preserve_interword_spaces: '1'
1207
+ });
1208
+ const { data: { text } } = await worker.recognize(tempCanvas);
1209
+ const cleanedText = this.normalizeActivityName(text);
1210
+ const bestMatch = this.findBestActivityMatch(cleanedText);
1211
+ return bestMatch || cleanedText;
1212
+ }
1213
+ catch (error) {
1214
+ console.warn('OCR extraction failed:', error);
1215
+ return '';
1216
+ }
1217
+ }
1218
+ /**
1219
+ * Initialize Tesseract worker for OCR
1220
+ */
1221
+ async initializeTesseract() {
1222
+ if (!this.tesseractWorker) {
1223
+ const Tesseract = await import('tesseract.js');
1224
+ this.tesseractWorker = await Tesseract.createWorker('eng');
1225
+ await this.tesseractWorker.setParameters({
1226
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ',
1227
+ tessedit_pageseg_mode: Tesseract.PSM.SINGLE_BLOCK,
1228
+ preserve_interword_spaces: '1'
1229
+ });
1230
+ }
1231
+ return this.tesseractWorker;
1232
+ }
1233
+ /**
1234
+ * Normalize activity name for matching
1235
+ */
1236
+ normalizeActivityName(text) {
1237
+ return text
1238
+ .toUpperCase()
1239
+ .trim()
1240
+ .replace(/[^A-Z\s]/g, '')
1241
+ .replace(/\s+/g, ' ')
1242
+ .replace(/\s*$/, '');
1243
+ }
1244
+ /**
1245
+ * Find best matching activity from pre-defined list
1246
+ */
1247
+ findBestActivityMatch(detectedName) {
1248
+ const normalized = this.normalizeActivityName(detectedName);
1249
+ if (!normalized)
1250
+ return null;
1251
+ // Exact match first
1252
+ for (const [activityId] of SCM_ACTIVITIES_MAP) {
1253
+ if (this.normalizeActivityName(activityId) === normalized) {
1254
+ return activityId;
1255
+ }
1256
+ }
1257
+ // Fuzzy matching for partial matches
1258
+ for (const [activityId] of SCM_ACTIVITIES_MAP) {
1259
+ const activityNormalized = this.normalizeActivityName(activityId);
1260
+ if (activityNormalized.includes(normalized) || normalized.includes(activityNormalized)) {
1261
+ return activityId;
1262
+ }
1263
+ }
1264
+ return null;
1265
+ }
1266
+ /**
1267
+ * Calculate detection confidence score
1268
+ */
1269
+ calculateDetectionConfidence(detected, matched, pss, psp) {
1270
+ let confidence = 0.3; // Base confidence
1271
+ // Text matching confidence
1272
+ const detectedNorm = this.normalizeActivityName(detected);
1273
+ const matchedNorm = this.normalizeActivityName(matched);
1274
+ if (detectedNorm === matchedNorm) {
1275
+ confidence += 0.5;
1276
+ }
1277
+ else if (detectedNorm.length > 0) {
1278
+ if (detectedNorm.includes(matchedNorm) || matchedNorm.includes(detectedNorm)) {
1279
+ confidence += 0.3;
1280
+ }
1281
+ else if (detectedNorm.length > 3) {
1282
+ const words1 = detectedNorm.split(' ').filter((w) => w.length > 2);
1283
+ const words2 = matchedNorm.split(' ').filter((w) => w.length > 2);
1284
+ const commonWords = words1.filter((w) => words2.includes(w));
1285
+ if (commonWords.length > 0) {
1286
+ confidence += 0.2;
1287
+ }
1288
+ else {
1289
+ confidence += 0.1;
1290
+ }
1291
+ }
1292
+ }
1293
+ // Color detection confidence
1294
+ if (pss !== 'white')
1295
+ confidence += 0.15;
1296
+ if (psp && psp !== 'white')
1297
+ confidence += 0.15;
1298
+ return Math.min(1.0, confidence);
1299
+ }
1300
+ /**
1301
+ * Map detected activities to definitions
1302
+ */
1303
+ mapDetectedToDefinitions(detected) {
1304
+ const mapped = [];
1305
+ for (const [activityId, definition] of SCM_ACTIVITIES_MAP) {
1306
+ const detectedMatch = detected.find((d) => d.activityName === activityId);
1307
+ if (detectedMatch) {
1308
+ mapped.push({
1309
+ id: definition.id,
1310
+ typology: definition.typology,
1311
+ cluster: definition.cluster,
1312
+ psp: definition.hasPsp ? detectedMatch.psp : null,
1313
+ pss: detectedMatch.pss,
1314
+ detectionConfidence: detectedMatch.confidence
1315
+ });
1316
+ }
1317
+ else {
1318
+ mapped.push({
1319
+ id: definition.id,
1320
+ typology: definition.typology,
1321
+ cluster: definition.cluster,
1322
+ psp: definition.hasPsp ? 'white' : null,
1323
+ pss: 'white',
1324
+ detectionConfidence: 0
1325
+ });
1326
+ }
1327
+ }
1328
+ return mapped;
1329
+ }
1330
+ /**
1331
+ * Cleanup Tesseract worker
1332
+ */
1333
+ async cleanup() {
1334
+ if (this.tesseractWorker) {
1335
+ await this.tesseractWorker.terminate();
1336
+ this.tesseractWorker = null;
1337
+ }
1338
+ }
1339
+ }
1340
+
1341
+ /**
1342
+ * TMA data type definitions for the extractor package
1343
+ * Based on the original useExtractTMA.ts interfaces
1344
+ */
1345
+ const DEFAULT_CONFIG = {
1346
+ includeSCM: true, // Include SCM by default as per Phase 2 requirements
1347
+ workerSrc: "auto",
1348
+ tesseractWorkerSrc: "auto",
1349
+ debug: false,
1350
+ timeoutMs: 30000,
1351
+ };
1352
+
1353
+ /**
1354
+ * Main TMA extractor class that orchestrates the extraction process
1355
+ */
1356
+ class TmaExtractor {
1357
+ constructor(config = {}) {
1358
+ this.config = { ...DEFAULT_CONFIG, ...config };
1359
+ this.pdfProcessor = new PdfProcessor(this.config);
1360
+ this.scmExtractor = new ScmExtractor();
1361
+ }
1362
+ /**
1363
+ * Validates input file
1364
+ */
1365
+ validateFile(file) {
1366
+ const validation = this.pdfProcessor.validatePdfFile(file);
1367
+ if (!validation.isValid) {
1368
+ throw createError("INVALID_FILE_TYPE", validation.error || "Invalid file");
1369
+ }
1370
+ }
1371
+ /**
1372
+ * Extracts all talent data from page texts
1373
+ */
1374
+ extractTalentData(pageTexts, pageCount) {
1375
+ try {
1376
+ const name = extractName(pageTexts.talentOrder, pageCount);
1377
+ const talents = extractTalentOrder(pageTexts.talentOrder, false, pageCount);
1378
+ const strength = extractStrength(pageTexts.strength, pageCount);
1379
+ const typology = extractTypology(pageTexts.typologyAndBranding);
1380
+ const personalbranding = extractPersonalBranding(pageTexts.typologyAndBranding, pageCount);
1381
+ return {
1382
+ name,
1383
+ talents,
1384
+ strength,
1385
+ typology,
1386
+ personalbranding,
1387
+ };
1388
+ }
1389
+ catch (error) {
1390
+ throw wrapError(error, "TALENT_EXTRACTION_ERROR", "Failed to extract talent data");
1391
+ }
1392
+ }
1393
+ /**
1394
+ * Processes a single PDF file and extracts TMA data
1395
+ */
1396
+ async extractFromPdf(file) {
1397
+ // Validate input file
1398
+ this.validateFile(file);
1399
+ try {
1400
+ // Get page count and validate format
1401
+ const pageCount = await this.pdfProcessor.getPageCount(file);
1402
+ const formatValidation = validateAndGetFormat(pageCount);
1403
+ if (!formatValidation.isValid) {
1404
+ throw createError("UNSUPPORTED_FORMAT", formatValidation.error);
1405
+ }
1406
+ // Extract page texts
1407
+ const pageNumbers = formatValidation.pages;
1408
+ const pageTexts = await this.pdfProcessor.extractPageTexts(file, pageNumbers);
1409
+ // Extract talent data
1410
+ const talentData = this.extractTalentData(pageTexts, pageCount);
1411
+ // Add SCM extraction if requested (Phase 2 feature)
1412
+ if (this.config.includeSCM) {
1413
+ try {
1414
+ this.logDebug("Starting SCM extraction...");
1415
+ // Get SCM page number based on TMA format
1416
+ const scmPageNumber = this.pdfProcessor.getScmPageNumber(pageCount);
1417
+ this.logDebug(`SCM page number for ${pageCount}-page TMA: ${scmPageNumber}`);
1418
+ const scmPage = await this.pdfProcessor.getPage(file, scmPageNumber);
1419
+ if (scmPage) {
1420
+ const scmData = await this.scmExtractor.extractScmData(scmPage);
1421
+ if (scmData) {
1422
+ talentData.scm = scmData;
1423
+ this.logDebug(`SCM extraction successful: ${scmData.activities.length} activities extracted`);
1424
+ }
1425
+ else {
1426
+ this.logDebug("SCM extraction returned null, continuing without SCM data");
1427
+ }
1428
+ }
1429
+ else {
1430
+ this.logDebug("No SCM page found, continuing without SCM data");
1431
+ }
1432
+ }
1433
+ catch (error) {
1434
+ this.logDebug("SCM extraction failed, continuing without SCM data: " + error);
1435
+ // Continue without SCM data - don't break the main extraction
1436
+ }
1437
+ }
1438
+ // Transform to cleaned format
1439
+ const singlePersonResult = {
1440
+ person: {
1441
+ ...talentData,
1442
+ top14Talents: talentData.talents.slice(0, 14).map((t) => t.tema),
1443
+ top7Talents: talentData.talents.slice(0, 7).map((t) => t.tema),
1444
+ low14Talents: talentData.talents.slice(-14).map((t) => t.tema),
1445
+ },
1446
+ };
1447
+ return DataTransformer.transformSinglePersonResult(singlePersonResult);
1448
+ }
1449
+ catch (error) {
1450
+ if (error instanceof TmaExtractionError) {
1451
+ throw error;
1452
+ }
1453
+ throw wrapError(error, "PDF_PROCESSING_ERROR", "Failed to process PDF file");
1454
+ }
1455
+ }
1456
+ /**
1457
+ * Logs debug messages if debug mode is enabled
1458
+ */
1459
+ logDebug(message) {
1460
+ if (this.config.debug) {
1461
+ console.log(`[TMA-Extractor] ${message}`);
1462
+ }
1463
+ }
1464
+ }
1465
+ /**
1466
+ * Main package export function
1467
+ * Simple API that creates an extractor instance and processes the file
1468
+ */
1469
+ async function tmaExtractor(file, config = {}) {
1470
+ const extractor = new TmaExtractor(config);
1471
+ return await extractor.extractFromPdf(file);
1472
+ }
1473
+
1474
+ export { DEFAULT_CONFIG, TmaExtractionError, TmaExtractor, tmaExtractor as default };
1475
+ //# sourceMappingURL=index.js.map