tm-extractor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/dist/constants/scm-activities.d.ts +35 -0
- package/dist/constants/scm-activities.d.ts.map +1 -0
- package/dist/constants/tma-formats.d.ts +77 -0
- package/dist/constants/tma-formats.d.ts.map +1 -0
- package/dist/constants/typology-definitions.d.ts +519 -0
- package/dist/constants/typology-definitions.d.ts.map +1 -0
- package/dist/core/data-transformer.d.ts +44 -0
- package/dist/core/data-transformer.d.ts.map +1 -0
- package/dist/core/pdf-processor.d.ts +48 -0
- package/dist/core/pdf-processor.d.ts.map +1 -0
- package/dist/extractors/branding-extractor.d.ts +21 -0
- package/dist/extractors/branding-extractor.d.ts.map +1 -0
- package/dist/extractors/scm-extractor.d.ts +96 -0
- package/dist/extractors/scm-extractor.d.ts.map +1 -0
- package/dist/extractors/strength-extractor.d.ts +21 -0
- package/dist/extractors/strength-extractor.d.ts.map +1 -0
- package/dist/extractors/talent-extractor.d.ts +25 -0
- package/dist/extractors/talent-extractor.d.ts.map +1 -0
- package/dist/extractors/typology-extractor.d.ts +25 -0
- package/dist/extractors/typology-extractor.d.ts.map +1 -0
- package/dist/index.cjs +1502 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +37 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1475 -0
- package/dist/index.js.map +1 -0
- package/dist/types/tma-types.d.ts +133 -0
- package/dist/types/tma-types.d.ts.map +1 -0
- package/dist/utils/error-handling.d.ts +46 -0
- package/dist/utils/error-handling.d.ts.map +1 -0
- package/dist/utils/format-detection.d.ts +48 -0
- package/dist/utils/format-detection.d.ts.map +1 -0
- package/package.json +67 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1475 @@
|
|
|
1
|
+
import * as pdfjs from 'pdfjs-dist';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* TMA format configurations for different PDF page counts
|
|
5
|
+
*/
|
|
6
|
+
const TMA_FORMATS = {
|
|
7
|
+
"49": {
|
|
8
|
+
pages: [9, 11, 13],
|
|
9
|
+
scmPage: 12,
|
|
10
|
+
format: "49-page",
|
|
11
|
+
},
|
|
12
|
+
"54": {
|
|
13
|
+
pages: [11, 13, 15],
|
|
14
|
+
scmPage: 14,
|
|
15
|
+
format: "54-page",
|
|
16
|
+
},
|
|
17
|
+
"46": {
|
|
18
|
+
pages: [12, 14, 16],
|
|
19
|
+
scmPage: 15,
|
|
20
|
+
format: "46-page",
|
|
21
|
+
},
|
|
22
|
+
"6": {
|
|
23
|
+
pages: [2, 4, 6],
|
|
24
|
+
scmPage: 5,
|
|
25
|
+
format: "6-page",
|
|
26
|
+
},
|
|
27
|
+
"22": {
|
|
28
|
+
pages: [2, 4, 6],
|
|
29
|
+
scmPage: 5,
|
|
30
|
+
format: "22-page",
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Talent extraction regex patterns for different formats
|
|
35
|
+
*/
|
|
36
|
+
const TALENT_REGEX_PATTERNS = {
|
|
37
|
+
// format49: /([A-Z-]+\.?)\s+(\d+)\./g,
|
|
38
|
+
// default: /^([1-9]|[12][0-9]|3[0-4])\s+([A-Z-]+)/gm,
|
|
39
|
+
// pdfParse: /([1-9]|[12][0-9]|3[0-4])([A-Z]+)/g
|
|
40
|
+
format49: /([A-Z-]+\.?)\s+(\d+)\./g,
|
|
41
|
+
default: /(\d+)\s+([A-Z-]+)/g,
|
|
42
|
+
pdfParse: /(\d+)([A-Z]+)/g,
|
|
43
|
+
};
|
|
44
|
+
/**
|
|
45
|
+
* Strength extraction regex patterns for different formats
|
|
46
|
+
*/
|
|
47
|
+
const STRENGTH_REGEX_PATTERNS = {
|
|
48
|
+
format49: /([A-Z]+)\s+(\d+)(?:\s+|$)/g,
|
|
49
|
+
default: /\d+\.\s*([\w-]+)/g,
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* Personal branding extraction regex patterns for different formats
|
|
53
|
+
*/
|
|
54
|
+
const PERSONAL_BRANDING_REGEX_PATTERNS = {
|
|
55
|
+
format49: /(QUALITY\s+CONTROLLER|[A-Z]{3,})\s*[‐-]?\s*(-?\d+(?:\.\d+)?)/g,
|
|
56
|
+
default: /([A-Z]+(?:\s+[A-Z]+)*)\s+Anda/g,
|
|
57
|
+
};
|
|
58
|
+
/**
|
|
59
|
+
* Valid talent number range (1-34)
|
|
60
|
+
*/
|
|
61
|
+
const TALENT_RANGE = { min: 1, max: 34 };
|
|
62
|
+
/**
|
|
63
|
+
* Maximum number of personal branding items to extract
|
|
64
|
+
*/
|
|
65
|
+
const PERSONAL_BRANDING_LIMIT = 5;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Detects TMA format based on page count
|
|
69
|
+
*/
|
|
70
|
+
function detectTmaFormat(pageCount) {
|
|
71
|
+
// Handle invalid page counts
|
|
72
|
+
if (!pageCount || typeof pageCount !== 'number' || pageCount <= 0) {
|
|
73
|
+
return 'unknown';
|
|
74
|
+
}
|
|
75
|
+
const formatKey = pageCount.toString();
|
|
76
|
+
return TMA_FORMATS[formatKey]?.format || 'unknown';
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Gets all supported page counts
|
|
80
|
+
*/
|
|
81
|
+
function getSupportedPageCounts() {
|
|
82
|
+
return Object.keys(TMA_FORMATS).map(key => parseInt(key));
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Gets format information for a given page count
|
|
86
|
+
*/
|
|
87
|
+
function getFormatInfo(pageCount) {
|
|
88
|
+
const formatKey = pageCount.toString();
|
|
89
|
+
return TMA_FORMATS[formatKey] || null;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Validates if a page count is supported and returns format details
|
|
93
|
+
*/
|
|
94
|
+
function validateAndGetFormat(pageCount) {
|
|
95
|
+
if (pageCount === 5) {
|
|
96
|
+
return {
|
|
97
|
+
isValid: false,
|
|
98
|
+
format: 'unknown',
|
|
99
|
+
error: 'Page count 5 is explicitly not supported'
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
const formatInfo = getFormatInfo(pageCount);
|
|
103
|
+
if (!formatInfo) {
|
|
104
|
+
return {
|
|
105
|
+
isValid: false,
|
|
106
|
+
format: 'unknown',
|
|
107
|
+
error: `Unsupported page count: ${pageCount}. Supported formats: ${getSupportedPageCounts().join(', ')}`
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
return {
|
|
111
|
+
isValid: true,
|
|
112
|
+
format: formatInfo.format,
|
|
113
|
+
pages: formatInfo.pages,
|
|
114
|
+
scmPage: formatInfo.scmPage
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Custom error class for TMA extraction operations
|
|
120
|
+
*/
|
|
121
|
+
class TmaExtractionError extends Error {
|
|
122
|
+
constructor(message, code, originalError) {
|
|
123
|
+
super(message);
|
|
124
|
+
this.code = code;
|
|
125
|
+
this.originalError = originalError;
|
|
126
|
+
this.name = 'TmaExtractionError';
|
|
127
|
+
// Maintains proper stack trace for where our error was thrown (only available on V8)
|
|
128
|
+
if (Error.captureStackTrace) {
|
|
129
|
+
Error.captureStackTrace(this, TmaExtractionError);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Error codes used throughout the package
|
|
135
|
+
*/
|
|
136
|
+
const ERROR_CODES_BASE = {
|
|
137
|
+
// File validation errors
|
|
138
|
+
INVALID_FILE_TYPE: 'INVALID_FILE_TYPE',
|
|
139
|
+
NO_FILES: 'NO_FILES',
|
|
140
|
+
TOO_MANY_FILES: 'TOO_MANY_FILES',
|
|
141
|
+
FILE_TOO_LARGE: 'FILE_TOO_LARGE',
|
|
142
|
+
// PDF processing errors
|
|
143
|
+
PDF_PROCESSING_ERROR: 'PDF_PROCESSING_ERROR',
|
|
144
|
+
PAGE_TEXT_EXTRACTION_ERROR: 'PAGE_TEXT_EXTRACTION_ERROR',
|
|
145
|
+
INVALID_PAGE_NUMBERS: 'INVALID_PAGE_NUMBERS',
|
|
146
|
+
INVALID_PAGE_NUMBER: 'INVALID_PAGE_NUMBER',
|
|
147
|
+
// Format errors
|
|
148
|
+
UNSUPPORTED_FORMAT: 'UNSUPPORTED_FORMAT',
|
|
149
|
+
// Extraction errors
|
|
150
|
+
TALENT_EXTRACTION_ERROR: 'TALENT_EXTRACTION_ERROR',
|
|
151
|
+
STRENGTH_EXTRACTION_ERROR: 'STRENGTH_EXTRACTION_ERROR',
|
|
152
|
+
TYPOLOGY_EXTRACTION_ERROR: 'TYPOLOGY_EXTRACTION_ERROR',
|
|
153
|
+
BRANDING_EXTRACTION_ERROR: 'BRANDING_EXTRACTION_ERROR',
|
|
154
|
+
// Validation errors
|
|
155
|
+
VALIDATION_ERROR: 'VALIDATION_ERROR',
|
|
156
|
+
NO_VALID_RESULTS: 'NO_VALID_RESULTS',
|
|
157
|
+
INSUFFICIENT_VALID_FILES: 'INSUFFICIENT_VALID_FILES',
|
|
158
|
+
// Configuration errors
|
|
159
|
+
WORKER_CONFIGURATION_ERROR: 'WORKER_CONFIGURATION_ERROR',
|
|
160
|
+
TIMEOUT_ERROR: 'TIMEOUT_ERROR',
|
|
161
|
+
// Server side error
|
|
162
|
+
SERVER_SIDE_ERROR: 'SERVER_SIDE_ERROR'
|
|
163
|
+
};
|
|
164
|
+
const ERROR_CODES = Object.freeze(ERROR_CODES_BASE);
|
|
165
|
+
/**
|
|
166
|
+
* Creates a TmaExtractionError with predefined error code
|
|
167
|
+
*/
|
|
168
|
+
function createError(code, message, originalError) {
|
|
169
|
+
return new TmaExtractionError(message, ERROR_CODES[code], originalError);
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Checks if an error is a TmaExtractionError
|
|
173
|
+
*/
|
|
174
|
+
function isTmaExtractionError(error) {
|
|
175
|
+
return error instanceof TmaExtractionError;
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Handles and wraps unknown errors in TmaExtractionError
|
|
179
|
+
*/
|
|
180
|
+
function wrapError(error, defaultCode = 'PDF_PROCESSING_ERROR', defaultMessage = 'An unexpected error occurred') {
|
|
181
|
+
if (isTmaExtractionError(error)) {
|
|
182
|
+
return error;
|
|
183
|
+
}
|
|
184
|
+
if (error instanceof Error) {
|
|
185
|
+
return createError(defaultCode, error.message, error);
|
|
186
|
+
}
|
|
187
|
+
return createError(defaultCode, defaultMessage);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* PDF Processing class that wraps PDF.js functionality for TMA extraction
|
|
192
|
+
*/
|
|
193
|
+
class PdfProcessor {
|
|
194
|
+
constructor(config) {
|
|
195
|
+
this.workerSrc = config?.workerSrc;
|
|
196
|
+
this.configurePdfWorker();
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Configures PDF.js worker source
|
|
200
|
+
*/
|
|
201
|
+
configurePdfWorker() {
|
|
202
|
+
if (this.workerSrc && this.workerSrc !== "auto") {
|
|
203
|
+
pdfjs.GlobalWorkerOptions.workerSrc = this.workerSrc;
|
|
204
|
+
}
|
|
205
|
+
else if (typeof window !== "undefined") {
|
|
206
|
+
// Browser environment - use default worker
|
|
207
|
+
pdfjs.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", import.meta.url).toString();
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Gets the total number of pages in a PDF file
|
|
212
|
+
*/
|
|
213
|
+
async getPageCount(file) {
|
|
214
|
+
try {
|
|
215
|
+
const arrayBuffer = await file.arrayBuffer();
|
|
216
|
+
const loadingTask = pdfjs.getDocument(arrayBuffer);
|
|
217
|
+
const pdfDoc = await loadingTask.promise;
|
|
218
|
+
return pdfDoc.numPages;
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
throw new TmaExtractionError("Failed to get PDF page count", "PDF_PROCESSING_ERROR", error);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Detects TMA format based on page count
|
|
226
|
+
*/
|
|
227
|
+
detectFormat(pageCount) {
|
|
228
|
+
return detectTmaFormat(pageCount);
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Gets page numbers for extraction based on page count
|
|
232
|
+
*/
|
|
233
|
+
getPageNumbers(pageCount) {
|
|
234
|
+
const formatKey = pageCount.toString();
|
|
235
|
+
const format = TMA_FORMATS[formatKey];
|
|
236
|
+
if (!format) {
|
|
237
|
+
throw new TmaExtractionError(`Unsupported TMA format: ${pageCount} pages`, "UNSUPPORTED_FORMAT");
|
|
238
|
+
}
|
|
239
|
+
return format.pages;
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Gets SCM page number based on page count
|
|
243
|
+
*/
|
|
244
|
+
getScmPageNumber(pageCount) {
|
|
245
|
+
const formatKey = pageCount.toString();
|
|
246
|
+
const format = TMA_FORMATS[formatKey];
|
|
247
|
+
if (!format) {
|
|
248
|
+
throw new TmaExtractionError(`Unsupported TMA format for SCM: ${pageCount} pages`, "UNSUPPORTED_FORMAT");
|
|
249
|
+
}
|
|
250
|
+
return format.scmPage;
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Extracts text content from a single PDF page
|
|
254
|
+
*/
|
|
255
|
+
async getPageText(page) {
|
|
256
|
+
try {
|
|
257
|
+
const content = await page.getTextContent();
|
|
258
|
+
return content.items.map((item) => item.str).join(" ");
|
|
259
|
+
}
|
|
260
|
+
catch (error) {
|
|
261
|
+
throw new TmaExtractionError("Failed to extract text from PDF page", "PAGE_TEXT_EXTRACTION_ERROR", error);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Extracts text content from specific PDF pages
|
|
266
|
+
*/
|
|
267
|
+
async extractPageTexts(file, pageNumbers) {
|
|
268
|
+
try {
|
|
269
|
+
const arrayBuffer = await file.arrayBuffer();
|
|
270
|
+
const loadingTask = pdfjs.getDocument(arrayBuffer);
|
|
271
|
+
const pdfDoc = await loadingTask.promise;
|
|
272
|
+
// Validate page numbers
|
|
273
|
+
const maxPage = pdfDoc.numPages;
|
|
274
|
+
const invalidPages = pageNumbers.filter((num) => num > maxPage || num < 1);
|
|
275
|
+
if (invalidPages.length > 0) {
|
|
276
|
+
throw new TmaExtractionError(`Invalid page numbers: ${invalidPages.join(", ")} (PDF has ${maxPage} pages)`, "INVALID_PAGE_NUMBERS");
|
|
277
|
+
}
|
|
278
|
+
// Get specific pages
|
|
279
|
+
const pages = await Promise.all(pageNumbers.map((pageNum) => pdfDoc.getPage(pageNum)));
|
|
280
|
+
const [talentPage, strengthPage, typologyPage] = pages;
|
|
281
|
+
return {
|
|
282
|
+
talentOrder: await this.getPageText(talentPage),
|
|
283
|
+
strength: await this.getPageText(strengthPage),
|
|
284
|
+
typologyAndBranding: await this.getPageText(typologyPage),
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
catch (error) {
|
|
288
|
+
if (error instanceof TmaExtractionError) {
|
|
289
|
+
throw error;
|
|
290
|
+
}
|
|
291
|
+
throw new TmaExtractionError("Failed to extract page texts from PDF", "PDF_PROCESSING_ERROR", error);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Gets a specific PDF page for external processing (e.g., SCM)
|
|
296
|
+
*/
|
|
297
|
+
async getPage(file, pageNumber) {
|
|
298
|
+
try {
|
|
299
|
+
const arrayBuffer = await file.arrayBuffer();
|
|
300
|
+
const loadingTask = pdfjs.getDocument(arrayBuffer);
|
|
301
|
+
const pdfDoc = await loadingTask.promise;
|
|
302
|
+
if (pageNumber > pdfDoc.numPages || pageNumber < 1) {
|
|
303
|
+
throw new TmaExtractionError(`Invalid page number: ${pageNumber} (PDF has ${pdfDoc.numPages} pages)`, "INVALID_PAGE_NUMBER");
|
|
304
|
+
}
|
|
305
|
+
return await pdfDoc.getPage(pageNumber);
|
|
306
|
+
}
|
|
307
|
+
catch (error) {
|
|
308
|
+
if (error instanceof TmaExtractionError) {
|
|
309
|
+
throw error;
|
|
310
|
+
}
|
|
311
|
+
throw new TmaExtractionError(`Failed to get PDF page ${pageNumber}`, "PDF_PROCESSING_ERROR", error);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Validates if a file is a valid PDF
|
|
316
|
+
*/
|
|
317
|
+
validatePdfFile(file) {
|
|
318
|
+
if (!file) {
|
|
319
|
+
return { isValid: false, error: "No file provided" };
|
|
320
|
+
}
|
|
321
|
+
if (file.type !== "application/pdf") {
|
|
322
|
+
return { isValid: false, error: "File must be a PDF" };
|
|
323
|
+
}
|
|
324
|
+
if (file.size === 0) {
|
|
325
|
+
return { isValid: false, error: "File is empty" };
|
|
326
|
+
}
|
|
327
|
+
// Basic size check (max 50MB)
|
|
328
|
+
if (file.size > 50 * 1024 * 1024) {
|
|
329
|
+
return { isValid: false, error: "File size exceeds 50MB limit" };
|
|
330
|
+
}
|
|
331
|
+
return { isValid: true };
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Data transformation class that implements filteredExtractedData logic
|
|
337
|
+
* Removes unnecessary fields and metadata from extracted TMA data
|
|
338
|
+
*/
|
|
339
|
+
class DataTransformer {
|
|
340
|
+
/**
|
|
341
|
+
* Removes the 'sign' field from talents array
|
|
342
|
+
*/
|
|
343
|
+
static removeTalentSign(talents) {
|
|
344
|
+
return talents.map(({ number, tema }) => ({
|
|
345
|
+
number,
|
|
346
|
+
tema
|
|
347
|
+
}));
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Removes SCM metadata while keeping activity data
|
|
351
|
+
* Filters out detectionConfidence and metadata according to filtered output spec
|
|
352
|
+
*/
|
|
353
|
+
static removeScmMetadata(scmData) {
|
|
354
|
+
if (!scmData) {
|
|
355
|
+
return scmData;
|
|
356
|
+
}
|
|
357
|
+
if (!scmData.activities) {
|
|
358
|
+
return {
|
|
359
|
+
activities: []
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
return {
|
|
363
|
+
activities: scmData.activities.map((activity) => {
|
|
364
|
+
const { detectionConfidence, ...cleanedActivity } = activity;
|
|
365
|
+
return cleanedActivity;
|
|
366
|
+
})
|
|
367
|
+
// Note: metadata field is completely removed
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
/**
|
|
371
|
+
* Cleans talent data by removing unnecessary fields
|
|
372
|
+
* Implements the filteredExtractedData logic from TMARawExtractor.vue
|
|
373
|
+
*/
|
|
374
|
+
static cleanTalentData(rawData) {
|
|
375
|
+
const cleanedData = {
|
|
376
|
+
name: rawData.name,
|
|
377
|
+
talents: this.removeTalentSign(rawData.talents),
|
|
378
|
+
strength: rawData.strength,
|
|
379
|
+
typology: rawData.typology,
|
|
380
|
+
personalbranding: rawData.personalbranding
|
|
381
|
+
};
|
|
382
|
+
// Only include SCM if it exists and clean it
|
|
383
|
+
if (rawData.scm) {
|
|
384
|
+
cleanedData.scm = this.removeScmMetadata(rawData.scm);
|
|
385
|
+
}
|
|
386
|
+
return cleanedData;
|
|
387
|
+
}
|
|
388
|
+
/**
|
|
389
|
+
* Transforms single person result to match filtered output format
|
|
390
|
+
*/
|
|
391
|
+
static transformSinglePersonResult(result) {
|
|
392
|
+
// Remove top14Talents, top7Talents, low14Talents from person object
|
|
393
|
+
const { top14Talents, top7Talents, low14Talents, ...cleanedPerson } = result.person;
|
|
394
|
+
return {
|
|
395
|
+
person: this.cleanTalentData(cleanedPerson)
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Validates cleaned data structure
|
|
400
|
+
*/
|
|
401
|
+
static validateCleanedData(data) {
|
|
402
|
+
const errors = [];
|
|
403
|
+
// Check required fields
|
|
404
|
+
if (!data.name || data.name.trim().length === 0) {
|
|
405
|
+
errors.push('Person name is required');
|
|
406
|
+
}
|
|
407
|
+
if (!data.talents || data.talents.length === 0) {
|
|
408
|
+
errors.push('Talents array is required and cannot be empty');
|
|
409
|
+
}
|
|
410
|
+
if (!data.strength || data.strength.length === 0) {
|
|
411
|
+
errors.push('Strength array is required and cannot be empty');
|
|
412
|
+
}
|
|
413
|
+
if (!data.typology || data.typology.length === 0) {
|
|
414
|
+
errors.push('Typology array is required and cannot be empty');
|
|
415
|
+
}
|
|
416
|
+
if (!data.personalbranding || data.personalbranding.length === 0) {
|
|
417
|
+
errors.push('Personal branding array is required and cannot be empty');
|
|
418
|
+
}
|
|
419
|
+
// Validate talent structure (should not have 'sign' field)
|
|
420
|
+
if (data.talents) {
|
|
421
|
+
const talentsWithSign = data.talents.filter((talent) => 'sign' in talent);
|
|
422
|
+
if (talentsWithSign.length > 0) {
|
|
423
|
+
errors.push(`Found ${talentsWithSign.length} talents with 'sign' field (should be removed)`);
|
|
424
|
+
}
|
|
425
|
+
// Check for required fields in talents
|
|
426
|
+
const invalidTalents = data.talents.filter(t => typeof t.number !== 'number' || !t.tema || t.tema.trim().length === 0);
|
|
427
|
+
if (invalidTalents.length > 0) {
|
|
428
|
+
errors.push(`Found ${invalidTalents.length} talents with missing number or tema`);
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
// Validate SCM structure if present
|
|
432
|
+
if (data.scm) {
|
|
433
|
+
if (data.scm.metadata) {
|
|
434
|
+
errors.push('SCM metadata should be removed from cleaned data');
|
|
435
|
+
}
|
|
436
|
+
if (data.scm.activities) {
|
|
437
|
+
const activitiesWithConfidence = data.scm.activities.filter((activity) => 'detectionConfidence' in activity);
|
|
438
|
+
if (activitiesWithConfidence.length > 0) {
|
|
439
|
+
errors.push(`Found ${activitiesWithConfidence.length} SCM activities with detectionConfidence (should be removed)`);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
return { isValid: errors.length === 0, errors };
|
|
444
|
+
}
|
|
445
|
+
/**
|
|
446
|
+
* Gets summary statistics of cleaned data
|
|
447
|
+
*/
|
|
448
|
+
static getDataSummary(data) {
|
|
449
|
+
return {
|
|
450
|
+
totalTalents: data.talents?.length || 0,
|
|
451
|
+
totalStrengths: data.strength?.length || 0,
|
|
452
|
+
totalTypologies: data.typology?.length || 0,
|
|
453
|
+
totalPersonalBranding: data.personalbranding?.length || 0,
|
|
454
|
+
totalScmActivities: data.scm?.activities?.length || null,
|
|
455
|
+
hasScmData: !!data.scm
|
|
456
|
+
};
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* Validates talent number is within valid range (1-34)
|
|
462
|
+
*/
|
|
463
|
+
function isValidTalentNumber(number) {
|
|
464
|
+
return number >= TALENT_RANGE.min && number <= TALENT_RANGE.max;
|
|
465
|
+
}
|
|
466
|
+
/**
|
|
467
|
+
* Extracts talent order for 49-page format
|
|
468
|
+
*/
|
|
469
|
+
function extractTalentOrderFormat49(text) {
|
|
470
|
+
const matches = [...text.matchAll(TALENT_REGEX_PATTERNS.format49)];
|
|
471
|
+
return matches
|
|
472
|
+
.map((match) => ({
|
|
473
|
+
number: parseInt(match[2]),
|
|
474
|
+
tema: match[1].replace(/\.$/, ''),
|
|
475
|
+
sign: 'false'
|
|
476
|
+
}))
|
|
477
|
+
.filter((talent) => isValidTalentNumber(talent.number))
|
|
478
|
+
.sort((a, b) => a.number - b.number);
|
|
479
|
+
}
|
|
480
|
+
/**
|
|
481
|
+
* Extracts talent order for default formats (54, 46, 6 page)
|
|
482
|
+
*/
|
|
483
|
+
function extractTalentOrderDefault(text, isPdfParse = false) {
|
|
484
|
+
const pattern = isPdfParse ? TALENT_REGEX_PATTERNS.pdfParse : TALENT_REGEX_PATTERNS.default;
|
|
485
|
+
const matches = [...text.matchAll(pattern)];
|
|
486
|
+
const result = matches
|
|
487
|
+
.map((match) => ({
|
|
488
|
+
number: parseInt(match[1]),
|
|
489
|
+
tema: match[2],
|
|
490
|
+
sign: 'false'
|
|
491
|
+
}))
|
|
492
|
+
.filter((talent) => isValidTalentNumber(talent.number))
|
|
493
|
+
.sort((a, b) => a.number - b.number);
|
|
494
|
+
return result;
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Extracts talent order from text based on TMA format
|
|
498
|
+
*/
|
|
499
|
+
function extractTalentOrder(text, isPdfParse = false, pageCount = 0) {
|
|
500
|
+
const format = detectTmaFormat(pageCount);
|
|
501
|
+
// Handle unknown format early to prevent extraction attempts
|
|
502
|
+
if (format === 'unknown') {
|
|
503
|
+
return [];
|
|
504
|
+
}
|
|
505
|
+
if (format === '49-page') {
|
|
506
|
+
return extractTalentOrderFormat49(text);
|
|
507
|
+
}
|
|
508
|
+
return extractTalentOrderDefault(text, isPdfParse);
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Extracts name from talent order text based on format
|
|
512
|
+
*/
|
|
513
|
+
function extractName(text, pageCount) {
|
|
514
|
+
const format = detectTmaFormat(pageCount);
|
|
515
|
+
let pattern = /URUTAN BAKAT\s+([A-Z\s.']+?)(?=\d)/;
|
|
516
|
+
if (format === '54-page') {
|
|
517
|
+
pattern = /URUTAN BAKAT\s+([A-Z\s.''-]+?)(?=\n\d)/;
|
|
518
|
+
}
|
|
519
|
+
else if (format === '49-page') {
|
|
520
|
+
pattern = /URUTAN BAKAT\s+([A-Z\s.']+?)(?=\n[A-Z])/;
|
|
521
|
+
}
|
|
522
|
+
const nameMatch = text.match(pattern);
|
|
523
|
+
return nameMatch ? nameMatch[1]?.trim() : '';
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
* Extracts strength data for 49-page format
|
|
528
|
+
*/
|
|
529
|
+
function extractStrengthFormat49(text) {
|
|
530
|
+
const strengthLines = text.match(STRENGTH_REGEX_PATTERNS.format49);
|
|
531
|
+
return (strengthLines?.map((line) => {
|
|
532
|
+
const [name, number] = line.trim().split(/\s+/);
|
|
533
|
+
return {
|
|
534
|
+
number: parseInt(number),
|
|
535
|
+
name: name.trim()
|
|
536
|
+
};
|
|
537
|
+
}) || []);
|
|
538
|
+
}
|
|
539
|
+
/**
|
|
540
|
+
* Extracts strength data for default formats (54, 46, 6 page)
|
|
541
|
+
*/
|
|
542
|
+
function extractStrengthDefault(text) {
|
|
543
|
+
const strengthLines = text.match(STRENGTH_REGEX_PATTERNS.default);
|
|
544
|
+
return (strengthLines?.map((line) => {
|
|
545
|
+
const [number, name] = line.split('.');
|
|
546
|
+
return {
|
|
547
|
+
number: parseInt(number),
|
|
548
|
+
name: name.trim()
|
|
549
|
+
};
|
|
550
|
+
}) || []);
|
|
551
|
+
}
|
|
552
|
+
/**
|
|
553
|
+
* Extracts strength data from text based on TMA format
|
|
554
|
+
*/
|
|
555
|
+
function extractStrength(text, pageCount = 0) {
|
|
556
|
+
const format = detectTmaFormat(pageCount);
|
|
557
|
+
if (format === '49-page') {
|
|
558
|
+
return extractStrengthFormat49(text);
|
|
559
|
+
}
|
|
560
|
+
return extractStrengthDefault(text);
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/**
|
|
564
|
+
* Fixed mapping of all 30 typologies with their standardized categories
|
|
565
|
+
* This ensures consistent typology identification across all TMA formats
|
|
566
|
+
*/
|
|
567
|
+
const TYPOLOGY_DEFINITIONS = [
|
|
568
|
+
{ id: 1, name: 'ARRANGER', category: 'HEADMAN' },
|
|
569
|
+
{ id: 2, name: 'SELLER', category: 'HEADMAN' },
|
|
570
|
+
{ id: 3, name: 'COMMANDER', category: 'HEADMAN' },
|
|
571
|
+
{ id: 4, name: 'MEDIATOR', category: 'HEADMAN' },
|
|
572
|
+
{ id: 5, name: 'SELECTOR', category: 'HEADMAN' },
|
|
573
|
+
{ id: 6, name: 'AMBASADOR', category: 'NETWORKING' },
|
|
574
|
+
{ id: 7, name: 'COMMUNICATOR', category: 'NETWORKING' },
|
|
575
|
+
{ id: 8, name: 'EDUCATOR', category: 'NETWORKING' },
|
|
576
|
+
{ id: 9, name: 'MOTIVATOR', category: 'NETWORKING' },
|
|
577
|
+
{ id: 10, name: 'CARETAKER', category: 'SERVICING' },
|
|
578
|
+
{ id: 11, name: 'SERVER', category: 'SERVICING' },
|
|
579
|
+
{ id: 12, name: 'ANALYST', category: 'THINKING' },
|
|
580
|
+
{ id: 13, name: 'TREASURER', category: 'THINKING' },
|
|
581
|
+
{ id: 14, name: 'RESTORER', category: 'REASONING' },
|
|
582
|
+
{ id: 15, name: 'EVALUATOR', category: 'REASONING' },
|
|
583
|
+
{ id: 16, name: 'EXPLORER', category: 'REASONING' },
|
|
584
|
+
{ id: 17, name: 'DESIGNER', category: 'GENERATING IDEA' },
|
|
585
|
+
{ id: 18, name: 'CREATOR', category: 'GENERATING IDEA' },
|
|
586
|
+
{ id: 19, name: 'SYNTHESIZER', category: 'GENERATING IDEA' },
|
|
587
|
+
{ id: 20, name: 'MARKETER', category: 'GENERATING IDEA' },
|
|
588
|
+
{ id: 21, name: 'STRATEGIST', category: 'GENERATING IDEA' },
|
|
589
|
+
{ id: 22, name: 'VISIONARY', category: 'GENERATING IDEA' },
|
|
590
|
+
{ id: 23, name: 'JOURNALIST', category: 'ELEMENTARY' },
|
|
591
|
+
{ id: 24, name: 'INTERPRETER', category: 'ELEMENTARY' },
|
|
592
|
+
{ id: 25, name: 'ADMINISTRATOR', category: 'ELEMENTARY' },
|
|
593
|
+
{ id: 26, name: 'SAFEKEEPER', category: 'TECHNICAL' },
|
|
594
|
+
{ id: 27, name: 'PRODUCER', category: 'TECHNICAL' },
|
|
595
|
+
{ id: 28, name: 'QUALITY CONTROLLER', category: 'TECHNICAL' },
|
|
596
|
+
{ id: 29, name: 'DISTRIBUTOR', category: 'TECHNICAL' },
|
|
597
|
+
{ id: 30, name: 'OPERATOR', category: 'TECHNICAL' }
|
|
598
|
+
];
|
|
599
|
+
|
|
600
|
+
/**
|
|
601
|
+
* Creates a dash-separated label from typology name
|
|
602
|
+
*/
|
|
603
|
+
function createTypologyLabel(name) {
|
|
604
|
+
return name.toLowerCase().replace(/\s+/g, '-');
|
|
605
|
+
}
|
|
606
|
+
/**
|
|
607
|
+
* Extracts score for a specific typology from text
|
|
608
|
+
*/
|
|
609
|
+
function extractTypologyScore(text, typologyName) {
|
|
610
|
+
// Escape special characters for regex
|
|
611
|
+
const escapedName = typologyName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
612
|
+
// Create regex pattern to find typology name followed by score
|
|
613
|
+
const pattern = new RegExp(`${escapedName}\\s+(-?\\d+(?:\\.\\d+)?)`, 'i');
|
|
614
|
+
const match = text.match(pattern);
|
|
615
|
+
if (match && match[1]) {
|
|
616
|
+
const score = parseFloat(match[1]);
|
|
617
|
+
return isNaN(score) ? null : score;
|
|
618
|
+
}
|
|
619
|
+
return null;
|
|
620
|
+
}
|
|
621
|
+
/**
|
|
622
|
+
* Extracts all 30 typologies with their scores from text
|
|
623
|
+
*/
|
|
624
|
+
function extractTypology(text) {
|
|
625
|
+
const typologies = [];
|
|
626
|
+
// Loop through all 30 predefined typologies
|
|
627
|
+
for (const definition of TYPOLOGY_DEFINITIONS) {
|
|
628
|
+
const score = extractTypologyScore(text, definition.name);
|
|
629
|
+
// Only include typologies that have scores found in the text
|
|
630
|
+
if (score !== null) {
|
|
631
|
+
typologies.push({
|
|
632
|
+
id: definition.id,
|
|
633
|
+
name: definition.name,
|
|
634
|
+
label: createTypologyLabel(definition.name),
|
|
635
|
+
category: definition.category,
|
|
636
|
+
score: score
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
// Sort by id to maintain consistent order
|
|
641
|
+
return typologies.sort((a, b) => a.id - b.id);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
/**
|
|
645
|
+
* Extracts personal branding for 49-page format
|
|
646
|
+
*/
|
|
647
|
+
function extractPersonalBrandingFormat49(text) {
|
|
648
|
+
const temp = [];
|
|
649
|
+
let match;
|
|
650
|
+
while ((match = PERSONAL_BRANDING_REGEX_PATTERNS.format49.exec(text)) !== null) {
|
|
651
|
+
const id = match[1].replace(/\s+/g, ' ').trim();
|
|
652
|
+
const score = parseFloat(match[2]);
|
|
653
|
+
if (id.length > 2 && !isNaN(score)) {
|
|
654
|
+
temp.push({ id, score });
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
// Keep unique entries with highest score
|
|
658
|
+
const uniqueMap = new Map();
|
|
659
|
+
temp.forEach(({ id, score }) => {
|
|
660
|
+
if (!uniqueMap.has(id) || uniqueMap.get(id) < score) {
|
|
661
|
+
uniqueMap.set(id, score);
|
|
662
|
+
}
|
|
663
|
+
});
|
|
664
|
+
// Sort positives and negatives separately
|
|
665
|
+
const arr = Array.from(uniqueMap.entries()).map(([id, score]) => ({ id, score }));
|
|
666
|
+
const positives = arr.filter((item) => item.score >= 0).sort((a, b) => b.score - a.score);
|
|
667
|
+
const negatives = arr.filter((item) => item.score < 0).sort((a, b) => a.score - b.score);
|
|
668
|
+
// Take top 5 from combined list
|
|
669
|
+
const top5 = [...positives, ...negatives].slice(0, PERSONAL_BRANDING_LIMIT);
|
|
670
|
+
return top5.map((item) => ({ id: item.id }));
|
|
671
|
+
}
|
|
672
|
+
/**
|
|
673
|
+
* Extracts personal branding for default formats
|
|
674
|
+
*/
|
|
675
|
+
function extractPersonalBrandingDefault(text) {
|
|
676
|
+
const personalBrandingValues = text.match(PERSONAL_BRANDING_REGEX_PATTERNS.default);
|
|
677
|
+
if (!personalBrandingValues)
|
|
678
|
+
return [];
|
|
679
|
+
return personalBrandingValues
|
|
680
|
+
.map((match) => match.replace(/\s+Anda$/, ''))
|
|
681
|
+
.map((value) => value.replace(/PERSONAL BRANDING\s+/, ''))
|
|
682
|
+
.filter((value) => value && value !== 'PERSONAL BRANDING')
|
|
683
|
+
.slice(0, PERSONAL_BRANDING_LIMIT)
|
|
684
|
+
.map((value) => ({ id: value.trim() }));
|
|
685
|
+
}
|
|
686
|
+
/**
|
|
687
|
+
* Extracts personal branding data based on TMA format
|
|
688
|
+
*/
|
|
689
|
+
function extractPersonalBranding(text, pageCount = 0) {
|
|
690
|
+
const format = detectTmaFormat(pageCount);
|
|
691
|
+
if (format === '49-page') {
|
|
692
|
+
return extractPersonalBrandingFormat49(text);
|
|
693
|
+
}
|
|
694
|
+
return extractPersonalBrandingDefault(text);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/**
|
|
698
|
+
* SCM (Strength Cluster Map) Activities Definitions
|
|
699
|
+
* Pre-defined list of all 114 activities with their typology and cluster information
|
|
700
|
+
* Framework-independent constants for tma-extractor package
|
|
701
|
+
*/
|
|
702
|
+
const SCM_ACTIVITIES_DEFINITIONS = [
|
|
703
|
+
// TOP AREA (24 activities, dari kiri ke kanan)
|
|
704
|
+
{ id: 'RELATING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 0 },
|
|
705
|
+
{ id: 'REPRESENTING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 1 },
|
|
706
|
+
{ id: 'COMMUNICATING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 2 },
|
|
707
|
+
{ id: 'CORRESPONDING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 3 },
|
|
708
|
+
{ id: 'ENTERTAINING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 4 },
|
|
709
|
+
{ id: 'PRESENTING', typology: 'COMMUNICATOR', cluster: 'NETWORKING', hasPsp: true, area: 'top', position: 5 },
|
|
710
|
+
{ id: 'COOPERATING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 6 },
|
|
711
|
+
{ id: 'COORDINATING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 7 },
|
|
712
|
+
{ id: 'DISPATCHING', typology: 'ARRANGER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 8 },
|
|
713
|
+
{ id: 'MEDIATING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 9 },
|
|
714
|
+
{ id: 'NEGOTIATING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 10 },
|
|
715
|
+
{ id: 'PURCHASING', typology: 'MEDIATOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 11 },
|
|
716
|
+
{ id: 'COLLECTING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 12 },
|
|
717
|
+
{ id: 'CONTROLLING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 13 },
|
|
718
|
+
{ id: 'INTERROGATING', typology: 'COMMANDER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 14 },
|
|
719
|
+
{ id: 'BROKERING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 15 },
|
|
720
|
+
{ id: 'INFLUENCING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 16 },
|
|
721
|
+
{ id: 'SELLING', typology: 'SELLER', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 17 },
|
|
722
|
+
{ id: 'RECRUITING', typology: 'SELECTOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 18 },
|
|
723
|
+
{ id: 'INTERVIEWING', typology: 'SELECTOR', cluster: 'HEADMAN', hasPsp: true, area: 'top', position: 19 },
|
|
724
|
+
{ id: 'CARING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 20 },
|
|
725
|
+
{ id: 'COUNSELING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 21 },
|
|
726
|
+
{ id: 'SPIRITUALIZING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 22 },
|
|
727
|
+
{ id: 'THERAPIES', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'top', position: 23 },
|
|
728
|
+
// LEFT AREA (33 activities, dari atas ke bawah) - pattern: NAME-PSS-PSP
|
|
729
|
+
{ id: 'LIAISING', typology: 'AMBASSADOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 0 },
|
|
730
|
+
{ id: 'GUIDING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 1 },
|
|
731
|
+
{ id: 'MOTIVATING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 2 },
|
|
732
|
+
{ id: 'SUPPORTING', typology: 'MOTIVATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 3 },
|
|
733
|
+
{ id: 'ADVISING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 4 },
|
|
734
|
+
{ id: 'COACHING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 5 },
|
|
735
|
+
{ id: 'CONSULTING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 6 },
|
|
736
|
+
{ id: 'MENTORING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 7 },
|
|
737
|
+
{ id: 'TEACHING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 8 },
|
|
738
|
+
{ id: 'TRAINING', typology: 'EDUCATOR', cluster: 'NETWORKING', hasPsp: true, area: 'left', position: 9 },
|
|
739
|
+
{ id: 'ANALYSING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 10 },
|
|
740
|
+
{ id: 'BOOKEEPING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 11 },
|
|
741
|
+
{ id: 'PROGRAMMING', typology: 'ANALYST', cluster: 'THINKING', hasPsp: true, area: 'left', position: 12 },
|
|
742
|
+
{ id: 'BUDGETING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 13 },
|
|
743
|
+
{ id: 'CASHIERING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 14 },
|
|
744
|
+
{ id: 'COSTING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 15 },
|
|
745
|
+
{ id: 'ESTIMATING', typology: 'TREASURER', cluster: 'THINKING', hasPsp: true, area: 'left', position: 16 },
|
|
746
|
+
{ id: 'AUDITING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 17 },
|
|
747
|
+
{ id: 'EVALUATING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 18 },
|
|
748
|
+
{ id: 'INSPECTING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 19 },
|
|
749
|
+
{ id: 'INVESTIGATING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 20 },
|
|
750
|
+
{ id: 'REVIEWING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 21 },
|
|
751
|
+
{ id: 'VERIFYING', typology: 'EVALUATOR', cluster: 'REASONING', hasPsp: true, area: 'left', position: 22 },
|
|
752
|
+
{ id: 'DIAGNOSING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 23 },
|
|
753
|
+
{ id: 'IDENTIFYING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 24 },
|
|
754
|
+
{ id: 'RESTORING', typology: 'RESTORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 25 },
|
|
755
|
+
{ id: 'APPRAISING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 26 },
|
|
756
|
+
{ id: 'OBSERVING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 27 },
|
|
757
|
+
{ id: 'RESEARCHING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 28 },
|
|
758
|
+
{ id: 'SURVEYING', typology: 'EXPLORER', cluster: 'REASONING', hasPsp: true, area: 'left', position: 29 },
|
|
759
|
+
{ id: 'CONCEPTUALIZING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 30 },
|
|
760
|
+
{ id: 'EDITING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 31 },
|
|
761
|
+
{ id: 'REDACTING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'left', position: 32 },
|
|
762
|
+
// RIGHT AREA (33 activities, dari atas ke bawah) - pattern: PSS-PSP-NAME
|
|
763
|
+
{ id: 'VOLUNTEERING', typology: 'CARETAKER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 0 },
|
|
764
|
+
{ id: 'ASSISTING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 1 },
|
|
765
|
+
{ id: 'GREETING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 2 },
|
|
766
|
+
{ id: 'INFORMING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 3 },
|
|
767
|
+
{ id: 'SERVING', typology: 'SERVER', cluster: 'SERVICING', hasPsp: true, area: 'right', position: 4 },
|
|
768
|
+
{ id: 'DESIGNING', typology: 'DESIGNER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 5 },
|
|
769
|
+
{ id: 'DRAFTING', typology: 'DESIGNER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 6 },
|
|
770
|
+
{ id: 'ANIMATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 7 },
|
|
771
|
+
{ id: 'CREATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 8 },
|
|
772
|
+
{ id: 'IDEATING', typology: 'CREATOR', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 9 },
|
|
773
|
+
{ id: 'SYNTHESIZING', typology: 'SYNTHESIZER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 10 },
|
|
774
|
+
{ id: 'ADVERTISING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 11 },
|
|
775
|
+
{ id: 'DEVELOPING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 12 },
|
|
776
|
+
{ id: 'MARKETING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 13 },
|
|
777
|
+
{ id: 'PUBLICIZING', typology: 'MARKETER', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 14 },
|
|
778
|
+
{ id: 'PLANNING', typology: 'STRATEGIST', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 15 },
|
|
779
|
+
{ id: 'STRATEGIZING', typology: 'STRATEGIST', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 16 },
|
|
780
|
+
{ id: 'VISIONING', typology: 'VISIONARY', cluster: 'GENERATING IDEA', hasPsp: true, area: 'right', position: 17 },
|
|
781
|
+
// 15 activities without PSP (hasPsp: false)
|
|
782
|
+
{ id: 'ACTING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 18 },
|
|
783
|
+
{ id: 'BEAUTIFYING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 19 },
|
|
784
|
+
{ id: 'CONSERVING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 20 },
|
|
785
|
+
{ id: 'COOKING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 21 },
|
|
786
|
+
{ id: 'DANCING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 22 },
|
|
787
|
+
{ id: 'DRAMATIZING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 23 },
|
|
788
|
+
{ id: 'MODELLING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 24 },
|
|
789
|
+
{ id: 'MUSICAL ART', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 25 },
|
|
790
|
+
{ id: 'SINGING', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 26 },
|
|
791
|
+
{ id: 'VISUAL ART', typology: '-', cluster: 'GENERATING IDEA', hasPsp: false, area: 'right', position: 27 },
|
|
792
|
+
{ id: 'MANUAL SKILL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 28 },
|
|
793
|
+
{ id: 'PHYSICAL SKILL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 29 },
|
|
794
|
+
{ id: 'PLANTING', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 30 },
|
|
795
|
+
{ id: 'SPORT', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 31 },
|
|
796
|
+
{ id: 'TENDING ANIMAL', typology: '-', cluster: 'TECHNICAL', hasPsp: false, area: 'right', position: 32 },
|
|
797
|
+
// BOTTOM AREA (24 activities, dari kiri ke kanan) - pattern: PSS-PSP-NAME
|
|
798
|
+
{ id: 'REPORTING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 0 },
|
|
799
|
+
{ id: 'WRITING', typology: 'JOURNALIST', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 1 },
|
|
800
|
+
{ id: 'INTERPRETING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 2 },
|
|
801
|
+
{ id: 'TRANSCRIBING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 3 },
|
|
802
|
+
{ id: 'TRANSLATING', typology: 'INTERPRETER', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 4 },
|
|
803
|
+
{ id: 'COMPLIANCING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 5 },
|
|
804
|
+
{ id: 'FILING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 6 },
|
|
805
|
+
{ id: 'HOUSEKEEPING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 7 },
|
|
806
|
+
{ id: 'ORGANISING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 8 },
|
|
807
|
+
{ id: 'SCHEDULING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 9 },
|
|
808
|
+
{ id: 'TYPEWRITING', typology: 'ADMINISTRATOR', cluster: 'ELEMENTARY', hasPsp: true, area: 'bottom', position: 10 },
|
|
809
|
+
{ id: 'ASSEMBLING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 11 },
|
|
810
|
+
{ id: 'BUILDING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 12 },
|
|
811
|
+
{ id: 'INSTALLING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 13 },
|
|
812
|
+
{ id: 'PRODUCING', typology: 'PRODUCER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 14 },
|
|
813
|
+
{ id: 'MONITORING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 15 },
|
|
814
|
+
{ id: 'SAFEKEEPING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 16 },
|
|
815
|
+
{ id: 'SECURING', typology: 'SAFEKEEPER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 17 },
|
|
816
|
+
{ id: 'FINISHING', typology: 'QUALITY-CONTROLLER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 18 },
|
|
817
|
+
{ id: 'TESTING', typology: 'QUALITY-CONTROLLER', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 19 },
|
|
818
|
+
{ id: 'DELIVERING', typology: 'DISTRIBUTOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 20 },
|
|
819
|
+
{ id: 'DISTRIBUTING', typology: 'DISTRIBUTOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 21 },
|
|
820
|
+
{ id: 'MAINTAINING', typology: 'OPERATOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 22 },
|
|
821
|
+
{ id: 'OPERATING', typology: 'OPERATOR', cluster: 'TECHNICAL', hasPsp: true, area: 'bottom', position: 23 },
|
|
822
|
+
];
|
|
823
|
+
// Helper Map for quick lookup
|
|
824
|
+
const SCM_ACTIVITIES_MAP = new Map(SCM_ACTIVITIES_DEFINITIONS.map((activity) => [activity.id, activity]));
|
|
825
|
+
function getActivityByPosition(area, position) {
|
|
826
|
+
return SCM_ACTIVITIES_DEFINITIONS.find((activity) => activity.area === area && activity.position === position);
|
|
827
|
+
}
|
|
828
|
+
// Validation constants
|
|
829
|
+
const SCM_VALIDATION = {
|
|
830
|
+
TOTAL_ACTIVITIES: 114,
|
|
831
|
+
ACTIVITIES_WITH_PSP: 99,
|
|
832
|
+
ACTIVITIES_WITHOUT_PSP: 15,
|
|
833
|
+
EXPECTED_COUNTS: {
|
|
834
|
+
left: 33,
|
|
835
|
+
right: 33,
|
|
836
|
+
top: 24,
|
|
837
|
+
bottom: 24
|
|
838
|
+
}
|
|
839
|
+
};
|
|
840
|
+
|
|
841
|
+
/**
|
|
842
|
+
* SCM (Strength Cluster Map) Extractor
|
|
843
|
+
* Framework-independent extraction of 114 activities with PSP/PSS color detection
|
|
844
|
+
* Integrates PDF.js for rendering and Tesseract.js for OCR
|
|
845
|
+
*/
|
|
846
|
+
class ScmExtractor {
|
|
847
|
+
constructor() {
|
|
848
|
+
this.tesseractWorker = null; // eslint-disable-line @typescript-eslint/no-explicit-any
|
|
849
|
+
}
|
|
850
|
+
/**
|
|
851
|
+
* Extract SCM data from a PDF page
|
|
852
|
+
*/
|
|
853
|
+
async extractScmData(pdfPage) {
|
|
854
|
+
// eslint-disable-line @typescript-eslint/no-explicit-any
|
|
855
|
+
const startTime = Date.now();
|
|
856
|
+
try {
|
|
857
|
+
// 1. Detect ID presence using PDF.js text extraction
|
|
858
|
+
const scmHasId = await this.detectScmHasId(pdfPage);
|
|
859
|
+
const scmAreaConfigs = this.getScmAreaConfigs(scmHasId);
|
|
860
|
+
console.log(`🆔 SCM ID Detection: ${scmHasId ? 'ID Present' : 'No ID'}`);
|
|
861
|
+
// 2. Render page to canvas
|
|
862
|
+
const canvas = await this.renderPageToCanvas(pdfPage);
|
|
863
|
+
// 3. Extract from all 4 areas
|
|
864
|
+
const [leftActivities, rightActivities, topActivities, bottomActivities] = await Promise.all([
|
|
865
|
+
this.extractAreaActivities(canvas, 'left', scmAreaConfigs),
|
|
866
|
+
this.extractAreaActivities(canvas, 'right', scmAreaConfigs),
|
|
867
|
+
this.extractAreaActivities(canvas, 'top', scmAreaConfigs),
|
|
868
|
+
this.extractAreaActivities(canvas, 'bottom', scmAreaConfigs)
|
|
869
|
+
]);
|
|
870
|
+
const allDetected = [
|
|
871
|
+
...leftActivities,
|
|
872
|
+
...rightActivities,
|
|
873
|
+
...topActivities,
|
|
874
|
+
...bottomActivities
|
|
875
|
+
];
|
|
876
|
+
// 4. Map to complete definitions
|
|
877
|
+
const mappedActivities = this.mapDetectedToDefinitions(allDetected);
|
|
878
|
+
// 5. Calculate statistics
|
|
879
|
+
const detectedCount = allDetected.length;
|
|
880
|
+
const unmappedActivities = allDetected
|
|
881
|
+
.filter((d) => !SCM_ACTIVITIES_MAP.has(d.activityName))
|
|
882
|
+
.map((d) => d.activityName);
|
|
883
|
+
const totalConfidence = mappedActivities.reduce((sum, activity) => sum + activity.detectionConfidence, 0);
|
|
884
|
+
const averageConfidence = mappedActivities.length > 0 ? totalConfidence / mappedActivities.length : 0;
|
|
885
|
+
const extractionTime = Date.now() - startTime;
|
|
886
|
+
return {
|
|
887
|
+
activities: mappedActivities,
|
|
888
|
+
metadata: {
|
|
889
|
+
totalActivities: SCM_VALIDATION.TOTAL_ACTIVITIES,
|
|
890
|
+
detectedActivities: detectedCount,
|
|
891
|
+
unmappedActivities,
|
|
892
|
+
averageConfidence,
|
|
893
|
+
extractionTime
|
|
894
|
+
}
|
|
895
|
+
};
|
|
896
|
+
}
|
|
897
|
+
catch (error) {
|
|
898
|
+
console.error('SCM extraction failed:', error);
|
|
899
|
+
return null;
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
/**
|
|
903
|
+
* Detect if SCM page has ID below the name using PDF.js text extraction
|
|
904
|
+
*/
|
|
905
|
+
async detectScmHasId(pdfPage) {
|
|
906
|
+
// eslint-disable-line @typescript-eslint/no-explicit-any
|
|
907
|
+
try {
|
|
908
|
+
const textContent = await pdfPage.getTextContent();
|
|
909
|
+
const textItems = textContent.items;
|
|
910
|
+
const viewport = pdfPage.getViewport({ scale: 1.0 });
|
|
911
|
+
const pageHeight = viewport.height;
|
|
912
|
+
const idSearchArea = pageHeight * 0.15; // Search in upper 15% of page
|
|
913
|
+
for (const item of textItems) {
|
|
914
|
+
if (item.transform && item.transform[5] > pageHeight - idSearchArea) {
|
|
915
|
+
const text = item.str?.trim();
|
|
916
|
+
if (text && /^\d+$/.test(text) && text.length >= 3) {
|
|
917
|
+
console.log(`🔍 ID Detection: Found ID "${text}" at position (${Math.round(item.transform[4])}, ${Math.round(item.transform[5])})`);
|
|
918
|
+
return true;
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
console.log('🔍 ID Detection: No numeric ID found in upper area');
|
|
923
|
+
return false;
|
|
924
|
+
}
|
|
925
|
+
catch (error) {
|
|
926
|
+
console.warn('PDF.js ID detection failed:', error);
|
|
927
|
+
return false;
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
/**
|
|
931
|
+
* Get area configurations based on whether ID is present
|
|
932
|
+
*/
|
|
933
|
+
getScmAreaConfigs(scmHasId) {
|
|
934
|
+
const topStartY = scmHasId ? 0.16 : 0.148;
|
|
935
|
+
const topEndY = scmHasId ? 0.258 : 0.246;
|
|
936
|
+
const bottomStartY = scmHasId ? 0.66 : 0.652;
|
|
937
|
+
const bottomEndY = scmHasId ? 0.758 : 0.746;
|
|
938
|
+
const leftRightStartY = scmHasId ? 0.256 : 0.244;
|
|
939
|
+
const leftRightEndY = scmHasId ? 0.662 : 0.65;
|
|
940
|
+
return {
|
|
941
|
+
left: {
|
|
942
|
+
startX: 0.065,
|
|
943
|
+
endX: 0.278,
|
|
944
|
+
startY: leftRightStartY,
|
|
945
|
+
endY: leftRightEndY,
|
|
946
|
+
expectedCount: 33,
|
|
947
|
+
pattern: 'NAME-PSS-PSP',
|
|
948
|
+
orientation: 'horizontal'
|
|
949
|
+
},
|
|
950
|
+
right: {
|
|
951
|
+
startX: 0.76,
|
|
952
|
+
endX: 0.92,
|
|
953
|
+
startY: leftRightStartY,
|
|
954
|
+
endY: leftRightEndY,
|
|
955
|
+
expectedCount: 33,
|
|
956
|
+
pattern: 'PSS-PSP-NAME',
|
|
957
|
+
orientation: 'horizontal'
|
|
958
|
+
},
|
|
959
|
+
top: {
|
|
960
|
+
startX: 0.255,
|
|
961
|
+
endX: 0.785,
|
|
962
|
+
startY: topStartY,
|
|
963
|
+
endY: topEndY,
|
|
964
|
+
expectedCount: 24,
|
|
965
|
+
pattern: 'NAME-PSS-PSP',
|
|
966
|
+
orientation: 'vertical'
|
|
967
|
+
},
|
|
968
|
+
bottom: {
|
|
969
|
+
startX: 0.255,
|
|
970
|
+
endX: 0.785,
|
|
971
|
+
startY: bottomStartY,
|
|
972
|
+
endY: bottomEndY,
|
|
973
|
+
expectedCount: 24,
|
|
974
|
+
pattern: 'PSS-PSP-NAME',
|
|
975
|
+
orientation: 'vertical'
|
|
976
|
+
}
|
|
977
|
+
};
|
|
978
|
+
}
|
|
979
|
+
/**
|
|
980
|
+
* Render PDF page to high-resolution canvas
|
|
981
|
+
*/
|
|
982
|
+
async renderPageToCanvas(pdfPage) {
|
|
983
|
+
// eslint-disable-line @typescript-eslint/no-explicit-any
|
|
984
|
+
const scale = 2.0; // Higher resolution for better OCR
|
|
985
|
+
const viewport = pdfPage.getViewport({ scale });
|
|
986
|
+
const canvas = document.createElement('canvas');
|
|
987
|
+
canvas.width = viewport.width;
|
|
988
|
+
canvas.height = viewport.height;
|
|
989
|
+
const context = canvas.getContext('2d');
|
|
990
|
+
if (!context) {
|
|
991
|
+
throw new Error('Could not get canvas context');
|
|
992
|
+
}
|
|
993
|
+
const renderTask = pdfPage.render({
|
|
994
|
+
canvasContext: context,
|
|
995
|
+
viewport: viewport
|
|
996
|
+
});
|
|
997
|
+
await renderTask.promise;
|
|
998
|
+
return canvas;
|
|
999
|
+
}
|
|
1000
|
+
/**
|
|
1001
|
+
* Detect color at specific coordinates using Canvas API
|
|
1002
|
+
*/
|
|
1003
|
+
detectColorAt(canvas, point) {
|
|
1004
|
+
const ctx = canvas.getContext('2d');
|
|
1005
|
+
if (!ctx)
|
|
1006
|
+
return 'white';
|
|
1007
|
+
const sampleSize = 10;
|
|
1008
|
+
const imageData = ctx.getImageData(Math.max(0, point.x - sampleSize / 2), Math.max(0, point.y - sampleSize / 2), sampleSize, sampleSize);
|
|
1009
|
+
let totalR = 0, totalG = 0, totalB = 0;
|
|
1010
|
+
const pixelCount = sampleSize * sampleSize;
|
|
1011
|
+
for (let i = 0; i < imageData.data.length; i += 4) {
|
|
1012
|
+
totalR += imageData.data[i];
|
|
1013
|
+
totalG += imageData.data[i + 1];
|
|
1014
|
+
totalB += imageData.data[i + 2];
|
|
1015
|
+
}
|
|
1016
|
+
const avgR = Math.round(totalR / pixelCount);
|
|
1017
|
+
const avgG = Math.round(totalG / pixelCount);
|
|
1018
|
+
const avgB = Math.round(totalB / pixelCount);
|
|
1019
|
+
return this.mapRgbToScmColor(avgR, avgG, avgB);
|
|
1020
|
+
}
|
|
1021
|
+
/**
|
|
1022
|
+
* Map RGB values to SCM color using exact hex color matching
|
|
1023
|
+
*/
|
|
1024
|
+
mapRgbToScmColor(r, g, b) {
|
|
1025
|
+
const colors = {
|
|
1026
|
+
black: { r: 0, g: 0, b: 0 },
|
|
1027
|
+
gray: { r: 125, g: 125, b: 125 },
|
|
1028
|
+
white: { r: 255, g: 255, b: 255 },
|
|
1029
|
+
yellow: { r: 255, g: 255, b: 51 },
|
|
1030
|
+
red: { r: 255, g: 0, b: 0 }
|
|
1031
|
+
};
|
|
1032
|
+
let minDistance = Infinity;
|
|
1033
|
+
let closestColor = 'white';
|
|
1034
|
+
const tolerance = 30;
|
|
1035
|
+
for (const [colorName, colorRgb] of Object.entries(colors)) {
|
|
1036
|
+
const distance = Math.sqrt(Math.pow(r - colorRgb.r, 2) + Math.pow(g - colorRgb.g, 2) + Math.pow(b - colorRgb.b, 2));
|
|
1037
|
+
if (distance <= tolerance && distance < minDistance) {
|
|
1038
|
+
minDistance = distance;
|
|
1039
|
+
closestColor = colorName;
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
if (minDistance > tolerance) {
|
|
1043
|
+
minDistance = Infinity;
|
|
1044
|
+
for (const [colorName, colorRgb] of Object.entries(colors)) {
|
|
1045
|
+
const distance = Math.sqrt(Math.pow(r - colorRgb.r, 2) + Math.pow(g - colorRgb.g, 2) + Math.pow(b - colorRgb.b, 2));
|
|
1046
|
+
if (distance < minDistance) {
|
|
1047
|
+
minDistance = distance;
|
|
1048
|
+
closestColor = colorName;
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
return closestColor;
|
|
1053
|
+
}
|
|
1054
|
+
/**
|
|
1055
|
+
* Extract activities from a specific area using pre-defined positions
|
|
1056
|
+
*/
|
|
1057
|
+
async extractAreaActivities(canvas, area, scmAreaConfigs // eslint-disable-line @typescript-eslint/no-explicit-any
|
|
1058
|
+
) {
|
|
1059
|
+
const config = scmAreaConfigs[area];
|
|
1060
|
+
const activities = [];
|
|
1061
|
+
for (let i = 0; i < config.expectedCount; i++) {
|
|
1062
|
+
try {
|
|
1063
|
+
const expectedActivity = getActivityByPosition(area, i);
|
|
1064
|
+
if (!expectedActivity) {
|
|
1065
|
+
console.warn(`No activity defined for ${area} area position ${i}`);
|
|
1066
|
+
continue;
|
|
1067
|
+
}
|
|
1068
|
+
const coords = this.calculateActivityCoordinates(canvas, area, i, scmAreaConfigs);
|
|
1069
|
+
// Extract activity name using Tesseract OCR
|
|
1070
|
+
const detectedName = await this.extractActivityName(canvas, coords.name, area);
|
|
1071
|
+
if (expectedActivity.id !== detectedName) {
|
|
1072
|
+
console.log(`🚀 ~ ${area}[${i}] Expected: ${expectedActivity.id}, Detected: "${detectedName}"`);
|
|
1073
|
+
}
|
|
1074
|
+
// Detect colors
|
|
1075
|
+
const pssColor = this.detectColorAt(canvas, coords.pss);
|
|
1076
|
+
const pspColor = expectedActivity.hasPsp ? this.detectColorAt(canvas, coords.psp) : null;
|
|
1077
|
+
// Calculate confidence
|
|
1078
|
+
const confidence = this.calculateDetectionConfidence(detectedName, expectedActivity.id, pssColor, pspColor);
|
|
1079
|
+
activities.push({
|
|
1080
|
+
activityName: expectedActivity.id,
|
|
1081
|
+
psp: pspColor,
|
|
1082
|
+
pss: pssColor,
|
|
1083
|
+
area,
|
|
1084
|
+
position: i,
|
|
1085
|
+
confidence
|
|
1086
|
+
});
|
|
1087
|
+
}
|
|
1088
|
+
catch (error) {
|
|
1089
|
+
console.warn(`Failed to extract activity at ${area} position ${i}:`, error);
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
return activities;
|
|
1093
|
+
}
|
|
1094
|
+
/**
|
|
1095
|
+
* Calculate activity coordinates based on area and position
|
|
1096
|
+
*/
|
|
1097
|
+
calculateActivityCoordinates(canvas, area, index, scmAreaConfigs // eslint-disable-line @typescript-eslint/no-explicit-any
|
|
1098
|
+
) {
|
|
1099
|
+
const config = scmAreaConfigs[area];
|
|
1100
|
+
const canvasWidth = canvas.width;
|
|
1101
|
+
const canvasHeight = canvas.height;
|
|
1102
|
+
const regionX = config.startX * canvasWidth;
|
|
1103
|
+
const regionY = config.startY * canvasHeight;
|
|
1104
|
+
const regionWidth = (config.endX - config.startX) * canvasWidth;
|
|
1105
|
+
const regionHeight = (config.endY - config.startY) * canvasHeight;
|
|
1106
|
+
if (area === 'left' || area === 'right') {
|
|
1107
|
+
const itemHeight = regionHeight / config.expectedCount;
|
|
1108
|
+
const rowY = regionY + index * itemHeight;
|
|
1109
|
+
const centerY = rowY + itemHeight / 2;
|
|
1110
|
+
if (area === 'left') {
|
|
1111
|
+
const nameWidth = regionWidth * 0.5;
|
|
1112
|
+
return {
|
|
1113
|
+
name: {
|
|
1114
|
+
x: regionX + regionWidth * 0.28,
|
|
1115
|
+
y: centerY - 8,
|
|
1116
|
+
width: nameWidth,
|
|
1117
|
+
height: itemHeight * 0.9
|
|
1118
|
+
},
|
|
1119
|
+
pss: { x: regionX + regionWidth * 0.84, y: centerY },
|
|
1120
|
+
psp: { x: regionX + regionWidth * 0.94, y: centerY }
|
|
1121
|
+
};
|
|
1122
|
+
}
|
|
1123
|
+
else {
|
|
1124
|
+
const nameWidth = regionWidth * 0.7;
|
|
1125
|
+
return {
|
|
1126
|
+
pss: { x: regionX + regionWidth * 0.09, y: centerY },
|
|
1127
|
+
psp: { x: regionX + regionWidth * 0.22, y: centerY },
|
|
1128
|
+
name: {
|
|
1129
|
+
x: regionX + regionWidth * 0.28,
|
|
1130
|
+
y: rowY + itemHeight * 0.28,
|
|
1131
|
+
width: nameWidth,
|
|
1132
|
+
height: itemHeight * 0.8
|
|
1133
|
+
}
|
|
1134
|
+
};
|
|
1135
|
+
}
|
|
1136
|
+
}
|
|
1137
|
+
else {
|
|
1138
|
+
const itemWidth = regionWidth / config.expectedCount;
|
|
1139
|
+
const colX = regionX + index * itemWidth;
|
|
1140
|
+
const centerX = colX + itemWidth / 2;
|
|
1141
|
+
if (area === 'top') {
|
|
1142
|
+
return {
|
|
1143
|
+
name: {
|
|
1144
|
+
x: colX + itemWidth * 0.1,
|
|
1145
|
+
y: regionY + regionHeight * 0.03,
|
|
1146
|
+
width: itemWidth * 0.8,
|
|
1147
|
+
height: regionHeight * 0.75
|
|
1148
|
+
},
|
|
1149
|
+
pss: { x: centerX, y: regionY + regionHeight * 0.82 },
|
|
1150
|
+
psp: { x: centerX + itemWidth * 0.05, y: regionY + regionHeight * 0.95 }
|
|
1151
|
+
};
|
|
1152
|
+
}
|
|
1153
|
+
else {
|
|
1154
|
+
return {
|
|
1155
|
+
psp: { x: centerX, y: regionY + regionHeight * 0.075 },
|
|
1156
|
+
pss: { x: centerX, y: regionY + regionHeight * 0.2 },
|
|
1157
|
+
name: {
|
|
1158
|
+
x: colX + itemWidth * 0.1,
|
|
1159
|
+
y: regionY + regionHeight * 0.24,
|
|
1160
|
+
width: itemWidth * 0.8,
|
|
1161
|
+
height: regionHeight * 0.8
|
|
1162
|
+
}
|
|
1163
|
+
};
|
|
1164
|
+
}
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
/**
|
|
1168
|
+
* Extract activity name using Tesseract OCR from canvas region
|
|
1169
|
+
*/
|
|
1170
|
+
async extractActivityName(canvas, region, area) {
|
|
1171
|
+
try {
|
|
1172
|
+
const worker = await this.initializeTesseract();
|
|
1173
|
+
const tempCanvas = document.createElement('canvas');
|
|
1174
|
+
tempCanvas.width = region.width;
|
|
1175
|
+
tempCanvas.height = region.height;
|
|
1176
|
+
const tempCtx = tempCanvas.getContext('2d');
|
|
1177
|
+
if (!tempCtx)
|
|
1178
|
+
return '';
|
|
1179
|
+
tempCtx.drawImage(canvas, region.x, region.y, region.width, region.height, 0, 0, region.width, region.height);
|
|
1180
|
+
// Handle vertical text rotation for top/bottom areas
|
|
1181
|
+
if (area === 'top' || area === 'bottom') {
|
|
1182
|
+
const rotatedCanvas = document.createElement('canvas');
|
|
1183
|
+
const rotatedCtx = rotatedCanvas.getContext('2d');
|
|
1184
|
+
if (!rotatedCtx)
|
|
1185
|
+
return '';
|
|
1186
|
+
rotatedCanvas.width = region.height;
|
|
1187
|
+
rotatedCanvas.height = region.width;
|
|
1188
|
+
rotatedCtx.translate(region.height / 2, region.width / 2);
|
|
1189
|
+
rotatedCtx.rotate(Math.PI / 2);
|
|
1190
|
+
rotatedCtx.drawImage(tempCanvas, -region.width / 2, -region.height / 2, region.width, region.height);
|
|
1191
|
+
tempCanvas.width = rotatedCanvas.width;
|
|
1192
|
+
tempCanvas.height = rotatedCanvas.height;
|
|
1193
|
+
tempCtx.clearRect(0, 0, tempCanvas.width, tempCanvas.height);
|
|
1194
|
+
tempCtx.drawImage(rotatedCanvas, 0, 0);
|
|
1195
|
+
}
|
|
1196
|
+
// Enhance image for OCR
|
|
1197
|
+
tempCtx.filter = 'contrast(150%) brightness(100%)';
|
|
1198
|
+
tempCtx.drawImage(tempCanvas, 0, 0);
|
|
1199
|
+
// Configure OCR
|
|
1200
|
+
const Tesseract = await import('tesseract.js');
|
|
1201
|
+
await worker.setParameters({
|
|
1202
|
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ',
|
|
1203
|
+
tessedit_pageseg_mode: area === 'top' || area === 'bottom'
|
|
1204
|
+
? Tesseract.PSM.SINGLE_LINE
|
|
1205
|
+
: Tesseract.PSM.SINGLE_BLOCK,
|
|
1206
|
+
preserve_interword_spaces: '1'
|
|
1207
|
+
});
|
|
1208
|
+
const { data: { text } } = await worker.recognize(tempCanvas);
|
|
1209
|
+
const cleanedText = this.normalizeActivityName(text);
|
|
1210
|
+
const bestMatch = this.findBestActivityMatch(cleanedText);
|
|
1211
|
+
return bestMatch || cleanedText;
|
|
1212
|
+
}
|
|
1213
|
+
catch (error) {
|
|
1214
|
+
console.warn('OCR extraction failed:', error);
|
|
1215
|
+
return '';
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
/**
|
|
1219
|
+
* Initialize Tesseract worker for OCR
|
|
1220
|
+
*/
|
|
1221
|
+
async initializeTesseract() {
|
|
1222
|
+
if (!this.tesseractWorker) {
|
|
1223
|
+
const Tesseract = await import('tesseract.js');
|
|
1224
|
+
this.tesseractWorker = await Tesseract.createWorker('eng');
|
|
1225
|
+
await this.tesseractWorker.setParameters({
|
|
1226
|
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ',
|
|
1227
|
+
tessedit_pageseg_mode: Tesseract.PSM.SINGLE_BLOCK,
|
|
1228
|
+
preserve_interword_spaces: '1'
|
|
1229
|
+
});
|
|
1230
|
+
}
|
|
1231
|
+
return this.tesseractWorker;
|
|
1232
|
+
}
|
|
1233
|
+
/**
|
|
1234
|
+
* Normalize activity name for matching
|
|
1235
|
+
*/
|
|
1236
|
+
normalizeActivityName(text) {
|
|
1237
|
+
return text
|
|
1238
|
+
.toUpperCase()
|
|
1239
|
+
.trim()
|
|
1240
|
+
.replace(/[^A-Z\s]/g, '')
|
|
1241
|
+
.replace(/\s+/g, ' ')
|
|
1242
|
+
.replace(/\s*$/, '');
|
|
1243
|
+
}
|
|
1244
|
+
/**
|
|
1245
|
+
* Find best matching activity from pre-defined list
|
|
1246
|
+
*/
|
|
1247
|
+
findBestActivityMatch(detectedName) {
|
|
1248
|
+
const normalized = this.normalizeActivityName(detectedName);
|
|
1249
|
+
if (!normalized)
|
|
1250
|
+
return null;
|
|
1251
|
+
// Exact match first
|
|
1252
|
+
for (const [activityId] of SCM_ACTIVITIES_MAP) {
|
|
1253
|
+
if (this.normalizeActivityName(activityId) === normalized) {
|
|
1254
|
+
return activityId;
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
// Fuzzy matching for partial matches
|
|
1258
|
+
for (const [activityId] of SCM_ACTIVITIES_MAP) {
|
|
1259
|
+
const activityNormalized = this.normalizeActivityName(activityId);
|
|
1260
|
+
if (activityNormalized.includes(normalized) || normalized.includes(activityNormalized)) {
|
|
1261
|
+
return activityId;
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
return null;
|
|
1265
|
+
}
|
|
1266
|
+
/**
|
|
1267
|
+
* Calculate detection confidence score
|
|
1268
|
+
*/
|
|
1269
|
+
calculateDetectionConfidence(detected, matched, pss, psp) {
|
|
1270
|
+
let confidence = 0.3; // Base confidence
|
|
1271
|
+
// Text matching confidence
|
|
1272
|
+
const detectedNorm = this.normalizeActivityName(detected);
|
|
1273
|
+
const matchedNorm = this.normalizeActivityName(matched);
|
|
1274
|
+
if (detectedNorm === matchedNorm) {
|
|
1275
|
+
confidence += 0.5;
|
|
1276
|
+
}
|
|
1277
|
+
else if (detectedNorm.length > 0) {
|
|
1278
|
+
if (detectedNorm.includes(matchedNorm) || matchedNorm.includes(detectedNorm)) {
|
|
1279
|
+
confidence += 0.3;
|
|
1280
|
+
}
|
|
1281
|
+
else if (detectedNorm.length > 3) {
|
|
1282
|
+
const words1 = detectedNorm.split(' ').filter((w) => w.length > 2);
|
|
1283
|
+
const words2 = matchedNorm.split(' ').filter((w) => w.length > 2);
|
|
1284
|
+
const commonWords = words1.filter((w) => words2.includes(w));
|
|
1285
|
+
if (commonWords.length > 0) {
|
|
1286
|
+
confidence += 0.2;
|
|
1287
|
+
}
|
|
1288
|
+
else {
|
|
1289
|
+
confidence += 0.1;
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
// Color detection confidence
|
|
1294
|
+
if (pss !== 'white')
|
|
1295
|
+
confidence += 0.15;
|
|
1296
|
+
if (psp && psp !== 'white')
|
|
1297
|
+
confidence += 0.15;
|
|
1298
|
+
return Math.min(1.0, confidence);
|
|
1299
|
+
}
|
|
1300
|
+
/**
|
|
1301
|
+
* Map detected activities to definitions
|
|
1302
|
+
*/
|
|
1303
|
+
mapDetectedToDefinitions(detected) {
|
|
1304
|
+
const mapped = [];
|
|
1305
|
+
for (const [activityId, definition] of SCM_ACTIVITIES_MAP) {
|
|
1306
|
+
const detectedMatch = detected.find((d) => d.activityName === activityId);
|
|
1307
|
+
if (detectedMatch) {
|
|
1308
|
+
mapped.push({
|
|
1309
|
+
id: definition.id,
|
|
1310
|
+
typology: definition.typology,
|
|
1311
|
+
cluster: definition.cluster,
|
|
1312
|
+
psp: definition.hasPsp ? detectedMatch.psp : null,
|
|
1313
|
+
pss: detectedMatch.pss,
|
|
1314
|
+
detectionConfidence: detectedMatch.confidence
|
|
1315
|
+
});
|
|
1316
|
+
}
|
|
1317
|
+
else {
|
|
1318
|
+
mapped.push({
|
|
1319
|
+
id: definition.id,
|
|
1320
|
+
typology: definition.typology,
|
|
1321
|
+
cluster: definition.cluster,
|
|
1322
|
+
psp: definition.hasPsp ? 'white' : null,
|
|
1323
|
+
pss: 'white',
|
|
1324
|
+
detectionConfidence: 0
|
|
1325
|
+
});
|
|
1326
|
+
}
|
|
1327
|
+
}
|
|
1328
|
+
return mapped;
|
|
1329
|
+
}
|
|
1330
|
+
/**
|
|
1331
|
+
* Cleanup Tesseract worker
|
|
1332
|
+
*/
|
|
1333
|
+
async cleanup() {
|
|
1334
|
+
if (this.tesseractWorker) {
|
|
1335
|
+
await this.tesseractWorker.terminate();
|
|
1336
|
+
this.tesseractWorker = null;
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
/**
|
|
1342
|
+
* TMA data type definitions for the extractor package
|
|
1343
|
+
* Based on the original useExtractTMA.ts interfaces
|
|
1344
|
+
*/
|
|
1345
|
+
const DEFAULT_CONFIG = {
|
|
1346
|
+
includeSCM: true, // Include SCM by default as per Phase 2 requirements
|
|
1347
|
+
workerSrc: "auto",
|
|
1348
|
+
tesseractWorkerSrc: "auto",
|
|
1349
|
+
debug: false,
|
|
1350
|
+
timeoutMs: 30000,
|
|
1351
|
+
};
|
|
1352
|
+
|
|
1353
|
+
/**
|
|
1354
|
+
* Main TMA extractor class that orchestrates the extraction process
|
|
1355
|
+
*/
|
|
1356
|
+
class TmaExtractor {
|
|
1357
|
+
constructor(config = {}) {
|
|
1358
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
1359
|
+
this.pdfProcessor = new PdfProcessor(this.config);
|
|
1360
|
+
this.scmExtractor = new ScmExtractor();
|
|
1361
|
+
}
|
|
1362
|
+
/**
|
|
1363
|
+
* Validates input file
|
|
1364
|
+
*/
|
|
1365
|
+
validateFile(file) {
|
|
1366
|
+
const validation = this.pdfProcessor.validatePdfFile(file);
|
|
1367
|
+
if (!validation.isValid) {
|
|
1368
|
+
throw createError("INVALID_FILE_TYPE", validation.error || "Invalid file");
|
|
1369
|
+
}
|
|
1370
|
+
}
|
|
1371
|
+
/**
|
|
1372
|
+
* Extracts all talent data from page texts
|
|
1373
|
+
*/
|
|
1374
|
+
extractTalentData(pageTexts, pageCount) {
|
|
1375
|
+
try {
|
|
1376
|
+
const name = extractName(pageTexts.talentOrder, pageCount);
|
|
1377
|
+
const talents = extractTalentOrder(pageTexts.talentOrder, false, pageCount);
|
|
1378
|
+
const strength = extractStrength(pageTexts.strength, pageCount);
|
|
1379
|
+
const typology = extractTypology(pageTexts.typologyAndBranding);
|
|
1380
|
+
const personalbranding = extractPersonalBranding(pageTexts.typologyAndBranding, pageCount);
|
|
1381
|
+
return {
|
|
1382
|
+
name,
|
|
1383
|
+
talents,
|
|
1384
|
+
strength,
|
|
1385
|
+
typology,
|
|
1386
|
+
personalbranding,
|
|
1387
|
+
};
|
|
1388
|
+
}
|
|
1389
|
+
catch (error) {
|
|
1390
|
+
throw wrapError(error, "TALENT_EXTRACTION_ERROR", "Failed to extract talent data");
|
|
1391
|
+
}
|
|
1392
|
+
}
|
|
1393
|
+
/**
|
|
1394
|
+
* Processes a single PDF file and extracts TMA data
|
|
1395
|
+
*/
|
|
1396
|
+
async extractFromPdf(file) {
|
|
1397
|
+
// Validate input file
|
|
1398
|
+
this.validateFile(file);
|
|
1399
|
+
try {
|
|
1400
|
+
// Get page count and validate format
|
|
1401
|
+
const pageCount = await this.pdfProcessor.getPageCount(file);
|
|
1402
|
+
const formatValidation = validateAndGetFormat(pageCount);
|
|
1403
|
+
if (!formatValidation.isValid) {
|
|
1404
|
+
throw createError("UNSUPPORTED_FORMAT", formatValidation.error);
|
|
1405
|
+
}
|
|
1406
|
+
// Extract page texts
|
|
1407
|
+
const pageNumbers = formatValidation.pages;
|
|
1408
|
+
const pageTexts = await this.pdfProcessor.extractPageTexts(file, pageNumbers);
|
|
1409
|
+
// Extract talent data
|
|
1410
|
+
const talentData = this.extractTalentData(pageTexts, pageCount);
|
|
1411
|
+
// Add SCM extraction if requested (Phase 2 feature)
|
|
1412
|
+
if (this.config.includeSCM) {
|
|
1413
|
+
try {
|
|
1414
|
+
this.logDebug("Starting SCM extraction...");
|
|
1415
|
+
// Get SCM page number based on TMA format
|
|
1416
|
+
const scmPageNumber = this.pdfProcessor.getScmPageNumber(pageCount);
|
|
1417
|
+
this.logDebug(`SCM page number for ${pageCount}-page TMA: ${scmPageNumber}`);
|
|
1418
|
+
const scmPage = await this.pdfProcessor.getPage(file, scmPageNumber);
|
|
1419
|
+
if (scmPage) {
|
|
1420
|
+
const scmData = await this.scmExtractor.extractScmData(scmPage);
|
|
1421
|
+
if (scmData) {
|
|
1422
|
+
talentData.scm = scmData;
|
|
1423
|
+
this.logDebug(`SCM extraction successful: ${scmData.activities.length} activities extracted`);
|
|
1424
|
+
}
|
|
1425
|
+
else {
|
|
1426
|
+
this.logDebug("SCM extraction returned null, continuing without SCM data");
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
else {
|
|
1430
|
+
this.logDebug("No SCM page found, continuing without SCM data");
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
catch (error) {
|
|
1434
|
+
this.logDebug("SCM extraction failed, continuing without SCM data: " + error);
|
|
1435
|
+
// Continue without SCM data - don't break the main extraction
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
// Transform to cleaned format
|
|
1439
|
+
const singlePersonResult = {
|
|
1440
|
+
person: {
|
|
1441
|
+
...talentData,
|
|
1442
|
+
top14Talents: talentData.talents.slice(0, 14).map((t) => t.tema),
|
|
1443
|
+
top7Talents: talentData.talents.slice(0, 7).map((t) => t.tema),
|
|
1444
|
+
low14Talents: talentData.talents.slice(-14).map((t) => t.tema),
|
|
1445
|
+
},
|
|
1446
|
+
};
|
|
1447
|
+
return DataTransformer.transformSinglePersonResult(singlePersonResult);
|
|
1448
|
+
}
|
|
1449
|
+
catch (error) {
|
|
1450
|
+
if (error instanceof TmaExtractionError) {
|
|
1451
|
+
throw error;
|
|
1452
|
+
}
|
|
1453
|
+
throw wrapError(error, "PDF_PROCESSING_ERROR", "Failed to process PDF file");
|
|
1454
|
+
}
|
|
1455
|
+
}
|
|
1456
|
+
/**
|
|
1457
|
+
* Logs debug messages if debug mode is enabled
|
|
1458
|
+
*/
|
|
1459
|
+
logDebug(message) {
|
|
1460
|
+
if (this.config.debug) {
|
|
1461
|
+
console.log(`[TMA-Extractor] ${message}`);
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
/**
|
|
1466
|
+
* Main package export function
|
|
1467
|
+
* Simple API that creates an extractor instance and processes the file
|
|
1468
|
+
*/
|
|
1469
|
+
async function tmaExtractor(file, config = {}) {
|
|
1470
|
+
const extractor = new TmaExtractor(config);
|
|
1471
|
+
return await extractor.extractFromPdf(file);
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
export { DEFAULT_CONFIG, TmaExtractionError, TmaExtractor, tmaExtractor as default };
|
|
1475
|
+
//# sourceMappingURL=index.js.map
|