@odda-ai/matching-core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +34 -0
- package/src/ai/AIProvider.ts +159 -0
- package/src/ai/adapters/AnthropicAdapter.ts +42 -0
- package/src/ai/adapters/OllamaAdapter.ts +42 -0
- package/src/ai/adapters/OpenAIAdapter.ts +53 -0
- package/src/ai/adapters/index.ts +3 -0
- package/src/ai/factory.ts +48 -0
- package/src/ai/index.ts +5 -0
- package/src/ai/registry.ts +15 -0
- package/src/ai/types.ts +59 -0
- package/src/cv-parser/PDFParserService.ts +160 -0
- package/src/cv-parser/index.ts +2 -0
- package/src/cv-parser/types.ts +58 -0
- package/src/features/ai-cv-resume.service.ts +104 -0
- package/src/features/ai-talent.service.ts +49 -0
- package/src/features/cv-chunking.service.ts +510 -0
- package/src/features/index.ts +5 -0
- package/src/features/job-matcher.service.ts +41 -0
- package/src/features/prompts.ts +621 -0
- package/src/features/system-messages.ts +28 -0
- package/src/features/types.ts +55 -0
package/package.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@odda-ai/matching-core",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Core AI provider library with support for OpenAI, Ollama, and Anthropic",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "index.js",
|
|
7
|
+
"types": "index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./index.d.ts",
|
|
11
|
+
"import": "./index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"files": [
|
|
15
|
+
"src",
|
|
16
|
+
"index.js",
|
|
17
|
+
"index.d.ts"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "tsc",
|
|
21
|
+
"prepublishOnly": "npm run build",
|
|
22
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
|
23
|
+
},
|
|
24
|
+
"author": "",
|
|
25
|
+
"license": "ISC",
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"axios": "^1.13.4",
|
|
28
|
+
"pdf-parse": "^1.1.1"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"@types/node": "^22.10.5",
|
|
32
|
+
"@types/pdf-parse": "^1.1.4"
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import axios, { AxiosError, type AxiosInstance } from "axios";
|
|
2
|
+
import type {
|
|
3
|
+
AIAdapter,
|
|
4
|
+
BaseAIConfig,
|
|
5
|
+
ChatMessage,
|
|
6
|
+
AIResponse,
|
|
7
|
+
} from "./types.js";
|
|
8
|
+
import { AI_ADAPTERS } from "./registry.js";
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Classe generica per interrogare qualsiasi provider AI
|
|
12
|
+
*/
|
|
13
|
+
export class AIProvider {
|
|
14
|
+
private config: BaseAIConfig;
|
|
15
|
+
private client: AxiosInstance;
|
|
16
|
+
private adapter: AIAdapter;
|
|
17
|
+
|
|
18
|
+
constructor(
|
|
19
|
+
config: BaseAIConfig,
|
|
20
|
+
adapter: AIAdapter | keyof typeof AI_ADAPTERS,
|
|
21
|
+
) {
|
|
22
|
+
this.config = {
|
|
23
|
+
temperature: 0.7,
|
|
24
|
+
maxTokens: 1000,
|
|
25
|
+
timeout: 300000,
|
|
26
|
+
...config,
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
// Determina l'adapter da usare
|
|
30
|
+
if (!adapter) {
|
|
31
|
+
throw new Error("È necessario specificare un adapter");
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (typeof adapter === "string") {
|
|
35
|
+
const registeredAdapter = AI_ADAPTERS[adapter];
|
|
36
|
+
if (!registeredAdapter) {
|
|
37
|
+
throw new Error(`Adapter non trovato: ${adapter}`);
|
|
38
|
+
}
|
|
39
|
+
this.adapter = registeredAdapter;
|
|
40
|
+
} else {
|
|
41
|
+
this.adapter = adapter;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Configura il client HTTP
|
|
45
|
+
this.client = this.createClient();
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
private createClient(): AxiosInstance {
|
|
49
|
+
const headers = {
|
|
50
|
+
...this.adapter.configureHeaders?.(this.config),
|
|
51
|
+
...this.config.headers,
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
const axiosConfig: any = {
|
|
55
|
+
baseURL: this.config.baseURL,
|
|
56
|
+
headers,
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
if (this.config.timeout !== undefined) {
|
|
60
|
+
axiosConfig.timeout = this.config.timeout;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return axios.create(axiosConfig);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Invia una chat completion al provider AI
|
|
68
|
+
*/
|
|
69
|
+
async chat(messages: ChatMessage[]): Promise<AIResponse> {
|
|
70
|
+
try {
|
|
71
|
+
const { endpoint, body } = this.adapter.transformRequest(
|
|
72
|
+
messages,
|
|
73
|
+
this.config,
|
|
74
|
+
);
|
|
75
|
+
const response = await this.client.post(endpoint, body);
|
|
76
|
+
|
|
77
|
+
return this.adapter.transformResponse(response);
|
|
78
|
+
} catch (error) {
|
|
79
|
+
if (axios.isAxiosError(error)) {
|
|
80
|
+
const errorMessage =
|
|
81
|
+
error.response?.data?.error?.message ||
|
|
82
|
+
error.response?.data?.message ||
|
|
83
|
+
error.message;
|
|
84
|
+
|
|
85
|
+
const axiosError = new Error(`Errore nella richiesta AI: ${errorMessage}`);
|
|
86
|
+
axiosError.name = "AIProviderError";
|
|
87
|
+
(axiosError as any).status = error.response?.status;
|
|
88
|
+
(axiosError as any).data = error.response?.data;
|
|
89
|
+
throw axiosError;
|
|
90
|
+
}
|
|
91
|
+
throw error;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Invia un singolo prompt al provider AI
|
|
97
|
+
*/
|
|
98
|
+
async prompt(prompt: string, systemMessage?: string): Promise<AIResponse> {
|
|
99
|
+
const messages: ChatMessage[] = [];
|
|
100
|
+
|
|
101
|
+
if (systemMessage) {
|
|
102
|
+
messages.push({
|
|
103
|
+
role: "system",
|
|
104
|
+
content: systemMessage,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
messages.push({
|
|
109
|
+
role: "user",
|
|
110
|
+
content: prompt,
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
return this.chat(messages);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Aggiorna la configurazione del provider
|
|
118
|
+
*/
|
|
119
|
+
updateConfig(config: Partial<BaseAIConfig>): void {
|
|
120
|
+
this.config = { ...this.config, ...config };
|
|
121
|
+
|
|
122
|
+
// Ricrea il client se la configurazione HTTP è cambiata
|
|
123
|
+
if (config.baseURL || config.apiKey || config.headers || config.timeout) {
|
|
124
|
+
this.client = this.createClient();
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Ottiene la configurazione corrente
|
|
130
|
+
*/
|
|
131
|
+
getConfig(): BaseAIConfig {
|
|
132
|
+
return { ...this.config };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Ottiene l'adapter corrente
|
|
137
|
+
*/
|
|
138
|
+
getAdapter(): AIAdapter {
|
|
139
|
+
return this.adapter;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Cambia l'adapter mantenendo la stessa configurazione
|
|
144
|
+
*/
|
|
145
|
+
setAdapter(adapter: AIAdapter | keyof typeof AI_ADAPTERS): void {
|
|
146
|
+
if (typeof adapter === "string") {
|
|
147
|
+
const registeredAdapter = AI_ADAPTERS[adapter];
|
|
148
|
+
if (!registeredAdapter) {
|
|
149
|
+
throw new Error(`Adapter non trovato: ${adapter}`);
|
|
150
|
+
}
|
|
151
|
+
this.adapter = registeredAdapter;
|
|
152
|
+
} else {
|
|
153
|
+
this.adapter = adapter;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Ricrea il client con i nuovi header dell'adapter
|
|
157
|
+
this.client = this.createClient();
|
|
158
|
+
}
|
|
159
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import type { AIAdapter, ChatMessage, BaseAIConfig, AIResponse } from '../types.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Adapter per Anthropic Claude
|
|
5
|
+
*/
|
|
6
|
+
export class AnthropicAdapter implements AIAdapter {
|
|
7
|
+
transformRequest(messages: ChatMessage[], config: BaseAIConfig) {
|
|
8
|
+
return {
|
|
9
|
+
endpoint: '/v1/messages',
|
|
10
|
+
body: {
|
|
11
|
+
model: config.model,
|
|
12
|
+
messages: messages.filter(m => m.role !== 'system'),
|
|
13
|
+
system: messages.find(m => m.role === 'system')?.content,
|
|
14
|
+
max_tokens: config.maxTokens || 1024,
|
|
15
|
+
temperature: config.temperature,
|
|
16
|
+
...config.additionalParams
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
transformResponse(response: any): AIResponse {
|
|
22
|
+
const data = response.data;
|
|
23
|
+
return {
|
|
24
|
+
content: data.content?.[0]?.text || '',
|
|
25
|
+
model: data.model,
|
|
26
|
+
usage: data.usage ? {
|
|
27
|
+
promptTokens: data.usage.input_tokens,
|
|
28
|
+
completionTokens: data.usage.output_tokens,
|
|
29
|
+
totalTokens: data.usage.input_tokens + data.usage.output_tokens
|
|
30
|
+
} : undefined,
|
|
31
|
+
raw: data
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
configureHeaders(config: BaseAIConfig): Record<string, string> {
|
|
36
|
+
return {
|
|
37
|
+
'Content-Type': 'application/json',
|
|
38
|
+
'x-api-key': config.apiKey || '',
|
|
39
|
+
'anthropic-version': '2023-06-01'
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import type { AIAdapter, ChatMessage, BaseAIConfig, AIResponse } from '../types.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Adapter per Ollama
|
|
5
|
+
*/
|
|
6
|
+
export class OllamaAdapter implements AIAdapter {
|
|
7
|
+
transformRequest(messages: ChatMessage[], config: BaseAIConfig) {
|
|
8
|
+
return {
|
|
9
|
+
endpoint: '/chat',
|
|
10
|
+
body: {
|
|
11
|
+
model: config.model,
|
|
12
|
+
messages,
|
|
13
|
+
stream: false,
|
|
14
|
+
options: {
|
|
15
|
+
temperature: config.temperature,
|
|
16
|
+
num_predict: config.maxTokens,
|
|
17
|
+
...config.additionalParams
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
transformResponse(response: any): AIResponse {
|
|
24
|
+
const data = response.data;
|
|
25
|
+
return {
|
|
26
|
+
content: data.message?.content || '',
|
|
27
|
+
model: data.model,
|
|
28
|
+
usage: data.prompt_eval_count && data.eval_count ? {
|
|
29
|
+
promptTokens: data.prompt_eval_count,
|
|
30
|
+
completionTokens: data.eval_count,
|
|
31
|
+
totalTokens: data.prompt_eval_count + data.eval_count
|
|
32
|
+
} : undefined,
|
|
33
|
+
raw: data
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
configureHeaders(): Record<string, string> {
|
|
38
|
+
return {
|
|
39
|
+
'Content-Type': 'application/json'
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import type { AIAdapter, ChatMessage, BaseAIConfig, AIResponse } from '../types.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Adapter per OpenAI e provider compatibili (Azure OpenAI, ecc.)
|
|
5
|
+
*/
|
|
6
|
+
export class OpenAIAdapter implements AIAdapter {
|
|
7
|
+
transformRequest(messages: ChatMessage[], config: BaseAIConfig) {
|
|
8
|
+
const body: any = {
|
|
9
|
+
model: config.model,
|
|
10
|
+
messages,
|
|
11
|
+
// temperature: config.temperature,
|
|
12
|
+
// ...config.additionalParams
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
// Usa max_completion_tokens per i nuovi modelli, max_tokens per i vecchi
|
|
16
|
+
// if (config.maxTokens) {
|
|
17
|
+
// if (config.model?.includes('gpt-4') || config.model?.includes('gpt-5')) {
|
|
18
|
+
// body.max_completion_tokens = config.maxTokens;
|
|
19
|
+
// } else {
|
|
20
|
+
// body.max_tokens = config.maxTokens;
|
|
21
|
+
// }
|
|
22
|
+
// }
|
|
23
|
+
|
|
24
|
+
return {
|
|
25
|
+
endpoint: '/chat/completions',
|
|
26
|
+
body
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
transformResponse(response: any): AIResponse {
|
|
31
|
+
const data = response.data;
|
|
32
|
+
return {
|
|
33
|
+
content: data.choices?.[0]?.message?.content || '',
|
|
34
|
+
model: data.model,
|
|
35
|
+
usage: data.usage ? {
|
|
36
|
+
promptTokens: data.usage.prompt_tokens,
|
|
37
|
+
completionTokens: data.usage.completion_tokens,
|
|
38
|
+
totalTokens: data.usage.total_tokens
|
|
39
|
+
} : undefined,
|
|
40
|
+
raw: data
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
configureHeaders(config: BaseAIConfig): Record<string, string> {
|
|
45
|
+
const headers: Record<string, string> = {
|
|
46
|
+
'Content-Type': 'application/json'
|
|
47
|
+
};
|
|
48
|
+
if (config.apiKey) {
|
|
49
|
+
headers['Authorization'] = `Bearer ${config.apiKey}`;
|
|
50
|
+
}
|
|
51
|
+
return headers;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { AIProvider } from './AIProvider.js';
|
|
2
|
+
import type { BaseAIConfig, AIAdapter } from './types.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Factory helper per creare provider comuni
|
|
6
|
+
*/
|
|
7
|
+
export const createAIProvider = {
|
|
8
|
+
openai: (apiKey: string, model = 'gpt-4', config?: Partial<BaseAIConfig>) =>
|
|
9
|
+
new AIProvider({
|
|
10
|
+
baseURL: 'https://api.openai.com/v1',
|
|
11
|
+
apiKey,
|
|
12
|
+
model,
|
|
13
|
+
...config
|
|
14
|
+
}, 'openai'),
|
|
15
|
+
|
|
16
|
+
ollama: (model = 'llama2', baseURL = 'http://localhost:11434/api', config?: Partial<BaseAIConfig>) =>
|
|
17
|
+
new AIProvider({
|
|
18
|
+
baseURL,
|
|
19
|
+
model,
|
|
20
|
+
...config
|
|
21
|
+
}, 'ollama'),
|
|
22
|
+
|
|
23
|
+
anthropic: (apiKey: string, model = 'claude-3-5-sonnet-20241022', config?: Partial<BaseAIConfig>) =>
|
|
24
|
+
new AIProvider({
|
|
25
|
+
baseURL: 'https://api.anthropic.com',
|
|
26
|
+
apiKey,
|
|
27
|
+
model,
|
|
28
|
+
...config
|
|
29
|
+
}, 'anthropic'),
|
|
30
|
+
|
|
31
|
+
azureOpenAI: (apiKey: string, endpoint: string, deployment: string, config?: Partial<BaseAIConfig>) =>
|
|
32
|
+
new AIProvider({
|
|
33
|
+
baseURL: `${endpoint}/openai/deployments/${deployment}`,
|
|
34
|
+
apiKey,
|
|
35
|
+
model: deployment,
|
|
36
|
+
headers: {
|
|
37
|
+
'api-key': apiKey
|
|
38
|
+
},
|
|
39
|
+
...config
|
|
40
|
+
}, 'azure'),
|
|
41
|
+
|
|
42
|
+
custom: (baseURL: string, adapter: AIAdapter, config?: Partial<BaseAIConfig>) =>
|
|
43
|
+
new AIProvider({
|
|
44
|
+
baseURL,
|
|
45
|
+
model: config?.model || 'default',
|
|
46
|
+
...config
|
|
47
|
+
}, adapter)
|
|
48
|
+
};
|
package/src/ai/index.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { OpenAIAdapter } from './adapters/OpenAIAdapter.js';
|
|
2
|
+
import { OllamaAdapter } from './adapters/OllamaAdapter.js';
|
|
3
|
+
import { AnthropicAdapter } from './adapters/AnthropicAdapter.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Registry degli adapter disponibili
|
|
7
|
+
*/
|
|
8
|
+
export const AI_ADAPTERS = {
|
|
9
|
+
openai: new OpenAIAdapter(),
|
|
10
|
+
ollama: new OllamaAdapter(),
|
|
11
|
+
anthropic: new AnthropicAdapter(),
|
|
12
|
+
// Alias per compatibilità
|
|
13
|
+
azure: new OpenAIAdapter(),
|
|
14
|
+
'azure-openai': new OpenAIAdapter()
|
|
15
|
+
} as const;
|
package/src/ai/types.ts
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Messaggio nel formato chat
|
|
3
|
+
*/
|
|
4
|
+
export interface ChatMessage {
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
name?: string;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Risposta dall'AI
|
|
12
|
+
*/
|
|
13
|
+
export interface AIResponse {
|
|
14
|
+
content: string;
|
|
15
|
+
model?: string;
|
|
16
|
+
usage?: {
|
|
17
|
+
promptTokens: number;
|
|
18
|
+
completionTokens: number;
|
|
19
|
+
totalTokens: number;
|
|
20
|
+
} | undefined;
|
|
21
|
+
raw?: any;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Configurazione base per il provider AI
|
|
26
|
+
*/
|
|
27
|
+
export interface BaseAIConfig {
|
|
28
|
+
baseURL: string;
|
|
29
|
+
model: string;
|
|
30
|
+
apiKey?: string;
|
|
31
|
+
headers?: Record<string, string>;
|
|
32
|
+
temperature?: number;
|
|
33
|
+
maxTokens?: number;
|
|
34
|
+
timeout?: number;
|
|
35
|
+
[key: string]: any;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Adapter per trasformare richieste e risposte per provider specifici
|
|
40
|
+
*/
|
|
41
|
+
export interface AIAdapter {
|
|
42
|
+
/**
|
|
43
|
+
* Trasforma i messaggi nel formato richiesto dal provider
|
|
44
|
+
*/
|
|
45
|
+
transformRequest(messages: ChatMessage[], config: BaseAIConfig): {
|
|
46
|
+
endpoint: string;
|
|
47
|
+
body: any;
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Trasforma la risposta del provider nel formato standard
|
|
52
|
+
*/
|
|
53
|
+
transformResponse(response: any): AIResponse;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Configura gli header HTTP specifici del provider
|
|
57
|
+
*/
|
|
58
|
+
configureHeaders?(config: BaseAIConfig): Record<string, string>;
|
|
59
|
+
}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import * as fs from 'fs';
|
|
2
|
+
import pdfParse from 'pdf-parse';
|
|
3
|
+
import type { PDFExtractionOptions, PDFExtractionResult } from './types.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Servizio per estrarre testo dai file PDF
|
|
7
|
+
*/
|
|
8
|
+
export class PDFParserService {
|
|
9
|
+
/**
|
|
10
|
+
* Estrae il testo da un file PDF
|
|
11
|
+
* @param pdfPath Percorso del file PDF o Buffer
|
|
12
|
+
* @param options Opzioni di estrazione
|
|
13
|
+
*/
|
|
14
|
+
async extractText(
|
|
15
|
+
pdfPath: string | Buffer,
|
|
16
|
+
options: PDFExtractionOptions = {}
|
|
17
|
+
): Promise<PDFExtractionResult> {
|
|
18
|
+
try {
|
|
19
|
+
// Leggi il file se è un percorso
|
|
20
|
+
const dataBuffer = typeof pdfPath === 'string'
|
|
21
|
+
? await fs.promises.readFile(pdfPath)
|
|
22
|
+
: pdfPath;
|
|
23
|
+
|
|
24
|
+
// Estrai il testo usando pdf-parse
|
|
25
|
+
const pdfData = await pdfParse(dataBuffer, {
|
|
26
|
+
max: options.endPage,
|
|
27
|
+
pagerender: this.createPageRenderer(options)
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
// Estrai testo per pagina se richiesto
|
|
31
|
+
const pageTexts: string[] = [];
|
|
32
|
+
if (options.startPage || options.endPage) {
|
|
33
|
+
const start = options.startPage || 1;
|
|
34
|
+
const end = options.endPage || pdfData.numpages;
|
|
35
|
+
|
|
36
|
+
for (let i = start; i <= end && i <= pdfData.numpages; i++) {
|
|
37
|
+
const pageData = await pdfParse(dataBuffer, {
|
|
38
|
+
max: i,
|
|
39
|
+
pagerender: this.createPageRenderer(options)
|
|
40
|
+
});
|
|
41
|
+
pageTexts.push(pageData.text);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Applica il range di pagine se specificato
|
|
46
|
+
let text = pdfData.text;
|
|
47
|
+
if (options.startPage || options.endPage) {
|
|
48
|
+
const lines = text.split('\n');
|
|
49
|
+
const start = options.startPage ? (options.startPage - 1) : 0;
|
|
50
|
+
const end = options.endPage;
|
|
51
|
+
text = lines.slice(start, end).join(options.pageSeparator || '\n');
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
text: text.trim(),
|
|
56
|
+
totalPages: pdfData.numpages,
|
|
57
|
+
pageTexts: pageTexts.length > 0 ? pageTexts : undefined,
|
|
58
|
+
metadata: this.extractMetadata(pdfData.info)
|
|
59
|
+
};
|
|
60
|
+
} catch (error) {
|
|
61
|
+
throw new Error(`Errore nell'estrazione del testo dal PDF: ${error instanceof Error ? error.message : String(error)}`);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Estrae il testo da più PDF contemporaneamente
|
|
67
|
+
* @param pdfPaths Array di percorsi PDF
|
|
68
|
+
* @param options Opzioni di estrazione
|
|
69
|
+
*/
|
|
70
|
+
async extractTextFromMultiple(
|
|
71
|
+
pdfPaths: (string | Buffer)[],
|
|
72
|
+
options: PDFExtractionOptions = {}
|
|
73
|
+
): Promise<PDFExtractionResult[]> {
|
|
74
|
+
const promises = pdfPaths.map(path => this.extractText(path, options));
|
|
75
|
+
return Promise.all(promises);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Verifica se un file è un PDF valido
|
|
80
|
+
* @param pdfPath Percorso del file PDF o Buffer
|
|
81
|
+
*/
|
|
82
|
+
async isValidPDF(pdfPath: string | Buffer): Promise<boolean> {
|
|
83
|
+
try {
|
|
84
|
+
const dataBuffer = typeof pdfPath === 'string'
|
|
85
|
+
? await fs.promises.readFile(pdfPath)
|
|
86
|
+
: pdfPath;
|
|
87
|
+
|
|
88
|
+
// Verifica il magic number del PDF (%PDF)
|
|
89
|
+
const header = dataBuffer.slice(0, 5).toString();
|
|
90
|
+
return header === '%PDF-';
|
|
91
|
+
} catch {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Crea un renderer personalizzato per le pagine
|
|
98
|
+
*/
|
|
99
|
+
private createPageRenderer(options: PDFExtractionOptions) {
|
|
100
|
+
if (!options.preserveFormatting) {
|
|
101
|
+
return undefined;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return (pageData: any) => {
|
|
105
|
+
return pageData.getTextContent().then((textContent: any) => {
|
|
106
|
+
let lastY: number | null = null;
|
|
107
|
+
let text = '';
|
|
108
|
+
|
|
109
|
+
for (const item of textContent.items) {
|
|
110
|
+
if (lastY !== null && lastY !== item.transform[5]) {
|
|
111
|
+
text += '\n';
|
|
112
|
+
}
|
|
113
|
+
text += item.str;
|
|
114
|
+
lastY = item.transform[5];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return text;
|
|
118
|
+
});
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Estrae i metadati dal PDF
|
|
124
|
+
*/
|
|
125
|
+
private extractMetadata(info: any): PDFExtractionResult['metadata'] {
|
|
126
|
+
if (!info) return undefined;
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
title: info.Title,
|
|
130
|
+
author: info.Author,
|
|
131
|
+
subject: info.Subject,
|
|
132
|
+
keywords: info.Keywords,
|
|
133
|
+
creator: info.Creator,
|
|
134
|
+
producer: info.Producer,
|
|
135
|
+
creationDate: info.CreationDate ? new Date(info.CreationDate) : undefined,
|
|
136
|
+
modificationDate: info.ModDate ? new Date(info.ModDate) : undefined
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Estrae solo testo da una specifica pagina
|
|
142
|
+
* @param pdfPath Percorso del file PDF o Buffer
|
|
143
|
+
* @param pageNumber Numero della pagina (1-based)
|
|
144
|
+
*/
|
|
145
|
+
async extractPageText(
|
|
146
|
+
pdfPath: string | Buffer,
|
|
147
|
+
pageNumber: number
|
|
148
|
+
): Promise<string> {
|
|
149
|
+
const result = await this.extractText(pdfPath, {
|
|
150
|
+
startPage: pageNumber,
|
|
151
|
+
endPage: pageNumber
|
|
152
|
+
});
|
|
153
|
+
return result.text;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Istanza singleton del parser
|
|
159
|
+
*/
|
|
160
|
+
export const pdfParser = new PDFParserService();
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Opzioni per l'estrazione del testo dal PDF
|
|
3
|
+
*/
|
|
4
|
+
export interface PDFExtractionOptions {
|
|
5
|
+
/**
|
|
6
|
+
* Numero di pagina da cui iniziare l'estrazione (1-based)
|
|
7
|
+
*/
|
|
8
|
+
startPage?: number;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Numero di pagina fino a cui estrarre (1-based)
|
|
12
|
+
*/
|
|
13
|
+
endPage?: number;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Se true, preserva la formattazione e gli spazi
|
|
17
|
+
*/
|
|
18
|
+
preserveFormatting?: boolean;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Separatore tra le pagine
|
|
22
|
+
*/
|
|
23
|
+
pageSeparator?: string;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Risultato dell'estrazione del testo
|
|
28
|
+
*/
|
|
29
|
+
export interface PDFExtractionResult {
|
|
30
|
+
/**
|
|
31
|
+
* Testo estratto dal PDF
|
|
32
|
+
*/
|
|
33
|
+
text: string;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Numero totale di pagine nel documento
|
|
37
|
+
*/
|
|
38
|
+
totalPages: number;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Testo estratto per ogni pagina
|
|
42
|
+
*/
|
|
43
|
+
pageTexts?: string[] | undefined;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Metadati del PDF (se disponibili)
|
|
47
|
+
*/
|
|
48
|
+
metadata?: {
|
|
49
|
+
title?: string;
|
|
50
|
+
author?: string;
|
|
51
|
+
subject?: string;
|
|
52
|
+
keywords?: string;
|
|
53
|
+
creator?: string;
|
|
54
|
+
producer?: string;
|
|
55
|
+
creationDate?: Date | undefined;
|
|
56
|
+
modificationDate?: Date | undefined;
|
|
57
|
+
} | undefined;
|
|
58
|
+
}
|