email-origin-chain 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +425 -0
- package/dist/detectors/crisp-detector.d.ts +11 -0
- package/dist/detectors/crisp-detector.js +46 -0
- package/dist/detectors/index.d.ts +5 -0
- package/dist/detectors/index.js +11 -0
- package/dist/detectors/new-outlook-detector.d.ts +10 -0
- package/dist/detectors/new-outlook-detector.js +112 -0
- package/dist/detectors/outlook-empty-header-detector.d.ts +16 -0
- package/dist/detectors/outlook-empty-header-detector.js +64 -0
- package/dist/detectors/outlook-fr-detector.d.ts +10 -0
- package/dist/detectors/outlook-fr-detector.js +119 -0
- package/dist/detectors/outlook-reverse-fr-detector.d.ts +13 -0
- package/dist/detectors/outlook-reverse-fr-detector.js +86 -0
- package/dist/detectors/registry.d.ts +25 -0
- package/dist/detectors/registry.js +81 -0
- package/dist/detectors/reply-detector.d.ts +11 -0
- package/dist/detectors/reply-detector.js +82 -0
- package/dist/detectors/types.d.ts +38 -0
- package/dist/detectors/types.js +2 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +132 -0
- package/dist/inline-layer.d.ts +7 -0
- package/dist/inline-layer.js +116 -0
- package/dist/mime-layer.d.ts +15 -0
- package/dist/mime-layer.js +70 -0
- package/dist/types.d.ts +63 -0
- package/dist/types.js +2 -0
- package/dist/utils/cleaner.d.ts +16 -0
- package/dist/utils/cleaner.js +51 -0
- package/dist/utils.d.ts +17 -0
- package/dist/utils.js +221 -0
- package/docs/TEST_COVERAGE.md +54 -0
- package/docs/architecture/README.md +27 -0
- package/docs/architecture/phase1_cc_fix.md +223 -0
- package/docs/architecture/phase2_plugin_foundation.md +185 -0
- package/docs/architecture/phase3_fallbacks.md +62 -0
- package/docs/architecture/plugin_plan.md +318 -0
- package/docs/architecture/refactor_report.md +98 -0
- package/docs/detectors_usage.md +42 -0
- package/docs/walkthrough_address_fix.md +58 -0
- package/docs/walkthrough_deep_forward_fix.md +35 -0
- package/package.json +48 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Flo (yodjii)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
# email-origin-chain
|
|
2
|
+
|
|
3
|
+
**Uncover the full audit trail of your email threads.** Recursively deep-dives into forwards and replies to reconstruct the entire conversation history. Combines MIME traversal with multi-language text detection for a perfect message chain—giving you instant access to the original sender's details and the true source message.
|
|
4
|
+
|
|
5
|
+
## Architecture & Refactor
|
|
6
|
+
|
|
7
|
+
The library recently underwent a major refactor to a plugin-based architecture, improving compatibility and fix recursion bugs.
|
|
8
|
+
|
|
9
|
+
Detailed documentation can be found in the [docs/architecture/](docs/architecture/README.md) directory:
|
|
10
|
+
- [Phase 1: Cc: Fix](docs/architecture/phase1_cc_fix.md)
|
|
11
|
+
- [Phase 2: Plugin Architecture](docs/architecture/phase2_plugin_foundation.md)
|
|
12
|
+
- [Phase 3: Full Compatibility (100%)](docs/architecture/phase3_fallbacks.md)
|
|
13
|
+
- [Deep Forward Fix Walkthrough](docs/walkthrough_deep_forward_fix.md)
|
|
14
|
+
- [Detector Usage & Priorities](docs/detectors_usage.md)
|
|
15
|
+
|
|
16
|
+
**✅ Test Coverage:** The library has been validated against **239 fixtures** from the `email-forward-parser-recursive` library with a **100% success rate** (239/239). This includes validating message bodies and ensuring non-message snippets are correctly identified. See [Test Coverage Report](docs/TEST_COVERAGE.md) for details.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- **Hybrid Strategy**: Combines MIME recursion (`message/rfc822`) and inline text parsing
|
|
21
|
+
- **Reply & Forward Support**: Detects both traditional "Forwarded message" blocks and "On ... wrote:" reply headers in 15+ languages.
|
|
22
|
+
- **Robust Parsing**: Uses `mailparser` and `email-forward-parser` with custom detectors for Outlook Live, French headers, and more.
|
|
23
|
+
- **Type-Safe**: Full TypeScript support
|
|
24
|
+
- **Normalized Output**: Consistent result format with diagnostics
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
npm install
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### CLI Utilities
|
|
33
|
+
You can test any email file directly using the included extraction tool:
|
|
34
|
+
```bash
|
|
35
|
+
npx tsx bin/extract.ts tests/fixtures/complex-forward.eml
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```typescript
|
|
39
|
+
import { extractDeepestHybrid } from 'email-deepest-forward';
|
|
40
|
+
|
|
41
|
+
// Process a full EML with hybrid strategy
|
|
42
|
+
const result = await extractDeepestHybrid(rawEmailString);
|
|
43
|
+
|
|
44
|
+
// Process ONLY the text/inline forwards (ignore MIME layer)
|
|
45
|
+
const textOnlyResult = await extractDeepestHybrid(rawText, { skipMimeLayer: true });
|
|
46
|
+
|
|
47
|
+
console.log(result.text); // The deepest original message
|
|
48
|
+
console.log(result.history); // Full conversation chain
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Options
|
|
52
|
+
|
|
53
|
+
| Option | Type | Default | Description |
|
|
54
|
+
| :--- | :--- | :--- | :--- |
|
|
55
|
+
| `skipMimeLayer` | `boolean` | `false` | If `true`, ignores MIME parsing (`rfc822`) and processes the input as raw text only. Ideal for inputs that are already stripped of headers. |
|
|
56
|
+
| `maxDepth` | `number` | `5` | Maximum number of recursion levels for MIME parsing. |
|
|
57
|
+
| `timeoutMs` | `number` | `5000` | Timeout for MIME processing to prevent blocking on huge files. |
|
|
58
|
+
|
|
59
|
+
## Response Format
|
|
60
|
+
|
|
61
|
+
The library returns a `ResultObject` with the following structure:
|
|
62
|
+
|
|
63
|
+
| Field | Type | Description |
|
|
64
|
+
| :--- | :--- | :--- |
|
|
65
|
+
| `from` | `object \| null` | `{ name?: string, address?: string }`. |
|
|
66
|
+
| `to` | `array` | List of primary recipients. |
|
|
67
|
+
| `cc` | `array` | List of CC recipients. |
|
|
68
|
+
| `subject` | `string \| null` | The original subject line of the deepest message. |
|
|
69
|
+
| `date_raw` | `string \| null` | The original date string found in the email headers. |
|
|
70
|
+
| `date_iso` | `string \| null` | ISO 8601 UTC representation (normalized via `any-date-parser`). |
|
|
71
|
+
| `text` | `string \| null` | Cleaned body content of the deepest message. |
|
|
72
|
+
| `attachments` | `array` | Metadata for MIME attachments found at the deepest level. |
|
|
73
|
+
| `history` | `array` | **Conversation Chaining**: Full audit trail of the discussion (see below). |
|
|
74
|
+
| `diagnostics` | `object` | Metadata about the parsing process. |
|
|
75
|
+
|
|
76
|
+
### Diagnostics Detail
|
|
77
|
+
|
|
78
|
+
- **`method`**: Strategy used to find the deepest message.
|
|
79
|
+
- `rfc822`: Found via recursive MIME attachments (highest reliability).
|
|
80
|
+
- `inline`: Found via text pattern detection (forwarded blocks).
|
|
81
|
+
- `fallback`: No forward found, returning current message info or best-effort extraction.
|
|
82
|
+
- **`depth`**: Number of forward levels traversed (0 for original email).
|
|
83
|
+
- **`parsedOk`**: `true` if at least a sender (`from`) and `subject` were successfully extracted.
|
|
84
|
+
- **`warnings`**: Array of non-fatal issues (e.g., date normalization failure).
|
|
85
|
+
|
|
86
|
+
### Conversation Chain Reconstruction (Full History)
|
|
87
|
+
|
|
88
|
+
Rather than just finding the "original" source, the library reconstructs the entire **Conversation Chain** (sometimes called *Email Threading* or *Message Chaining*). This allows you to audit every step of a transfer:
|
|
89
|
+
|
|
90
|
+
- **`history[0]`**: The **deepest** (oldest) message in the chain. Same as the root object.
|
|
91
|
+
- **`history[1...n-1]`**: Intermediate forwards/messages.
|
|
92
|
+
- **`history[n]`**: The **root** (most recent) message you actually received.
|
|
93
|
+
|
|
94
|
+
Each history entry contains its own `from`, `to`, `cc`, `subject`, `date_iso`, `text`, and **`flags`** (array of strings). The contact fields (`from`, `to`, `cc`) are structured as objects containing:
|
|
95
|
+
- **`name`**: The display name (e.g., "John Doe").
|
|
96
|
+
- **`address`**: The email address (e.g., "john@example.com").
|
|
97
|
+
|
|
98
|
+
#### Possible Flags:
|
|
99
|
+
- `level:deepest`: The original source of the thread.
|
|
100
|
+
- `level:root`: The entry representing the received email itself.
|
|
101
|
+
- `trust:high_mime`: Metadata from a real `.eml` attachment (100% reliable).
|
|
102
|
+
- `trust:medium_inline`: Metadata extracted from text patterns (best effort).
|
|
103
|
+
- `method:crisp_engine`: Detected via standard international patterns (Crisp).
|
|
104
|
+
- `method:outlook_fr`: Detected via standard rules (French, Outlook).
|
|
105
|
+
- `method:outlook_reverse_fr`: Detected via reversed rules (Envoyé before De).
|
|
106
|
+
- `method:outlook_empty_header`: Detected via permissive rules (No date/email).
|
|
107
|
+
- `method:new_outlook`: Detected via modern localized headers (handles bolding and `mailto:` tags).
|
|
108
|
+
- `method:reply`: Detected via international reply patterns (`On ... wrote:`).
|
|
109
|
+
- `method:crisp`: Detected via standard international patterns (Crisp/Fallback).
|
|
110
|
+
- `content:silent_forward`: The user forwarded the message without adding any text.
|
|
111
|
+
- `date:unparseable`: A date string was found but could not be normalized to ISO.
|
|
112
|
+
|
|
113
|
+
### Typical Output Example
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"from": { "name": "Original Sender Name", "address": "original@source.com" },
|
|
118
|
+
"subject": "Initial Topic",
|
|
119
|
+
"text": "The very first message content.",
|
|
120
|
+
"history": [
|
|
121
|
+
{
|
|
122
|
+
"depth": 2,
|
|
123
|
+
"from": { "name": "Original Sender Name", "address": "original@source.com" },
|
|
124
|
+
"text": "The very first message content.",
|
|
125
|
+
"flags": ["method:outlook_fr", "trust:medium_inline", "level:deepest"]
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
"depth": 1,
|
|
129
|
+
"from": { "name": "Intermediate Person", "address": "inter@company.com" },
|
|
130
|
+
"text": "",
|
|
131
|
+
"flags": ["method:crisp", "trust:medium_inline", "content:silent_forward"]
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"depth": 0,
|
|
135
|
+
"from": { "name": "Me", "address": "me@provider.com" },
|
|
136
|
+
"text": "Check this thread below!",
|
|
137
|
+
"flags": ["trust:high_mime", "level:root"]
|
|
138
|
+
}
|
|
139
|
+
],
|
|
140
|
+
"diagnostics": {
|
|
141
|
+
"method": "inline",
|
|
142
|
+
"depth": 2,
|
|
143
|
+
"parsedOk": true,
|
|
144
|
+
"warnings": []
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Examples
|
|
150
|
+
|
|
151
|
+
### 1. Simple Email (No Forward)
|
|
152
|
+
When no forward is detected, the library returns the metadata of the email itself.
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
const email = `From: alice@example.com
|
|
156
|
+
Subject: Meeting Update
|
|
157
|
+
Date: Mon, 26 Jan 2026 15:00:00 +0100
|
|
158
|
+
|
|
159
|
+
Hey, the meeting is moved to 4 PM.`;
|
|
160
|
+
|
|
161
|
+
const result = await extractDeepestHybrid(email);
|
|
162
|
+
console.log(result.diagnostics.depth); // 0
|
|
163
|
+
console.log(result.from.address); // "alice@example.com"
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### 2. Double Inline Forward (Deep Extraction)
|
|
167
|
+
The library recursively follows "Forwarded message" blocks to find the original sender.
|
|
168
|
+
|
|
169
|
+
```typescript
|
|
170
|
+
const doubleForward = `
|
|
171
|
+
---------- Forwarded message ---------
|
|
172
|
+
From: Flo R. <florian.regalo@gmail.com>
|
|
173
|
+
Date: Mon, 26 Jan 2026 at 15:01
|
|
174
|
+
Subject: Fwd: original topic
|
|
175
|
+
|
|
176
|
+
---------- Forwarded message ---------
|
|
177
|
+
From: Original Sender <original@source.com>
|
|
178
|
+
Date: Mon, 26 Jan 2026 at 10:00
|
|
179
|
+
Subject: original topic
|
|
180
|
+
|
|
181
|
+
This is the very first message content.`;
|
|
182
|
+
|
|
183
|
+
const result = await extractDeepestHybrid(doubleForward);
|
|
184
|
+
console.log(result.diagnostics.depth); // 2
|
|
185
|
+
console.log(result.from.address); // "original@source.com"
|
|
186
|
+
console.log(result.text); // "This is the very first message content."
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### 3. Extreme Conversation Chain (5 Levels)
|
|
190
|
+
For complex corporate threads where a message is forwarded multiple times across different regional offices (e.g., mixing English and French headers).
|
|
191
|
+
|
|
192
|
+
```typescript
|
|
193
|
+
const extremeChain = `From: boss@corp.com
|
|
194
|
+
Date: Tue, 27 Jan 2026 02:35:18 +0100
|
|
195
|
+
Subject: FW: Final Review
|
|
196
|
+
|
|
197
|
+
Check the bottom of this long thread.
|
|
198
|
+
|
|
199
|
+
---------- Forwarded message ---------
|
|
200
|
+
From: "Intermediate Manager" <inter-2@corp.com>
|
|
201
|
+
Date: mardi 27 janvier 2026 à 00:30
|
|
202
|
+
Subject: Tr: Final Review
|
|
203
|
+
|
|
204
|
+
But it is quite normal!
|
|
205
|
+
|
|
206
|
+
De : "Employee" <real.end@gmail.com>
|
|
207
|
+
Envoyé : mardi 27 janvier 2026 à 00:30
|
|
208
|
+
À : "Recip" <inter-1@provider.com>
|
|
209
|
+
Objet : Fwd: Final Review
|
|
210
|
+
|
|
211
|
+
Great Yodjii, thank you
|
|
212
|
+
|
|
213
|
+
---------- Forwarded message ---------
|
|
214
|
+
From: <inter-1@provider.com>
|
|
215
|
+
Date: Tue, 27 Jan 2026 at 00:29
|
|
216
|
+
Subject: Fwd: original request
|
|
217
|
+
|
|
218
|
+
Ok noted, I am forwarding it back to you.
|
|
219
|
+
|
|
220
|
+
---------- Forwarded message ---------
|
|
221
|
+
From: <original@source.com>
|
|
222
|
+
Date: mardi 27 janvier 2026 à 00:28
|
|
223
|
+
Subject: original request
|
|
224
|
+
|
|
225
|
+
Hello, please forward this back to me.`;
|
|
226
|
+
|
|
227
|
+
const result = await extractDeepestHybrid(extremeChain);
|
|
228
|
+
console.log(result.diagnostics.depth); // 4 (5 messages total)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
**JSON Output Example (Extreme Case):**
|
|
232
|
+
|
|
233
|
+
```json
|
|
234
|
+
{
|
|
235
|
+
"from": { "address": "original@source.com" },
|
|
236
|
+
"subject": "original request",
|
|
237
|
+
"text": "Hello, please forward this back to me.",
|
|
238
|
+
"history": [
|
|
239
|
+
{
|
|
240
|
+
"depth": 4,
|
|
241
|
+
"from": { "address": "original@source.com" },
|
|
242
|
+
"text": "Hello, please forward this back to me.",
|
|
243
|
+
"flags": ["method:crisp", "trust:medium_inline", "level:deepest"]
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
"depth": 3,
|
|
247
|
+
"from": { "address": "inter-1@provider.com" },
|
|
248
|
+
"text": "Ok noted, I am forwarding it back to you.",
|
|
249
|
+
"flags": ["method:crisp", "trust:medium_inline"]
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
"depth": 2,
|
|
253
|
+
"from": { "name": "Employee", "address": "real.end@gmail.com" },
|
|
254
|
+
"text": "Great Yodjii, thank you",
|
|
255
|
+
"flags": ["method:outlook_empty_header", "trust:medium_inline"]
|
|
256
|
+
},
|
|
257
|
+
{
|
|
258
|
+
"depth": 1,
|
|
259
|
+
"from": { "name": "Intermediate Manager", "address": "inter-2@corp.com" },
|
|
260
|
+
"text": "But it is quite normal!",
|
|
261
|
+
"flags": ["method:crisp", "trust:medium_inline"]
|
|
262
|
+
},
|
|
263
|
+
{
|
|
264
|
+
"depth": 0,
|
|
265
|
+
"from": { "address": "boss@corp.com" },
|
|
266
|
+
"text": "Check the bottom of this long thread.",
|
|
267
|
+
"flags": ["trust:high_mime", "level:root"]
|
|
268
|
+
}
|
|
269
|
+
],
|
|
270
|
+
"diagnostics": {
|
|
271
|
+
"method": "inline",
|
|
272
|
+
"depth": 4,
|
|
273
|
+
"parsedOk": true,
|
|
274
|
+
"warnings": []
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### 4. International Support (e.g., French)
|
|
280
|
+
The library automatically handles international headers like "De:", "Objet:", "Message transféré".
|
|
281
|
+
|
|
282
|
+
```typescript
|
|
283
|
+
const frenchEmail = `
|
|
284
|
+
---------- Message transféré ---------
|
|
285
|
+
De : Expert Auto <expert@assurance.fr>
|
|
286
|
+
Date : lun. 10 févr. 2025 à 11:39
|
|
287
|
+
Objet : Dossier #12345
|
|
288
|
+
|
|
289
|
+
Hello, here is your expertise report.`;
|
|
290
|
+
|
|
291
|
+
const result = await extractDeepestHybrid(frenchEmail);
|
|
292
|
+
console.log(result.from.name); // "Expert Auto"
|
|
293
|
+
console.log(result.date_iso); // "2025-02-10T10:39:00.000Z"
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Extensions & Plugins (Custom Detectors)
|
|
297
|
+
|
|
298
|
+
The library allows you to inject **custom forward detectors** to handle specific corporate headers, regional formats, or proprietary email barriers that are not covered by the default detectors.
|
|
299
|
+
|
|
300
|
+
This system is built on **Dependency Injection**, meaning your custom logic lives in your application code, not deeper in `node_modules`.
|
|
301
|
+
|
|
302
|
+
### How to create a Plugin
|
|
303
|
+
Implement the `ForwardDetector` interface:
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { extractDeepestHybrid, ForwardDetector, DetectionResult } from 'email-deepest-forward';
|
|
307
|
+
|
|
308
|
+
class MyCustomDetector implements ForwardDetector {
|
|
309
|
+
// Unique name for your detector (will appear in 'diagnostics.method')
|
|
310
|
+
name = 'my-custom-detector';
|
|
311
|
+
|
|
312
|
+
// Priority: Lower number = Higher priority.
|
|
313
|
+
// -100 = Override Everything (Expert Plugins)
|
|
314
|
+
// -40 to -20 = Specific Build-in Detectors (Outlook, FR, etc.)
|
|
315
|
+
// 100 = Crisp (Default International Engine)
|
|
316
|
+
// 150 = Reply (Fallback)
|
|
317
|
+
priority = -100;
|
|
318
|
+
|
|
319
|
+
detect(text: string): DetectionResult {
|
|
320
|
+
// Example: Detects '--- START FORWARD ---'
|
|
321
|
+
const marker = '--- START FORWARD ---';
|
|
322
|
+
const idx = text.indexOf(marker);
|
|
323
|
+
|
|
324
|
+
if (idx !== -1) {
|
|
325
|
+
// Extracted body (text AFTER the marker)
|
|
326
|
+
const body = text.substring(idx + marker.length).trim();
|
|
327
|
+
|
|
328
|
+
// Text BEFORE the marker (the message from the forwarder)
|
|
329
|
+
const message = text.substring(0, idx).trim();
|
|
330
|
+
|
|
331
|
+
return {
|
|
332
|
+
found: true,
|
|
333
|
+
detector: this.name,
|
|
334
|
+
confidence: 'high',
|
|
335
|
+
message: message, // Important for history reconstruction
|
|
336
|
+
email: {
|
|
337
|
+
from: { name: 'Detected Sender', address: 'sender@example.com' },
|
|
338
|
+
subject: 'Extracted Subject',
|
|
339
|
+
date: new Date().toISOString(),
|
|
340
|
+
body: body
|
|
341
|
+
}
|
|
342
|
+
};
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return { found: false, confidence: 'low' };
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
### How to use it
|
|
351
|
+
Pass your detector instance in the `options.customDetectors` array:
|
|
352
|
+
|
|
353
|
+
```typescript
|
|
354
|
+
const result = await extractDeepestHybrid(emailContent, {
|
|
355
|
+
customDetectors: [ new MyCustomDetector() ]
|
|
356
|
+
});
|
|
357
|
+
|
|
358
|
+
console.log(result.diagnostics.method); // "method:my-custom-detector"
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
---
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
### Malformed Inputs
|
|
365
|
+
If you pass a string that isn't an email (e.g., a simple welcome message), the library returns the text but sets `parsedOk` to `false`.
|
|
366
|
+
|
|
367
|
+
```typescript
|
|
368
|
+
const result = await extractDeepestHybrid("Welcome to our platform!");
|
|
369
|
+
|
|
370
|
+
console.log(result.from); // null
|
|
371
|
+
console.log(result.diagnostics.parsedOk); // false
|
|
372
|
+
console.log(result.text); // "Welcome to our platform!"
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### Missing or Unparseable Dates
|
|
376
|
+
If a date cannot be normalized to ISO format, `date_iso` will be `null` and a warning will be added. You can still access the original string via `date_raw`.
|
|
377
|
+
|
|
378
|
+
```typescript
|
|
379
|
+
const result = await extractDeepestHybrid(emailWithBadDate);
|
|
380
|
+
|
|
381
|
+
if (!result.date_iso) {
|
|
382
|
+
console.warn(result.diagnostics.warnings[0]); // "Could not normalize date: ..."
|
|
383
|
+
console.log("Raw date was:", result.date_raw);
|
|
384
|
+
}
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
### Non-String Input
|
|
388
|
+
The library strictly requires a string input and will throw an Error otherwise.
|
|
389
|
+
|
|
390
|
+
```typescript
|
|
391
|
+
try {
|
|
392
|
+
await extractDeepestHybrid(null as any);
|
|
393
|
+
} catch (e) {
|
|
394
|
+
console.error(e.message); // "Input must be a string"
|
|
395
|
+
}
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
## The Expert Cleaner Utility
|
|
399
|
+
|
|
400
|
+
All built-in detectors use the `Cleaner` utility to ensure consistent text normalization across recursion levels.
|
|
401
|
+
|
|
402
|
+
### Key Features:
|
|
403
|
+
- **Normalization**: Unifies line breaks (`\r\n` -> `\n`), removes BOM, handles ` `.
|
|
404
|
+
- **Memoization**: Cache layer to prevent re-processing the same text multiple times.
|
|
405
|
+
- **Quote Stripping**: Expertly removes `>` prefixes while preserving body structure.
|
|
406
|
+
- **Boundary Detection**: Uses the "Double Newline" rule found in professional parsers.
|
|
407
|
+
|
|
408
|
+
```typescript
|
|
409
|
+
import { Cleaner } from 'email-deepest-forward/utils/cleaner';
|
|
410
|
+
|
|
411
|
+
const normalized = Cleaner.normalize(rawText);
|
|
412
|
+
const bodyOnly = Cleaner.extractBody(lines, lastHeaderIndex);
|
|
413
|
+
const quoteFree = Cleaner.stripQuotes(bodyOnly);
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
## Strategy
|
|
417
|
+
|
|
418
|
+
1. **MIME Layer**: Recursively descends through `message/rfc822` attachments using `mailparser`.
|
|
419
|
+
2. **Inline Layer**: Iteratively scans the body for forwarded blocks using `email-forward-parser` patterns (supports multi-language).
|
|
420
|
+
3. **Date Normalization**: Uses `any-date-parser` and `luxon` for resilient international date parsing.
|
|
421
|
+
4. **Fallback**: Manual regex extraction if no structured headers are found.
|
|
422
|
+
|
|
423
|
+
## License
|
|
424
|
+
|
|
425
|
+
MIT - See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { ForwardDetector, DetectionResult } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Crisp detector - uses the email-forward-parser library
|
|
4
|
+
* This is the primary detector with highest priority
|
|
5
|
+
*/
|
|
6
|
+
export declare class CrispDetector implements ForwardDetector {
|
|
7
|
+
readonly name = "crisp";
|
|
8
|
+
readonly priority = 100;
|
|
9
|
+
private parser;
|
|
10
|
+
detect(text: string): DetectionResult;
|
|
11
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.CrispDetector = void 0;
|
|
7
|
+
const email_forward_parser_1 = __importDefault(require("email-forward-parser"));
|
|
8
|
+
/**
|
|
9
|
+
* Crisp detector - uses the email-forward-parser library
|
|
10
|
+
* This is the primary detector with highest priority
|
|
11
|
+
*/
|
|
12
|
+
class CrispDetector {
|
|
13
|
+
constructor() {
|
|
14
|
+
this.name = 'crisp';
|
|
15
|
+
this.priority = 100; // Fallback - universal library (lower priority than specifics)
|
|
16
|
+
this.parser = new email_forward_parser_1.default();
|
|
17
|
+
}
|
|
18
|
+
detect(text) {
|
|
19
|
+
const result = this.parser.read(text, undefined);
|
|
20
|
+
if (!result?.forwarded || !result?.email) {
|
|
21
|
+
return {
|
|
22
|
+
found: false,
|
|
23
|
+
confidence: 'low'
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
// Convert Crisp result to our DetectionResult format
|
|
27
|
+
const from = result.email.from;
|
|
28
|
+
const fromValue = typeof from === 'string'
|
|
29
|
+
? from
|
|
30
|
+
: from
|
|
31
|
+
? { name: from.name || '', address: from.address || '' }
|
|
32
|
+
: '';
|
|
33
|
+
return {
|
|
34
|
+
found: true,
|
|
35
|
+
email: {
|
|
36
|
+
from: fromValue,
|
|
37
|
+
subject: result.email.subject || undefined,
|
|
38
|
+
date: result.email.date || undefined,
|
|
39
|
+
body: result.email.body || undefined
|
|
40
|
+
},
|
|
41
|
+
message: result.message || undefined,
|
|
42
|
+
confidence: 'high' // Crisp is very reliable
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
exports.CrispDetector = CrispDetector;
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { DetectorRegistry } from './registry';
|
|
2
|
+
export { CrispDetector } from './crisp-detector';
|
|
3
|
+
export { OutlookFRDetector } from './outlook-fr-detector';
|
|
4
|
+
export { NewOutlookDetector } from './new-outlook-detector';
|
|
5
|
+
export type { ForwardDetector, DetectionResult } from './types';
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.NewOutlookDetector = exports.OutlookFRDetector = exports.CrispDetector = exports.DetectorRegistry = void 0;
|
|
4
|
+
var registry_1 = require("./registry");
|
|
5
|
+
Object.defineProperty(exports, "DetectorRegistry", { enumerable: true, get: function () { return registry_1.DetectorRegistry; } });
|
|
6
|
+
var crisp_detector_1 = require("./crisp-detector");
|
|
7
|
+
Object.defineProperty(exports, "CrispDetector", { enumerable: true, get: function () { return crisp_detector_1.CrispDetector; } });
|
|
8
|
+
var outlook_fr_detector_1 = require("./outlook-fr-detector");
|
|
9
|
+
Object.defineProperty(exports, "OutlookFRDetector", { enumerable: true, get: function () { return outlook_fr_detector_1.OutlookFRDetector; } });
|
|
10
|
+
var new_outlook_detector_1 = require("./new-outlook-detector");
|
|
11
|
+
Object.defineProperty(exports, "NewOutlookDetector", { enumerable: true, get: function () { return new_outlook_detector_1.NewOutlookDetector; } });
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { ForwardDetector, DetectionResult } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Detector for "Plain Header" format (common in New Outlook, Outlook 2013, Mobile clients)
|
|
4
|
+
* Pattern: Localized headers like From/De/Von, To/À/An, Date/Sent/Envoyé
|
|
5
|
+
*/
|
|
6
|
+
export declare class NewOutlookDetector implements ForwardDetector {
|
|
7
|
+
readonly name = "new_outlook";
|
|
8
|
+
readonly priority = -40;
|
|
9
|
+
detect(text: string): DetectionResult;
|
|
10
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.NewOutlookDetector = void 0;
|
|
4
|
+
const cleaner_1 = require("../utils/cleaner");
|
|
5
|
+
/**
|
|
6
|
+
* Detector for "Plain Header" format (common in New Outlook, Outlook 2013, Mobile clients)
|
|
7
|
+
* Pattern: Localized headers like From/De/Von, To/À/An, Date/Sent/Envoyé
|
|
8
|
+
*/
|
|
9
|
+
class NewOutlookDetector {
|
|
10
|
+
constructor() {
|
|
11
|
+
this.name = 'new_outlook';
|
|
12
|
+
this.priority = -40; // Specific detector - High Priority (Override)
|
|
13
|
+
}
|
|
14
|
+
detect(text) {
|
|
15
|
+
// 1. Expert Normalization
|
|
16
|
+
const normalized = cleaner_1.Cleaner.normalize(text);
|
|
17
|
+
// Define multi-lingual header maps
|
|
18
|
+
const labels = {
|
|
19
|
+
from: ['From', 'De', 'Von', 'Da', 'Od', 'Fra', 'Kimden', 'Van', 'Från', 'De ', 'Lähettäjä', 'Feladó', 'От'],
|
|
20
|
+
date: ['Date', 'Sent', 'Envoyé', 'Gesendet', 'Inviato', 'Enviado', 'Data', 'Sendt', 'Lähetetty', 'Skickat', 'Datum', 'Dátum', 'Päivämäärä', 'Tarih', 'Дата'],
|
|
21
|
+
subject: ['Subject', 'Objet', 'Betreff', 'Oggetto', 'Assunto', 'Asunto', 'Emne', 'Aihe', 'Ämne', 'Předmět', 'Predmet', 'Tárgy', 'Temat', 'Тема', 'Konu', 'Onderwerp'],
|
|
22
|
+
to: ['To', 'À', 'A', 'An', 'Para', 'Til', 'Vastaanottaja', 'Till', 'Pro', 'Za', 'Címzett', 'Do', 'Кому', 'Kime', 'Aan']
|
|
23
|
+
};
|
|
24
|
+
const lines = normalized.split('\n').map(l => l.trimRight());
|
|
25
|
+
// Helper to find a header in a set of lines
|
|
26
|
+
const findHeader = (searchLines, keys) => {
|
|
27
|
+
for (let i = 0; i < searchLines.length; i++) {
|
|
28
|
+
const line = searchLines[i];
|
|
29
|
+
for (const key of keys) {
|
|
30
|
+
const regex = new RegExp(`^\\s*[\\*_]*${key}[\\*_]*\\s*:`, 'i');
|
|
31
|
+
if (line.match(regex)) {
|
|
32
|
+
const colonIndex = line.indexOf(':');
|
|
33
|
+
return { index: i, line, key, value: line.substring(colonIndex + 1).trim() };
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return null;
|
|
38
|
+
};
|
|
39
|
+
// 2. Identification
|
|
40
|
+
const fromMatch = findHeader(lines, labels.from);
|
|
41
|
+
if (!fromMatch)
|
|
42
|
+
return { found: false, confidence: 'low' };
|
|
43
|
+
const fromIndex = fromMatch.index;
|
|
44
|
+
// CRITICAL: The window must stop if we hit an empty line (end of headers)
|
|
45
|
+
let searchWindow = [];
|
|
46
|
+
const windowLimit = 15;
|
|
47
|
+
const searchStart = Math.max(0, fromIndex - 2);
|
|
48
|
+
for (let i = searchStart; i < Math.min(lines.length, fromIndex + windowLimit); i++) {
|
|
49
|
+
if (i > fromIndex && lines[i].trim() === '')
|
|
50
|
+
break;
|
|
51
|
+
searchWindow.push(lines[i]);
|
|
52
|
+
}
|
|
53
|
+
const findHeaderInWindow = (keys) => {
|
|
54
|
+
for (let j = 0; j < searchWindow.length; j++) {
|
|
55
|
+
const line = searchWindow[j];
|
|
56
|
+
for (const key of keys) {
|
|
57
|
+
const regex = new RegExp(`^\\s*[\\*_]*${key}[\\*_]*\\s*:`, 'i');
|
|
58
|
+
if (line.match(regex)) {
|
|
59
|
+
const colonIndex = line.indexOf(':');
|
|
60
|
+
return { index: searchStart + j, line, key, value: line.substring(colonIndex + 1).trim() };
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return null;
|
|
65
|
+
};
|
|
66
|
+
const subject = findHeaderInWindow(labels.subject);
|
|
67
|
+
if (!subject)
|
|
68
|
+
return { found: false, confidence: 'low' };
|
|
69
|
+
const date = findHeaderInWindow(labels.date);
|
|
70
|
+
const to = findHeaderInWindow(labels.to);
|
|
71
|
+
const fromValue = fromMatch.value;
|
|
72
|
+
const emailMatch = fromValue.match(/[<\[](?:mailto:)?(.*?)[>\]]/i);
|
|
73
|
+
const address = emailMatch ? emailMatch[1].trim() : (fromValue.includes('@') ? fromValue : '');
|
|
74
|
+
const name = fromValue.replace(/[<\[].*?[>\]]/g, '').trim() || address;
|
|
75
|
+
const subjectIndex = subject.index;
|
|
76
|
+
const dateIndex = date ? date.index : -1;
|
|
77
|
+
const toIndex = to ? to.index : -1;
|
|
78
|
+
const lastHeaderIndex = Math.max(fromIndex, subjectIndex, dateIndex, toIndex);
|
|
79
|
+
// 3. Expert Body Extraction
|
|
80
|
+
const bodyContent = cleaner_1.Cleaner.extractBody(lines, lastHeaderIndex);
|
|
81
|
+
const finalBody = fromMatch.line.startsWith('>') ? cleaner_1.Cleaner.stripQuotes(bodyContent) : bodyContent;
|
|
82
|
+
// 4. Message (preceding text)
|
|
83
|
+
let messageEndIndex = fromIndex;
|
|
84
|
+
if (messageEndIndex > 0) {
|
|
85
|
+
for (let k = 1; k <= 5; k++) {
|
|
86
|
+
if (messageEndIndex - k < 0)
|
|
87
|
+
break;
|
|
88
|
+
const prevLine = lines[messageEndIndex - k].trim();
|
|
89
|
+
if (prevLine.match(/^-{2,}.*-{2,}$/) || prevLine.match(/^_{3,}$/)) {
|
|
90
|
+
messageEndIndex = messageEndIndex - k;
|
|
91
|
+
break;
|
|
92
|
+
}
|
|
93
|
+
if (prevLine === '')
|
|
94
|
+
continue;
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
const message = messageEndIndex > 0 ? lines.slice(0, messageEndIndex).join('\n').trim() : undefined;
|
|
99
|
+
return {
|
|
100
|
+
found: true,
|
|
101
|
+
email: {
|
|
102
|
+
from: address ? { name: name.replace(/["']/g, ''), address: address } : name,
|
|
103
|
+
subject: subject.value,
|
|
104
|
+
date: date ? date.value : undefined,
|
|
105
|
+
body: finalBody
|
|
106
|
+
},
|
|
107
|
+
message: message,
|
|
108
|
+
confidence: 'medium'
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
exports.NewOutlookDetector = NewOutlookDetector;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { ForwardDetector, DetectionResult } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Detector for Outlook forwards where the "Envoyé:" (Sent) header is present but empty.
|
|
4
|
+
* Example of failing block:
|
|
5
|
+
* ________________________________
|
|
6
|
+
* De: Florian M.
|
|
7
|
+
* Envoyé:
|
|
8
|
+
* À: Flo M.
|
|
9
|
+
* Objet: RE: ...
|
|
10
|
+
*/
|
|
11
|
+
export declare class OutlookEmptyHeaderDetector implements ForwardDetector {
|
|
12
|
+
readonly name = "outlook_empty_header";
|
|
13
|
+
readonly priority = 50;
|
|
14
|
+
private readonly HEADER_PATTERN;
|
|
15
|
+
detect(text: string): DetectionResult;
|
|
16
|
+
}
|