email-origin-chain 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +425 -0
  3. package/dist/detectors/crisp-detector.d.ts +11 -0
  4. package/dist/detectors/crisp-detector.js +46 -0
  5. package/dist/detectors/index.d.ts +5 -0
  6. package/dist/detectors/index.js +11 -0
  7. package/dist/detectors/new-outlook-detector.d.ts +10 -0
  8. package/dist/detectors/new-outlook-detector.js +112 -0
  9. package/dist/detectors/outlook-empty-header-detector.d.ts +16 -0
  10. package/dist/detectors/outlook-empty-header-detector.js +64 -0
  11. package/dist/detectors/outlook-fr-detector.d.ts +10 -0
  12. package/dist/detectors/outlook-fr-detector.js +119 -0
  13. package/dist/detectors/outlook-reverse-fr-detector.d.ts +13 -0
  14. package/dist/detectors/outlook-reverse-fr-detector.js +86 -0
  15. package/dist/detectors/registry.d.ts +25 -0
  16. package/dist/detectors/registry.js +81 -0
  17. package/dist/detectors/reply-detector.d.ts +11 -0
  18. package/dist/detectors/reply-detector.js +82 -0
  19. package/dist/detectors/types.d.ts +38 -0
  20. package/dist/detectors/types.js +2 -0
  21. package/dist/index.d.ts +6 -0
  22. package/dist/index.js +132 -0
  23. package/dist/inline-layer.d.ts +7 -0
  24. package/dist/inline-layer.js +116 -0
  25. package/dist/mime-layer.d.ts +15 -0
  26. package/dist/mime-layer.js +70 -0
  27. package/dist/types.d.ts +63 -0
  28. package/dist/types.js +2 -0
  29. package/dist/utils/cleaner.d.ts +16 -0
  30. package/dist/utils/cleaner.js +51 -0
  31. package/dist/utils.d.ts +17 -0
  32. package/dist/utils.js +221 -0
  33. package/docs/TEST_COVERAGE.md +54 -0
  34. package/docs/architecture/README.md +27 -0
  35. package/docs/architecture/phase1_cc_fix.md +223 -0
  36. package/docs/architecture/phase2_plugin_foundation.md +185 -0
  37. package/docs/architecture/phase3_fallbacks.md +62 -0
  38. package/docs/architecture/plugin_plan.md +318 -0
  39. package/docs/architecture/refactor_report.md +98 -0
  40. package/docs/detectors_usage.md +42 -0
  41. package/docs/walkthrough_address_fix.md +58 -0
  42. package/docs/walkthrough_deep_forward_fix.md +35 -0
  43. package/package.json +48 -0
package/dist/index.js ADDED
@@ -0,0 +1,132 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.extractDeepestHybrid = extractDeepestHybrid;
18
+ const mime_layer_1 = require("./mime-layer");
19
+ const inline_layer_1 = require("./inline-layer");
20
+ const utils_1 = require("./utils");
21
+ /**
22
+ * Main entry point: Extract the deepest forwarded email using hybrid strategy
23
+ */
24
+ async function extractDeepestHybrid(raw, options) {
25
+ // Validation
26
+ if (typeof raw !== 'string') {
27
+ throw new Error('Input must be a string');
28
+ }
29
+ const opts = {
30
+ maxDepth: options?.maxDepth ?? 15,
31
+ timeoutMs: options?.timeoutMs ?? 10000,
32
+ skipMimeLayer: options?.skipMimeLayer ?? false,
33
+ customDetectors: options?.customDetectors ?? []
34
+ };
35
+ const warnings = [];
36
+ // If skipMimeLayer is true, parse only inline forwards (text-only mode)
37
+ if (opts.skipMimeLayer) {
38
+ return await (0, inline_layer_1.processInline)(raw, 0, [], opts.customDetectors);
39
+ }
40
+ try {
41
+ // Step 1: MIME Layer
42
+ let timer;
43
+ const mimeResult = await Promise.race([
44
+ (0, mime_layer_1.processMime)(raw, opts),
45
+ new Promise((_, reject) => {
46
+ timer = setTimeout(() => reject(new Error('MIME parsing timeout')), opts.timeoutMs);
47
+ })
48
+ ]).finally(() => {
49
+ if (timer)
50
+ clearTimeout(timer);
51
+ });
52
+ // Step 2: Inline Layer
53
+ const inlineResult = await (0, inline_layer_1.processInline)(mimeResult.rawBody, mimeResult.depth, mimeResult.history, opts.customDetectors);
54
+ // Step 3: Align results
55
+ let from = (0, utils_1.normalizeFrom)(inlineResult.from);
56
+ let subject = inlineResult.subject;
57
+ let date_raw = inlineResult.date_raw;
58
+ let date_iso = inlineResult.date_iso;
59
+ let text = inlineResult.text;
60
+ if (inlineResult.diagnostics.method === 'fallback' && mimeResult.metadata) {
61
+ const m = mimeResult.metadata;
62
+ if (!from && m.from?.value?.[0]) {
63
+ from = (0, utils_1.normalizeFrom)({ name: m.from.value[0].name, address: m.from.value[0].address });
64
+ }
65
+ if (!subject && m.subject)
66
+ subject = m.subject;
67
+ if (!date_iso && m.date)
68
+ date_iso = m.date.toISOString();
69
+ if (!date_raw && m.date)
70
+ date_raw = m.date.toString();
71
+ if (!text)
72
+ text = mimeResult.rawBody;
73
+ }
74
+ // Align the root entry of history
75
+ if (inlineResult.history.length > 0) {
76
+ const rootInHistory = inlineResult.history[inlineResult.history.length - 1];
77
+ if (!rootInHistory.from && mimeResult.metadata) {
78
+ const m = mimeResult.metadata;
79
+ if (m.from?.value?.[0]) {
80
+ rootInHistory.from = (0, utils_1.normalizeFrom)({ name: m.from.value[0].name, address: m.from.value[0].address });
81
+ }
82
+ if (m.subject)
83
+ rootInHistory.subject = m.subject;
84
+ }
85
+ }
86
+ // Step 4: Final enrichment
87
+ const attachments = mimeResult.lastAttachments.map(att => ({
88
+ filename: att.filename,
89
+ contentType: att.contentType || 'application/octet-stream',
90
+ size: att.size || 0
91
+ }));
92
+ date_iso = date_iso || (0, utils_1.normalizeDateToISO)(date_raw);
93
+ // Destructure to exclude 'from' since we have our own normalized version
94
+ const { from: _unusedFrom, ...restInlineResult } = inlineResult;
95
+ const result = {
96
+ ...restInlineResult,
97
+ // Use our normalized/enriched values
98
+ from,
99
+ subject,
100
+ date_raw,
101
+ date_iso,
102
+ text: (0, utils_1.cleanText)(text),
103
+ attachments: [...attachments, ...inlineResult.attachments],
104
+ diagnostics: {
105
+ ...inlineResult.diagnostics,
106
+ depth: mimeResult.depth + inlineResult.diagnostics.depth,
107
+ method: (inlineResult.diagnostics.method === 'fallback' && mimeResult.isRfc822) ? 'rfc822' : inlineResult.diagnostics.method,
108
+ parsedOk: !!(from && subject) || !!(from && inlineResult.diagnostics.method !== 'fallback'),
109
+ warnings: [...warnings, ...inlineResult.diagnostics.warnings]
110
+ }
111
+ };
112
+ return result;
113
+ }
114
+ catch (error) {
115
+ return {
116
+ from: null,
117
+ subject: null,
118
+ date_raw: null,
119
+ date_iso: null,
120
+ text: (0, utils_1.cleanText)(raw),
121
+ attachments: [],
122
+ history: [],
123
+ diagnostics: {
124
+ method: 'fallback',
125
+ depth: 0,
126
+ parsedOk: false,
127
+ warnings: [`Fatal error: ${error.message}`]
128
+ }
129
+ };
130
+ }
131
+ }
132
+ __exportStar(require("./types"), exports);
@@ -0,0 +1,7 @@
1
+ import { ResultObject, HistoryEntry } from './types';
2
+ import { ForwardDetector } from './detectors/types';
3
+ /**
4
+ * Process inline forwarded content recursively.
5
+ * Uses a manual loop with DetectorRegistry to allow multiple strategies (lib, custom regexes, etc.)
6
+ */
7
+ export declare function processInline(text: string, depth: number, baseHistory?: HistoryEntry[], customDetectors?: ForwardDetector[]): Promise<ResultObject>;
@@ -0,0 +1,116 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.processInline = processInline;
4
+ const detectors_1 = require("./detectors");
5
+ const utils_1 = require("./utils");
6
+ /**
7
+ * Process inline forwarded content recursively.
8
+ * Uses a manual loop with DetectorRegistry to allow multiple strategies (lib, custom regexes, etc.)
9
+ */
10
+ async function processInline(text, depth, baseHistory = [], customDetectors = []) {
11
+ const warnings = [];
12
+ const registry = new detectors_1.DetectorRegistry(customDetectors);
13
+ const history = [...baseHistory];
14
+ let currentText = text.trim();
15
+ const startingDepth = depth;
16
+ let currentDepth = depth;
17
+ const maxRecursiveDepth = 15; // Increased for deep chains
18
+ // Ensure we have at least one entry representing the "current" starting point
19
+ if (history.length === 0) {
20
+ history.push({
21
+ from: null,
22
+ subject: null,
23
+ date_raw: null,
24
+ date_iso: null,
25
+ text: '',
26
+ depth: currentDepth,
27
+ flags: ['level:root', 'trust:medium_inline']
28
+ });
29
+ }
30
+ // Detection loop: This allows combining the library (CrispDetector)
31
+ // with custom local detectors (OutlookFRDetector, etc.)
32
+ while (currentDepth < maxRecursiveDepth) {
33
+ const result = registry.detect(currentText);
34
+ if (!result.found || !result.email) {
35
+ // No more forwards detected
36
+ const lastIdx = history.length - 1;
37
+ history[lastIdx].text = (0, utils_1.cleanText)(currentText);
38
+ break;
39
+ }
40
+ const email = result.email;
41
+ // Update previous level's exclusive text
42
+ const previousIdx = history.length - 1;
43
+ history[previousIdx].text = (0, utils_1.cleanText)(result.message || '');
44
+ if (!history[previousIdx].text && !history[previousIdx].flags.includes('content:silent_forward')) {
45
+ history[previousIdx].flags.push('content:silent_forward');
46
+ }
47
+ // Build flags
48
+ const flags = [`method:${result.detector || 'unknown'}`, 'trust:medium_inline'];
49
+ if (!email.body || email.body.trim() === '') {
50
+ flags.push('content:silent_forward');
51
+ }
52
+ // Normalize date
53
+ const dateIso = (0, utils_1.normalizeDateToISO)(email.date);
54
+ if (email.date && !dateIso) {
55
+ warnings.push(`Could not normalize date: "${email.date}"`);
56
+ flags.push('date:unparseable');
57
+ }
58
+ // Normalize from address (fix patterns like "email [email]")
59
+ let fromNormalized = typeof email.from === 'object'
60
+ ? { name: email.from.name, address: email.from.address }
61
+ : (email.from ? { address: email.from } : null);
62
+ fromNormalized = (0, utils_1.normalizeFrom)(fromNormalized);
63
+ // Add this forward level to history
64
+ history.push({
65
+ from: fromNormalized,
66
+ subject: email.subject || null,
67
+ date_raw: email.date || null,
68
+ date_iso: dateIso,
69
+ text: (0, utils_1.cleanText)(email.body || ''),
70
+ depth: currentDepth + 1,
71
+ flags: flags
72
+ });
73
+ // Continue with the body for next iteration
74
+ currentText = (email.body || '').trim();
75
+ currentDepth++;
76
+ }
77
+ // Mark the deepest entry
78
+ if (currentDepth > startingDepth) {
79
+ const deepestEntry = history[history.length - 1];
80
+ if (!deepestEntry.flags.includes('level:deepest')) {
81
+ deepestEntry.flags.push('level:deepest');
82
+ }
83
+ return {
84
+ from: deepestEntry.from,
85
+ subject: deepestEntry.subject,
86
+ date_raw: deepestEntry.date_raw,
87
+ date_iso: deepestEntry.date_iso,
88
+ text: deepestEntry.text,
89
+ attachments: [],
90
+ history: history.slice().reverse(),
91
+ diagnostics: {
92
+ method: (deepestEntry.flags.find(f => f.startsWith('method:')) || 'inline'),
93
+ depth: currentDepth - startingDepth,
94
+ parsedOk: true,
95
+ warnings: warnings
96
+ }
97
+ };
98
+ }
99
+ // No forwards found
100
+ const currentEntry = history[history.length - 1];
101
+ return {
102
+ from: currentEntry.from,
103
+ subject: currentEntry.subject,
104
+ date_raw: currentEntry.date_raw,
105
+ date_iso: currentEntry.date_iso,
106
+ text: currentEntry.text || (0, utils_1.cleanText)(currentText),
107
+ attachments: [],
108
+ history: history.slice().reverse(),
109
+ diagnostics: {
110
+ method: 'fallback',
111
+ depth: 0,
112
+ parsedOk: false,
113
+ warnings: warnings.length > 0 ? warnings : ['No forwarded content detected']
114
+ }
115
+ };
116
+ }
@@ -0,0 +1,15 @@
1
+ import { Attachment as MailparserAttachment } from 'mailparser';
2
+ import { Options, HistoryEntry } from './types';
3
+ export interface MimeResult {
4
+ rawBody: string;
5
+ depth: number;
6
+ lastAttachments: MailparserAttachment[];
7
+ isRfc822: boolean;
8
+ history: HistoryEntry[];
9
+ metadata?: {
10
+ from?: any;
11
+ subject?: string;
12
+ date?: Date;
13
+ };
14
+ }
15
+ export declare function processMime(raw: string, options: Options): Promise<MimeResult>;
@@ -0,0 +1,70 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.processMime = processMime;
4
+ const mailparser_1 = require("mailparser");
5
+ async function processMime(raw, options) {
6
+ let currentRaw = raw;
7
+ let depth = 0;
8
+ const maxDepth = options.maxDepth || 5;
9
+ let lastAttachments = [];
10
+ let isRfc822 = false;
11
+ const history = [];
12
+ // Safety check
13
+ if (typeof raw !== 'string') {
14
+ throw new Error("MIME parser input must be a string");
15
+ }
16
+ // Iterative approach to avoid call stack limits, though recursion is also fine for depth < 100
17
+ while (depth < maxDepth) {
18
+ try {
19
+ const parsed = await (0, mailparser_1.simpleParser)(currentRaw);
20
+ // Record current level in history
21
+ history.push({
22
+ from: parsed.from?.value?.[0] ? {
23
+ name: parsed.from.value[0].name,
24
+ address: parsed.from.value[0].address
25
+ } : null,
26
+ subject: parsed.subject || null,
27
+ date_raw: parsed.date?.toString() || null,
28
+ date_iso: parsed.date ? parsed.date.toISOString() : null,
29
+ text: parsed.text || null, // Will be "exclusive" text once we know if there’s a forward inside
30
+ depth,
31
+ flags: ['trust:high_mime']
32
+ });
33
+ // Check for attached messages
34
+ const rfcParts = parsed.attachments.filter(a => a.contentType === 'message/rfc822');
35
+ if (rfcParts.length > 0) {
36
+ const last = rfcParts[rfcParts.length - 1];
37
+ if (last.content) {
38
+ currentRaw = last.content.toString('utf8');
39
+ depth++;
40
+ isRfc822 = true;
41
+ // Reset attachments for the new level
42
+ lastAttachments = [];
43
+ continue;
44
+ }
45
+ }
46
+ return {
47
+ rawBody: parsed.text || currentRaw,
48
+ depth,
49
+ lastAttachments: parsed.attachments,
50
+ isRfc822,
51
+ history,
52
+ metadata: {
53
+ from: parsed.from,
54
+ subject: parsed.subject,
55
+ date: parsed.date
56
+ }
57
+ };
58
+ }
59
+ catch (error) {
60
+ break;
61
+ }
62
+ }
63
+ return {
64
+ rawBody: currentRaw,
65
+ depth,
66
+ lastAttachments,
67
+ isRfc822,
68
+ history
69
+ };
70
+ }
@@ -0,0 +1,63 @@
1
+ import { ForwardDetector, DetectionResult } from './detectors/types';
2
+ export { ForwardDetector, DetectionResult };
3
+ export interface EmailAddress {
4
+ name?: string;
5
+ address?: string;
6
+ }
7
+ export interface Attachment {
8
+ filename?: string;
9
+ contentType: string;
10
+ size: number;
11
+ content?: any;
12
+ }
13
+ export interface Diagnostics {
14
+ method: 'rfc822' | 'inline' | 'fallback';
15
+ depth: number;
16
+ parsedOk: boolean;
17
+ warnings: string[];
18
+ }
19
+ export interface HistoryEntry {
20
+ from: EmailAddress | null;
21
+ subject: string | null;
22
+ date_raw: string | null;
23
+ date_iso: string | null;
24
+ text: string | null;
25
+ depth: number;
26
+ flags: string[];
27
+ }
28
+ export interface ResultObject {
29
+ from: EmailAddress | null;
30
+ subject: string | null;
31
+ date_raw: string | null;
32
+ date_iso: string | null;
33
+ text: string | null;
34
+ attachments: Attachment[];
35
+ history: HistoryEntry[];
36
+ diagnostics: Diagnostics;
37
+ }
38
+ /**
39
+ * Options for extraction behavior
40
+ */
41
+ export interface Options {
42
+ /**
43
+ * Maximum depth to descend through MIME attachments.
44
+ * Default: 5
45
+ */
46
+ maxDepth?: number;
47
+ /**
48
+ * Maximum time in milliseconds to wait for MIME parsing before timeout.
49
+ * Default: 5000ms
50
+ */
51
+ timeoutMs?: number;
52
+ /**
53
+ * Skip MIME layer processing and parse only inline forwards.
54
+ * Use this when input is plain text body (not a full email with headers).
55
+ * Default: false
56
+ */
57
+ skipMimeLayer?: boolean;
58
+ /**
59
+ * Custom forward detectors to register.
60
+ * These will be added to the registry and used for detection.
61
+ */
62
+ customDetectors?: ForwardDetector[];
63
+ }
package/dist/types.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,16 @@
1
+ export declare class Cleaner {
2
+ private static normalizationCache;
3
+ /**
4
+ * Normalizes whitespace scories (BOM, nbsp, line breaks)
5
+ */
6
+ static normalize(text: string): string;
7
+ /**
8
+ * Consistently strips quotes (>) and common Outlook leading indentation (4 spaces)
9
+ */
10
+ static stripQuotes(text: string): string;
11
+ /**
12
+ * Robustly identifies the body after a header block by finding the
13
+ * first double-newline (or single if strict) after the last known header line.
14
+ */
15
+ static extractBody(lines: string[], lastHeaderIndex: number): string;
16
+ }
@@ -0,0 +1,51 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Cleaner = void 0;
4
+ class Cleaner {
5
+ /**
6
+ * Normalizes whitespace scories (BOM, nbsp, line breaks)
7
+ */
8
+ static normalize(text) {
9
+ if (!text)
10
+ return '';
11
+ const cached = this.normalizationCache.get(text);
12
+ if (cached !== undefined)
13
+ return cached;
14
+ const normalized = text
15
+ .replace(/\r\n/gm, '\n')
16
+ .replace(/\uFEFF/gm, '')
17
+ .replace(/\u00A0$/gm, '')
18
+ .replace(/\u00A0/gm, ' ')
19
+ .trim();
20
+ if (this.normalizationCache.size > 200) {
21
+ this.normalizationCache.clear();
22
+ }
23
+ this.normalizationCache.set(text, normalized);
24
+ return normalized;
25
+ }
26
+ /**
27
+ * Consistently strips quotes (>) and common Outlook leading indentation (4 spaces)
28
+ */
29
+ static stripQuotes(text) {
30
+ return text
31
+ .replace(/^(>+)\s?$/gm, '') // Empty quote lines
32
+ .replace(/^(>+)\s?/gm, '') // Quote lines with content
33
+ .replace(/^(\ {4})\s?/gm, ''); // 4 spaces indentation
34
+ }
35
+ /**
36
+ * Robustly identifies the body after a header block by finding the
37
+ * first double-newline (or single if strict) after the last known header line.
38
+ */
39
+ static extractBody(lines, lastHeaderIndex) {
40
+ // Crisp logic: looks for \n\n (start of next line being empty)
41
+ // following the last header.
42
+ let bodyStartIndex = lastHeaderIndex + 1;
43
+ // Skip any empty lines immediately following headers to find the real body start
44
+ while (bodyStartIndex < lines.length && lines[bodyStartIndex].trim() === '') {
45
+ bodyStartIndex++;
46
+ }
47
+ return lines.slice(bodyStartIndex).join('\n').trim();
48
+ }
49
+ }
50
+ exports.Cleaner = Cleaner;
51
+ Cleaner.normalizationCache = new Map();
@@ -0,0 +1,17 @@
1
+ import { ResultObject, EmailAddress } from './types';
2
+ export declare function normalizeDateToISO(dateRaw: string | Date | null | undefined): string | null;
3
+ export declare function cleanText(text: string | null | undefined): string | null;
4
+ /**
5
+ * Normalizes EmailAddress to fix edge cases like "email [email]" pattern
6
+ *
7
+ * Issue: Some email clients (Gmail, Outlook) produce formats like:
8
+ * "john.doe@example.com [john.doe@example.com]"
9
+ *
10
+ * email-forward-parser may parse this as:
11
+ * { name: "john.doe@example.com [john.doe@example.com]", address: "" }
12
+ *
13
+ * This function detects and fixes this pattern to:
14
+ * { name: null, address: "john.doe@example.com" }
15
+ */
16
+ export declare function normalizeFrom(from: EmailAddress | null | undefined): EmailAddress | null;
17
+ export declare function normalizeParserResult(parsed: any, method: 'inline' | 'fallback', depth: number, warnings?: string[]): ResultObject;