@mcinteerj/openclaw-gmail 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +117 -0
- package/index.ts +14 -0
- package/openclaw.plugin.json +11 -0
- package/package.json +54 -0
- package/src/accounts.ts +57 -0
- package/src/attachments.ts +29 -0
- package/src/channel.ts +360 -0
- package/src/config.ts +32 -0
- package/src/history-store.ts +42 -0
- package/src/html.ts +5 -0
- package/src/inbound.ts +183 -0
- package/src/monitor.ts +472 -0
- package/src/normalize.ts +42 -0
- package/src/onboarding.ts +214 -0
- package/src/outbound-check.test.ts +159 -0
- package/src/outbound-check.ts +233 -0
- package/src/outbound.ts +176 -0
- package/src/quoting.ts +230 -0
- package/src/runtime.ts +14 -0
- package/src/sanitize.test.ts +239 -0
- package/src/sanitize.ts +165 -0
- package/src/semaphore.ts +36 -0
- package/src/strip-quotes.ts +33 -0
- package/src/threading.ts +8 -0
package/src/sanitize.ts
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sanitise raw HTML email bodies into clean plain text.
|
|
3
|
+
*
|
|
4
|
+
* Goals:
|
|
5
|
+
* - Strip all HTML to readable text (no tags, no CSS, no scripts)
|
|
6
|
+
* - Remove tracking pixels, base64 images, inline junk
|
|
7
|
+
* - Preserve meaningful link URLs
|
|
8
|
+
* - Strip common email footer noise (signatures, disclaimers, "Sent from…")
|
|
9
|
+
* - Collapse excessive whitespace
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
// ── HTML entities ──────────────────────────────────────────────────────────
|
|
13
|
+
const NAMED_ENTITIES: Record<string, string> = {
|
|
14
|
+
amp: "&",
|
|
15
|
+
lt: "<",
|
|
16
|
+
gt: ">",
|
|
17
|
+
quot: '"',
|
|
18
|
+
apos: "'",
|
|
19
|
+
nbsp: " ",
|
|
20
|
+
mdash: "—",
|
|
21
|
+
ndash: "–",
|
|
22
|
+
lsquo: "\u2018",
|
|
23
|
+
rsquo: "\u2019",
|
|
24
|
+
ldquo: "\u201C",
|
|
25
|
+
rdquo: "\u201D",
|
|
26
|
+
bull: "\u2022",
|
|
27
|
+
hellip: "\u2026",
|
|
28
|
+
copy: "\u00A9",
|
|
29
|
+
reg: "\u00AE",
|
|
30
|
+
trade: "\u2122",
|
|
31
|
+
"#39": "'",
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
function decodeEntities(text: string): string {
|
|
35
|
+
return text
|
|
36
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) =>
|
|
37
|
+
String.fromCharCode(parseInt(hex, 16)),
|
|
38
|
+
)
|
|
39
|
+
.replace(/&#(\d+);/g, (_, dec) => String.fromCharCode(parseInt(dec, 10)))
|
|
40
|
+
.replace(/&([a-zA-Z#0-9]+);/g, (match, name) => NAMED_ENTITIES[name] ?? match);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// ── HTML → plain text ──────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Convert an HTML string to clean plain text.
|
|
47
|
+
*/
|
|
48
|
+
export function htmlToText(html: string): string {
|
|
49
|
+
let s = html;
|
|
50
|
+
|
|
51
|
+
// 1. Remove <style>, <script>, and <head> blocks entirely
|
|
52
|
+
s = s.replace(/<style[\s>][\s\S]*?<\/style>/gi, "");
|
|
53
|
+
s = s.replace(/<script[\s>][\s\S]*?<\/script>/gi, "");
|
|
54
|
+
s = s.replace(/<head[\s>][\s\S]*?<\/head>/gi, "");
|
|
55
|
+
|
|
56
|
+
// 2. Remove HTML comments
|
|
57
|
+
s = s.replace(/<!--[\s\S]*?-->/g, "");
|
|
58
|
+
|
|
59
|
+
// 3. Remove tracking pixels and junk images
|
|
60
|
+
// - 1×1 images (width/height = 1)
|
|
61
|
+
// - display:none images
|
|
62
|
+
// - base64 data URI images
|
|
63
|
+
s = s.replace(/<img[^>]*(?:width\s*=\s*["']?1["']?(?=[\s>\/])|height\s*=\s*["']?1["']?(?=[\s>\/]))[^>]*\/?>/gi, "");
|
|
64
|
+
s = s.replace(/<img[^>]*display\s*:\s*none[^>]*\/?>/gi, "");
|
|
65
|
+
s = s.replace(/<img[^>]*src\s*=\s*["']data:[^"']*["'][^>]*\/?>/gi, "");
|
|
66
|
+
|
|
67
|
+
// 4. Convert <br> variants to newlines
|
|
68
|
+
s = s.replace(/<br\s*\/?>/gi, "\n");
|
|
69
|
+
|
|
70
|
+
// 5. Block-level elements → newlines (before & after)
|
|
71
|
+
const blockTags = "p|div|tr|li|h[1-6]|table|section|article|header|footer|blockquote|ul|ol|dd|dt|dl|pre|hr|figcaption";
|
|
72
|
+
s = s.replace(new RegExp(`<\\/?(${blockTags})[^>]*>`, "gi"), "\n");
|
|
73
|
+
|
|
74
|
+
// 6. Extract <a> links — keep URL when it differs from the link text
|
|
75
|
+
s = s.replace(/<a\s[^>]*href\s*=\s*["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi,
|
|
76
|
+
(_match, href: string, text: string) => {
|
|
77
|
+
const linkText = text.replace(/<[^>]*>/g, "").trim();
|
|
78
|
+
const cleanHref = href.trim();
|
|
79
|
+
if (!linkText) return cleanHref ? `(${cleanHref})` : "";
|
|
80
|
+
if (!cleanHref || cleanHref === "#" || cleanHref.startsWith("mailto:")) return linkText;
|
|
81
|
+
// If the visible text IS the URL (or close), just show text
|
|
82
|
+
if (linkText === cleanHref || linkText === cleanHref.replace(/^https?:\/\//, "")) {
|
|
83
|
+
return linkText;
|
|
84
|
+
}
|
|
85
|
+
return `${linkText} (${cleanHref})`;
|
|
86
|
+
},
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
// 7. Strip all remaining HTML tags
|
|
90
|
+
s = s.replace(/<[^>]+>/g, "");
|
|
91
|
+
|
|
92
|
+
// 8. Decode HTML entities
|
|
93
|
+
s = decodeEntities(s);
|
|
94
|
+
|
|
95
|
+
return s;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// ── Footer / junk removal ──────────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
const FOOTER_PATTERNS: RegExp[] = [
|
|
101
|
+
// "Sent from my …"
|
|
102
|
+
/^sent from my (?:iphone|ipad|galaxy|samsung|android|pixel|outlook|thunderbird|mail for windows).*$/im,
|
|
103
|
+
// "Get Outlook for …"
|
|
104
|
+
/^get outlook for (?:ios|android|windows|mac).*$/im,
|
|
105
|
+
// Unsubscribe lines
|
|
106
|
+
/^.*\bunsubscribe\b.*$/im,
|
|
107
|
+
// Confidentiality / disclaimer blocks (often multi-line, grab the whole paragraph)
|
|
108
|
+
/(?:^|\n).*(?:confidential(?:ity)?|disclaimer|privileged|intended recipient|legally privileged).*(?:\n(?!\n).*){0,8}/im,
|
|
109
|
+
// Signature separator: line starting with "-- " (RFC 3676) or just "--"
|
|
110
|
+
/^--\s*$/m,
|
|
111
|
+
// Copyright footers
|
|
112
|
+
/^.*©\s*\d{4}.*$/im,
|
|
113
|
+
/^.*(?:all rights reserved|privacy policy|terms of (?:service|use)).*$/im,
|
|
114
|
+
];
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Remove common email footer junk. When a signature separator ("--") is
|
|
118
|
+
* found, everything after it is dropped. Individual junk lines are also
|
|
119
|
+
* stripped even if no separator is present.
|
|
120
|
+
*/
|
|
121
|
+
export function stripFooterJunk(text: string, stripSignature = true): string {
|
|
122
|
+
let s = text;
|
|
123
|
+
|
|
124
|
+
// If there's a signature separator, chop everything from it onwards
|
|
125
|
+
if (stripSignature) {
|
|
126
|
+
const sigIdx = text.search(/^--\s*$/m);
|
|
127
|
+
s = sigIdx >= 0 ? text.slice(0, sigIdx) : text;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Remove individual footer lines
|
|
131
|
+
for (const pat of FOOTER_PATTERNS) {
|
|
132
|
+
s = s.replace(pat, "");
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return s;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ── Whitespace cleanup ─────────────────────────────────────────────────────
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Normalise whitespace:
|
|
142
|
+
* - Trim each line
|
|
143
|
+
* - Remove blank-only lines
|
|
144
|
+
* - Collapse 3+ consecutive newlines → 2
|
|
145
|
+
* - Trim leading/trailing whitespace on the whole string
|
|
146
|
+
*/
|
|
147
|
+
export function cleanWhitespace(text: string): string {
|
|
148
|
+
return text
|
|
149
|
+
.split("\n")
|
|
150
|
+
.map((line) => line.trim())
|
|
151
|
+
.join("\n")
|
|
152
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
153
|
+
.trim();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// ── Public API ─────────────────────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Full sanitisation pipeline: HTML → text → strip junk → clean whitespace.
|
|
160
|
+
*/
|
|
161
|
+
export function sanitizeEmailBody(html: string, options?: { stripSignature?: boolean }): string {
|
|
162
|
+
const text = htmlToText(html);
|
|
163
|
+
const noJunk = stripFooterJunk(text, options?.stripSignature ?? true);
|
|
164
|
+
return cleanWhitespace(noJunk);
|
|
165
|
+
}
|
package/src/semaphore.ts
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
export class Semaphore {
|
|
2
|
+
private tasks: (() => void)[] = [];
|
|
3
|
+
private count: number;
|
|
4
|
+
|
|
5
|
+
constructor(private max: number) {
|
|
6
|
+
this.count = max;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
async acquire(): Promise<void> {
|
|
10
|
+
if (this.count > 0) {
|
|
11
|
+
this.count--;
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
return new Promise((resolve) => {
|
|
15
|
+
this.tasks.push(resolve);
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
release(): void {
|
|
20
|
+
if (this.tasks.length > 0) {
|
|
21
|
+
const next = this.tasks.shift();
|
|
22
|
+
if (next) next();
|
|
23
|
+
} else {
|
|
24
|
+
this.count++;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async run<T>(task: () => Promise<T>): Promise<T> {
|
|
29
|
+
await this.acquire();
|
|
30
|
+
try {
|
|
31
|
+
return await task();
|
|
32
|
+
} finally {
|
|
33
|
+
this.release();
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { sanitizeEmailBody } from "./sanitize.js";
|
|
2
|
+
|
|
3
|
+
export function stripQuotes(html: string): string {
|
|
4
|
+
// Remove Gmail quote div
|
|
5
|
+
const gmailQuote = html.match(/<div class="gmail_quote"[^>]*>[\s\S]*?<\/div>/i);
|
|
6
|
+
if (gmailQuote) {
|
|
7
|
+
html = html.replace(gmailQuote[0], "");
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
// Remove blockquotes
|
|
11
|
+
const blockquote = html.match(/<blockquote[^>]*>[\s\S]*?<\/blockquote>/gi);
|
|
12
|
+
if (blockquote) {
|
|
13
|
+
for (const bq of blockquote) {
|
|
14
|
+
html = html.replace(bq, "");
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return html;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function extractTextBody(html?: string, plain?: string, options?: { stripSignature?: boolean }): string {
|
|
22
|
+
// Prefer HTML for stripping structure
|
|
23
|
+
if (html) {
|
|
24
|
+
const stripped = stripQuotes(html);
|
|
25
|
+
return sanitizeEmailBody(stripped, { stripSignature: options?.stripSignature ?? true });
|
|
26
|
+
}
|
|
27
|
+
// Fallback to plain text with regex stripping
|
|
28
|
+
if (plain) {
|
|
29
|
+
// Basic stripping of "On ... wrote:" trailing block
|
|
30
|
+
return plain.replace(/\nOn .+, .+ wrote:[\s\S]*$/, "").trim();
|
|
31
|
+
}
|
|
32
|
+
return "";
|
|
33
|
+
}
|
package/src/threading.ts
ADDED