@mcinteerj/openclaw-gmail 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Sanitise raw HTML email bodies into clean plain text.
3
+ *
4
+ * Goals:
5
+ * - Strip all HTML to readable text (no tags, no CSS, no scripts)
6
+ * - Remove tracking pixels, base64 images, inline junk
7
+ * - Preserve meaningful link URLs
8
+ * - Strip common email footer noise (signatures, disclaimers, "Sent from…")
9
+ * - Collapse excessive whitespace
10
+ */
11
+
12
+ // ── HTML entities ──────────────────────────────────────────────────────────
13
+ const NAMED_ENTITIES: Record<string, string> = {
14
+ amp: "&",
15
+ lt: "<",
16
+ gt: ">",
17
+ quot: '"',
18
+ apos: "'",
19
+ nbsp: " ",
20
+ mdash: "—",
21
+ ndash: "–",
22
+ lsquo: "\u2018",
23
+ rsquo: "\u2019",
24
+ ldquo: "\u201C",
25
+ rdquo: "\u201D",
26
+ bull: "\u2022",
27
+ hellip: "\u2026",
28
+ copy: "\u00A9",
29
+ reg: "\u00AE",
30
+ trade: "\u2122",
31
+ "#39": "'",
32
+ };
33
+
34
+ function decodeEntities(text: string): string {
35
+ return text
36
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, hex) =>
37
+ String.fromCharCode(parseInt(hex, 16)),
38
+ )
39
+ .replace(/&#(\d+);/g, (_, dec) => String.fromCharCode(parseInt(dec, 10)))
40
+ .replace(/&([a-zA-Z#0-9]+);/g, (match, name) => NAMED_ENTITIES[name] ?? match);
41
+ }
42
+
43
+ // ── HTML → plain text ──────────────────────────────────────────────────────
44
+
45
+ /**
46
+ * Convert an HTML string to clean plain text.
47
+ */
48
+ export function htmlToText(html: string): string {
49
+ let s = html;
50
+
51
+ // 1. Remove <style>, <script>, and <head> blocks entirely
52
+ s = s.replace(/<style[\s>][\s\S]*?<\/style>/gi, "");
53
+ s = s.replace(/<script[\s>][\s\S]*?<\/script>/gi, "");
54
+ s = s.replace(/<head[\s>][\s\S]*?<\/head>/gi, "");
55
+
56
+ // 2. Remove HTML comments
57
+ s = s.replace(/<!--[\s\S]*?-->/g, "");
58
+
59
+ // 3. Remove tracking pixels and junk images
60
+ // - 1×1 images (width/height = 1)
61
+ // - display:none images
62
+ // - base64 data URI images
63
+ s = s.replace(/<img[^>]*(?:width\s*=\s*["']?1["']?(?=[\s>\/])|height\s*=\s*["']?1["']?(?=[\s>\/]))[^>]*\/?>/gi, "");
64
+ s = s.replace(/<img[^>]*display\s*:\s*none[^>]*\/?>/gi, "");
65
+ s = s.replace(/<img[^>]*src\s*=\s*["']data:[^"']*["'][^>]*\/?>/gi, "");
66
+
67
+ // 4. Convert <br> variants to newlines
68
+ s = s.replace(/<br\s*\/?>/gi, "\n");
69
+
70
+ // 5. Block-level elements → newlines (before & after)
71
+ const blockTags = "p|div|tr|li|h[1-6]|table|section|article|header|footer|blockquote|ul|ol|dd|dt|dl|pre|hr|figcaption";
72
+ s = s.replace(new RegExp(`<\\/?(${blockTags})[^>]*>`, "gi"), "\n");
73
+
74
+ // 6. Extract <a> links — keep URL when it differs from the link text
75
+ s = s.replace(/<a\s[^>]*href\s*=\s*["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi,
76
+ (_match, href: string, text: string) => {
77
+ const linkText = text.replace(/<[^>]*>/g, "").trim();
78
+ const cleanHref = href.trim();
79
+ if (!linkText) return cleanHref ? `(${cleanHref})` : "";
80
+ if (!cleanHref || cleanHref === "#" || cleanHref.startsWith("mailto:")) return linkText;
81
+ // If the visible text IS the URL (or close), just show text
82
+ if (linkText === cleanHref || linkText === cleanHref.replace(/^https?:\/\//, "")) {
83
+ return linkText;
84
+ }
85
+ return `${linkText} (${cleanHref})`;
86
+ },
87
+ );
88
+
89
+ // 7. Strip all remaining HTML tags
90
+ s = s.replace(/<[^>]+>/g, "");
91
+
92
+ // 8. Decode HTML entities
93
+ s = decodeEntities(s);
94
+
95
+ return s;
96
+ }
97
+
98
+ // ── Footer / junk removal ──────────────────────────────────────────────────
99
+
100
+ const FOOTER_PATTERNS: RegExp[] = [
101
+ // "Sent from my …"
102
+ /^sent from my (?:iphone|ipad|galaxy|samsung|android|pixel|outlook|thunderbird|mail for windows).*$/im,
103
+ // "Get Outlook for …"
104
+ /^get outlook for (?:ios|android|windows|mac).*$/im,
105
+ // Unsubscribe lines
106
+ /^.*\bunsubscribe\b.*$/im,
107
+ // Confidentiality / disclaimer blocks (often multi-line, grab the whole paragraph)
108
+ /(?:^|\n).*(?:confidential(?:ity)?|disclaimer|privileged|intended recipient|legally privileged).*(?:\n(?!\n).*){0,8}/im,
109
+ // Signature separator: line starting with "-- " (RFC 3676) or just "--"
110
+ /^--\s*$/m,
111
+ // Copyright footers
112
+ /^.*©\s*\d{4}.*$/im,
113
+ /^.*(?:all rights reserved|privacy policy|terms of (?:service|use)).*$/im,
114
+ ];
115
+
116
+ /**
117
+ * Remove common email footer junk. When a signature separator ("--") is
118
+ * found, everything after it is dropped. Individual junk lines are also
119
+ * stripped even if no separator is present.
120
+ */
121
+ export function stripFooterJunk(text: string, stripSignature = true): string {
122
+ let s = text;
123
+
124
+ // If there's a signature separator, chop everything from it onwards
125
+ if (stripSignature) {
126
+ const sigIdx = text.search(/^--\s*$/m);
127
+ s = sigIdx >= 0 ? text.slice(0, sigIdx) : text;
128
+ }
129
+
130
+ // Remove individual footer lines
131
+ for (const pat of FOOTER_PATTERNS) {
132
+ s = s.replace(pat, "");
133
+ }
134
+
135
+ return s;
136
+ }
137
+
138
+ // ── Whitespace cleanup ─────────────────────────────────────────────────────
139
+
140
+ /**
141
+ * Normalise whitespace:
142
+ * - Trim each line
143
+ * - Remove blank-only lines
144
+ * - Collapse 3+ consecutive newlines → 2
145
+ * - Trim leading/trailing whitespace on the whole string
146
+ */
147
+ export function cleanWhitespace(text: string): string {
148
+ return text
149
+ .split("\n")
150
+ .map((line) => line.trim())
151
+ .join("\n")
152
+ .replace(/\n{3,}/g, "\n\n")
153
+ .trim();
154
+ }
155
+
156
+ // ── Public API ─────────────────────────────────────────────────────────────
157
+
158
+ /**
159
+ * Full sanitisation pipeline: HTML → text → strip junk → clean whitespace.
160
+ */
161
+ export function sanitizeEmailBody(html: string, options?: { stripSignature?: boolean }): string {
162
+ const text = htmlToText(html);
163
+ const noJunk = stripFooterJunk(text, options?.stripSignature ?? true);
164
+ return cleanWhitespace(noJunk);
165
+ }
@@ -0,0 +1,36 @@
1
+ export class Semaphore {
2
+ private tasks: (() => void)[] = [];
3
+ private count: number;
4
+
5
+ constructor(private max: number) {
6
+ this.count = max;
7
+ }
8
+
9
+ async acquire(): Promise<void> {
10
+ if (this.count > 0) {
11
+ this.count--;
12
+ return;
13
+ }
14
+ return new Promise((resolve) => {
15
+ this.tasks.push(resolve);
16
+ });
17
+ }
18
+
19
+ release(): void {
20
+ if (this.tasks.length > 0) {
21
+ const next = this.tasks.shift();
22
+ if (next) next();
23
+ } else {
24
+ this.count++;
25
+ }
26
+ }
27
+
28
+ async run<T>(task: () => Promise<T>): Promise<T> {
29
+ await this.acquire();
30
+ try {
31
+ return await task();
32
+ } finally {
33
+ this.release();
34
+ }
35
+ }
36
+ }
@@ -0,0 +1,33 @@
1
+ import { sanitizeEmailBody } from "./sanitize.js";
2
+
3
+ export function stripQuotes(html: string): string {
4
+ // Remove Gmail quote div
5
+ const gmailQuote = html.match(/<div class="gmail_quote"[^>]*>[\s\S]*?<\/div>/i);
6
+ if (gmailQuote) {
7
+ html = html.replace(gmailQuote[0], "");
8
+ }
9
+
10
+ // Remove blockquotes
11
+ const blockquote = html.match(/<blockquote[^>]*>[\s\S]*?<\/blockquote>/gi);
12
+ if (blockquote) {
13
+ for (const bq of blockquote) {
14
+ html = html.replace(bq, "");
15
+ }
16
+ }
17
+
18
+ return html;
19
+ }
20
+
21
+ export function extractTextBody(html?: string, plain?: string, options?: { stripSignature?: boolean }): string {
22
+ // Prefer HTML for stripping structure
23
+ if (html) {
24
+ const stripped = stripQuotes(html);
25
+ return sanitizeEmailBody(stripped, { stripSignature: options?.stripSignature ?? true });
26
+ }
27
+ // Fallback to plain text with regex stripping
28
+ if (plain) {
29
+ // Basic stripping of "On ... wrote:" trailing block
30
+ return plain.replace(/\nOn .+, .+ wrote:[\s\S]*$/, "").trim();
31
+ }
32
+ return "";
33
+ }
@@ -0,0 +1,8 @@
1
+ import { type ChannelThreadingAdapter } from "openclaw/plugin-sdk";
2
+
3
+ export const gmailThreading: ChannelThreadingAdapter = {
4
+ buildToolContext: ({ context, hasRepliedRef }) => ({
5
+ currentThreadTs: context.ReplyToId,
6
+ hasRepliedRef,
7
+ }),
8
+ };