@realtimex/realtimex-alchemy 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ import { SupabaseService } from './SupabaseService.js';
2
+ import { EventService } from './EventService.js';
3
+ export class ProcessingEventService {
4
+ static instance;
5
+ sseEvents = EventService.getInstance();
6
+ static getInstance() {
7
+ if (!this.instance) {
8
+ this.instance = new ProcessingEventService();
9
+ }
10
+ return this.instance;
11
+ }
12
+ async log(event, supabase) {
13
+ // 1. Log to SSE for instant feedback in existing terminal
14
+ this.sseEvents.emit({
15
+ type: event.eventType,
16
+ message: event.message,
17
+ data: event.details
18
+ });
19
+ // Mirror to console for developer visibility
20
+ const icon = event.eventType === 'error' ? '❌' : event.eventType === 'action' ? '⚡' : 'ℹ️';
21
+ console.log(`[${new Date().toLocaleTimeString()}] ${icon} [${event.agentState}] ${event.message}`);
22
+ // 2. Persist to Supabase for the new Advanced Terminal
23
+ const client = supabase || (SupabaseService.isConfigured() ? SupabaseService.getServiceRoleClient() : null);
24
+ if (client) {
25
+ try {
26
+ const userId = event.userId || await this.getFallbackUserId(client);
27
+ if (userId) {
28
+ await client.from('processing_events').insert([{
29
+ user_id: userId,
30
+ event_type: event.eventType,
31
+ agent_state: event.agentState,
32
+ message: event.message,
33
+ details: event.details || {},
34
+ level: event.level || 'info',
35
+ duration_ms: event.durationMs || null,
36
+ metadata: event.metadata || {},
37
+ created_at: new Date().toISOString()
38
+ }]);
39
+ }
40
+ }
41
+ catch (error) {
42
+ console.error('[ProcessingEventService] Failed to persist event:', error);
43
+ }
44
+ }
45
+ else {
46
+ // Supabase not configured and no client provided - skip persistence
47
+ // console.warn('[ProcessingEventService] Skipping persistence (no config)');
48
+ }
49
+ }
50
+ async getFallbackUserId(client) {
51
+ // Safe check for auth
52
+ if (!client || !client.auth)
53
+ return null;
54
+ // Try to get user from client auth context first
55
+ try {
56
+ const { data: { user } } = await client.auth.getUser();
57
+ if (user)
58
+ return user.id;
59
+ }
60
+ catch (e) {
61
+ // Ignore auth errors during fallback lookup
62
+ }
63
+ // Fallback to searching DB if we have service role (unlikely in Zero-Env)
64
+ if (SupabaseService.isConfigured()) {
65
+ try {
66
+ const { data } = await client.from('profiles').select('id').limit(1).maybeSingle();
67
+ return data?.id || null;
68
+ }
69
+ catch (e) {
70
+ return null;
71
+ }
72
+ }
73
+ return null;
74
+ }
75
+ }
@@ -0,0 +1,40 @@
1
+ import axios from 'axios';
2
+ import puppeteer from 'puppeteer';
3
+ import TurndownService from 'turndown';
4
+ import { EventService } from './EventService.js';
5
+ export class RouterService {
6
+ turndown = new TurndownService();
7
+ events = EventService.getInstance();
8
+ async extractContent(url) {
9
+ this.events.emit({ type: 'router', message: `Attempting Tier 1 Extraction (Axios): ${url.substring(0, 30)}...` });
10
+ try {
11
+ // Tier 1: Fast Fetch
12
+ const response = await axios.get(url, { timeout: 5000 });
13
+ const html = response.data;
14
+ const markdown = this.turndown.turndown(html);
15
+ if (markdown.length > 500) {
16
+ this.events.emit({ type: 'router', message: `Tier 1 Success (${markdown.length} chars)` });
17
+ return markdown;
18
+ }
19
+ }
20
+ catch (e) {
21
+ this.events.emit({ type: 'router', message: `Tier 1 Failed, Falling back to Tier 2...` });
22
+ }
23
+ // Tier 2: Puppeteer
24
+ this.events.emit({ type: 'router', message: `Attempting Tier 2 Extraction (Puppeteer)...` });
25
+ try {
26
+ const browser = await puppeteer.launch({ headless: true });
27
+ const page = await browser.newPage();
28
+ await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
29
+ const content = await page.content();
30
+ await browser.close();
31
+ const markdown = this.turndown.turndown(content);
32
+ this.events.emit({ type: 'router', message: `Tier 2 Success (${markdown.length} chars)` });
33
+ return markdown;
34
+ }
35
+ catch (e) {
36
+ this.events.emit({ type: 'router', message: `Tier 2 Failed: ${e.message}` });
37
+ throw new Error(`Failed to extract content from ${url}`);
38
+ }
39
+ }
40
+ }
@@ -0,0 +1,49 @@
1
+ import { createClient } from '@supabase/supabase-js';
2
+ export class SupabaseService {
3
+ static instance = null;
4
+ static serviceRoleInstance = null;
5
+ static getClient() {
6
+ if (!process.env.SUPABASE_URL || !process.env.SUPABASE_ANON_KEY) {
7
+ throw new Error('Supabase URL and Anon Key must be configured');
8
+ }
9
+ if (!this.instance) {
10
+ this.instance = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_ANON_KEY, {
11
+ auth: { persistSession: false }
12
+ });
13
+ }
14
+ return this.instance;
15
+ }
16
+ static getServiceRoleClient() {
17
+ if (!process.env.SUPABASE_URL || !process.env.SUPABASE_SERVICE_ROLE_KEY) {
18
+ throw new Error('Missing Supabase Config. Please add SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY to your .env file.');
19
+ }
20
+ if (!this.serviceRoleInstance) {
21
+ this.serviceRoleInstance = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY, {
22
+ auth: {
23
+ autoRefreshToken: false,
24
+ persistSession: false
25
+ }
26
+ });
27
+ }
28
+ return this.serviceRoleInstance;
29
+ }
30
+ static createClient(url, key, accessToken) {
31
+ const options = {
32
+ auth: {
33
+ persistSession: false,
34
+ autoRefreshToken: false,
35
+ }
36
+ };
37
+ if (accessToken) {
38
+ options.global = {
39
+ headers: {
40
+ Authorization: `Bearer ${accessToken}`
41
+ }
42
+ };
43
+ }
44
+ return createClient(url, key, options);
45
+ }
46
+ static isConfigured() {
47
+ return !!(process.env.SUPABASE_URL && process.env.SUPABASE_ANON_KEY);
48
+ }
49
+ }
@@ -0,0 +1,206 @@
1
+ import * as os from 'os';
2
+ import * as path from 'path';
3
+ import * as fs from 'fs';
4
+ import Database from 'better-sqlite3';
5
+ import { v4 as uuidv4 } from 'uuid';
6
+ export class BrowserPathDetector {
7
+ platform;
8
+ homeDir;
9
+ constructor() {
10
+ this.platform = os.platform();
11
+ this.homeDir = os.homedir();
12
+ }
13
+ /**
14
+ * Get default browser history paths based on platform
15
+ */
16
+ getDefaultPaths() {
17
+ if (this.platform === 'darwin') {
18
+ // macOS
19
+ return {
20
+ chrome: [
21
+ path.join(this.homeDir, 'Library/Application Support/Google/Chrome/Default/History'),
22
+ path.join(this.homeDir, 'Library/Application Support/Google/Chrome/Profile 1/History'),
23
+ ],
24
+ firefox: [
25
+ // Firefox uses profiles with random names
26
+ path.join(this.homeDir, 'Library/Application Support/Firefox/Profiles'),
27
+ ],
28
+ safari: [
29
+ path.join(this.homeDir, 'Library/Safari/History.db'),
30
+ ],
31
+ edge: [
32
+ path.join(this.homeDir, 'Library/Application Support/Microsoft Edge/Default/History'),
33
+ ],
34
+ brave: [
35
+ path.join(this.homeDir, 'Library/Application Support/BraveSoftware/Brave-Browser/Default/History'),
36
+ ],
37
+ arc: [
38
+ path.join(this.homeDir, 'Library/Application Support/Arc/User Data/Default/History'),
39
+ ],
40
+ custom: [],
41
+ };
42
+ }
43
+ else if (this.platform === 'win32') {
44
+ // Windows
45
+ const appData = process.env.APPDATA || '';
46
+ const localAppData = process.env.LOCALAPPDATA || '';
47
+ return {
48
+ chrome: [
49
+ path.join(localAppData, 'Google\\Chrome\\User Data\\Default\\History'),
50
+ ],
51
+ firefox: [
52
+ path.join(appData, 'Mozilla\\Firefox\\Profiles'),
53
+ ],
54
+ safari: [],
55
+ edge: [
56
+ path.join(localAppData, 'Microsoft\\Edge\\User Data\\Default\\History'),
57
+ ],
58
+ brave: [
59
+ path.join(localAppData, 'BraveSoftware\\Brave-Browser\\User Data\\Default\\History'),
60
+ ],
61
+ arc: [],
62
+ custom: [],
63
+ };
64
+ }
65
+ else {
66
+ // Linux
67
+ return {
68
+ chrome: [
69
+ path.join(this.homeDir, '.config/google-chrome/Default/History'),
70
+ ],
71
+ firefox: [
72
+ path.join(this.homeDir, '.mozilla/firefox'),
73
+ ],
74
+ safari: [],
75
+ edge: [
76
+ path.join(this.homeDir, '.config/microsoft-edge/Default/History'),
77
+ ],
78
+ brave: [
79
+ path.join(this.homeDir, '.config/BraveSoftware/Brave-Browser/Default/History'),
80
+ ],
81
+ arc: [],
82
+ custom: [],
83
+ };
84
+ }
85
+ }
86
+ /**
87
+ * Find Firefox profile directories and locate places.sqlite
88
+ */
89
+ findFirefoxHistory(profilesDir) {
90
+ try {
91
+ if (!fs.existsSync(profilesDir))
92
+ return null;
93
+ const profiles = fs.readdirSync(profilesDir);
94
+ for (const profile of profiles) {
95
+ if (profile.endsWith('.default') || profile.includes('default-release')) {
96
+ const historyPath = path.join(profilesDir, profile, 'places.sqlite');
97
+ if (fs.existsSync(historyPath)) {
98
+ return historyPath;
99
+ }
100
+ }
101
+ }
102
+ }
103
+ catch (err) {
104
+ console.error('Error finding Firefox history:', err);
105
+ }
106
+ return null;
107
+ }
108
+ /**
109
+ * Validate if a file is a valid SQLite database
110
+ * Uses copy-to-temp to avoid "database is locked" errors when browser is open
111
+ */
112
+ validateSQLitePath(filePath) {
113
+ let tempPath = null;
114
+ try {
115
+ if (!fs.existsSync(filePath)) {
116
+ return { valid: false, error: 'File does not exist' };
117
+ }
118
+ const stats = fs.statSync(filePath);
119
+ if (!stats.isFile()) {
120
+ return { valid: false, error: 'Path is not a file' };
121
+ }
122
+ // Copy to temp file to avoid database lock issues
123
+ const tempDir = os.tmpdir();
124
+ tempPath = path.join(tempDir, `history_validate_${uuidv4()}.db`);
125
+ fs.copyFileSync(filePath, tempPath);
126
+ // Try to open the temp copy as SQLite database
127
+ const db = new Database(tempPath, { readonly: true });
128
+ // Check if it has expected tables (moz_places for Firefox, urls for Chrome/others)
129
+ const tables = db.prepare("SELECT name FROM sqlite_master WHERE type='table'").all();
130
+ const tableNames = tables.map((t) => t.name);
131
+ db.close();
132
+ const hasHistoryTables = tableNames.includes('urls') ||
133
+ tableNames.includes('moz_places') ||
134
+ tableNames.includes('history_items');
135
+ if (!hasHistoryTables) {
136
+ return { valid: false, error: 'Not a browser history database' };
137
+ }
138
+ return { valid: true };
139
+ }
140
+ catch (err) {
141
+ return { valid: false, error: err.message || 'Invalid SQLite database' };
142
+ }
143
+ finally {
144
+ // Clean up temp file
145
+ if (tempPath && fs.existsSync(tempPath)) {
146
+ try {
147
+ fs.unlinkSync(tempPath);
148
+ }
149
+ catch {
150
+ // Ignore cleanup errors
151
+ }
152
+ }
153
+ }
154
+ }
155
+ /**
156
+ * Detect all available browser history paths
157
+ */
158
+ detectAll() {
159
+ const defaultPaths = this.getDefaultPaths();
160
+ const results = {};
161
+ for (const [browser, paths] of Object.entries(defaultPaths)) {
162
+ if (browser === 'custom')
163
+ continue;
164
+ const browserType = browser;
165
+ let foundPath = null;
166
+ if (browserType === 'firefox' && paths.length > 0) {
167
+ // Special handling for Firefox
168
+ foundPath = this.findFirefoxHistory(paths[0]);
169
+ }
170
+ else {
171
+ // Check each potential path
172
+ for (const p of paths) {
173
+ if (fs.existsSync(p)) {
174
+ foundPath = p;
175
+ break;
176
+ }
177
+ }
178
+ }
179
+ if (foundPath) {
180
+ const validation = this.validateSQLitePath(foundPath);
181
+ results[browserType] = {
182
+ browser: browserType,
183
+ path: foundPath,
184
+ found: true,
185
+ valid: validation.valid,
186
+ error: validation.error,
187
+ };
188
+ }
189
+ else {
190
+ results[browserType] = {
191
+ browser: browserType,
192
+ path: '',
193
+ found: false,
194
+ };
195
+ }
196
+ }
197
+ return results;
198
+ }
199
+ /**
200
+ * Detect a specific browser's history path
201
+ */
202
+ detect(browser) {
203
+ const all = this.detectAll();
204
+ return all[browser] || { browser, path: '', found: false };
205
+ }
206
+ }
@@ -0,0 +1,176 @@
1
+ /**
2
+ * URL Normalizer - Strips tracking parameters and normalizes URLs for deduplication
3
+ */
4
+ // Common tracking parameters to strip
5
+ const TRACKING_PARAMS = new Set([
6
+ // Google Analytics / Ads
7
+ 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
8
+ 'gclid', 'gclsrc', 'dclid',
9
+ // Facebook
10
+ 'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
11
+ // Twitter/X
12
+ 'twclid', 's', 't', // Twitter share params
13
+ // Microsoft/Bing
14
+ 'msclkid',
15
+ // Email marketing
16
+ 'mc_cid', 'mc_eid', // Mailchimp
17
+ 'oly_enc_id', 'oly_anon_id', // Omeda
18
+ '_hsenc', '_hsmi', 'hsCtaTracking', // HubSpot
19
+ // Analytics & tracking
20
+ '_ga', '_gl', // Google Analytics
21
+ 'ref', 'ref_src', 'ref_url', // Referrer tracking
22
+ 'source', 'src',
23
+ 'campaign', 'medium',
24
+ // Social sharing
25
+ 'share', 'shared', 'via',
26
+ // Session/user tracking
27
+ 'sessionid', 'session_id', 'sid',
28
+ 'userid', 'user_id', 'uid',
29
+ 'token', 'auth',
30
+ // Misc tracking
31
+ 'pk_campaign', 'pk_kwd', 'pk_source', // Piwik/Matomo
32
+ 'zanpid', // Zanox
33
+ 'irclickid', // Impact Radius
34
+ 'affiliate', 'affiliate_id', 'aff_id',
35
+ 'clickid', 'click_id',
36
+ 'trk', 'tracking', 'track',
37
+ ]);
38
+ // Parameters that might be tracking but context-dependent (be more careful)
39
+ const SUSPICIOUS_PARAMS = new Set([
40
+ 'id', 'ref', 'source', 'from', 'origin',
41
+ ]);
42
+ export class UrlNormalizer {
43
+ /**
44
+ * Normalize a URL by stripping tracking parameters and standardizing format
45
+ */
46
+ static normalize(url) {
47
+ try {
48
+ const parsed = new URL(url);
49
+ // 1. Lowercase the hostname
50
+ parsed.hostname = parsed.hostname.toLowerCase();
51
+ // 2. Remove default ports
52
+ if (parsed.port === '80' || parsed.port === '443') {
53
+ parsed.port = '';
54
+ }
55
+ // 3. Remove trailing slash from path (except root)
56
+ if (parsed.pathname.length > 1 && parsed.pathname.endsWith('/')) {
57
+ parsed.pathname = parsed.pathname.slice(0, -1);
58
+ }
59
+ // 4. Remove fragment/hash
60
+ parsed.hash = '';
61
+ // 5. Strip tracking parameters
62
+ const cleanParams = new URLSearchParams();
63
+ parsed.searchParams.forEach((value, key) => {
64
+ const lowerKey = key.toLowerCase();
65
+ if (!TRACKING_PARAMS.has(lowerKey)) {
66
+ cleanParams.set(key, value);
67
+ }
68
+ });
69
+ // 6. Sort remaining params for consistent ordering
70
+ const sortedParams = new URLSearchParams();
71
+ const keys = Array.from(cleanParams.keys()).sort();
72
+ for (const key of keys) {
73
+ sortedParams.set(key, cleanParams.get(key));
74
+ }
75
+ parsed.search = sortedParams.toString() ? `?${sortedParams.toString()}` : '';
76
+ return parsed.toString();
77
+ }
78
+ catch {
79
+ // If URL parsing fails, return original
80
+ return url;
81
+ }
82
+ }
83
+ /**
84
+ * Extract the canonical URL (domain + path, no params)
85
+ * Useful for aggressive deduplication
86
+ */
87
+ static getCanonical(url) {
88
+ try {
89
+ const parsed = new URL(url);
90
+ parsed.hostname = parsed.hostname.toLowerCase();
91
+ parsed.search = '';
92
+ parsed.hash = '';
93
+ // Remove trailing slash
94
+ if (parsed.pathname.length > 1 && parsed.pathname.endsWith('/')) {
95
+ parsed.pathname = parsed.pathname.slice(0, -1);
96
+ }
97
+ return `${parsed.protocol}//${parsed.host}${parsed.pathname}`;
98
+ }
99
+ catch {
100
+ return url;
101
+ }
102
+ }
103
+ /**
104
+ * Get a fingerprint for deduplication (hash of normalized URL)
105
+ */
106
+ static getFingerprint(url) {
107
+ const normalized = this.normalize(url);
108
+ // Simple hash for dedup - not cryptographic, just for comparison
109
+ let hash = 0;
110
+ for (let i = 0; i < normalized.length; i++) {
111
+ const char = normalized.charCodeAt(i);
112
+ hash = ((hash << 5) - hash) + char;
113
+ hash = hash & hash; // Convert to 32bit integer
114
+ }
115
+ return hash.toString(36);
116
+ }
117
+ /**
118
+ * Check if two URLs are effectively the same (after normalization)
119
+ */
120
+ static areSameUrl(url1, url2) {
121
+ return this.normalize(url1) === this.normalize(url2);
122
+ }
123
+ /**
124
+ * Extract domain from URL
125
+ */
126
+ static getDomain(url) {
127
+ try {
128
+ const parsed = new URL(url);
129
+ return parsed.hostname.toLowerCase();
130
+ }
131
+ catch {
132
+ return '';
133
+ }
134
+ }
135
+ /**
136
+ * Check if URL matches common non-content patterns
137
+ */
138
+ static isLikelyNonContent(url) {
139
+ try {
140
+ const parsed = new URL(url);
141
+ const path = parsed.pathname.toLowerCase();
142
+ // Common non-content paths
143
+ const nonContentPaths = [
144
+ '/login', '/signin', '/sign-in', '/signup', '/sign-up', '/register',
145
+ '/logout', '/signout', '/sign-out',
146
+ '/auth', '/oauth', '/callback', '/sso',
147
+ '/cart', '/checkout', '/basket', '/bag',
148
+ '/account', '/profile', '/settings', '/preferences',
149
+ '/admin', '/dashboard', '/manage',
150
+ '/api/', '/graphql', '/_next/', '/__',
151
+ '/search', '/results',
152
+ ];
153
+ for (const nonContent of nonContentPaths) {
154
+ if (path.startsWith(nonContent) || path.includes(nonContent)) {
155
+ return true;
156
+ }
157
+ }
158
+ // Non-content file extensions
159
+ const nonContentExtensions = [
160
+ '.json', '.xml', '.js', '.css', '.map',
161
+ '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.webp',
162
+ '.woff', '.woff2', '.ttf', '.eot',
163
+ '.pdf', '.zip', '.gz',
164
+ ];
165
+ for (const ext of nonContentExtensions) {
166
+ if (path.endsWith(ext)) {
167
+ return true;
168
+ }
169
+ }
170
+ return false;
171
+ }
172
+ catch {
173
+ return false;
174
+ }
175
+ }
176
+ }
@@ -0,0 +1,114 @@
1
+ export class ContentCleaner {
2
+ /**
3
+ * Cleans email body by removing noise, quoted replies, and footers.
4
+ * optimized for LLM processing.
5
+ */
6
+ static cleanEmailBody(text) {
7
+ if (!text)
8
+ return "";
9
+ const originalText = text;
10
+ // 1. Detect if content is actually HTML
11
+ const isHtml = /<[a-z][\s\S]*>/i.test(text);
12
+ if (isHtml) {
13
+ // Lightweight HTML -> Markdown Conversion
14
+ // Structure: <br>, <p> -> Newlines
15
+ text = text.replace(/<br\s*\/?>/gi, '\n');
16
+ text = text.replace(/<\/p>/gi, '\n\n');
17
+ text = text.replace(/<p.*?>/gi, '');
18
+ // Structure: Headers <h1>-<h6> -> # Title
19
+ text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
20
+ // Structure: Lists <li> -> - Item
21
+ text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
22
+ text = text.replace(/<ul.*?>/gi, '');
23
+ text = text.replace(/<\/ul>/gi, '\n');
24
+ // Links: <a href=\"...\">text</a> -> [text](href)
25
+ text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
26
+ // Images: <img src=\"...\" alt=\"...\"> -> ![alt](src)
27
+ text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => `![${alt}](${src})`);
28
+ // Style/Script removal (strictly remove content)
29
+ text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
30
+ text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
31
+ // Final Strip of remaining tags
32
+ text = text.replace(/<[^>]+>/g, ' ');
33
+ // Entity decoding (Basic)
34
+ text = text.replace(/&nbsp;/gi, ' ');
35
+ text = text.replace(/&amp;/gi, '&');
36
+ text = text.replace(/&lt;/gi, '<');
37
+ text = text.replace(/&gt;/gi, '>');
38
+ text = text.replace(/&quot;/gi, '"');
39
+ text = text.replace(/&#39;/gi, "'");
40
+ }
41
+ const lines = text.split('\n');
42
+ const cleanedLines = [];
43
+ // Patterns that usually mark the START of a reply chain or a generic footer
44
+ const truncationPatterns = [
45
+ /^On .* wrote:$/i,
46
+ /^From: .* <.*>$/i,
47
+ /^-----Original Message-----$/i,
48
+ /^________________________________$/i,
49
+ /^Sent from my iPhone$/i,
50
+ /^Sent from my Android$/i,
51
+ /^Get Outlook for/i,
52
+ /^--$/ // Standard signature separator
53
+ ];
54
+ // Patterns for lines that should be stripped but NOT truncate the whole email
55
+ const noisePatterns = [
56
+ /view in browser/i,
57
+ /click here to view/i,
58
+ /legal notice/i,
59
+ /all rights reserved/i,
60
+ /privacy policy/i,
61
+ /terms of service/i,
62
+ /unsubscribe/i
63
+ ];
64
+ for (let line of lines) {
65
+ let lineStripped = line.trim();
66
+ if (!lineStripped) {
67
+ cleanedLines.push("");
68
+ continue;
69
+ }
70
+ // 2. Quoted text removal (lines starting with >)
71
+ if (lineStripped.startsWith('>')) {
72
+ continue;
73
+ }
74
+ // 3. Truncation check: If we hit a reply header, we stop entirely
75
+ let shouldTruncate = false;
76
+ for (const pattern of truncationPatterns) {
77
+ if (pattern.test(lineStripped)) {
78
+ shouldTruncate = true;
79
+ break;
80
+ }
81
+ }
82
+ if (shouldTruncate)
83
+ break;
84
+ // 4. Noise check: Strip boilerplate lines
85
+ let isNoise = false;
86
+ if (lineStripped.length < 100) {
87
+ for (const pattern of noisePatterns) {
88
+ if (pattern.test(lineStripped)) {
89
+ isNoise = true;
90
+ break;
91
+ }
92
+ }
93
+ }
94
+ if (isNoise)
95
+ continue;
96
+ cleanedLines.push(line);
97
+ }
98
+ // Reassemble
99
+ text = cleanedLines.join('\n');
100
+ // Collapse whitespace
101
+ text = text.replace(/\n{3,}/g, '\n\n');
102
+ text = text.replace(/[ \t]{2,}/g, ' ');
103
+ // Safety Fallback: If cleaning stripped too much, return original text truncated
104
+ if (text.trim().length < 20 && originalText.trim().length > 20) {
105
+ return originalText.substring(0, 3000).trim();
106
+ }
107
+ // Sanitize LLM Special Tokens
108
+ text = text.replace(/<\|/g, '< |');
109
+ text = text.replace(/\|>/g, '| >');
110
+ text = text.replace(/\[INST\]/gi, '[ INST ]');
111
+ text = text.replace(/\[\/INST\]/gi, '[ /INST ]');
112
+ return text.trim();
113
+ }
114
+ }