@realtimex/realtimex-alchemy 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/bin/realtimex-alchemy.js +55 -0
- package/dist/api/config/index.js +33 -0
- package/dist/api/index.js +237 -0
- package/dist/api/lib/ContentCleaner.js +114 -0
- package/dist/api/lib/types.js +1 -0
- package/dist/api/services/AlchemistService.js +241 -0
- package/dist/api/services/EventService.js +53 -0
- package/dist/api/services/LibrarianService.js +72 -0
- package/dist/api/services/MinerService.js +314 -0
- package/dist/api/services/ProcessingEventService.js +75 -0
- package/dist/api/services/RouterService.js +40 -0
- package/dist/api/services/SupabaseService.js +49 -0
- package/dist/api/utils/BrowserPathDetector.js +206 -0
- package/dist/api/utils/UrlNormalizer.js +176 -0
- package/dist/api/utils/contentCleaner.js +114 -0
- package/dist/api/utils/contentCleaner.test.js +96 -0
- package/dist/assets/index-7Lemtnxa.css +1 -0
- package/dist/assets/index-CRgCScOz.js +101 -0
- package/dist/email-automator-logo.svg +51 -0
- package/dist/favicon.svg +45 -0
- package/dist/index.html +18 -0
- package/package.json +80 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { SupabaseService } from './SupabaseService.js';
|
|
2
|
+
import { EventService } from './EventService.js';
|
|
3
|
+
export class ProcessingEventService {
|
|
4
|
+
static instance;
|
|
5
|
+
sseEvents = EventService.getInstance();
|
|
6
|
+
static getInstance() {
|
|
7
|
+
if (!this.instance) {
|
|
8
|
+
this.instance = new ProcessingEventService();
|
|
9
|
+
}
|
|
10
|
+
return this.instance;
|
|
11
|
+
}
|
|
12
|
+
async log(event, supabase) {
|
|
13
|
+
// 1. Log to SSE for instant feedback in existing terminal
|
|
14
|
+
this.sseEvents.emit({
|
|
15
|
+
type: event.eventType,
|
|
16
|
+
message: event.message,
|
|
17
|
+
data: event.details
|
|
18
|
+
});
|
|
19
|
+
// Mirror to console for developer visibility
|
|
20
|
+
const icon = event.eventType === 'error' ? '❌' : event.eventType === 'action' ? '⚡' : 'ℹ️';
|
|
21
|
+
console.log(`[${new Date().toLocaleTimeString()}] ${icon} [${event.agentState}] ${event.message}`);
|
|
22
|
+
// 2. Persist to Supabase for the new Advanced Terminal
|
|
23
|
+
const client = supabase || (SupabaseService.isConfigured() ? SupabaseService.getServiceRoleClient() : null);
|
|
24
|
+
if (client) {
|
|
25
|
+
try {
|
|
26
|
+
const userId = event.userId || await this.getFallbackUserId(client);
|
|
27
|
+
if (userId) {
|
|
28
|
+
await client.from('processing_events').insert([{
|
|
29
|
+
user_id: userId,
|
|
30
|
+
event_type: event.eventType,
|
|
31
|
+
agent_state: event.agentState,
|
|
32
|
+
message: event.message,
|
|
33
|
+
details: event.details || {},
|
|
34
|
+
level: event.level || 'info',
|
|
35
|
+
duration_ms: event.durationMs || null,
|
|
36
|
+
metadata: event.metadata || {},
|
|
37
|
+
created_at: new Date().toISOString()
|
|
38
|
+
}]);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
console.error('[ProcessingEventService] Failed to persist event:', error);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
// Supabase not configured and no client provided - skip persistence
|
|
47
|
+
// console.warn('[ProcessingEventService] Skipping persistence (no config)');
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
async getFallbackUserId(client) {
|
|
51
|
+
// Safe check for auth
|
|
52
|
+
if (!client || !client.auth)
|
|
53
|
+
return null;
|
|
54
|
+
// Try to get user from client auth context first
|
|
55
|
+
try {
|
|
56
|
+
const { data: { user } } = await client.auth.getUser();
|
|
57
|
+
if (user)
|
|
58
|
+
return user.id;
|
|
59
|
+
}
|
|
60
|
+
catch (e) {
|
|
61
|
+
// Ignore auth errors during fallback lookup
|
|
62
|
+
}
|
|
63
|
+
// Fallback to searching DB if we have service role (unlikely in Zero-Env)
|
|
64
|
+
if (SupabaseService.isConfigured()) {
|
|
65
|
+
try {
|
|
66
|
+
const { data } = await client.from('profiles').select('id').limit(1).maybeSingle();
|
|
67
|
+
return data?.id || null;
|
|
68
|
+
}
|
|
69
|
+
catch (e) {
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import puppeteer from 'puppeteer';
|
|
3
|
+
import TurndownService from 'turndown';
|
|
4
|
+
import { EventService } from './EventService.js';
|
|
5
|
+
export class RouterService {
|
|
6
|
+
turndown = new TurndownService();
|
|
7
|
+
events = EventService.getInstance();
|
|
8
|
+
async extractContent(url) {
|
|
9
|
+
this.events.emit({ type: 'router', message: `Attempting Tier 1 Extraction (Axios): ${url.substring(0, 30)}...` });
|
|
10
|
+
try {
|
|
11
|
+
// Tier 1: Fast Fetch
|
|
12
|
+
const response = await axios.get(url, { timeout: 5000 });
|
|
13
|
+
const html = response.data;
|
|
14
|
+
const markdown = this.turndown.turndown(html);
|
|
15
|
+
if (markdown.length > 500) {
|
|
16
|
+
this.events.emit({ type: 'router', message: `Tier 1 Success (${markdown.length} chars)` });
|
|
17
|
+
return markdown;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
catch (e) {
|
|
21
|
+
this.events.emit({ type: 'router', message: `Tier 1 Failed, Falling back to Tier 2...` });
|
|
22
|
+
}
|
|
23
|
+
// Tier 2: Puppeteer
|
|
24
|
+
this.events.emit({ type: 'router', message: `Attempting Tier 2 Extraction (Puppeteer)...` });
|
|
25
|
+
try {
|
|
26
|
+
const browser = await puppeteer.launch({ headless: true });
|
|
27
|
+
const page = await browser.newPage();
|
|
28
|
+
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
|
29
|
+
const content = await page.content();
|
|
30
|
+
await browser.close();
|
|
31
|
+
const markdown = this.turndown.turndown(content);
|
|
32
|
+
this.events.emit({ type: 'router', message: `Tier 2 Success (${markdown.length} chars)` });
|
|
33
|
+
return markdown;
|
|
34
|
+
}
|
|
35
|
+
catch (e) {
|
|
36
|
+
this.events.emit({ type: 'router', message: `Tier 2 Failed: ${e.message}` });
|
|
37
|
+
throw new Error(`Failed to extract content from ${url}`);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { createClient } from '@supabase/supabase-js';
|
|
2
|
+
export class SupabaseService {
|
|
3
|
+
static instance = null;
|
|
4
|
+
static serviceRoleInstance = null;
|
|
5
|
+
static getClient() {
|
|
6
|
+
if (!process.env.SUPABASE_URL || !process.env.SUPABASE_ANON_KEY) {
|
|
7
|
+
throw new Error('Supabase URL and Anon Key must be configured');
|
|
8
|
+
}
|
|
9
|
+
if (!this.instance) {
|
|
10
|
+
this.instance = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_ANON_KEY, {
|
|
11
|
+
auth: { persistSession: false }
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
return this.instance;
|
|
15
|
+
}
|
|
16
|
+
static getServiceRoleClient() {
|
|
17
|
+
if (!process.env.SUPABASE_URL || !process.env.SUPABASE_SERVICE_ROLE_KEY) {
|
|
18
|
+
throw new Error('Missing Supabase Config. Please add SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY to your .env file.');
|
|
19
|
+
}
|
|
20
|
+
if (!this.serviceRoleInstance) {
|
|
21
|
+
this.serviceRoleInstance = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY, {
|
|
22
|
+
auth: {
|
|
23
|
+
autoRefreshToken: false,
|
|
24
|
+
persistSession: false
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
return this.serviceRoleInstance;
|
|
29
|
+
}
|
|
30
|
+
static createClient(url, key, accessToken) {
|
|
31
|
+
const options = {
|
|
32
|
+
auth: {
|
|
33
|
+
persistSession: false,
|
|
34
|
+
autoRefreshToken: false,
|
|
35
|
+
}
|
|
36
|
+
};
|
|
37
|
+
if (accessToken) {
|
|
38
|
+
options.global = {
|
|
39
|
+
headers: {
|
|
40
|
+
Authorization: `Bearer ${accessToken}`
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
return createClient(url, key, options);
|
|
45
|
+
}
|
|
46
|
+
static isConfigured() {
|
|
47
|
+
return !!(process.env.SUPABASE_URL && process.env.SUPABASE_ANON_KEY);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import * as os from 'os';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
import * as fs from 'fs';
|
|
4
|
+
import Database from 'better-sqlite3';
|
|
5
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
6
|
+
export class BrowserPathDetector {
|
|
7
|
+
platform;
|
|
8
|
+
homeDir;
|
|
9
|
+
constructor() {
|
|
10
|
+
this.platform = os.platform();
|
|
11
|
+
this.homeDir = os.homedir();
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Get default browser history paths based on platform
|
|
15
|
+
*/
|
|
16
|
+
getDefaultPaths() {
|
|
17
|
+
if (this.platform === 'darwin') {
|
|
18
|
+
// macOS
|
|
19
|
+
return {
|
|
20
|
+
chrome: [
|
|
21
|
+
path.join(this.homeDir, 'Library/Application Support/Google/Chrome/Default/History'),
|
|
22
|
+
path.join(this.homeDir, 'Library/Application Support/Google/Chrome/Profile 1/History'),
|
|
23
|
+
],
|
|
24
|
+
firefox: [
|
|
25
|
+
// Firefox uses profiles with random names
|
|
26
|
+
path.join(this.homeDir, 'Library/Application Support/Firefox/Profiles'),
|
|
27
|
+
],
|
|
28
|
+
safari: [
|
|
29
|
+
path.join(this.homeDir, 'Library/Safari/History.db'),
|
|
30
|
+
],
|
|
31
|
+
edge: [
|
|
32
|
+
path.join(this.homeDir, 'Library/Application Support/Microsoft Edge/Default/History'),
|
|
33
|
+
],
|
|
34
|
+
brave: [
|
|
35
|
+
path.join(this.homeDir, 'Library/Application Support/BraveSoftware/Brave-Browser/Default/History'),
|
|
36
|
+
],
|
|
37
|
+
arc: [
|
|
38
|
+
path.join(this.homeDir, 'Library/Application Support/Arc/User Data/Default/History'),
|
|
39
|
+
],
|
|
40
|
+
custom: [],
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
else if (this.platform === 'win32') {
|
|
44
|
+
// Windows
|
|
45
|
+
const appData = process.env.APPDATA || '';
|
|
46
|
+
const localAppData = process.env.LOCALAPPDATA || '';
|
|
47
|
+
return {
|
|
48
|
+
chrome: [
|
|
49
|
+
path.join(localAppData, 'Google\\Chrome\\User Data\\Default\\History'),
|
|
50
|
+
],
|
|
51
|
+
firefox: [
|
|
52
|
+
path.join(appData, 'Mozilla\\Firefox\\Profiles'),
|
|
53
|
+
],
|
|
54
|
+
safari: [],
|
|
55
|
+
edge: [
|
|
56
|
+
path.join(localAppData, 'Microsoft\\Edge\\User Data\\Default\\History'),
|
|
57
|
+
],
|
|
58
|
+
brave: [
|
|
59
|
+
path.join(localAppData, 'BraveSoftware\\Brave-Browser\\User Data\\Default\\History'),
|
|
60
|
+
],
|
|
61
|
+
arc: [],
|
|
62
|
+
custom: [],
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
// Linux
|
|
67
|
+
return {
|
|
68
|
+
chrome: [
|
|
69
|
+
path.join(this.homeDir, '.config/google-chrome/Default/History'),
|
|
70
|
+
],
|
|
71
|
+
firefox: [
|
|
72
|
+
path.join(this.homeDir, '.mozilla/firefox'),
|
|
73
|
+
],
|
|
74
|
+
safari: [],
|
|
75
|
+
edge: [
|
|
76
|
+
path.join(this.homeDir, '.config/microsoft-edge/Default/History'),
|
|
77
|
+
],
|
|
78
|
+
brave: [
|
|
79
|
+
path.join(this.homeDir, '.config/BraveSoftware/Brave-Browser/Default/History'),
|
|
80
|
+
],
|
|
81
|
+
arc: [],
|
|
82
|
+
custom: [],
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Find Firefox profile directories and locate places.sqlite
|
|
88
|
+
*/
|
|
89
|
+
findFirefoxHistory(profilesDir) {
|
|
90
|
+
try {
|
|
91
|
+
if (!fs.existsSync(profilesDir))
|
|
92
|
+
return null;
|
|
93
|
+
const profiles = fs.readdirSync(profilesDir);
|
|
94
|
+
for (const profile of profiles) {
|
|
95
|
+
if (profile.endsWith('.default') || profile.includes('default-release')) {
|
|
96
|
+
const historyPath = path.join(profilesDir, profile, 'places.sqlite');
|
|
97
|
+
if (fs.existsSync(historyPath)) {
|
|
98
|
+
return historyPath;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
catch (err) {
|
|
104
|
+
console.error('Error finding Firefox history:', err);
|
|
105
|
+
}
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Validate if a file is a valid SQLite database
|
|
110
|
+
* Uses copy-to-temp to avoid "database is locked" errors when browser is open
|
|
111
|
+
*/
|
|
112
|
+
validateSQLitePath(filePath) {
|
|
113
|
+
let tempPath = null;
|
|
114
|
+
try {
|
|
115
|
+
if (!fs.existsSync(filePath)) {
|
|
116
|
+
return { valid: false, error: 'File does not exist' };
|
|
117
|
+
}
|
|
118
|
+
const stats = fs.statSync(filePath);
|
|
119
|
+
if (!stats.isFile()) {
|
|
120
|
+
return { valid: false, error: 'Path is not a file' };
|
|
121
|
+
}
|
|
122
|
+
// Copy to temp file to avoid database lock issues
|
|
123
|
+
const tempDir = os.tmpdir();
|
|
124
|
+
tempPath = path.join(tempDir, `history_validate_${uuidv4()}.db`);
|
|
125
|
+
fs.copyFileSync(filePath, tempPath);
|
|
126
|
+
// Try to open the temp copy as SQLite database
|
|
127
|
+
const db = new Database(tempPath, { readonly: true });
|
|
128
|
+
// Check if it has expected tables (moz_places for Firefox, urls for Chrome/others)
|
|
129
|
+
const tables = db.prepare("SELECT name FROM sqlite_master WHERE type='table'").all();
|
|
130
|
+
const tableNames = tables.map((t) => t.name);
|
|
131
|
+
db.close();
|
|
132
|
+
const hasHistoryTables = tableNames.includes('urls') ||
|
|
133
|
+
tableNames.includes('moz_places') ||
|
|
134
|
+
tableNames.includes('history_items');
|
|
135
|
+
if (!hasHistoryTables) {
|
|
136
|
+
return { valid: false, error: 'Not a browser history database' };
|
|
137
|
+
}
|
|
138
|
+
return { valid: true };
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
return { valid: false, error: err.message || 'Invalid SQLite database' };
|
|
142
|
+
}
|
|
143
|
+
finally {
|
|
144
|
+
// Clean up temp file
|
|
145
|
+
if (tempPath && fs.existsSync(tempPath)) {
|
|
146
|
+
try {
|
|
147
|
+
fs.unlinkSync(tempPath);
|
|
148
|
+
}
|
|
149
|
+
catch {
|
|
150
|
+
// Ignore cleanup errors
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Detect all available browser history paths
|
|
157
|
+
*/
|
|
158
|
+
detectAll() {
|
|
159
|
+
const defaultPaths = this.getDefaultPaths();
|
|
160
|
+
const results = {};
|
|
161
|
+
for (const [browser, paths] of Object.entries(defaultPaths)) {
|
|
162
|
+
if (browser === 'custom')
|
|
163
|
+
continue;
|
|
164
|
+
const browserType = browser;
|
|
165
|
+
let foundPath = null;
|
|
166
|
+
if (browserType === 'firefox' && paths.length > 0) {
|
|
167
|
+
// Special handling for Firefox
|
|
168
|
+
foundPath = this.findFirefoxHistory(paths[0]);
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
// Check each potential path
|
|
172
|
+
for (const p of paths) {
|
|
173
|
+
if (fs.existsSync(p)) {
|
|
174
|
+
foundPath = p;
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
if (foundPath) {
|
|
180
|
+
const validation = this.validateSQLitePath(foundPath);
|
|
181
|
+
results[browserType] = {
|
|
182
|
+
browser: browserType,
|
|
183
|
+
path: foundPath,
|
|
184
|
+
found: true,
|
|
185
|
+
valid: validation.valid,
|
|
186
|
+
error: validation.error,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
results[browserType] = {
|
|
191
|
+
browser: browserType,
|
|
192
|
+
path: '',
|
|
193
|
+
found: false,
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return results;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Detect a specific browser's history path
|
|
201
|
+
*/
|
|
202
|
+
detect(browser) {
|
|
203
|
+
const all = this.detectAll();
|
|
204
|
+
return all[browser] || { browser, path: '', found: false };
|
|
205
|
+
}
|
|
206
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL Normalizer - Strips tracking parameters and normalizes URLs for deduplication
|
|
3
|
+
*/
|
|
4
|
+
// Common tracking parameters to strip
|
|
5
|
+
const TRACKING_PARAMS = new Set([
|
|
6
|
+
// Google Analytics / Ads
|
|
7
|
+
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
8
|
+
'gclid', 'gclsrc', 'dclid',
|
|
9
|
+
// Facebook
|
|
10
|
+
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
|
|
11
|
+
// Twitter/X
|
|
12
|
+
'twclid', 's', 't', // Twitter share params
|
|
13
|
+
// Microsoft/Bing
|
|
14
|
+
'msclkid',
|
|
15
|
+
// Email marketing
|
|
16
|
+
'mc_cid', 'mc_eid', // Mailchimp
|
|
17
|
+
'oly_enc_id', 'oly_anon_id', // Omeda
|
|
18
|
+
'_hsenc', '_hsmi', 'hsCtaTracking', // HubSpot
|
|
19
|
+
// Analytics & tracking
|
|
20
|
+
'_ga', '_gl', // Google Analytics
|
|
21
|
+
'ref', 'ref_src', 'ref_url', // Referrer tracking
|
|
22
|
+
'source', 'src',
|
|
23
|
+
'campaign', 'medium',
|
|
24
|
+
// Social sharing
|
|
25
|
+
'share', 'shared', 'via',
|
|
26
|
+
// Session/user tracking
|
|
27
|
+
'sessionid', 'session_id', 'sid',
|
|
28
|
+
'userid', 'user_id', 'uid',
|
|
29
|
+
'token', 'auth',
|
|
30
|
+
// Misc tracking
|
|
31
|
+
'pk_campaign', 'pk_kwd', 'pk_source', // Piwik/Matomo
|
|
32
|
+
'zanpid', // Zanox
|
|
33
|
+
'irclickid', // Impact Radius
|
|
34
|
+
'affiliate', 'affiliate_id', 'aff_id',
|
|
35
|
+
'clickid', 'click_id',
|
|
36
|
+
'trk', 'tracking', 'track',
|
|
37
|
+
]);
|
|
38
|
+
// Parameters that might be tracking but context-dependent (be more careful)
|
|
39
|
+
const SUSPICIOUS_PARAMS = new Set([
|
|
40
|
+
'id', 'ref', 'source', 'from', 'origin',
|
|
41
|
+
]);
|
|
42
|
+
export class UrlNormalizer {
|
|
43
|
+
/**
|
|
44
|
+
* Normalize a URL by stripping tracking parameters and standardizing format
|
|
45
|
+
*/
|
|
46
|
+
static normalize(url) {
|
|
47
|
+
try {
|
|
48
|
+
const parsed = new URL(url);
|
|
49
|
+
// 1. Lowercase the hostname
|
|
50
|
+
parsed.hostname = parsed.hostname.toLowerCase();
|
|
51
|
+
// 2. Remove default ports
|
|
52
|
+
if (parsed.port === '80' || parsed.port === '443') {
|
|
53
|
+
parsed.port = '';
|
|
54
|
+
}
|
|
55
|
+
// 3. Remove trailing slash from path (except root)
|
|
56
|
+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith('/')) {
|
|
57
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
58
|
+
}
|
|
59
|
+
// 4. Remove fragment/hash
|
|
60
|
+
parsed.hash = '';
|
|
61
|
+
// 5. Strip tracking parameters
|
|
62
|
+
const cleanParams = new URLSearchParams();
|
|
63
|
+
parsed.searchParams.forEach((value, key) => {
|
|
64
|
+
const lowerKey = key.toLowerCase();
|
|
65
|
+
if (!TRACKING_PARAMS.has(lowerKey)) {
|
|
66
|
+
cleanParams.set(key, value);
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
// 6. Sort remaining params for consistent ordering
|
|
70
|
+
const sortedParams = new URLSearchParams();
|
|
71
|
+
const keys = Array.from(cleanParams.keys()).sort();
|
|
72
|
+
for (const key of keys) {
|
|
73
|
+
sortedParams.set(key, cleanParams.get(key));
|
|
74
|
+
}
|
|
75
|
+
parsed.search = sortedParams.toString() ? `?${sortedParams.toString()}` : '';
|
|
76
|
+
return parsed.toString();
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
// If URL parsing fails, return original
|
|
80
|
+
return url;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Extract the canonical URL (domain + path, no params)
|
|
85
|
+
* Useful for aggressive deduplication
|
|
86
|
+
*/
|
|
87
|
+
static getCanonical(url) {
|
|
88
|
+
try {
|
|
89
|
+
const parsed = new URL(url);
|
|
90
|
+
parsed.hostname = parsed.hostname.toLowerCase();
|
|
91
|
+
parsed.search = '';
|
|
92
|
+
parsed.hash = '';
|
|
93
|
+
// Remove trailing slash
|
|
94
|
+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith('/')) {
|
|
95
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
96
|
+
}
|
|
97
|
+
return `${parsed.protocol}//${parsed.host}${parsed.pathname}`;
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
return url;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Get a fingerprint for deduplication (hash of normalized URL)
|
|
105
|
+
*/
|
|
106
|
+
static getFingerprint(url) {
|
|
107
|
+
const normalized = this.normalize(url);
|
|
108
|
+
// Simple hash for dedup - not cryptographic, just for comparison
|
|
109
|
+
let hash = 0;
|
|
110
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
111
|
+
const char = normalized.charCodeAt(i);
|
|
112
|
+
hash = ((hash << 5) - hash) + char;
|
|
113
|
+
hash = hash & hash; // Convert to 32bit integer
|
|
114
|
+
}
|
|
115
|
+
return hash.toString(36);
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Check if two URLs are effectively the same (after normalization)
|
|
119
|
+
*/
|
|
120
|
+
static areSameUrl(url1, url2) {
|
|
121
|
+
return this.normalize(url1) === this.normalize(url2);
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Extract domain from URL
|
|
125
|
+
*/
|
|
126
|
+
static getDomain(url) {
|
|
127
|
+
try {
|
|
128
|
+
const parsed = new URL(url);
|
|
129
|
+
return parsed.hostname.toLowerCase();
|
|
130
|
+
}
|
|
131
|
+
catch {
|
|
132
|
+
return '';
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Check if URL matches common non-content patterns
|
|
137
|
+
*/
|
|
138
|
+
static isLikelyNonContent(url) {
|
|
139
|
+
try {
|
|
140
|
+
const parsed = new URL(url);
|
|
141
|
+
const path = parsed.pathname.toLowerCase();
|
|
142
|
+
// Common non-content paths
|
|
143
|
+
const nonContentPaths = [
|
|
144
|
+
'/login', '/signin', '/sign-in', '/signup', '/sign-up', '/register',
|
|
145
|
+
'/logout', '/signout', '/sign-out',
|
|
146
|
+
'/auth', '/oauth', '/callback', '/sso',
|
|
147
|
+
'/cart', '/checkout', '/basket', '/bag',
|
|
148
|
+
'/account', '/profile', '/settings', '/preferences',
|
|
149
|
+
'/admin', '/dashboard', '/manage',
|
|
150
|
+
'/api/', '/graphql', '/_next/', '/__',
|
|
151
|
+
'/search', '/results',
|
|
152
|
+
];
|
|
153
|
+
for (const nonContent of nonContentPaths) {
|
|
154
|
+
if (path.startsWith(nonContent) || path.includes(nonContent)) {
|
|
155
|
+
return true;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// Non-content file extensions
|
|
159
|
+
const nonContentExtensions = [
|
|
160
|
+
'.json', '.xml', '.js', '.css', '.map',
|
|
161
|
+
'.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.webp',
|
|
162
|
+
'.woff', '.woff2', '.ttf', '.eot',
|
|
163
|
+
'.pdf', '.zip', '.gz',
|
|
164
|
+
];
|
|
165
|
+
for (const ext of nonContentExtensions) {
|
|
166
|
+
if (path.endsWith(ext)) {
|
|
167
|
+
return true;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return false;
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
export class ContentCleaner {
|
|
2
|
+
/**
|
|
3
|
+
* Cleans email body by removing noise, quoted replies, and footers.
|
|
4
|
+
* optimized for LLM processing.
|
|
5
|
+
*/
|
|
6
|
+
static cleanEmailBody(text) {
|
|
7
|
+
if (!text)
|
|
8
|
+
return "";
|
|
9
|
+
const originalText = text;
|
|
10
|
+
// 1. Detect if content is actually HTML
|
|
11
|
+
const isHtml = /<[a-z][\s\S]*>/i.test(text);
|
|
12
|
+
if (isHtml) {
|
|
13
|
+
// Lightweight HTML -> Markdown Conversion
|
|
14
|
+
// Structure: <br>, <p> -> Newlines
|
|
15
|
+
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
16
|
+
text = text.replace(/<\/p>/gi, '\n\n');
|
|
17
|
+
text = text.replace(/<p.*?>/gi, '');
|
|
18
|
+
// Structure: Headers <h1>-<h6> -> # Title
|
|
19
|
+
text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
|
|
20
|
+
// Structure: Lists <li> -> - Item
|
|
21
|
+
text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
|
|
22
|
+
text = text.replace(/<ul.*?>/gi, '');
|
|
23
|
+
text = text.replace(/<\/ul>/gi, '\n');
|
|
24
|
+
// Links: <a href=\"...\">text</a> -> [text](href)
|
|
25
|
+
text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
|
|
26
|
+
// Images: <img src=\"...\" alt=\"...\"> -> 
|
|
27
|
+
text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => ``);
|
|
28
|
+
// Style/Script removal (strictly remove content)
|
|
29
|
+
text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
|
|
30
|
+
text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
|
|
31
|
+
// Final Strip of remaining tags
|
|
32
|
+
text = text.replace(/<[^>]+>/g, ' ');
|
|
33
|
+
// Entity decoding (Basic)
|
|
34
|
+
text = text.replace(/ /gi, ' ');
|
|
35
|
+
text = text.replace(/&/gi, '&');
|
|
36
|
+
text = text.replace(/</gi, '<');
|
|
37
|
+
text = text.replace(/>/gi, '>');
|
|
38
|
+
text = text.replace(/"/gi, '"');
|
|
39
|
+
text = text.replace(/'/gi, "'");
|
|
40
|
+
}
|
|
41
|
+
const lines = text.split('\n');
|
|
42
|
+
const cleanedLines = [];
|
|
43
|
+
// Patterns that usually mark the START of a reply chain or a generic footer
|
|
44
|
+
const truncationPatterns = [
|
|
45
|
+
/^On .* wrote:$/i,
|
|
46
|
+
/^From: .* <.*>$/i,
|
|
47
|
+
/^-----Original Message-----$/i,
|
|
48
|
+
/^________________________________$/i,
|
|
49
|
+
/^Sent from my iPhone$/i,
|
|
50
|
+
/^Sent from my Android$/i,
|
|
51
|
+
/^Get Outlook for/i,
|
|
52
|
+
/^--$/ // Standard signature separator
|
|
53
|
+
];
|
|
54
|
+
// Patterns for lines that should be stripped but NOT truncate the whole email
|
|
55
|
+
const noisePatterns = [
|
|
56
|
+
/view in browser/i,
|
|
57
|
+
/click here to view/i,
|
|
58
|
+
/legal notice/i,
|
|
59
|
+
/all rights reserved/i,
|
|
60
|
+
/privacy policy/i,
|
|
61
|
+
/terms of service/i,
|
|
62
|
+
/unsubscribe/i
|
|
63
|
+
];
|
|
64
|
+
for (let line of lines) {
|
|
65
|
+
let lineStripped = line.trim();
|
|
66
|
+
if (!lineStripped) {
|
|
67
|
+
cleanedLines.push("");
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
// 2. Quoted text removal (lines starting with >)
|
|
71
|
+
if (lineStripped.startsWith('>')) {
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
// 3. Truncation check: If we hit a reply header, we stop entirely
|
|
75
|
+
let shouldTruncate = false;
|
|
76
|
+
for (const pattern of truncationPatterns) {
|
|
77
|
+
if (pattern.test(lineStripped)) {
|
|
78
|
+
shouldTruncate = true;
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
if (shouldTruncate)
|
|
83
|
+
break;
|
|
84
|
+
// 4. Noise check: Strip boilerplate lines
|
|
85
|
+
let isNoise = false;
|
|
86
|
+
if (lineStripped.length < 100) {
|
|
87
|
+
for (const pattern of noisePatterns) {
|
|
88
|
+
if (pattern.test(lineStripped)) {
|
|
89
|
+
isNoise = true;
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (isNoise)
|
|
95
|
+
continue;
|
|
96
|
+
cleanedLines.push(line);
|
|
97
|
+
}
|
|
98
|
+
// Reassemble
|
|
99
|
+
text = cleanedLines.join('\n');
|
|
100
|
+
// Collapse whitespace
|
|
101
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
102
|
+
text = text.replace(/[ \t]{2,}/g, ' ');
|
|
103
|
+
// Safety Fallback: If cleaning stripped too much, return original text truncated
|
|
104
|
+
if (text.trim().length < 20 && originalText.trim().length > 20) {
|
|
105
|
+
return originalText.substring(0, 3000).trim();
|
|
106
|
+
}
|
|
107
|
+
// Sanitize LLM Special Tokens
|
|
108
|
+
text = text.replace(/<\|/g, '< |');
|
|
109
|
+
text = text.replace(/\|>/g, '| >');
|
|
110
|
+
text = text.replace(/\[INST\]/gi, '[ INST ]');
|
|
111
|
+
text = text.replace(/\[\/INST\]/gi, '[ /INST ]');
|
|
112
|
+
return text.trim();
|
|
113
|
+
}
|
|
114
|
+
}
|