@realtimex/realtimex-alchemy 1.0.42 → 1.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/CHANGELOG.md CHANGED
@@ -5,6 +5,23 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.44] - 2026-01-26
9
+
10
+ ### Added
11
+ - **Setup Wizard**: Added a UI-based migration tool to streamline the initial setup. Users can now run database migrations directly from the setup interface.
12
+ - **Migration**: Added `POST /api/migrate` endpoint to trigger database migrations from the frontend, streaming real-time logs via SSE.
13
+
14
+ ### Improved
15
+ - **Scripts**: Enhanced `migrate.sh` to support non-interactive authentication (via Access Tokens or DB Passwords) and automatic `TOKEN_ENCRYPTION_KEY` generation for Edge Functions.
16
+
17
+ ## [1.0.43] - 2026-01-26
18
+
19
+ ### Added
20
+ - **Settings**: Added personalized "Blocked Tags" management in the Intelligence Engine settings, allowing users to override default filters.
21
+
22
+ ### Improved
23
+ - **Transmute Engine**: Optimized automatic engine creation to only generate newsletter pipelines for active categories (excluding "Other"), significantly reducing noise.
24
+
8
25
  ## [1.0.42] - 2026-01-26
9
26
 
10
27
  ### Added
package/dist/api/index.js CHANGED
@@ -2,6 +2,7 @@ import express from 'express';
2
2
  import cors from 'cors';
3
3
  import path from 'path';
4
4
  import { fileURLToPath } from 'url';
5
+ import { spawn } from 'child_process';
5
6
  import { MinerService } from './services/MinerService.js';
6
7
  import { AlchemistService } from './services/AlchemistService.js';
7
8
  import { LibrarianService } from './services/LibrarianService.js';
@@ -27,6 +28,109 @@ const events = EventService.getInstance();
27
28
  app.get('/health', (req, res) => {
28
29
  res.json({ status: 'active', platform: process.platform });
29
30
  });
31
+ // Run database migrations (SSE stream)
32
+ app.post('/api/migrate', (req, res) => {
33
+ const { projectId, dbPassword, accessToken } = req.body;
34
+ if (!projectId) {
35
+ return res.status(400).json({ error: 'Project ID is required' });
36
+ }
37
+ // Set up SSE for streaming output
38
+ res.setHeader('Content-Type', 'text/event-stream');
39
+ res.setHeader('Cache-Control', 'no-cache');
40
+ res.setHeader('Connection', 'keep-alive');
41
+ const sendEvent = (type, data) => {
42
+ res.write(`data: ${JSON.stringify({ type, data })}\n\n`);
43
+ };
44
+ sendEvent('info', '🚀 Starting migration...');
45
+ // Find the migrate.sh script - check multiple possible locations
46
+ // In dev: api/../scripts/migrate.sh
47
+ // In prod: dist/api/../../scripts/migrate.sh
48
+ const possiblePaths = [
49
+ path.join(__dirname, '..', 'scripts', 'migrate.sh'), // dev mode
50
+ path.join(__dirname, '..', '..', 'scripts', 'migrate.sh'), // compiled dist/api/
51
+ path.join(process.cwd(), 'scripts', 'migrate.sh') // fallback to cwd
52
+ ];
53
+ const scriptPath = possiblePaths.find(p => fs.existsSync(p));
54
+ const projectRoot = scriptPath ? path.dirname(path.dirname(scriptPath)) : process.cwd();
55
+ if (!scriptPath) {
56
+ sendEvent('error', `Migration script not found. Searched: ${possiblePaths.join(', ')}`);
57
+ sendEvent('done', 'failed');
58
+ return res.end();
59
+ }
60
+ sendEvent('info', `Found script at: ${scriptPath}`);
61
+ sendEvent('info', `Working directory: ${projectRoot}`);
62
+ // Prepare environment - support both access token and database password
63
+ const env = {
64
+ ...process.env,
65
+ SUPABASE_PROJECT_ID: projectId,
66
+ // Ensure PATH includes common locations for supabase CLI
67
+ PATH: `${process.env.PATH}:/usr/local/bin:/opt/homebrew/bin:${projectRoot}/node_modules/.bin`
68
+ };
69
+ // Access token is preferred for non-interactive auth
70
+ if (accessToken) {
71
+ env.SUPABASE_ACCESS_TOKEN = accessToken;
72
+ sendEvent('info', 'Using access token for authentication');
73
+ }
74
+ if (dbPassword) {
75
+ env.SUPABASE_DB_PASSWORD = dbPassword;
76
+ sendEvent('info', 'Using database password for authentication');
77
+ }
78
+ // Track process state
79
+ let processCompleted = false;
80
+ // Spawn the migration script in its own process group
81
+ const child = spawn('bash', [scriptPath], {
82
+ env,
83
+ cwd: projectRoot,
84
+ stdio: ['ignore', 'pipe', 'pipe'],
85
+ detached: true // Run in separate process group
86
+ });
87
+ // Don't let the parent wait for this child
88
+ child.unref();
89
+ sendEvent('info', `Process spawned with PID: ${child.pid}`);
90
+ child.stdout.on('data', (data) => {
91
+ const lines = data.toString().split('\n').filter((l) => l.trim());
92
+ for (const line of lines) {
93
+ sendEvent('stdout', line);
94
+ }
95
+ });
96
+ child.stderr.on('data', (data) => {
97
+ const lines = data.toString().split('\n').filter((l) => l.trim());
98
+ for (const line of lines) {
99
+ // Supabase CLI outputs progress to stderr, not always errors
100
+ sendEvent('stderr', line);
101
+ }
102
+ });
103
+ child.on('close', (code, signal) => {
104
+ processCompleted = true;
105
+ if (code === 0) {
106
+ sendEvent('info', '✅ Migration completed successfully!');
107
+ sendEvent('done', 'success');
108
+ }
109
+ else if (signal) {
110
+ sendEvent('error', `Migration killed by signal: ${signal}`);
111
+ sendEvent('done', 'failed');
112
+ }
113
+ else {
114
+ sendEvent('error', `Migration failed with exit code ${code}`);
115
+ sendEvent('done', 'failed');
116
+ }
117
+ res.end();
118
+ });
119
+ child.on('error', (err) => {
120
+ processCompleted = true;
121
+ sendEvent('error', `Failed to start migration: ${err.message}`);
122
+ sendEvent('done', 'failed');
123
+ res.end();
124
+ });
125
+ // Don't kill the process on client disconnect - let migration complete
126
+ // The process should finish on its own, and failed writes are handled gracefully
127
+ let clientConnected = true;
128
+ req.on('close', () => {
129
+ clientConnected = false;
130
+ // Don't kill the process - let it complete
131
+ console.log('[Migrate] Client disconnected, but migration will continue');
132
+ });
133
+ });
30
134
  // SSE Events
31
135
  app.get('/events', (req, res) => {
32
136
  res.setHeader('Content-Type', 'text/event-stream');
@@ -458,6 +562,22 @@ if (fs.existsSync(staticPath)) {
458
562
  res.status(500).json({ error: error.message || 'Engine brief fetch failed' });
459
563
  }
460
564
  });
565
+ app.post('/api/engines/ensure-defaults', async (req, res) => {
566
+ try {
567
+ const userId = req.headers['x-user-id'];
568
+ if (!userId) {
569
+ return res.status(401).json({ error: 'Unauthorized: Missing User ID' });
570
+ }
571
+ console.log(`[API] Ensuring default engines for user ${userId}`);
572
+ const supabase = getAuthenticatedSupabase(req);
573
+ await transmuteService.ensureDefaultNewsletterEngines(userId, supabase);
574
+ res.json({ success: true });
575
+ }
576
+ catch (error) {
577
+ console.error('[API] Failed to ensure default engines:', error);
578
+ res.status(500).json({ error: error.message || 'Failed to ensure default engines' });
579
+ }
580
+ });
461
581
  // Client-side routing fallback (Bypass path-to-regexp error in Express 5)
462
582
  app.use((req, res, next) => {
463
583
  if (!req.path.startsWith('/api') && !req.path.startsWith('/events')) {
@@ -4,6 +4,7 @@ import { embeddingService } from './EmbeddingService.js';
4
4
  import { deduplicationService } from './DeduplicationService.js';
5
5
  import { SDKService } from './SDKService.js';
6
6
  import { ContentCleaner } from '../utils/contentCleaner.js';
7
+ import { transmuteService } from './TransmuteService.js';
7
8
  export class AlchemistService {
8
9
  processingEvents;
9
10
  router;
@@ -111,9 +112,8 @@ export class AlchemistService {
111
112
  let rawContent = result.content;
112
113
  finalUrl = result.finalUrl;
113
114
  if (rawContent && rawContent.length > 20) {
114
- // HIGHLIGHT: Payload Hygiene - Clean Markdown content after conversion
115
- // This strips JS/CSS patterns that survived Turndown
116
- const cleaned = ContentCleaner.cleanContent(rawContent);
115
+ // HIGHLIGHT: Payload Hygiene - Content is already cleaned by RouterService
116
+ const cleaned = rawContent;
117
117
  // Check if this is a login wall or paywall
118
118
  isGatedContent = ContentCleaner.isGatedContent(cleaned);
119
119
  if (isGatedContent) {
@@ -158,7 +158,7 @@ export class AlchemistService {
158
158
  summary: isGatedContent ? 'Login or subscription required to access this content.' : response.summary,
159
159
  category: response.category,
160
160
  entities: response.entities,
161
- tags: response.tags,
161
+ tags: (response.tags || []).map(t => t.toLowerCase().trim()),
162
162
  content: content,
163
163
  // Mark as dismissed if low score OR gated content
164
164
  is_dismissed: response.score < 50 || isGatedContent,
@@ -245,7 +245,11 @@ export class AlchemistService {
245
245
  },
246
246
  userId
247
247
  }, supabase);
248
- // 6. Trigger Background Persona Consolidation (don't await)
248
+ // 6. Trigger Background Engine Discovery (NEW: Dynamically create engines after sync)
249
+ transmuteService.ensureDefaultNewsletterEngines(userId, supabase).catch(err => {
250
+ console.error('[AlchemistService] Background engine discovery failed:', err);
251
+ });
252
+ // 7. Trigger Background Persona Consolidation (don't await)
249
253
  import('./PersonaService.js').then(({ personaService }) => {
250
254
  personaService.consolidatePersona(userId, supabase).catch(err => {
251
255
  console.error('[AlchemistService] Background persona update failed:', err);
@@ -105,12 +105,13 @@ Be concise, helpful, and professional.
105
105
  ...previousMessages.map(m => ({ role: m.role, content: m.content })),
106
106
  { role: 'user', content: finalPrompt } // Current turn with RAG context
107
107
  ];
108
+ console.log('[ChatService] Final Prompt being sent to LLM:', JSON.stringify(messages, null, 2));
108
109
  const response = await sdk.llm.chat(messages, {
109
110
  provider: settings.llm_provider || 'realtimexai',
110
- model: settings.llm_model || 'gpt-4o-mini' // Default to available model
111
+ model: settings.llm_model || 'gpt-4o'
111
112
  });
112
113
  console.log('[ChatService] LLM Response:', JSON.stringify(response, null, 2));
113
- const aiContent = response.response?.content || "I'm sorry, I couldn't generate a response.";
114
+ const aiContent = response.response?.content || "I'm sorry, I couldn't generate a response. The LLM returned empty content.";
114
115
  // 6. Save Assistant Message
115
116
  const { data: aiMsg, error: aiError } = await supabase
116
117
  .from('chat_messages')
@@ -1,10 +1,8 @@
1
1
  import axios from 'axios';
2
2
  import puppeteer from 'puppeteer';
3
- import TurndownService from 'turndown';
4
3
  import { EventService } from './EventService.js';
5
4
  import { ContentCleaner } from '../utils/contentCleaner.js';
6
5
  export class RouterService {
7
- turndown = new TurndownService();
8
6
  events = EventService.getInstance();
9
7
  async extractContent(url) {
10
8
  this.events.emit({ type: 'router', message: `Attempting Tier 1 Extraction (Axios): ${url.substring(0, 30)}...` });
@@ -26,9 +24,8 @@ export class RouterService {
26
24
  // But usually responseUrl is the reliable one.
27
25
  }
28
26
  const rawHtml = response.data;
29
- // Payload Hygiene: Sanitize HTML before Markdown conversion
30
- const sanitizedHtml = ContentCleaner.sanitizeHtml(rawHtml);
31
- const markdown = this.turndown.turndown(sanitizedHtml);
27
+ // Payload Hygiene: Full HTML Pipeline (Sanitize -> Markdown -> Polish)
28
+ const markdown = ContentCleaner.cleanContent(rawHtml);
32
29
  if (markdown.length > 500) {
33
30
  this.events.emit({ type: 'router', message: `Tier 1 Success (${markdown.length} chars) -> ${finalUrl.substring(0, 30)}...` });
34
31
  return { content: markdown, finalUrl };
@@ -47,9 +44,9 @@ export class RouterService {
47
44
  // Capture final URL from page object
48
45
  finalUrl = page.url();
49
46
  const content = await page.content();
50
- const sanitizedHtml = ContentCleaner.sanitizeHtml(content);
47
+ // Payload Hygiene: Full HTML Pipeline (Sanitize -> Markdown -> Polish)
48
+ const markdown = ContentCleaner.cleanContent(content);
51
49
  await browser.close();
52
- const markdown = this.turndown.turndown(sanitizedHtml);
53
50
  this.events.emit({ type: 'router', message: `Tier 2 Success (${markdown.length} chars) -> ${finalUrl.substring(0, 30)}...` });
54
51
  return { content: markdown, finalUrl };
55
52
  }
@@ -1,7 +1,7 @@
1
1
  import path from 'path';
2
2
  import os from 'os';
3
3
  import { SDKService } from './SDKService.js';
4
- import { ContentCleaner } from '../utils/contentCleaner.js';
4
+ import { BLOCKED_TAGS as DEFAULT_BLOCKED_TAGS } from '../../shared/constants.js';
5
5
  export class TransmuteService {
6
6
  /**
7
7
  * Run a specific engine pipeline
@@ -119,8 +119,8 @@ export class TransmuteService {
119
119
  summary: s.summary,
120
120
  url: bestUrl, // Best Available (Resolved) URL
121
121
  source_urls: uniqueUrls, // All associated direct URLs
122
- // Use ContentCleaner to strip JS/CSS noise
123
- content: s.content ? ContentCleaner.cleanContent(s.content) : undefined
122
+ // Return clean/raw content (already cleaned by RouterService)
123
+ content: s.content || undefined
124
124
  };
125
125
  });
126
126
  // 4. Construct high-fidelity System Prompt
@@ -169,16 +169,31 @@ export class TransmuteService {
169
169
  * Fetch relevant signals based on engine configuration
170
170
  */
171
171
  async fetchContextSignals(userId, config, supabase) {
172
+ const maxSignals = config.max_signals || 10;
172
173
  let query = supabase
173
174
  .from('signals')
174
175
  .select('*')
175
176
  .eq('user_id', userId)
176
177
  .order('score', { ascending: false })
177
- .limit(10);
178
- if (config.category && config.category !== 'All') {
179
- query = query.eq('category', config.category);
178
+ .limit(maxSignals);
179
+ // Support both single category (legacy) and multi-select categories
180
+ const categories = Array.isArray(config.category) ? config.category : (config.category ? [config.category] : []);
181
+ if (categories.length > 0 && !categories.includes('All')) {
182
+ // Use OR logic for multiple categories
183
+ query = query.in('category', categories);
180
184
  }
181
- const { data } = await query;
185
+ // HIGHLIGHT: Support tag-based filtering (Dynamic Tag Engines)
186
+ if (config.tag) {
187
+ const normalizedTag = config.tag.toLowerCase().trim();
188
+ console.log(`[Transmute] Filtering by tag: "${normalizedTag}" (original: "${config.tag}")`);
189
+ query = query.contains('tags', [normalizedTag]);
190
+ }
191
+ const { data, error } = await query;
192
+ if (error) {
193
+ console.error('[Transmute] Signal query failed:', error);
194
+ return [];
195
+ }
196
+ console.log(`[Transmute] Retrieved ${data?.length || 0} signals for user ${userId}`);
182
197
  return (data || []);
183
198
  }
184
199
  /**
@@ -222,6 +237,146 @@ export class TransmuteService {
222
237
  });
223
238
  return response.response?.content || "Failed to generate content.";
224
239
  }
240
+ /**
241
+ * Ensure default newsletter engines exist for each category
242
+ */
243
+ /**
244
+ * Ensure default newsletter engines exist for each category
245
+ * This is "Self-Healing": it creates missing engines based on discovered signals.
246
+ */
247
+ async ensureDefaultNewsletterEngines(userId, supabase) {
248
+ console.log(`[Transmute] Running Self-Healing Engine Discovery for user ${userId}...`);
249
+ // 1. Fetch Active Categories from Signals
250
+ // We only create engines for categories that actually have data.
251
+ const { data: signalStats } = await supabase
252
+ .from('signals')
253
+ .select('category')
254
+ .eq('user_id', userId);
255
+ const activeCategories = new Set(signalStats?.map(s => s.category).filter(Boolean) || []);
256
+ // 2. Fetch Existing Pipelines to avoid duplicates
257
+ const { data: existingEngines } = await supabase
258
+ .from('engines')
259
+ .select('title, config')
260
+ .eq('user_id', userId)
261
+ .eq('type', 'newsletter');
262
+ const existingTitles = new Set(existingEngines?.map(e => e.title) || []);
263
+ const existingCategories = new Set(existingEngines?.map(e => e.config?.category).filter(Boolean));
264
+ const existingTags = new Set(existingEngines?.map(e => e.config?.tag?.toLowerCase().trim()).filter(Boolean));
265
+ // 3. Create Engines for Active Categories (Exclude 'Other')
266
+ for (const category of activeCategories) {
267
+ // Filter out 'Other' (case-insensitive) and empty strings
268
+ if (!category || category.toLowerCase() === 'other')
269
+ continue;
270
+ // STRICT 1:1 CHECK: Skip if title exists OR category is already covered
271
+ const title = `${category} Daily`;
272
+ if (existingTitles.has(title) || existingCategories.has(category)) {
273
+ continue;
274
+ }
275
+ console.log(`[Transmute] Bootstrapping missing category engine: ${title}`);
276
+ const config = {
277
+ category,
278
+ execution_mode: 'desktop',
279
+ schedule: 'Daily',
280
+ llm_provider: 'realtimexai',
281
+ llm_model: 'gpt-4o',
282
+ max_signals: 30,
283
+ custom_prompt: `Create a comprehensive daily newsletter focused on ${category}. Highlight the most important developments, key insights, and actionable takeaways. Use a professional, insight-driven tone with clear structure: start with 'The Big Story' followed by 'Quick Hits' for other notable items.`
284
+ };
285
+ await supabase
286
+ .from('engines')
287
+ .insert({
288
+ user_id: userId,
289
+ title: title,
290
+ type: 'newsletter',
291
+ config: config,
292
+ status: 'active'
293
+ });
294
+ }
295
+ // 4. Dynamic Tag-Based Categories
296
+ // De-prioritized for now to reduce noise.
297
+ // Only run if the threshold is very high or if category mapping isn't enough.
298
+ // await this.ensureDynamicTagEngines(userId, supabase, existingTitles, existingTags);
299
+ }
300
+ /**
301
+ * Find popular tags and create engines for them
302
+ * Treat popular tags as "dynamic categories" with 1:1 mapping.
303
+ */
304
+ async ensureDynamicTagEngines(userId, supabase, existingTitles, existingTags) {
305
+ // 1. Fetch all tags for the user
306
+ const { data: signals } = await supabase
307
+ .from('signals')
308
+ .select('tags')
309
+ .eq('user_id', userId);
310
+ if (!signals || signals.length === 0)
311
+ return;
312
+ // 2. Fetch User Settings (for blocked tags)
313
+ const { data: settings } = await supabase
314
+ .from('alchemy_settings')
315
+ .select('blocked_tags')
316
+ .eq('user_id', userId)
317
+ .maybeSingle();
318
+ const userBlockedTags = new Set((settings?.blocked_tags || []).map(t => t.toLowerCase().trim()));
319
+ // Use user blocked tags if they exist, otherwise fallback to system defaults
320
+ const systemDefaults = Array.from(DEFAULT_BLOCKED_TAGS).map(t => t.toLowerCase().trim());
321
+ const BLOCKED_TAGS = userBlockedTags.size > 0 ? userBlockedTags : new Set(systemDefaults);
322
+ // Core category terms to skip (already handled by category discovery)
323
+ const CORE_CATEGORY_TERMS = new Set([
324
+ 'ai', 'ml', 'machine learning', 'artificial intelligence',
325
+ 'business', 'technology', 'tech', 'finance', 'financial',
326
+ 'crypto', 'cryptocurrency', 'bitcoin', 'science', 'scientific',
327
+ 'politics', 'political', 'government'
328
+ ]);
329
+ const tagCounts = new Map();
330
+ const DYNAMIC_THRESHOLD = 50; // Increased significantly to reduce noise for now
331
+ const MAX_NEW_PER_RUN = 3;
332
+ signals.forEach(s => {
333
+ const tags = (s.tags || []);
334
+ tags.forEach(tag => {
335
+ const lower = tag.toLowerCase().trim();
336
+ // Filter out: empty, short, blocked, or core category overlaps
337
+ // Added check for 'redirect' and other junk specifically
338
+ if (!lower || lower.length < 3 || BLOCKED_TAGS.has(lower) || CORE_CATEGORY_TERMS.has(lower) || lower.includes('redirect')) {
339
+ return;
340
+ }
341
+ tagCounts.set(lower, (tagCounts.get(lower) || 0) + 1);
342
+ });
343
+ });
344
+ // 3. Identify and Sort Candidates
345
+ const candidates = Array.from(tagCounts.entries())
346
+ .filter(([_, count]) => count >= DYNAMIC_THRESHOLD)
347
+ .sort((a, b) => b[1] - a[1]);
348
+ let createdCount = 0;
349
+ for (const [tag, count] of candidates) {
350
+ const displayName = tag.split(' ').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
351
+ const title = `${displayName} Daily`;
352
+ // STRICT 1:1 CHECK: Skip if title exists OR tag is already covered
353
+ if (existingTitles.has(title) || existingTags.has(tag)) {
354
+ continue;
355
+ }
356
+ console.log(`[Transmute] Creating dynamic tag engine for "${tag}" (${count} signals)`);
357
+ const config = {
358
+ tag,
359
+ execution_mode: 'desktop',
360
+ schedule: 'Daily',
361
+ llm_provider: 'realtimexai',
362
+ llm_model: 'gpt-4o'
363
+ };
364
+ const { error } = await supabase
365
+ .from('engines')
366
+ .insert({
367
+ user_id: userId,
368
+ title: title,
369
+ type: 'newsletter',
370
+ config: config,
371
+ status: 'active'
372
+ });
373
+ if (!error)
374
+ createdCount++;
375
+ }
376
+ if (createdCount > 0) {
377
+ console.log(`[Transmute] Successfully auto-created ${createdCount} dynamic tag engines.`);
378
+ }
379
+ }
225
380
  /**
226
381
  * Save the generated asset to DB
227
382
  */
@@ -0,0 +1,62 @@
1
+ import { JSDOM } from 'jsdom';
2
+ import { Readability } from '@mozilla/readability';
3
+ export class DOMSanitizer {
4
+ /**
5
+ * Sanitize HTML using proper DOM parsing (not regex).
6
+ * RETURNS: Cleaned HTML string (not Markdown, not Plain Text).
7
+ */
8
+ static sanitizeHtml(html) {
9
+ if (!html)
10
+ return "";
11
+ try {
12
+ // 1. Parse DOM
13
+ const dom = new JSDOM(html, { url: "https://example.com" });
14
+ const doc = dom.window.document;
15
+ // 2. Pre-clean: Remove Toxic Tags immediately
16
+ // We do this BEFORE Readability to ensure no scripts sneak in
17
+ const toxicSelectors = [
18
+ 'script', 'style', 'noscript', 'svg', 'iframe', 'embed', 'object',
19
+ 'meta', 'link', 'head' // We only want body content
20
+ ];
21
+ let removedCount = 0;
22
+ toxicSelectors.forEach(tag => {
23
+ const elements = doc.querySelectorAll(tag);
24
+ if (elements.length > 0) {
25
+ console.log(`[DOMSanitizer] Removing ${elements.length} <${tag}> tags`);
26
+ removedCount += elements.length;
27
+ elements.forEach(el => el.remove());
28
+ }
29
+ });
30
+ console.log(`[DOMSanitizer] Removed ${removedCount} toxic elements total.`);
31
+ // 3. Try Readability (Best for Articles)
32
+ const reader = new Readability(doc);
33
+ const article = reader.parse();
34
+ if (article && article.content) {
35
+ // Return Clean HTML with structure preserved (not textContent)
36
+ return article.content;
37
+ }
38
+ // 4. Fallback: Manual Cleaning (If Readability fails)
39
+ // Remove UI Noise
40
+ const noiseSelectors = [
41
+ 'header', 'footer', 'nav', 'aside', 'form',
42
+ '[role="alert"]', '[role="banner"]', '[role="dialog"]',
43
+ '.ad', '.ads', '.advertisement', '.social-share', '#cookie-banner'
44
+ ];
45
+ noiseSelectors.forEach(selector => {
46
+ try {
47
+ doc.querySelectorAll(selector).forEach(el => el.remove());
48
+ }
49
+ catch (e) {
50
+ // Ignore selector errors
51
+ }
52
+ });
53
+ // Return whatever is left in the body as HTML
54
+ return doc.body.innerHTML;
55
+ }
56
+ catch (error) {
57
+ console.error('[DOMSanitizer] Parsing failed, falling back to regex strip', error);
58
+ // Emergency fallback: Strip script/style manually, return text
59
+ return html.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/gi, '');
60
+ }
61
+ }
62
+ }