@realtimex/realtimex-alchemy 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/bin/realtimex-alchemy.js +55 -0
- package/dist/api/config/index.js +33 -0
- package/dist/api/index.js +237 -0
- package/dist/api/lib/ContentCleaner.js +114 -0
- package/dist/api/lib/types.js +1 -0
- package/dist/api/services/AlchemistService.js +241 -0
- package/dist/api/services/EventService.js +53 -0
- package/dist/api/services/LibrarianService.js +72 -0
- package/dist/api/services/MinerService.js +314 -0
- package/dist/api/services/ProcessingEventService.js +75 -0
- package/dist/api/services/RouterService.js +40 -0
- package/dist/api/services/SupabaseService.js +49 -0
- package/dist/api/utils/BrowserPathDetector.js +206 -0
- package/dist/api/utils/UrlNormalizer.js +176 -0
- package/dist/api/utils/contentCleaner.js +114 -0
- package/dist/api/utils/contentCleaner.test.js +96 -0
- package/dist/assets/index-7Lemtnxa.css +1 -0
- package/dist/assets/index-CRgCScOz.js +101 -0
- package/dist/email-automator-logo.svg +51 -0
- package/dist/favicon.svg +45 -0
- package/dist/index.html +18 -0
- package/package.json +80 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import OpenAI from 'openai';
|
|
2
|
+
import { ProcessingEventService } from './ProcessingEventService.js';
|
|
3
|
+
import { RouterService } from './RouterService.js';
|
|
4
|
+
export class AlchemistService {
|
|
5
|
+
processingEvents;
|
|
6
|
+
router;
|
|
7
|
+
constructor() {
|
|
8
|
+
this.processingEvents = ProcessingEventService.getInstance();
|
|
9
|
+
this.router = new RouterService();
|
|
10
|
+
}
|
|
11
|
+
async process(entries, settings, supabase, userId, syncStartTime) {
|
|
12
|
+
if (!entries || entries.length === 0)
|
|
13
|
+
return;
|
|
14
|
+
// Track stats for completion event
|
|
15
|
+
const stats = {
|
|
16
|
+
total: entries.length,
|
|
17
|
+
signals: 0,
|
|
18
|
+
skipped: 0,
|
|
19
|
+
errors: 0
|
|
20
|
+
};
|
|
21
|
+
// Initialize OpenAI client with user settings
|
|
22
|
+
const config = {
|
|
23
|
+
baseUrl: settings.llm_base_url || 'http://localhost:11434',
|
|
24
|
+
model: settings.llm_model_name || 'llama3',
|
|
25
|
+
apiKey: settings.llm_api_key || 'ollama'
|
|
26
|
+
};
|
|
27
|
+
console.log('[AlchemistService] LLM Config:', {
|
|
28
|
+
baseUrl: config.baseUrl,
|
|
29
|
+
model: config.model,
|
|
30
|
+
hasApiKey: !!config.apiKey
|
|
31
|
+
});
|
|
32
|
+
const client = new OpenAI({
|
|
33
|
+
baseURL: config.baseUrl.endsWith('/v1') ? config.baseUrl : `${config.baseUrl}/v1`,
|
|
34
|
+
apiKey: config.apiKey,
|
|
35
|
+
dangerouslyAllowBrowser: true
|
|
36
|
+
});
|
|
37
|
+
for (const entry of entries) {
|
|
38
|
+
// Emit: Reading
|
|
39
|
+
await this.processingEvents.log({
|
|
40
|
+
eventType: 'analysis',
|
|
41
|
+
agentState: 'Reading',
|
|
42
|
+
message: `Reading content from: ${entry.url}`,
|
|
43
|
+
level: 'info',
|
|
44
|
+
userId
|
|
45
|
+
}, supabase);
|
|
46
|
+
try {
|
|
47
|
+
// 2. Extract Content via RouterService (Tier 1 → Tier 2 fallback)
|
|
48
|
+
let content = '';
|
|
49
|
+
try {
|
|
50
|
+
const markdown = await this.router.extractContent(entry.url);
|
|
51
|
+
if (markdown && markdown.length > 100) {
|
|
52
|
+
// Truncate to avoid token limits (keep ~8000 chars)
|
|
53
|
+
const truncated = markdown.length > 8000 ? markdown.substring(0, 8000) + '...' : markdown;
|
|
54
|
+
content = `Page Title: ${entry.title}\nContent: ${truncated}`;
|
|
55
|
+
}
|
|
56
|
+
else {
|
|
57
|
+
content = `Page Title: ${entry.title} (Content unavailable or too short)`;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
catch (scrapeError) {
|
|
61
|
+
console.warn(`Extraction failed for ${entry.url}:`, scrapeError.message);
|
|
62
|
+
content = `Page Title: ${entry.title}. URL: ${entry.url} (Extraction failed)`;
|
|
63
|
+
}
|
|
64
|
+
// Emit: Analyzing
|
|
65
|
+
const startAnalysis = Date.now();
|
|
66
|
+
await this.processingEvents.log({
|
|
67
|
+
eventType: 'analysis',
|
|
68
|
+
agentState: 'Thinking',
|
|
69
|
+
message: `Analyzing relevance of: ${entry.title}`,
|
|
70
|
+
level: 'info',
|
|
71
|
+
userId
|
|
72
|
+
}, supabase);
|
|
73
|
+
// 3. LLM Analysis
|
|
74
|
+
const response = await this.analyzeContent(client, content, entry.url, config.model);
|
|
75
|
+
const duration = Date.now() - startAnalysis;
|
|
76
|
+
if (response.relevant) {
|
|
77
|
+
// Emit: Signal Found
|
|
78
|
+
await this.processingEvents.log({
|
|
79
|
+
eventType: 'action',
|
|
80
|
+
agentState: 'Signal',
|
|
81
|
+
message: `Found signal: ${response.summary} (${response.score}%)`,
|
|
82
|
+
level: 'info',
|
|
83
|
+
metadata: response,
|
|
84
|
+
durationMs: duration,
|
|
85
|
+
userId
|
|
86
|
+
}, supabase);
|
|
87
|
+
// 4. Save Signal
|
|
88
|
+
console.log('[AlchemistService] Saving signal to database...');
|
|
89
|
+
const { error: insertError } = await supabase.from('signals').insert([{
|
|
90
|
+
user_id: userId,
|
|
91
|
+
url: entry.url,
|
|
92
|
+
title: entry.title,
|
|
93
|
+
score: response.score,
|
|
94
|
+
summary: response.summary,
|
|
95
|
+
category: response.category,
|
|
96
|
+
entities: response.entities,
|
|
97
|
+
tags: response.tags,
|
|
98
|
+
content: content
|
|
99
|
+
}]);
|
|
100
|
+
if (insertError) {
|
|
101
|
+
console.error('[AlchemistService] Insert error:', insertError);
|
|
102
|
+
stats.errors++;
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
console.log('[AlchemistService] Signal saved successfully');
|
|
106
|
+
stats.signals++;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
// Emit: Skipped
|
|
111
|
+
await this.processingEvents.log({
|
|
112
|
+
eventType: 'info',
|
|
113
|
+
agentState: 'Skipped',
|
|
114
|
+
message: `Irrelevant content (${response.score}%): ${entry.title}`,
|
|
115
|
+
level: 'debug',
|
|
116
|
+
durationMs: duration,
|
|
117
|
+
userId
|
|
118
|
+
}, supabase);
|
|
119
|
+
stats.skipped++;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
catch (error) {
|
|
123
|
+
console.error(`Error analyzing ${entry.url}:`, error);
|
|
124
|
+
await this.processingEvents.log({
|
|
125
|
+
eventType: 'error',
|
|
126
|
+
agentState: 'Error',
|
|
127
|
+
message: `Analysis failed for ${entry.title}: ${error.message}`,
|
|
128
|
+
level: 'error',
|
|
129
|
+
userId
|
|
130
|
+
}, supabase);
|
|
131
|
+
stats.errors++;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// Emit: Sync Completed
|
|
135
|
+
const totalDuration = syncStartTime ? Date.now() - syncStartTime : 0;
|
|
136
|
+
await this.processingEvents.log({
|
|
137
|
+
eventType: 'info',
|
|
138
|
+
agentState: 'Completed',
|
|
139
|
+
message: `Sync completed: ${stats.signals} signals found, ${stats.skipped} skipped, ${stats.errors} errors`,
|
|
140
|
+
level: 'info',
|
|
141
|
+
durationMs: totalDuration,
|
|
142
|
+
metadata: {
|
|
143
|
+
is_completion: true,
|
|
144
|
+
total_urls: stats.total,
|
|
145
|
+
signals_found: stats.signals,
|
|
146
|
+
skipped: stats.skipped,
|
|
147
|
+
errors: stats.errors,
|
|
148
|
+
duration_seconds: Math.round(totalDuration / 1000)
|
|
149
|
+
},
|
|
150
|
+
userId
|
|
151
|
+
}, supabase);
|
|
152
|
+
}
|
|
153
|
+
async analyzeContent(client, content, url, model) {
|
|
154
|
+
const prompt = `
|
|
155
|
+
Act as "The Alchemist", a high-level intelligence analyst.
|
|
156
|
+
Analyze the following article value.
|
|
157
|
+
|
|
158
|
+
Input:
|
|
159
|
+
URL: ${url}
|
|
160
|
+
Content: ${content}
|
|
161
|
+
|
|
162
|
+
CRITICAL SCORING:
|
|
163
|
+
High Score (80-100): Original research, concrete data points, contrarian insights, deep technical details, official documentation.
|
|
164
|
+
Medium Score (50-79): Decent summaries, useful aggregate news, tutorials, reference material, software documentation.
|
|
165
|
+
Low Score (0-49): Marketing fluff, SEO clickbait, generic listicles, navigation menus only, login pages, or site footers.
|
|
166
|
+
|
|
167
|
+
Return STRICT JSON:
|
|
168
|
+
{
|
|
169
|
+
"score": number (0-100),
|
|
170
|
+
"category": string (one of: AI & ML, Business, Politics, Technology, Finance, Crypto, Science, Other),
|
|
171
|
+
"summary": string (1-sentence concise gist),
|
|
172
|
+
"entities": string[],
|
|
173
|
+
"tags": string[] (3-5 relevant topic tags for categorization),
|
|
174
|
+
"relevant": boolean (true if score > 50)
|
|
175
|
+
}
|
|
176
|
+
`;
|
|
177
|
+
const completion = await client.chat.completions.create({
|
|
178
|
+
model: model,
|
|
179
|
+
messages: [
|
|
180
|
+
{ role: 'system', content: 'You are a precise analyzer. Return ONLY valid JSON, no other text.' },
|
|
181
|
+
{ role: 'user', content: prompt }
|
|
182
|
+
]
|
|
183
|
+
// Note: response_format removed for compatibility with LM Studio and other local LLMs
|
|
184
|
+
});
|
|
185
|
+
const raw = completion.choices[0].message.content || '{}';
|
|
186
|
+
return this.parseRobustJSON(raw);
|
|
187
|
+
}
|
|
188
|
+
// For manual signal test from UI
|
|
189
|
+
async analyzeSignal(text, config) {
|
|
190
|
+
const client = new OpenAI({
|
|
191
|
+
baseURL: config.baseUrl.endsWith('/v1') ? config.baseUrl : `${config.baseUrl}/v1`,
|
|
192
|
+
apiKey: config.apiKey || 'ollama',
|
|
193
|
+
dangerouslyAllowBrowser: true
|
|
194
|
+
});
|
|
195
|
+
return this.analyzeContent(client, text, 'Manual Text Input', config.model);
|
|
196
|
+
}
|
|
197
|
+
async testConnection(config) {
|
|
198
|
+
try {
|
|
199
|
+
const client = new OpenAI({
|
|
200
|
+
apiKey: config.apiKey || 'ollama',
|
|
201
|
+
baseURL: config.baseUrl.endsWith('/v1') ? config.baseUrl : `${config.baseUrl}/v1`
|
|
202
|
+
});
|
|
203
|
+
const completion = await client.chat.completions.create({
|
|
204
|
+
messages: [{ role: 'user', content: 'Say "OK"' }],
|
|
205
|
+
model: config.model,
|
|
206
|
+
max_tokens: 5
|
|
207
|
+
});
|
|
208
|
+
return {
|
|
209
|
+
success: true,
|
|
210
|
+
message: `Connection successful!`,
|
|
211
|
+
model: config.model
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
catch (error) {
|
|
215
|
+
return {
|
|
216
|
+
success: false,
|
|
217
|
+
message: error.message || 'Connection failed'
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
parseRobustJSON(input) {
|
|
222
|
+
try {
|
|
223
|
+
const jsonMatch = input.match(/\{[\s\S]*\}/);
|
|
224
|
+
const jsonStr = jsonMatch ? jsonMatch[0] : input;
|
|
225
|
+
const cleaned = jsonStr
|
|
226
|
+
.replace(/<\|[\s\S]*?\|>/g, '')
|
|
227
|
+
.replace(/```json/g, '')
|
|
228
|
+
.replace(/```/g, '')
|
|
229
|
+
.trim();
|
|
230
|
+
const result = JSON.parse(cleaned);
|
|
231
|
+
// Ensure tags exists for backward compatibility if LLM fails to return it
|
|
232
|
+
if (!result.tags)
|
|
233
|
+
result.tags = result.entities || [];
|
|
234
|
+
return result;
|
|
235
|
+
}
|
|
236
|
+
catch (e) {
|
|
237
|
+
console.error("JSON Parse Error:", e, input);
|
|
238
|
+
return { score: 0, summary: 'Failed to parse', category: 'Error', entities: [], tags: [], relevant: false };
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
2
|
+
export class EventService {
|
|
3
|
+
static instance;
|
|
4
|
+
clients = [];
|
|
5
|
+
eventHistory = [];
|
|
6
|
+
maxHistory = 50;
|
|
7
|
+
constructor() { }
|
|
8
|
+
static getInstance() {
|
|
9
|
+
if (!EventService.instance) {
|
|
10
|
+
EventService.instance = new EventService();
|
|
11
|
+
}
|
|
12
|
+
return EventService.instance;
|
|
13
|
+
}
|
|
14
|
+
addClient(res) {
|
|
15
|
+
this.clients.push(res);
|
|
16
|
+
// Send connection success
|
|
17
|
+
this.sendEvent(res, {
|
|
18
|
+
type: 'system',
|
|
19
|
+
message: 'Alchemy Engine Connected',
|
|
20
|
+
timestamp: new Date().toISOString()
|
|
21
|
+
});
|
|
22
|
+
// Send history
|
|
23
|
+
if (this.eventHistory.length > 0) {
|
|
24
|
+
this.sendEvent(res, {
|
|
25
|
+
type: 'history',
|
|
26
|
+
message: 'Event History',
|
|
27
|
+
data: this.eventHistory
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
removeClient(res) {
|
|
32
|
+
this.clients = this.clients.filter(client => client !== res);
|
|
33
|
+
}
|
|
34
|
+
emit(event) {
|
|
35
|
+
const fullEvent = {
|
|
36
|
+
id: uuidv4(),
|
|
37
|
+
timestamp: new Date().toISOString(),
|
|
38
|
+
...event
|
|
39
|
+
};
|
|
40
|
+
// Add to history
|
|
41
|
+
this.eventHistory.unshift(fullEvent);
|
|
42
|
+
if (this.eventHistory.length > this.maxHistory) {
|
|
43
|
+
this.eventHistory.pop();
|
|
44
|
+
}
|
|
45
|
+
// Broadcast to all clients
|
|
46
|
+
this.clients.forEach(client => {
|
|
47
|
+
this.sendEvent(client, fullEvent);
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
sendEvent(res, event) {
|
|
51
|
+
res.write(`data: ${JSON.stringify(event)}\n\n`);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { SupabaseService } from './SupabaseService.js';
|
|
2
|
+
export class LibrarianService {
|
|
3
|
+
async getSignals(supabase) {
|
|
4
|
+
// Use provided client or try to get service role client (which may fail if no env)
|
|
5
|
+
const client = supabase || (SupabaseService.isConfigured() ? SupabaseService.getServiceRoleClient() : null);
|
|
6
|
+
if (!client) {
|
|
7
|
+
console.warn('[Librarian] Supabase not configured, returning empty signals');
|
|
8
|
+
return [];
|
|
9
|
+
}
|
|
10
|
+
const { data, error } = await client
|
|
11
|
+
.from('signals')
|
|
12
|
+
.select('*')
|
|
13
|
+
.order('created_at', { ascending: false });
|
|
14
|
+
if (error) {
|
|
15
|
+
console.error('[Librarian] Error fetching signals:', error);
|
|
16
|
+
return [];
|
|
17
|
+
}
|
|
18
|
+
return (data || []).map((s) => ({
|
|
19
|
+
id: s.id,
|
|
20
|
+
url: s.url,
|
|
21
|
+
title: s.title,
|
|
22
|
+
score: s.score,
|
|
23
|
+
summary: s.summary,
|
|
24
|
+
date: s.created_at,
|
|
25
|
+
category: s.category,
|
|
26
|
+
entities: s.entities,
|
|
27
|
+
content: s.content
|
|
28
|
+
}));
|
|
29
|
+
}
|
|
30
|
+
async saveSignal(metadata, content, supabase) {
|
|
31
|
+
const client = supabase || (SupabaseService.isConfigured() ? SupabaseService.getServiceRoleClient() : null);
|
|
32
|
+
if (!client) {
|
|
33
|
+
console.warn('[Librarian] Supabase not configured, signal not saved to cloud');
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
// For saving, we need the user_id.
|
|
37
|
+
// If the client is scoped, we can get it from auth.
|
|
38
|
+
const { data: { user } } = await client.auth.getUser();
|
|
39
|
+
// If no user (e.g. using service role without user context), fallback or skip
|
|
40
|
+
// But for Zero-Env, we expect a scoped client.
|
|
41
|
+
const { error } = await client
|
|
42
|
+
.from('signals')
|
|
43
|
+
.upsert({
|
|
44
|
+
url: metadata.url,
|
|
45
|
+
title: metadata.title,
|
|
46
|
+
score: metadata.score,
|
|
47
|
+
summary: metadata.summary,
|
|
48
|
+
category: metadata.category,
|
|
49
|
+
entities: metadata.entities || [],
|
|
50
|
+
content: content,
|
|
51
|
+
user_id: user?.id,
|
|
52
|
+
updated_at: new Date().toISOString()
|
|
53
|
+
}, {
|
|
54
|
+
onConflict: 'user_id, url' // Column names separated by comma and space
|
|
55
|
+
});
|
|
56
|
+
if (error) {
|
|
57
|
+
console.error('[Librarian] Error saving signal:', error);
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
console.log('[Librarian] Signal saved/updated successfully for URL:', metadata.url);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
async getSystemUserId() {
|
|
64
|
+
// Legacy fallback
|
|
65
|
+
if (SupabaseService.isConfigured()) {
|
|
66
|
+
const supabase = SupabaseService.getServiceRoleClient();
|
|
67
|
+
const { data } = await supabase.from('profiles').select('id').limit(1).maybeSingle();
|
|
68
|
+
return data?.id || null;
|
|
69
|
+
}
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import fs from 'fs/promises';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import sqlite3 from 'better-sqlite3';
|
|
4
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
5
|
+
import { CONFIG } from '../config/index.js';
|
|
6
|
+
import { ProcessingEventService } from './ProcessingEventService.js';
|
|
7
|
+
import { UrlNormalizer } from '../utils/UrlNormalizer.js';
|
|
8
|
+
export class MinerService {
|
|
9
|
+
processingEvents = ProcessingEventService.getInstance();
|
|
10
|
+
blacklist = [
|
|
11
|
+
'google.com/search',
|
|
12
|
+
'localhost:',
|
|
13
|
+
'127.0.0.1',
|
|
14
|
+
'facebook.com',
|
|
15
|
+
'twitter.com',
|
|
16
|
+
'instagram.com',
|
|
17
|
+
'linkedin.com/feed',
|
|
18
|
+
];
|
|
19
|
+
async mineHistory(settings, supabase) {
|
|
20
|
+
// Extract enabled sources from settings
|
|
21
|
+
console.log('[MinerService] settings.custom_browser_paths:', JSON.stringify(settings.custom_browser_paths, null, 2));
|
|
22
|
+
const enabledSources = (settings.custom_browser_paths || []).filter((s) => {
|
|
23
|
+
console.log(`[MinerService] Checking source: ${s.path}, enabled: ${s.enabled} (${typeof s.enabled})`);
|
|
24
|
+
return s.enabled === true || s.enabled === 'true';
|
|
25
|
+
});
|
|
26
|
+
console.log('[MinerService] enabledSources:', enabledSources.length);
|
|
27
|
+
if (enabledSources.length === 0) {
|
|
28
|
+
this.processingEvents.log({
|
|
29
|
+
eventType: 'warning',
|
|
30
|
+
agentState: 'Idle',
|
|
31
|
+
message: 'No active browser sources configured.'
|
|
32
|
+
}, supabase);
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
const maxUrlsPerSync = settings.max_urls_per_sync || CONFIG.MAX_HISTORY_ITEMS;
|
|
36
|
+
const syncMode = settings.sync_mode || 'incremental';
|
|
37
|
+
const syncFromDate = settings.sync_from_date;
|
|
38
|
+
let allEntries = [];
|
|
39
|
+
// Verify user content
|
|
40
|
+
const { data: { user } } = await supabase.auth.getUser();
|
|
41
|
+
const userId = user?.id;
|
|
42
|
+
for (const source of enabledSources) {
|
|
43
|
+
const sourceStart = Date.now();
|
|
44
|
+
await this.processingEvents.log({
|
|
45
|
+
eventType: 'info',
|
|
46
|
+
agentState: 'Mining',
|
|
47
|
+
message: `Mining source: ${source.label} (${source.browser})...`,
|
|
48
|
+
level: 'info',
|
|
49
|
+
userId
|
|
50
|
+
}, supabase);
|
|
51
|
+
try {
|
|
52
|
+
const entries = await this.mineSource(source, supabase, userId, maxUrlsPerSync, settings);
|
|
53
|
+
allEntries = [...allEntries, ...entries];
|
|
54
|
+
const duration = Date.now() - sourceStart;
|
|
55
|
+
await this.processingEvents.log({
|
|
56
|
+
eventType: 'info',
|
|
57
|
+
agentState: 'Mining',
|
|
58
|
+
message: `Found ${entries.length} URLs from ${source.label}`,
|
|
59
|
+
details: { source: source.label, count: entries.length },
|
|
60
|
+
level: 'info',
|
|
61
|
+
durationMs: duration,
|
|
62
|
+
userId
|
|
63
|
+
}, supabase);
|
|
64
|
+
}
|
|
65
|
+
catch (error) {
|
|
66
|
+
console.error(`Error mining ${source.label}:`, error);
|
|
67
|
+
const duration = Date.now() - sourceStart;
|
|
68
|
+
await this.processingEvents.log({
|
|
69
|
+
eventType: 'error',
|
|
70
|
+
agentState: 'Mining',
|
|
71
|
+
message: `Failed to mine ${source.label}: ${error.message}`,
|
|
72
|
+
details: { source: source.label, error: error.message },
|
|
73
|
+
level: 'error',
|
|
74
|
+
durationMs: duration,
|
|
75
|
+
userId
|
|
76
|
+
}, supabase);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Cross-source deduplication: if same URL appears in multiple browsers, keep only one
|
|
80
|
+
const seenUrls = new Set();
|
|
81
|
+
const uniqueEntries = [];
|
|
82
|
+
let crossSourceDupes = 0;
|
|
83
|
+
for (const entry of allEntries) {
|
|
84
|
+
if (!seenUrls.has(entry.url)) {
|
|
85
|
+
seenUrls.add(entry.url);
|
|
86
|
+
uniqueEntries.push(entry);
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
crossSourceDupes++;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
if (crossSourceDupes > 0) {
|
|
93
|
+
console.log(`[MinerService] Cross-source dedup: removed ${crossSourceDupes} duplicates`);
|
|
94
|
+
await this.processingEvents.log({
|
|
95
|
+
eventType: 'info',
|
|
96
|
+
agentState: 'Mining',
|
|
97
|
+
message: `Deduplication: ${crossSourceDupes} cross-browser duplicates removed`,
|
|
98
|
+
level: 'debug',
|
|
99
|
+
userId
|
|
100
|
+
}, supabase);
|
|
101
|
+
}
|
|
102
|
+
// Auto-clear sync_start_date after successful sync (like email-automator)
|
|
103
|
+
// This ensures next sync automatically uses incremental mode
|
|
104
|
+
if (userId && settings.sync_start_date) {
|
|
105
|
+
console.log('[MinerService] Auto-clearing sync_start_date after successful sync');
|
|
106
|
+
await supabase
|
|
107
|
+
.from('alchemy_settings')
|
|
108
|
+
.update({ sync_start_date: null })
|
|
109
|
+
.eq('user_id', userId);
|
|
110
|
+
}
|
|
111
|
+
return uniqueEntries;
|
|
112
|
+
}
|
|
113
|
+
async mineSource(source, supabase, userId, maxItems, settings) {
|
|
114
|
+
const historyPath = source.path;
|
|
115
|
+
if (!historyPath)
|
|
116
|
+
return [];
|
|
117
|
+
// Determine starting timestamp based on simplified sync logic
|
|
118
|
+
// Priority: sync_start_date > last_sync_checkpoint > checkpoint from history_checkpoints
|
|
119
|
+
let startTime;
|
|
120
|
+
if (settings?.sync_start_date) {
|
|
121
|
+
// User has set a manual sync start date - use it
|
|
122
|
+
startTime = new Date(settings.sync_start_date).getTime();
|
|
123
|
+
console.log(`[MinerService] Using sync_start_date: ${new Date(startTime).toISOString()}`);
|
|
124
|
+
}
|
|
125
|
+
else if (settings?.last_sync_checkpoint) {
|
|
126
|
+
// Use the last checkpoint from settings
|
|
127
|
+
startTime = new Date(settings.last_sync_checkpoint).getTime();
|
|
128
|
+
console.log(`[MinerService] Using last_sync_checkpoint: ${new Date(startTime).toISOString()}`);
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
// Fall back to browser-specific checkpoint
|
|
132
|
+
startTime = await this.getCheckpoint(source.path, supabase);
|
|
133
|
+
console.log(`[MinerService] Using browser checkpoint: ${new Date(startTime).toISOString()}`);
|
|
134
|
+
}
|
|
135
|
+
try {
|
|
136
|
+
await fs.access(historyPath);
|
|
137
|
+
// Bypass SQLite lock by copying
|
|
138
|
+
const tempId = uuidv4();
|
|
139
|
+
const tempPath = path.join(CONFIG.DATA_DIR, `history_${tempId}_temp.db`);
|
|
140
|
+
await fs.mkdir(CONFIG.DATA_DIR, { recursive: true });
|
|
141
|
+
await fs.copyFile(historyPath, tempPath);
|
|
142
|
+
const db = new sqlite3(tempPath, { readonly: true });
|
|
143
|
+
// Adjust query based on browser type
|
|
144
|
+
let query = '';
|
|
145
|
+
let queryParamTime = 0;
|
|
146
|
+
// Normalize checkpoint time to browser-specific format for querying
|
|
147
|
+
queryParamTime = this.fromUnixMs(startTime, source.browser);
|
|
148
|
+
console.log(`[MinerService] Browser: ${source.browser}`);
|
|
149
|
+
console.log(`[MinerService] Start Time (Unix Ms): ${startTime}`);
|
|
150
|
+
console.log(`[MinerService] Query Param Time (Browser Format): ${queryParamTime}`);
|
|
151
|
+
if (source.browser === 'firefox') {
|
|
152
|
+
query = `
|
|
153
|
+
SELECT url, title, visit_count, last_visit_date as last_visit_time
|
|
154
|
+
FROM moz_places
|
|
155
|
+
WHERE last_visit_date > ? AND url LIKE 'http%'
|
|
156
|
+
ORDER BY last_visit_date DESC
|
|
157
|
+
LIMIT ?
|
|
158
|
+
`;
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
// Chrome, Edge, Brave, Arc, Safari (usually)
|
|
162
|
+
if (source.browser === 'safari') {
|
|
163
|
+
// Safari uses Core Data timestamp (seconds since 2001-01-01)
|
|
164
|
+
// Not fully implemented yet, but keeping placeholder
|
|
165
|
+
query = `
|
|
166
|
+
SELECT url, title, visit_count, last_visit_time
|
|
167
|
+
FROM history_items
|
|
168
|
+
WHERE last_visit_time > ?
|
|
169
|
+
ORDER BY last_visit_time DESC
|
|
170
|
+
LIMIT ?
|
|
171
|
+
`;
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
query = `
|
|
175
|
+
SELECT url, title, visit_count, last_visit_time
|
|
176
|
+
FROM urls
|
|
177
|
+
WHERE last_visit_time > ?
|
|
178
|
+
ORDER BY last_visit_time DESC
|
|
179
|
+
LIMIT ?
|
|
180
|
+
`;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
let rows = [];
|
|
184
|
+
const limit = maxItems || CONFIG.MAX_HISTORY_ITEMS;
|
|
185
|
+
try {
|
|
186
|
+
rows = db.prepare(query).all(queryParamTime, limit);
|
|
187
|
+
}
|
|
188
|
+
catch (sqlErr) {
|
|
189
|
+
console.warn(`SQL Error for ${source.label}:`, sqlErr);
|
|
190
|
+
// Fallback or skip
|
|
191
|
+
}
|
|
192
|
+
db.close();
|
|
193
|
+
await fs.unlink(tempPath);
|
|
194
|
+
// Track seen normalized URLs for deduplication within this batch
|
|
195
|
+
const seenUrls = new Set();
|
|
196
|
+
let skippedDuplicates = 0;
|
|
197
|
+
let skippedNonContent = 0;
|
|
198
|
+
let skippedBlacklist = 0;
|
|
199
|
+
const entries = rows
|
|
200
|
+
.filter(row => {
|
|
201
|
+
if (!row.url)
|
|
202
|
+
return false;
|
|
203
|
+
// 1. Blacklist check (domain-level)
|
|
204
|
+
if (this.blacklist.some(b => row.url.includes(b))) {
|
|
205
|
+
skippedBlacklist++;
|
|
206
|
+
return false;
|
|
207
|
+
}
|
|
208
|
+
// 2. Non-content URL check (login pages, APIs, assets, etc.)
|
|
209
|
+
if (UrlNormalizer.isLikelyNonContent(row.url)) {
|
|
210
|
+
skippedNonContent++;
|
|
211
|
+
return false;
|
|
212
|
+
}
|
|
213
|
+
// 3. Normalize and deduplicate
|
|
214
|
+
const normalizedUrl = UrlNormalizer.normalize(row.url);
|
|
215
|
+
if (seenUrls.has(normalizedUrl)) {
|
|
216
|
+
skippedDuplicates++;
|
|
217
|
+
return false;
|
|
218
|
+
}
|
|
219
|
+
seenUrls.add(normalizedUrl);
|
|
220
|
+
return true;
|
|
221
|
+
})
|
|
222
|
+
.map(row => ({
|
|
223
|
+
id: uuidv4(),
|
|
224
|
+
url: UrlNormalizer.normalize(row.url), // Store normalized URL
|
|
225
|
+
title: row.title || 'Untitled',
|
|
226
|
+
visit_count: row.visit_count || 1,
|
|
227
|
+
// Normalize back to Unix Ms for internal storage/usage
|
|
228
|
+
last_visit_time: this.toUnixMs(row.last_visit_time, source.browser),
|
|
229
|
+
browser: source.browser
|
|
230
|
+
}));
|
|
231
|
+
// Log filtering stats
|
|
232
|
+
if (skippedDuplicates > 0 || skippedNonContent > 0 || skippedBlacklist > 0) {
|
|
233
|
+
console.log(`[MinerService] URL Filtering: ${skippedDuplicates} duplicates, ${skippedNonContent} non-content, ${skippedBlacklist} blacklisted`);
|
|
234
|
+
}
|
|
235
|
+
if (entries.length > 0) {
|
|
236
|
+
const newestTime = Math.max(...entries.map(e => e.last_visit_time));
|
|
237
|
+
await this.saveCheckpoint(source.path, newestTime, supabase, userId);
|
|
238
|
+
// Also update last_sync_checkpoint in settings for global tracking
|
|
239
|
+
if (userId && settings) {
|
|
240
|
+
await supabase
|
|
241
|
+
.from('alchemy_settings')
|
|
242
|
+
.update({ last_sync_checkpoint: new Date(newestTime).toISOString() })
|
|
243
|
+
.eq('user_id', userId);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
return entries;
|
|
247
|
+
}
|
|
248
|
+
catch (error) {
|
|
249
|
+
throw error;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
toUnixMs(timestamp, browser) {
|
|
253
|
+
if (!timestamp)
|
|
254
|
+
return Date.now();
|
|
255
|
+
if (browser === 'firefox') {
|
|
256
|
+
// Firefox: Microseconds -> Milliseconds
|
|
257
|
+
return Math.floor(timestamp / 1000);
|
|
258
|
+
}
|
|
259
|
+
else if (browser === 'safari') {
|
|
260
|
+
// Safari: Seconds since 2001-01-01 -> Unix Ms
|
|
261
|
+
// 978307200 is seconds between 1970 and 2001
|
|
262
|
+
return Math.floor((timestamp + 978307200) * 1000);
|
|
263
|
+
}
|
|
264
|
+
else {
|
|
265
|
+
// Chrome/Webkit: Microseconds since 1601-01-01
|
|
266
|
+
// Difference between 1601 and 1970 in ms: 11644473600000
|
|
267
|
+
return Math.floor((timestamp / 1000) - 11644473600000);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
fromUnixMs(unixMs, browser) {
|
|
271
|
+
if (!unixMs)
|
|
272
|
+
return 0; // Default to beginning of time
|
|
273
|
+
if (browser === 'firefox') {
|
|
274
|
+
return unixMs * 1000;
|
|
275
|
+
}
|
|
276
|
+
else if (browser === 'safari') {
|
|
277
|
+
return (unixMs / 1000) - 978307200;
|
|
278
|
+
}
|
|
279
|
+
else {
|
|
280
|
+
// Chrome/Webkit
|
|
281
|
+
return (unixMs + 11644473600000) * 1000;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
async getCheckpoint(browser, supabase) {
|
|
285
|
+
const { data } = await supabase
|
|
286
|
+
.from('history_checkpoints')
|
|
287
|
+
.select('last_visit_time')
|
|
288
|
+
.eq('browser', browser)
|
|
289
|
+
.limit(1)
|
|
290
|
+
.maybeSingle();
|
|
291
|
+
let checkpoint = data?.last_visit_time || 0;
|
|
292
|
+
// Sanity Check: If checkpoint is from the "future" (likely old raw Chrome timestamp)
|
|
293
|
+
// Chrome timestamps (microseconds) are ~10^16
|
|
294
|
+
// Unix Ms timestamps are ~10^12
|
|
295
|
+
// If checkpoint > 3000000000000 (Year 2065), it's definitely invalid/raw format.
|
|
296
|
+
if (checkpoint > 3000000000000) {
|
|
297
|
+
console.warn(`[MinerService] Checkpoint ${checkpoint} looks invalid (too large/raw format). Resetting to 0.`);
|
|
298
|
+
return 0;
|
|
299
|
+
}
|
|
300
|
+
return checkpoint;
|
|
301
|
+
}
|
|
302
|
+
async saveCheckpoint(browser, time, supabase, userId) {
|
|
303
|
+
if (!userId)
|
|
304
|
+
return;
|
|
305
|
+
await supabase
|
|
306
|
+
.from('history_checkpoints')
|
|
307
|
+
.upsert({
|
|
308
|
+
user_id: userId,
|
|
309
|
+
browser,
|
|
310
|
+
last_visit_time: time,
|
|
311
|
+
updated_at: new Date().toISOString()
|
|
312
|
+
}, { onConflict: 'user_id,browser' });
|
|
313
|
+
}
|
|
314
|
+
}
|