alif-digest 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +33 -0
- package/.husky/pre-commit +1 -0
- package/.prettierrc +7 -0
- package/LICENSE +21 -0
- package/README.md +131 -0
- package/dist/cli/commands/init.d.ts +1 -0
- package/dist/cli/commands/init.js +88 -0
- package/dist/cli/commands/init.js.map +1 -0
- package/dist/cli/commands/run.d.ts +4 -0
- package/dist/cli/commands/run.js +46 -0
- package/dist/cli/commands/run.js.map +1 -0
- package/dist/cli/commands/schedule.d.ts +1 -0
- package/dist/cli/commands/schedule.js +94 -0
- package/dist/cli/commands/schedule.js.map +1 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +29 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/config-manager.d.ts +14 -0
- package/dist/core/config-manager.js +65 -0
- package/dist/core/config-manager.js.map +1 -0
- package/dist/core/config-schema.d.ts +40 -0
- package/dist/core/config-schema.js +24 -0
- package/dist/core/config-schema.js.map +1 -0
- package/dist/core/default-keywords.d.ts +1 -0
- package/dist/core/default-keywords.js +10 -0
- package/dist/core/default-keywords.js.map +1 -0
- package/dist/core/filters/deduplicator.d.ts +10 -0
- package/dist/core/filters/deduplicator.js +34 -0
- package/dist/core/filters/deduplicator.js.map +1 -0
- package/dist/core/filters/keywords.d.ts +6 -0
- package/dist/core/filters/keywords.js +17 -0
- package/dist/core/filters/keywords.js.map +1 -0
- package/dist/core/orchestrator.d.ts +6 -0
- package/dist/core/orchestrator.js +44 -0
- package/dist/core/orchestrator.js.map +1 -0
- package/dist/core/pipeline.d.ts +15 -0
- package/dist/core/pipeline.js +140 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/scheduler.d.ts +9 -0
- package/dist/core/scheduler.js +64 -0
- package/dist/core/scheduler.js.map +1 -0
- package/dist/core/scraper-types.d.ts +27 -0
- package/dist/core/scraper-types.js +3 -0
- package/dist/core/scraper-types.js.map +1 -0
- package/dist/core/scrapers/api-scraper.d.ts +4 -0
- package/dist/core/scrapers/api-scraper.js +46 -0
- package/dist/core/scrapers/api-scraper.js.map +1 -0
- package/dist/core/scrapers/arxiv-scraper.d.ts +4 -0
- package/dist/core/scrapers/arxiv-scraper.js +34 -0
- package/dist/core/scrapers/arxiv-scraper.js.map +1 -0
- package/dist/core/scrapers/json-scraper.d.ts +4 -0
- package/dist/core/scrapers/json-scraper.js +56 -0
- package/dist/core/scrapers/json-scraper.js.map +1 -0
- package/dist/core/scrapers/rss-scraper.d.ts +6 -0
- package/dist/core/scrapers/rss-scraper.js +32 -0
- package/dist/core/scrapers/rss-scraper.js.map +1 -0
- package/dist/core/scrapers/scrape-scraper.d.ts +4 -0
- package/dist/core/scrapers/scrape-scraper.js +49 -0
- package/dist/core/scrapers/scrape-scraper.js.map +1 -0
- package/dist/db/article-store.d.ts +22 -0
- package/dist/db/article-store.js +43 -0
- package/dist/db/article-store.js.map +1 -0
- package/dist/db/connection.d.ts +2 -0
- package/dist/db/connection.js +15 -0
- package/dist/db/connection.js.map +1 -0
- package/dist/db/migrate.d.ts +2 -0
- package/dist/db/migrate.js +60 -0
- package/dist/db/migrate.js.map +1 -0
- package/dist/db/schedule-store.d.ts +17 -0
- package/dist/db/schedule-store.js +23 -0
- package/dist/db/schedule-store.js.map +1 -0
- package/dist/db/source-health-store.d.ts +16 -0
- package/dist/db/source-health-store.js +31 -0
- package/dist/db/source-health-store.js.map +1 -0
- package/dist/providers/delivery/index.d.ts +18 -0
- package/dist/providers/delivery/index.js +2 -0
- package/dist/providers/delivery/index.js.map +1 -0
- package/dist/providers/delivery/slack.d.ts +6 -0
- package/dist/providers/delivery/slack.js +52 -0
- package/dist/providers/delivery/slack.js.map +1 -0
- package/dist/providers/delivery/webhook.d.ts +6 -0
- package/dist/providers/delivery/webhook.js +16 -0
- package/dist/providers/delivery/webhook.js.map +1 -0
- package/dist/providers/factory.d.ts +7 -0
- package/dist/providers/factory.js +33 -0
- package/dist/providers/factory.js.map +1 -0
- package/dist/providers/llm/anthropic.d.ts +12 -0
- package/dist/providers/llm/anthropic.js +43 -0
- package/dist/providers/llm/anthropic.js.map +1 -0
- package/dist/providers/llm/index.d.ts +10 -0
- package/dist/providers/llm/index.js +2 -0
- package/dist/providers/llm/index.js.map +1 -0
- package/dist/providers/llm/ollama.d.ts +12 -0
- package/dist/providers/llm/ollama.js +42 -0
- package/dist/providers/llm/ollama.js.map +1 -0
- package/dist/providers/llm/openrouter.d.ts +13 -0
- package/dist/providers/llm/openrouter.js +53 -0
- package/dist/providers/llm/openrouter.js.map +1 -0
- package/dist/providers/llm/utils.d.ts +6 -0
- package/dist/providers/llm/utils.js +45 -0
- package/dist/providers/llm/utils.js.map +1 -0
- package/dist/resources/default-feeds.json +650 -0
- package/dist/resources/index.d.ts +2 -0
- package/dist/resources/index.js +3 -0
- package/dist/resources/index.js.map +1 -0
- package/eslint.config.mjs +29 -0
- package/package.json +66 -0
- package/src/cli/commands/init.ts +94 -0
- package/src/cli/commands/run.ts +52 -0
- package/src/cli/commands/schedule.ts +99 -0
- package/src/cli/index.ts +34 -0
- package/src/core/config-manager.ts +72 -0
- package/src/core/config-schema.ts +31 -0
- package/src/core/default-keywords.ts +9 -0
- package/src/core/filters/deduplicator.ts +39 -0
- package/src/core/filters/keywords.ts +18 -0
- package/src/core/orchestrator.ts +47 -0
- package/src/core/pipeline.ts +171 -0
- package/src/core/scheduler.ts +74 -0
- package/src/core/scraper-types.ts +30 -0
- package/src/core/scrapers/api-scraper.ts +45 -0
- package/src/core/scrapers/arxiv-scraper.ts +35 -0
- package/src/core/scrapers/json-scraper.ts +54 -0
- package/src/core/scrapers/rss-scraper.ts +34 -0
- package/src/core/scrapers/scrape-scraper.ts +50 -0
- package/src/db/article-store.ts +75 -0
- package/src/db/connection.ts +17 -0
- package/src/db/migrate.ts +68 -0
- package/src/db/schedule-store.ts +41 -0
- package/src/db/source-health-store.ts +42 -0
- package/src/providers/delivery/index.ts +19 -0
- package/src/providers/delivery/slack.ts +55 -0
- package/src/providers/delivery/webhook.ts +16 -0
- package/src/providers/factory.ts +37 -0
- package/src/providers/llm/anthropic.ts +48 -0
- package/src/providers/llm/index.ts +8 -0
- package/src/providers/llm/ollama.ts +44 -0
- package/src/providers/llm/openrouter.ts +56 -0
- package/src/providers/llm/utils.ts +54 -0
- package/src/resources/default-feeds.json +650 -0
- package/src/resources/index.ts +3 -0
- package/tests/config-manager.test.ts +70 -0
- package/tests/db-integration.test.ts +72 -0
- package/tests/filters.test.ts +53 -0
- package/tests/llm-provider.test.ts +115 -0
- package/tsconfig.json +18 -0
- package/vitest.config.ts +13 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import { Config } from './config-schema.js';
|
|
2
|
+
import { ScraperOrchestrator } from './orchestrator.js';
|
|
3
|
+
import { ScraperSource } from './scraper-types.js';
|
|
4
|
+
import { KeywordScorer } from './filters/keywords.js';
|
|
5
|
+
import { Deduplicator } from './filters/deduplicator.js';
|
|
6
|
+
import { ArticleStore } from '../db/article-store.js';
|
|
7
|
+
import { SourceHealthStore } from '../db/source-health-store.js';
|
|
8
|
+
import { Database } from 'better-sqlite3';
|
|
9
|
+
import { ProviderFactory } from '../providers/factory.js';
|
|
10
|
+
import { LLMProvider } from '../providers/llm/index.js';
|
|
11
|
+
import { DeliveryProvider, Digest } from '../providers/delivery/index.js';
|
|
12
|
+
import { BASE_KEYWORDS } from './default-keywords.js';
|
|
13
|
+
|
|
14
|
+
export class Pipeline {
|
|
15
|
+
private orchestrator: ScraperOrchestrator;
|
|
16
|
+
private scorer: KeywordScorer;
|
|
17
|
+
private deduplicator: Deduplicator;
|
|
18
|
+
private articleStore: ArticleStore;
|
|
19
|
+
private healthStore: SourceHealthStore;
|
|
20
|
+
private llm: LLMProvider;
|
|
21
|
+
private delivery: DeliveryProvider[];
|
|
22
|
+
|
|
23
|
+
constructor(
|
|
24
|
+
private config: Config,
|
|
25
|
+
db: Database,
|
|
26
|
+
) {
|
|
27
|
+
this.orchestrator = new ScraperOrchestrator();
|
|
28
|
+
|
|
29
|
+
const mergedKeywords = {
|
|
30
|
+
...BASE_KEYWORDS,
|
|
31
|
+
...(config.preferences.customKeywords || {}),
|
|
32
|
+
};
|
|
33
|
+
this.scorer = new KeywordScorer(mergedKeywords);
|
|
34
|
+
|
|
35
|
+
this.deduplicator = new Deduplicator();
|
|
36
|
+
this.articleStore = new ArticleStore(db);
|
|
37
|
+
this.healthStore = new SourceHealthStore(db);
|
|
38
|
+
this.llm = ProviderFactory.createLLM(config);
|
|
39
|
+
this.delivery = ProviderFactory.createDelivery(config);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async run(sources: ScraperSource[], force = false) {
|
|
43
|
+
const cooldown = this.config.preferences.sourceCooldownMinutes;
|
|
44
|
+
const activeSources = sources.filter((s) => {
|
|
45
|
+
if (!force && this.healthStore.isThrottled(s.id, cooldown)) {
|
|
46
|
+
console.log(`[Pipeline] Skipping ${s.id} (last check was < ${cooldown}m ago)`);
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
return true;
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
let enrichedItems: any[] = [];
|
|
53
|
+
const digestDate = new Date().toISOString().split('T')[0];
|
|
54
|
+
let newItemsCount = 0;
|
|
55
|
+
|
|
56
|
+
if (activeSources.length > 0) {
|
|
57
|
+
console.log(`[Pipeline] Scraping ${activeSources.length} sources...`);
|
|
58
|
+
const results = await this.orchestrator.runAll(activeSources);
|
|
59
|
+
|
|
60
|
+
// Record health
|
|
61
|
+
for (const res of results) {
|
|
62
|
+
this.healthStore.record({
|
|
63
|
+
source: res.source,
|
|
64
|
+
status: res.status,
|
|
65
|
+
items_found: res.items.length,
|
|
66
|
+
error_message: res.error,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const allArticles = results.flatMap((r) => r.items);
|
|
71
|
+
console.log(`[Pipeline] Found ${allArticles.length} raw items.`);
|
|
72
|
+
|
|
73
|
+
// Filtering
|
|
74
|
+
const latestTimestamp = this.articleStore.getLatestTimestamp();
|
|
75
|
+
let newItems = allArticles;
|
|
76
|
+
|
|
77
|
+
if (latestTimestamp) {
|
|
78
|
+
const lastTime = new Date(latestTimestamp).getTime();
|
|
79
|
+
newItems = allArticles.filter((a) => {
|
|
80
|
+
if (!a.published_at) return true;
|
|
81
|
+
return new Date(a.published_at).getTime() > lastTime;
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
newItemsCount = newItems.length;
|
|
85
|
+
console.log(`[Pipeline] ${newItemsCount} new items after incremental filter.`);
|
|
86
|
+
|
|
87
|
+
// Scoring & Deduplication
|
|
88
|
+
const unique = this.deduplicator.process(newItems);
|
|
89
|
+
const highSignal = unique
|
|
90
|
+
.map((a) => ({ ...a, score: this.scorer.score(a) }))
|
|
91
|
+
.filter((a) => a.score >= this.config.preferences.signalThreshold)
|
|
92
|
+
.sort((a, b) => b.score - a.score);
|
|
93
|
+
|
|
94
|
+
console.log(`[Pipeline] ${highSignal.length} high-signal items selected.`);
|
|
95
|
+
|
|
96
|
+
if (highSignal.length > 0) {
|
|
97
|
+
// LLM Analysis
|
|
98
|
+
console.log('[Pipeline] Analyzing high-signal items with LLM...');
|
|
99
|
+
const analysisResults = await this.llm.analyze(
|
|
100
|
+
highSignal.map((a) => ({ title: a.title, content: a.content })),
|
|
101
|
+
);
|
|
102
|
+
|
|
103
|
+
enrichedItems = highSignal.map((article, idx) => ({
|
|
104
|
+
...article,
|
|
105
|
+
summary: analysisResults[idx]?.summary || null,
|
|
106
|
+
category: analysisResults[idx]?.category || 'Uncategorized',
|
|
107
|
+
}));
|
|
108
|
+
|
|
109
|
+
// Persistence
|
|
110
|
+
for (const item of enrichedItems) {
|
|
111
|
+
this.articleStore.upsert({
|
|
112
|
+
...item,
|
|
113
|
+
digest_date: digestDate,
|
|
114
|
+
delivered: 0,
|
|
115
|
+
} as any);
|
|
116
|
+
}
|
|
117
|
+
} else {
|
|
118
|
+
console.log('[Pipeline] No high-signal items in this batch.');
|
|
119
|
+
}
|
|
120
|
+
} else {
|
|
121
|
+
console.log('[Pipeline] All sources are cooled down. Checking database for pending items...');
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// 5. Build Final Digest (Current + Pending from last 24h)
|
|
125
|
+
const pendingItems = this.articleStore.getPendingHighSignal(
|
|
126
|
+
this.config.preferences.signalThreshold,
|
|
127
|
+
24,
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
// Merge and deduplicate by ID (latest run wins)
|
|
131
|
+
const allToDeliverMap = new Map();
|
|
132
|
+
for (const item of pendingItems) allToDeliverMap.set(item.id, item);
|
|
133
|
+
for (const item of enrichedItems) allToDeliverMap.set(item.id, item);
|
|
134
|
+
|
|
135
|
+
const finalItemsToDeliver = Array.from(allToDeliverMap.values());
|
|
136
|
+
|
|
137
|
+
if (finalItemsToDeliver.length === 0) {
|
|
138
|
+
console.log('[Pipeline] No high-signal items to deliver.');
|
|
139
|
+
console.log('[Pipeline] Execution complete! 🥂');
|
|
140
|
+
return [];
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Delivery
|
|
144
|
+
const digest: Digest = {
|
|
145
|
+
items: finalItemsToDeliver.map((item) => ({
|
|
146
|
+
title: item.title,
|
|
147
|
+
url: item.url,
|
|
148
|
+
summary: item.summary,
|
|
149
|
+
category: item.category,
|
|
150
|
+
source: item.source,
|
|
151
|
+
score: item.score,
|
|
152
|
+
})),
|
|
153
|
+
metadata: {
|
|
154
|
+
total_new_items: newItemsCount,
|
|
155
|
+
total_selected: finalItemsToDeliver.length,
|
|
156
|
+
date: digestDate,
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
console.log(
|
|
161
|
+
`[Pipeline] Delivering ${finalItemsToDeliver.length} items to ${this.delivery.length} channels...`,
|
|
162
|
+
);
|
|
163
|
+
await Promise.all(this.delivery.map((d) => d.send(digest)));
|
|
164
|
+
|
|
165
|
+
// Mark as delivered
|
|
166
|
+
this.articleStore.markAsDelivered(finalItemsToDeliver.map((a) => a.id));
|
|
167
|
+
|
|
168
|
+
console.log('[Pipeline] Execution complete! 🥂');
|
|
169
|
+
return finalItemsToDeliver;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { ScheduleStore } from '../db/schedule-store.js';
|
|
2
|
+
import { Database } from 'better-sqlite3';
|
|
3
|
+
|
|
4
|
+
export class Scheduler {
|
|
5
|
+
private store: ScheduleStore;
|
|
6
|
+
|
|
7
|
+
constructor(db: Database) {
|
|
8
|
+
this.store = new ScheduleStore(db);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
async add(name: string, cron: string, scheduledTime?: string) {
|
|
12
|
+
const id = Math.random().toString(36).substring(2, 9);
|
|
13
|
+
this.store.add({
|
|
14
|
+
id,
|
|
15
|
+
name,
|
|
16
|
+
cron,
|
|
17
|
+
scheduled_time: scheduledTime,
|
|
18
|
+
active: 1,
|
|
19
|
+
});
|
|
20
|
+
return id;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
list() {
|
|
24
|
+
return this.store.getAll();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
remove(id: string) {
|
|
28
|
+
this.store.delete(id);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// This would be called by a background daemon or a frequent cron
|
|
32
|
+
async checkAndRun(runner: () => Promise<void>) {
|
|
33
|
+
const schedules = this.store.getAll();
|
|
34
|
+
const now = new Date();
|
|
35
|
+
const currentHHmm = `${now.getHours().toString().padStart(2, '0')}:${now.getMinutes().toString().padStart(2, '0')}`;
|
|
36
|
+
|
|
37
|
+
for (const schedule of schedules) {
|
|
38
|
+
if (!schedule.active) continue;
|
|
39
|
+
|
|
40
|
+
const lastRun = schedule.last_run ? new Date(schedule.last_run) : new Date(0);
|
|
41
|
+
const diffHours = (now.getTime() - lastRun.getTime()) / (1000 * 60 * 60);
|
|
42
|
+
|
|
43
|
+
// Simple implementation:
|
|
44
|
+
// 1. If never run, run it instantly.
|
|
45
|
+
// 2. If it's a 'daily' schedule and has a scheduled_time, check if we've passed that time today and haven't run yet.
|
|
46
|
+
// 3. Otherwise fallback to the 24h cooldown.
|
|
47
|
+
|
|
48
|
+
let shouldRun = false;
|
|
49
|
+
|
|
50
|
+
if (!schedule.last_run) {
|
|
51
|
+
shouldRun = true;
|
|
52
|
+
} else if (schedule.cron === 'daily' && schedule.scheduled_time) {
|
|
53
|
+
// Run if:
|
|
54
|
+
// - Current time >= scheduled time
|
|
55
|
+
// - Last run was NOT today
|
|
56
|
+
const lastRunDate = lastRun.toISOString().split('T')[0];
|
|
57
|
+
const todayDate = now.toISOString().split('T')[0];
|
|
58
|
+
|
|
59
|
+
if (currentHHmm >= schedule.scheduled_time && lastRunDate !== todayDate) {
|
|
60
|
+
shouldRun = true;
|
|
61
|
+
}
|
|
62
|
+
} else if (diffHours >= 24) {
|
|
63
|
+
// Fallback for non-daily or simple daily without time
|
|
64
|
+
shouldRun = true;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (shouldRun) {
|
|
68
|
+
console.log(`[Scheduler] Running job: ${schedule.name}`);
|
|
69
|
+
await runner();
|
|
70
|
+
this.store.updateLastRun(schedule.id, now.toISOString());
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export interface ScrapedArticle {
|
|
2
|
+
id: string;
|
|
3
|
+
title: string;
|
|
4
|
+
url: string;
|
|
5
|
+
content?: string;
|
|
6
|
+
published_at?: string;
|
|
7
|
+
source: string;
|
|
8
|
+
metadata?: Record<string, unknown>;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface ScraperSource {
|
|
12
|
+
id: string;
|
|
13
|
+
name: string;
|
|
14
|
+
type: 'rss' | 'api' | 'scrape' | 'json';
|
|
15
|
+
url: string;
|
|
16
|
+
tier?: number;
|
|
17
|
+
tags?: string[];
|
|
18
|
+
mapping?: Record<string, string>;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface ScraperResult {
|
|
22
|
+
source: string;
|
|
23
|
+
status: 'ok' | 'error';
|
|
24
|
+
items: ScrapedArticle[];
|
|
25
|
+
error?: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export abstract class BaseScraper {
|
|
29
|
+
abstract scrape(source: ScraperSource): Promise<ScraperResult>;
|
|
30
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { BaseScraper, ScraperSource, ScraperResult, ScrapedArticle } from '../scraper-types.js';
|
|
3
|
+
|
|
4
|
+
export class ApiScraper extends BaseScraper {
|
|
5
|
+
async scrape(source: ScraperSource): Promise<ScraperResult> {
|
|
6
|
+
try {
|
|
7
|
+
const response = await axios.get(source.url);
|
|
8
|
+
const data = response.data;
|
|
9
|
+
let items: ScrapedArticle[] = [];
|
|
10
|
+
|
|
11
|
+
// Specialized logic based on source ID for common APIs
|
|
12
|
+
if (source.id === 'hn') {
|
|
13
|
+
items = data.hits.map((hit: any) => ({
|
|
14
|
+
id: hit.objectID,
|
|
15
|
+
title: hit.title,
|
|
16
|
+
url: hit.url || `https://news.ycombinator.com/item?id=${hit.objectID}`,
|
|
17
|
+
content: hit.story_text || '',
|
|
18
|
+
published_at: hit.created_at,
|
|
19
|
+
source: source.id,
|
|
20
|
+
}));
|
|
21
|
+
} else if (source.id.startsWith('arxiv')) {
|
|
22
|
+
// ArXiv returns XML/Atom, but we'll assume the URL handles it or use a specific XML parser if needed.
|
|
23
|
+
// For brevity in this generic API scraper, we'll keep it simple.
|
|
24
|
+
// In a real scenario, we'd use a dedicated library or cheerio for XML.
|
|
25
|
+
return {
|
|
26
|
+
source: source.id,
|
|
27
|
+
status: 'error',
|
|
28
|
+
items: [],
|
|
29
|
+
error: 'ArXiv requires specialized XML parsing',
|
|
30
|
+
};
|
|
31
|
+
} else {
|
|
32
|
+
return { source: source.id, status: 'error', items: [], error: 'Unsupported API source' };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return { source: source.id, status: 'ok', items };
|
|
36
|
+
} catch (error) {
|
|
37
|
+
return {
|
|
38
|
+
source: source.id,
|
|
39
|
+
status: 'error',
|
|
40
|
+
items: [],
|
|
41
|
+
error: error instanceof Error ? error.message : String(error),
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import { BaseScraper, ScraperSource, ScraperResult, ScrapedArticle } from '../scraper-types.js';
|
|
4
|
+
|
|
5
|
+
export class ArxivScraper extends BaseScraper {
|
|
6
|
+
async scrape(source: ScraperSource): Promise<ScraperResult> {
|
|
7
|
+
try {
|
|
8
|
+
const response = await axios.get(source.url);
|
|
9
|
+
const $ = cheerio.load(response.data, { xmlMode: true });
|
|
10
|
+
const items: ScrapedArticle[] = [];
|
|
11
|
+
|
|
12
|
+
$('entry').each((_, el) => {
|
|
13
|
+
const $el = $(el);
|
|
14
|
+
const url = $el.find('id').text().trim();
|
|
15
|
+
items.push({
|
|
16
|
+
id: url,
|
|
17
|
+
title: $el.find('title').text().trim().replace(/\s+/g, ' '),
|
|
18
|
+
url,
|
|
19
|
+
content: $el.find('summary').text().trim().replace(/\s+/g, ' '),
|
|
20
|
+
published_at: $el.find('published').text().trim(),
|
|
21
|
+
source: source.id,
|
|
22
|
+
});
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
return { source: source.id, status: 'ok', items };
|
|
26
|
+
} catch (error) {
|
|
27
|
+
return {
|
|
28
|
+
source: source.id,
|
|
29
|
+
status: 'error',
|
|
30
|
+
items: [],
|
|
31
|
+
error: error instanceof Error ? error.message : String(error),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { BaseScraper, ScraperSource, ScraperResult, ScrapedArticle } from '../scraper-types.js';
|
|
3
|
+
|
|
4
|
+
export class JsonScraper extends BaseScraper {
|
|
5
|
+
async scrape(source: ScraperSource): Promise<ScraperResult> {
|
|
6
|
+
try {
|
|
7
|
+
const response = await axios.get(source.url);
|
|
8
|
+
const data = response.data;
|
|
9
|
+
let items: ScrapedArticle[] = [];
|
|
10
|
+
|
|
11
|
+
if (source.mapping) {
|
|
12
|
+
// Generic JSON array mapping
|
|
13
|
+
const list = Array.isArray(data) ? data : data[source.mapping.items || 'items'];
|
|
14
|
+
if (Array.isArray(list)) {
|
|
15
|
+
items = list.map((item: any, index: number) => ({
|
|
16
|
+
id: item.id || `json-${source.id}-${index}`,
|
|
17
|
+
title: item[source.mapping!.title || 'title'] || 'No Title',
|
|
18
|
+
url: item[source.mapping!.url || 'url'] || source.url,
|
|
19
|
+
content: item[source.mapping!.content || 'content'] || '',
|
|
20
|
+
published_at: item[source.mapping!.published_at || 'date'] || new Date().toISOString(),
|
|
21
|
+
source: source.id,
|
|
22
|
+
}));
|
|
23
|
+
}
|
|
24
|
+
} else if (source.id.startsWith('reddit')) {
|
|
25
|
+
items = data.data.children.map((child: any) => {
|
|
26
|
+
const post = child.data;
|
|
27
|
+
return {
|
|
28
|
+
id: post.name,
|
|
29
|
+
title: post.title,
|
|
30
|
+
url: post.url.startsWith('/') ? `https://reddit.com${post.url}` : post.url,
|
|
31
|
+
content: post.selftext || '',
|
|
32
|
+
published_at: new Date(post.created_utc * 1000).toISOString(),
|
|
33
|
+
source: source.id,
|
|
34
|
+
};
|
|
35
|
+
});
|
|
36
|
+
} else {
|
|
37
|
+
return {
|
|
38
|
+
source: source.id,
|
|
39
|
+
status: 'error',
|
|
40
|
+
items: [],
|
|
41
|
+
error: 'Unsupported JSON source or missing mapping',
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
return { source: source.id, status: 'ok', items };
|
|
45
|
+
} catch (error) {
|
|
46
|
+
return {
|
|
47
|
+
source: source.id,
|
|
48
|
+
status: 'error',
|
|
49
|
+
items: [],
|
|
50
|
+
error: error instanceof Error ? error.message : String(error),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import Parser from 'rss-parser';
|
|
2
|
+
import { BaseScraper, ScraperSource, ScraperResult, ScrapedArticle } from '../scraper-types.js';
|
|
3
|
+
|
|
4
|
+
export class RssScraper extends BaseScraper {
|
|
5
|
+
private parser: Parser;
|
|
6
|
+
|
|
7
|
+
constructor() {
|
|
8
|
+
super();
|
|
9
|
+
this.parser = new Parser();
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async scrape(source: ScraperSource): Promise<ScraperResult> {
|
|
13
|
+
try {
|
|
14
|
+
const feed = await this.parser.parseURL(source.url);
|
|
15
|
+
const items: ScrapedArticle[] = feed.items.map((item) => ({
|
|
16
|
+
id: item.guid || item.link || `${source.id}-${item.title}`,
|
|
17
|
+
title: item.title || 'No Title',
|
|
18
|
+
url: item.link || '',
|
|
19
|
+
content: item.contentSnippet || item.content || '',
|
|
20
|
+
published_at: item.isoDate || item.pubDate,
|
|
21
|
+
source: source.id,
|
|
22
|
+
}));
|
|
23
|
+
|
|
24
|
+
return { source: source.id, status: 'ok', items };
|
|
25
|
+
} catch (error) {
|
|
26
|
+
return {
|
|
27
|
+
source: source.id,
|
|
28
|
+
status: 'error',
|
|
29
|
+
items: [],
|
|
30
|
+
error: error instanceof Error ? error.message : String(error),
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import { BaseScraper, ScraperSource, ScraperResult, ScrapedArticle } from '../scraper-types.js';
|
|
4
|
+
|
|
5
|
+
export class ScrapeScraper extends BaseScraper {
|
|
6
|
+
async scrape(source: ScraperSource): Promise<ScraperResult> {
|
|
7
|
+
try {
|
|
8
|
+
const response = await axios.get(source.url, {
|
|
9
|
+
headers: {
|
|
10
|
+
'User-Agent': 'Mozilla/5.0 (compatible; AlifBot/1.0)',
|
|
11
|
+
},
|
|
12
|
+
});
|
|
13
|
+
const $ = cheerio.load(response.data);
|
|
14
|
+
const items: ScrapedArticle[] = [];
|
|
15
|
+
|
|
16
|
+
if (source.id === 'github_trending') {
|
|
17
|
+
$('.Box-row').each((_, el) => {
|
|
18
|
+
const $el = $(el);
|
|
19
|
+
const title = $el.find('h2 a').text().trim().replace(/\s+/g, ' ');
|
|
20
|
+
const url = 'https://github.com' + $el.find('h2 a').attr('href');
|
|
21
|
+
const content = $el.find('p').text().trim();
|
|
22
|
+
|
|
23
|
+
items.push({
|
|
24
|
+
id: url,
|
|
25
|
+
title,
|
|
26
|
+
url,
|
|
27
|
+
content,
|
|
28
|
+
source: source.id,
|
|
29
|
+
});
|
|
30
|
+
});
|
|
31
|
+
} else {
|
|
32
|
+
return {
|
|
33
|
+
source: source.id,
|
|
34
|
+
status: 'error',
|
|
35
|
+
items: [],
|
|
36
|
+
error: 'Unsupported scraping source',
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return { source: source.id, status: 'ok', items };
|
|
41
|
+
} catch (error) {
|
|
42
|
+
return {
|
|
43
|
+
source: source.id,
|
|
44
|
+
status: 'error',
|
|
45
|
+
items: [],
|
|
46
|
+
error: error instanceof Error ? error.message : String(error),
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
export interface Article {
|
|
4
|
+
id: string;
|
|
5
|
+
title: string;
|
|
6
|
+
url: string;
|
|
7
|
+
source: string;
|
|
8
|
+
content?: string | null;
|
|
9
|
+
summary?: string | null;
|
|
10
|
+
category?: string;
|
|
11
|
+
published_at?: string;
|
|
12
|
+
score?: number;
|
|
13
|
+
digest_date: string;
|
|
14
|
+
delivered?: number;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class ArticleStore {
|
|
18
|
+
constructor(private db: Database) {}
|
|
19
|
+
|
|
20
|
+
upsert(article: Article) {
|
|
21
|
+
const stmt = this.db.prepare(`
|
|
22
|
+
INSERT INTO articles (id, title, url, source, content, summary, category, published_at, score, digest_date, delivered)
|
|
23
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
24
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
25
|
+
title = excluded.title,
|
|
26
|
+
summary = excluded.summary,
|
|
27
|
+
category = excluded.category,
|
|
28
|
+
score = excluded.score
|
|
29
|
+
`);
|
|
30
|
+
|
|
31
|
+
stmt.run(
|
|
32
|
+
article.id,
|
|
33
|
+
article.title,
|
|
34
|
+
article.url,
|
|
35
|
+
article.source,
|
|
36
|
+
article.content || null,
|
|
37
|
+
article.summary || null,
|
|
38
|
+
article.category || 'Uncategorized',
|
|
39
|
+
article.published_at || null,
|
|
40
|
+
article.score || 0,
|
|
41
|
+
article.digest_date,
|
|
42
|
+
article.delivered || 0,
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
getPendingHighSignal(threshold: number, hours: number = 24): Article[] {
|
|
47
|
+
const cutoff = new Date(Date.now() - hours * 60 * 60 * 1000).toISOString();
|
|
48
|
+
return this.db
|
|
49
|
+
.prepare(
|
|
50
|
+
`
|
|
51
|
+
SELECT * FROM articles
|
|
52
|
+
WHERE score >= ?
|
|
53
|
+
AND delivered = 0
|
|
54
|
+
AND summary IS NOT NULL
|
|
55
|
+
AND (published_at > ? OR digest_date > ?)
|
|
56
|
+
`,
|
|
57
|
+
)
|
|
58
|
+
.all(threshold, cutoff, cutoff.split('T')[0]) as Article[];
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
markAsDelivered(articleIds: string[]) {
|
|
62
|
+
const stmt = this.db.prepare('UPDATE articles SET delivered = 1 WHERE id = ?');
|
|
63
|
+
const transaction = this.db.transaction((ids: string[]) => {
|
|
64
|
+
for (const id of ids) stmt.run(id);
|
|
65
|
+
});
|
|
66
|
+
transaction(articleIds);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
getLatestTimestamp(): string | null {
|
|
70
|
+
const result = this.db.prepare('SELECT MAX(published_at) as latest FROM articles').get() as {
|
|
71
|
+
latest: string | null;
|
|
72
|
+
};
|
|
73
|
+
return result?.latest || null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import Database, { Database as DatabaseType } from 'better-sqlite3';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import fs from 'fs';
|
|
4
|
+
|
|
5
|
+
export function createDatabase(dbPath: string): DatabaseType {
|
|
6
|
+
const dbDir = path.dirname(dbPath);
|
|
7
|
+
if (!fs.existsSync(dbDir)) {
|
|
8
|
+
fs.mkdirSync(dbDir, { recursive: true });
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
const db = new Database(dbPath);
|
|
12
|
+
db.pragma('journal_mode = WAL');
|
|
13
|
+
db.pragma('synchronous = NORMAL');
|
|
14
|
+
db.pragma('foreign_keys = ON');
|
|
15
|
+
|
|
16
|
+
return db;
|
|
17
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
const MIGRATIONS = [
|
|
4
|
+
`
|
|
5
|
+
CREATE TABLE IF NOT EXISTS articles (
|
|
6
|
+
id TEXT PRIMARY KEY,
|
|
7
|
+
title TEXT NOT NULL,
|
|
8
|
+
url TEXT NOT NULL,
|
|
9
|
+
source TEXT NOT NULL,
|
|
10
|
+
content TEXT,
|
|
11
|
+
summary TEXT,
|
|
12
|
+
category TEXT,
|
|
13
|
+
published_at TEXT,
|
|
14
|
+
score INTEGER,
|
|
15
|
+
digest_date TEXT NOT NULL
|
|
16
|
+
);
|
|
17
|
+
`,
|
|
18
|
+
`
|
|
19
|
+
CREATE TABLE IF NOT EXISTS source_health (
|
|
20
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
21
|
+
source TEXT NOT NULL,
|
|
22
|
+
status TEXT NOT NULL,
|
|
23
|
+
items_found INTEGER DEFAULT 0,
|
|
24
|
+
error_message TEXT,
|
|
25
|
+
last_check TEXT DEFAULT CURRENT_TIMESTAMP
|
|
26
|
+
);
|
|
27
|
+
`,
|
|
28
|
+
`
|
|
29
|
+
CREATE TABLE IF NOT EXISTS schedules (
|
|
30
|
+
id TEXT PRIMARY KEY,
|
|
31
|
+
name TEXT NOT NULL,
|
|
32
|
+
cron TEXT NOT NULL,
|
|
33
|
+
active INTEGER DEFAULT 1,
|
|
34
|
+
last_run TEXT
|
|
35
|
+
);
|
|
36
|
+
`,
|
|
37
|
+
`
|
|
38
|
+
ALTER TABLE schedules ADD COLUMN scheduled_time TEXT;
|
|
39
|
+
`,
|
|
40
|
+
`
|
|
41
|
+
ALTER TABLE articles ADD COLUMN delivered INTEGER DEFAULT 0;
|
|
42
|
+
`,
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
export function runMigrations(db: Database) {
|
|
46
|
+
db.transaction(() => {
|
|
47
|
+
// Basic migration tracking
|
|
48
|
+
db.prepare(
|
|
49
|
+
`
|
|
50
|
+
CREATE TABLE IF NOT EXISTS migrations (
|
|
51
|
+
id INTEGER PRIMARY KEY,
|
|
52
|
+
executed_at TEXT DEFAULT CURRENT_TIMESTAMP
|
|
53
|
+
)
|
|
54
|
+
`,
|
|
55
|
+
).run();
|
|
56
|
+
|
|
57
|
+
const result = db.prepare('SELECT MAX(id) as lastId FROM migrations').get() as {
|
|
58
|
+
lastId: number | null;
|
|
59
|
+
};
|
|
60
|
+
const lastId = result?.lastId ?? -1;
|
|
61
|
+
|
|
62
|
+
for (let i = lastId + 1; i < MIGRATIONS.length; i++) {
|
|
63
|
+
console.log(`[Database] Running migration ${i}...`);
|
|
64
|
+
db.prepare(MIGRATIONS[i]).run();
|
|
65
|
+
db.prepare('INSERT INTO migrations (id) VALUES (?)').run(i);
|
|
66
|
+
}
|
|
67
|
+
})();
|
|
68
|
+
}
|