@de-otio/chaoskb-client 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/cli/index.d.ts.map +1 -1
  2. package/dist/cli/index.js +12 -1
  3. package/dist/cli/index.js.map +1 -1
  4. package/dist/cli/mcp-server.d.ts +16 -1
  5. package/dist/cli/mcp-server.d.ts.map +1 -1
  6. package/dist/cli/mcp-server.js +29 -12
  7. package/dist/cli/mcp-server.js.map +1 -1
  8. package/dist/cli/tools/kb-ingest.d.ts +3 -1
  9. package/dist/cli/tools/kb-ingest.d.ts.map +1 -1
  10. package/dist/cli/tools/kb-ingest.js +45 -5
  11. package/dist/cli/tools/kb-ingest.js.map +1 -1
  12. package/dist/cli/tools/kb-query.d.ts +2 -0
  13. package/dist/cli/tools/kb-query.d.ts.map +1 -1
  14. package/dist/cli/tools/kb-query.js +11 -2
  15. package/dist/cli/tools/kb-query.js.map +1 -1
  16. package/dist/pipeline/content-pipeline.d.ts +2 -0
  17. package/dist/pipeline/content-pipeline.d.ts.map +1 -1
  18. package/dist/pipeline/content-pipeline.js +27 -1
  19. package/dist/pipeline/content-pipeline.js.map +1 -1
  20. package/dist/pipeline/extract.d.ts.map +1 -1
  21. package/dist/pipeline/extract.js +129 -4
  22. package/dist/pipeline/extract.js.map +1 -1
  23. package/dist/pipeline/fetch.d.ts +11 -0
  24. package/dist/pipeline/fetch.d.ts.map +1 -1
  25. package/dist/pipeline/fetch.js +153 -1
  26. package/dist/pipeline/fetch.js.map +1 -1
  27. package/dist/pipeline/file-extract.d.ts +16 -0
  28. package/dist/pipeline/file-extract.d.ts.map +1 -0
  29. package/dist/pipeline/file-extract.js +249 -0
  30. package/dist/pipeline/file-extract.js.map +1 -0
  31. package/dist/pipeline/index.d.ts +2 -0
  32. package/dist/pipeline/index.d.ts.map +1 -1
  33. package/dist/pipeline/index.js +2 -0
  34. package/dist/pipeline/index.js.map +1 -1
  35. package/dist/pipeline/types.d.ts +6 -0
  36. package/dist/pipeline/types.d.ts.map +1 -1
  37. package/dist/pipeline/validate.d.ts +36 -0
  38. package/dist/pipeline/validate.d.ts.map +1 -0
  39. package/dist/pipeline/validate.js +632 -0
  40. package/dist/pipeline/validate.js.map +1 -0
  41. package/dist/storage/source-repo.d.ts +2 -0
  42. package/dist/storage/source-repo.d.ts.map +1 -1
  43. package/dist/storage/source-repo.js +9 -2
  44. package/dist/storage/source-repo.js.map +1 -1
  45. package/dist/storage/types.d.ts +1 -0
  46. package/dist/storage/types.d.ts.map +1 -1
  47. package/dist/sync/ssh-signer.d.ts +4 -0
  48. package/dist/sync/ssh-signer.d.ts.map +1 -1
  49. package/dist/sync/ssh-signer.js +45 -2
  50. package/dist/sync/ssh-signer.js.map +1 -1
  51. package/package.json +5 -2
@@ -0,0 +1,632 @@
1
+ /**
2
+ * Content validation for the ingestion pipeline.
3
+ *
4
+ * Runs quality checks on fetched HTML and extracted content, returning
5
+ * structured issues. Errors block ingestion; warnings are surfaced
6
+ * to the user alongside the stored content.
7
+ *
8
+ * Checks are organised into three tiers:
9
+ * 1. Pattern matching — known services / phrases
10
+ * 2. Structural HTML — HTML characteristics regardless of service
11
+ * 3. Content heuristics — statistical properties of extracted text
12
+ *
13
+ * All patterns are English-only for now.
14
+ */
15
+ // ===== Thresholds ==========================================================
16
+ const THIN_CONTENT_LIMIT = 50;
17
+ const SHORT_CONTENT_LIMIT = 200;
18
+ const PAYWALL_CONTENT_LIMIT = 500;
19
+ const SOFT_404_CONTENT_LIMIT = 1000;
20
+ const ERROR_PAGE_CONTENT_LIMIT = 500;
21
+ const MAINTENANCE_CONTENT_LIMIT = 500;
22
+ const REDIRECT_CONTENT_LIMIT = 300;
23
+ const LOGIN_FORM_CONTENT_LIMIT = 500;
24
+ const COOKIE_CONSENT_TEXT_LIMIT = 300;
25
+ const COOKIE_CONSENT_HTML_LIMIT = 200;
26
+ const ACCESS_RESTRICTED_CONTENT_LIMIT = 500;
27
+ const NAV_ONLY_CONTENT_LIMIT = 500;
28
+ // Content-to-HTML ratio
29
+ const MIN_HTML_SIZE_FOR_RATIO = 2000;
30
+ const RATIO_ERROR_THRESHOLD = 0.01;
31
+ const RATIO_ERROR_HTML_MIN = 5000;
32
+ const RATIO_WARNING_THRESHOLD = 0.03;
33
+ const RATIO_WARNING_HTML_MIN = 3000;
34
+ // Repetitive content
35
+ const MIN_SENTENCES_FOR_REPETITION = 4;
36
+ const REPETITION_UNIQUE_RATIO = 0.4;
37
+ const MAX_SENTENCE_REPEATS = 3;
38
+ // Encoding garbage
39
+ const REPLACEMENT_CHAR_RATIO = 0.05;
40
+ const MOJIBAKE_COUNT_THRESHOLD = 5;
41
+ const MOJIBAKE_TEXT_LIMIT = 2000;
42
+ const CONTROL_CHAR_RATIO = 0.02;
43
+ // Zero-width character steganography
44
+ const ZERO_WIDTH_WARN_THRESHOLD = 10;
45
+ /** Cap HTML scanned by pattern-matching to avoid perf issues on huge pages. */
46
+ const HTML_SCAN_LIMIT = 200_000;
47
+ // ===== Pattern sets ========================================================
48
+ // --- Bot / WAF / DDoS ------------------------------------------------------
49
+ const BOT_BLOCK_HTML_PATTERNS = [
50
+ // Cloudflare
51
+ /challenges\.cloudflare\.com/i,
52
+ /cf[-_]chl[-_]opt/i,
53
+ /cf-browser-verification/i,
54
+ /id=["']challenge-running["']/i,
55
+ // Akamai
56
+ /ak_bmsc/i,
57
+ /_sec\/cp_challenge/i,
58
+ // AWS WAF
59
+ /awswaf/i,
60
+ // Imperva / Incapsula
61
+ /incap_ses/i,
62
+ /visid_incap/i,
63
+ // PerimeterX / HUMAN
64
+ /perimeterx/i,
65
+ /px-captcha/i,
66
+ // DataDome
67
+ /datadome/i,
68
+ // Kasada
69
+ /cd\.kasada\.io/i,
70
+ // DDoS protection
71
+ /ddos protection by/i,
72
+ /sucuri website firewall/i,
73
+ /protection by incapsula/i,
74
+ ];
75
+ const BOT_BLOCK_TEXT_PATTERNS = [
76
+ // Cloudflare
77
+ /checking if the site connection is secure/i,
78
+ /attention required.{0,10}cloudflare/i,
79
+ // Generic WAF
80
+ /^access denied$/im,
81
+ /you have been blocked/i,
82
+ /request blocked/i,
83
+ /this request was blocked by the security rules/i,
84
+ /your (?:ip|access) (?:has been|is) (?:blocked|banned|restricted)/i,
85
+ /automated (?:access|requests?) (?:detected|blocked)/i,
86
+ /unusual traffic from your (?:computer|network)/i,
87
+ ];
88
+ // --- CAPTCHA ---------------------------------------------------------------
89
+ const CAPTCHA_HTML_PATTERNS = [
90
+ /g-recaptcha/i,
91
+ /h-captcha/i,
92
+ /class=["'][^"']*hcaptcha/i,
93
+ /captcha-delivery\.com/i,
94
+ // Cloudflare Turnstile
95
+ /challenges\.cloudflare\.com\/turnstile/i,
96
+ /cf-turnstile/i,
97
+ // Arkose Labs / FunCaptcha
98
+ /funcaptcha/i,
99
+ /arkoselabs\.com/i,
100
+ // GeeTest
101
+ /geetest/i,
102
+ // Generic
103
+ /id=["']captcha/i,
104
+ /class=["'][^"']*captcha/i,
105
+ ];
106
+ const CAPTCHA_TEXT_PATTERNS = [
107
+ /verify you are (?:a )?human/i,
108
+ /complete the security check/i,
109
+ /i[''\u2019]m not a robot/i,
110
+ /prove you[''\u2019]re not a robot/i,
111
+ /please (?:complete|solve) (?:the|this) (?:captcha|challenge|puzzle)/i,
112
+ ];
113
+ // --- Soft 404 --------------------------------------------------------------
114
+ const SOFT_404_TITLE_RE = /<title[^>]*>[^<]*(?:404|not\s*found|page\s*(?:not|doesn[''\u2019]t)\s*(?:exist|found))[^<]*<\/title>/i;
115
+ const SOFT_404_TEXT_PATTERNS = [
116
+ /^(?:404|page not found|not found)\s*$/im,
117
+ /this page (?:doesn[''\u2019]t|does not|could not) (?:exist|be found)/i,
118
+ /the page you (?:are|were) looking for.*(?:not found|doesn[''\u2019]t exist|no longer exists|has been (?:removed|moved|deleted))/i,
119
+ /we couldn[''\u2019]t find (?:that|this|the) page/i,
120
+ /nothing (?:was )?found here/i,
121
+ /oops.*(?:page|content).*(?:not found|gone|missing)/i,
122
+ /(?:sorry|unfortunately).*(?:page|url).*(?:not found|doesn[''\u2019]t exist|no longer available)/i,
123
+ ];
124
+ const SOFT_404_META_RE = /<meta[^>]+(?:prerender-status-code|http-equiv=["']status["'])[^>]+(?:content=["']404["']|404)/i;
125
+ // --- Error page ------------------------------------------------------------
126
+ const ERROR_PAGE_TITLE_RE = /<title[^>]*>[^<]*(?:500|error|something went wrong|internal server error|service unavailable|bad gateway)[^<]*<\/title>/i;
127
+ const ERROR_PAGE_TEXT_PATTERNS = [
128
+ /^(?:something went wrong|an error (?:occurred|has occurred)|internal server error|server error|service unavailable|bad gateway|gateway timeout)\s*$/im,
129
+ /we[''\u2019]re having (?:trouble|problems|issues|technical difficulties)/i,
130
+ /unexpected error/i,
131
+ /application error/i,
132
+ ];
133
+ const ERROR_PAGE_TEXT_GATED = [
134
+ /please try again later/i,
135
+ ];
136
+ const ERROR_PAGE_HTML_PATTERNS = [
137
+ /id=["']error-page["']/i,
138
+ /class=["'][^"']*(?:error-page|error-container|error-boundary)/i,
139
+ /next-error/i,
140
+ ];
141
+ // --- Maintenance / coming-soon ---------------------------------------------
142
+ const MAINTENANCE_TEXT_PATTERNS = [
143
+ /(?:site|website|page) (?:is )?(?:under|undergoing) (?:maintenance|construction)/i,
144
+ /we[''\u2019](?:re|ll be) (?:back|right back|up) (?:shortly|soon)/i,
145
+ /(?:currently|temporarily) (?:unavailable|down for maintenance)/i,
146
+ /under construction/i,
147
+ /scheduled maintenance/i,
148
+ /we are (?:updating|upgrading|performing maintenance)/i,
149
+ ];
150
+ const MAINTENANCE_TEXT_GATED = [
151
+ /coming soon/i,
152
+ ];
153
+ // --- Cookie consent --------------------------------------------------------
154
+ const COOKIE_TEXT_PATTERNS = [
155
+ /we use cookies/i,
156
+ /this (?:website|site) uses cookies/i,
157
+ /cookie (?:policy|preferences|settings|consent)/i,
158
+ /by continuing.*you (?:agree|consent)/i,
159
+ /manage (?:your )?(?:cookie|privacy) (?:preferences|settings)/i,
160
+ /accept (?:all|cookies)/i,
161
+ ];
162
+ const COOKIE_HTML_PATTERNS = [
163
+ /class=["'][^"']*(?:cookie-consent|cookie-banner|cookie-wall|consent-wall|gdpr-banner)/i,
164
+ /id=["'](?:cookie-consent|cookie-banner|consent)/i,
165
+ ];
166
+ // --- Paywall / login wall --------------------------------------------------
167
+ const PAYWALL_PATTERNS = [
168
+ /subscribe to (?:continue|read|access)/i,
169
+ /sign (?:in|up) to (?:continue|read|access|view)/i,
170
+ /log in to (?:continue|read|access|view)/i,
171
+ /create an? (?:free )?account/i,
172
+ /members only/i,
173
+ /premium content/i,
174
+ /start your (?:free )?trial/i,
175
+ /(?:this|the) (?:article|story|content|post) is (?:for|available to|exclusive to) (?:subscribers|members|premium)/i,
176
+ /(?:free )?articles? remaining/i,
177
+ /you(?:[''\u2019]ve| have) (?:reached|used|read) your (?:(?:free|monthly|weekly) )?(?:article|story)? ?limit/i,
178
+ /register (?:for free )?to (?:continue|read|access)/i,
179
+ /unlock (?:this|the|full) (?:article|story|content)/i,
180
+ /already a (?:subscriber|member)/i,
181
+ ];
182
+ // --- Redirect interstitial -------------------------------------------------
183
+ const REDIRECT_TEXT_PATTERNS = [
184
+ /you are (?:now )?being (?:redirected|transferred|forwarded)/i,
185
+ /(?:click here|tap here) if you are not (?:automatically )?redirected/i,
186
+ /if you are not redirected.*click/i,
187
+ /redirecting (?:you )?(?:to|in \d+ seconds)/i,
188
+ ];
189
+ const REDIRECT_META_REFRESH_RE = /<meta\s+http-equiv=["']refresh["'][^>]*url=/i;
190
+ // --- Age gate / geo-block --------------------------------------------------
191
+ const ACCESS_RESTRICTED_TEXT_PATTERNS = [
192
+ /(?:verify|confirm) (?:your|that you are).*(?:age|over \d+|at least \d+)/i,
193
+ /you must be (?:\d+|of legal age)/i,
194
+ /this content is (?:not available|unavailable|restricted) in your (?:country|region|area|location)/i,
195
+ /(?:geo|geographic(?:ally)?|region(?:ally)?)[\s-](?:blocked|restricted|unavailable)/i,
196
+ /content not available in your (?:country|region)/i,
197
+ ];
198
+ const ACCESS_RESTRICTED_HTML_PATTERNS = [
199
+ /class=["'][^"']*(?:age-gate|age-verification|age-check)/i,
200
+ /id=["']age-gate/i,
201
+ ];
202
+ // --- Login form (HTML structure) -------------------------------------------
203
+ const LOGIN_TEXT_PATTERNS = [
204
+ /sign in/i,
205
+ /log in/i,
206
+ /forgot (?:your )?password/i,
207
+ /remember me/i,
208
+ /don[''\u2019]t have an account/i,
209
+ ];
210
+ // --- Meta robots -----------------------------------------------------------
211
+ const NOINDEX_RE = /<meta[^>]+(?:name=["']robots["'][^>]+content=["'][^"']*noindex|content=["'][^"']*noindex[^"']*["'][^>]+name=["']robots["'])/i;
212
+ // --- Encoding garbage ------------------------------------------------------
213
+ const MOJIBAKE_RE = /\u00C3[\u00A9\u00A8\u00BC]|\u00E2\u0080[\u0099\u009C\u009D]|\u00C2[\u00A0-\u00BF]/g;
214
+ const CONTROL_CHAR_RE = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g;
215
+ // --- Zero-width character steganography ------------------------------------
216
+ const ZERO_WIDTH_RE = /[\u200B-\u200F\u202A-\u202F\u2060-\u206F\uFEFF]/g;
217
+ // --- Prompt injection patterns ---------------------------------------------
218
+ const PROMPT_INJECTION_PATTERNS = [
219
+ // Instruction override attempts
220
+ /ignore (?:all )?(?:previous|prior|above|earlier) (?:instructions|prompts|context)/i,
221
+ /disregard (?:all )?(?:previous|prior|above|earlier) (?:instructions|prompts|context)/i,
222
+ /forget (?:all )?(?:previous|prior|above|earlier) (?:instructions|prompts|context)/i,
223
+ /override (?:all )?(?:previous|prior|above|earlier) (?:instructions|prompts|context)/i,
224
+ // System/role impersonation
225
+ /^system\s*:/im,
226
+ /you are now (?:a |an )?(?:new |different )?(?:AI|assistant|bot|agent)/i,
227
+ /your (?:new |real |actual )?(?:role|purpose|instructions?|directive) (?:is|are)\b/i,
228
+ /act(?:ing)? as (?:a |an )?(?:new |different )?\w+ (?:AI|assistant|agent)/i,
229
+ /entering (?:a )?(?:new |special |admin )?mode/i,
230
+ // Delimiter/framing escape
231
+ /<\/system>/i,
232
+ /\[\/INST\]/i,
233
+ /\[INST\]/i,
234
+ /<<\s*SYS\s*>>/i,
235
+ /END_SYSTEM/i,
236
+ /BEGIN_(?:USER|INSTRUCTIONS)/i,
237
+ // Meta-instruction patterns
238
+ /(?:important|critical|urgent)[\s:]+(?:system|security) (?:update|notice|message|override)/i,
239
+ /the (?:above|previous) (?:warning|message|instructions?) (?:is|are|was) (?:outdated|incorrect|old|deprecated)/i,
240
+ /do not (?:mention|reveal|disclose|tell|share) (?:this|these) (?:instructions?|prompt)/i,
241
+ /(?:when|if) (?:the )?(?:user|human) asks?\b.*(?:always|instead|actually)/i,
242
+ ];
243
+ // ===== Public API ==========================================================
244
+ /**
245
+ * Validate fetched HTML and its extracted content.
246
+ *
247
+ * Returns all detected issues (not just the first). Callers should
248
+ * treat `error`-severity issues as ingestion blockers and `warning`
249
+ * issues as informational.
250
+ */
251
+ export function validateContent(html, extracted) {
252
+ const issues = [];
253
+ const text = extracted.content;
254
+ // Cap HTML to avoid perf issues on very large pages
255
+ const scanHtml = html.length > HTML_SCAN_LIMIT ? html.slice(0, HTML_SCAN_LIMIT) : html;
256
+ // --- Tier 1: Pattern matching (known blockers) ---------------------------
257
+ checkBotBlocked(scanHtml, text, issues);
258
+ checkCaptcha(scanHtml, text, issues);
259
+ checkSoft404(scanHtml, text, issues);
260
+ checkErrorPage(scanHtml, text, issues);
261
+ checkMaintenancePage(text, issues);
262
+ checkCookieConsentPage(scanHtml, text, issues);
263
+ // --- Tier 2: Structural HTML analysis ------------------------------------
264
+ checkContentToHtmlRatio(html, text, issues);
265
+ checkLoginFormPage(scanHtml, text, issues);
266
+ checkMetaRobotsNoindex(scanHtml, issues);
267
+ // --- Tier 3: Content heuristics ------------------------------------------
268
+ checkThinContent(text, issues);
269
+ checkShortContent(text, issues);
270
+ checkPaywall(text, issues);
271
+ checkRedirectInterstitial(scanHtml, text, issues);
272
+ checkAccessRestricted(scanHtml, text, issues);
273
+ checkRepetitiveContent(text, issues);
274
+ checkNavigationOnly(text, issues);
275
+ checkEncodingGarbage(text, issues);
276
+ checkZeroWidthCharacters(text, issues);
277
+ checkPromptInjection(text, issues);
278
+ return issues;
279
+ }
280
+ /**
281
+ * Validate extracted content from a local file.
282
+ *
283
+ * Runs only Tier 3 (content heuristic) checks. Tier 1 (pattern matching)
284
+ * and Tier 2 (structural HTML analysis) are specific to URL-fetched content.
285
+ */
286
+ export function validateFileContent(extracted) {
287
+ const issues = [];
288
+ const text = extracted.content;
289
+ checkThinContent(text, issues);
290
+ checkShortContent(text, issues);
291
+ checkRepetitiveContent(text, issues);
292
+ checkNavigationOnly(text, issues);
293
+ checkEncodingGarbage(text, issues);
294
+ checkZeroWidthCharacters(text, issues);
295
+ checkPromptInjection(text, issues);
296
+ return issues;
297
+ }
298
+ // ===== Tier 1: Pattern matching ============================================
299
+ function checkBotBlocked(html, text, issues) {
300
+ if (BOT_BLOCK_HTML_PATTERNS.some((p) => p.test(html))) {
301
+ issues.push({
302
+ severity: 'error',
303
+ code: 'bot-blocked',
304
+ message: 'This page returned an anti-bot challenge instead of content. ' +
305
+ 'The site blocks automated requests.',
306
+ });
307
+ return;
308
+ }
309
+ if (BOT_BLOCK_TEXT_PATTERNS.some((p) => p.test(text))) {
310
+ issues.push({
311
+ severity: 'error',
312
+ code: 'bot-blocked',
313
+ message: 'This page returned an "Access Denied" or bot-detection response. ' +
314
+ 'The site blocks automated requests.',
315
+ });
316
+ }
317
+ }
318
+ function checkCaptcha(html, text, issues) {
319
+ if (CAPTCHA_HTML_PATTERNS.some((p) => p.test(html)) || CAPTCHA_TEXT_PATTERNS.some((p) => p.test(text))) {
320
+ issues.push({
321
+ severity: 'error',
322
+ code: 'captcha',
323
+ message: 'This page contains a CAPTCHA challenge. ' +
324
+ 'The site requires human verification before serving content.',
325
+ });
326
+ }
327
+ }
328
+ function checkSoft404(html, text, issues) {
329
+ if (SOFT_404_TITLE_RE.test(html) || SOFT_404_META_RE.test(html)) {
330
+ issues.push({
331
+ severity: 'error',
332
+ code: 'soft-404',
333
+ message: 'This page appears to be a "not found" page that returned HTTP 200.',
334
+ });
335
+ return;
336
+ }
337
+ if (text.length < SOFT_404_CONTENT_LIMIT && SOFT_404_TEXT_PATTERNS.some((p) => p.test(text))) {
338
+ issues.push({
339
+ severity: 'error',
340
+ code: 'soft-404',
341
+ message: 'This page appears to be a "not found" page that returned HTTP 200.',
342
+ });
343
+ }
344
+ }
345
+ function checkErrorPage(html, text, issues) {
346
+ if (ERROR_PAGE_TITLE_RE.test(html) || ERROR_PAGE_HTML_PATTERNS.some((p) => p.test(html))) {
347
+ issues.push({
348
+ severity: 'error',
349
+ code: 'error-page',
350
+ message: 'This page appears to be an error or status page, not article content.',
351
+ });
352
+ return;
353
+ }
354
+ if (ERROR_PAGE_TEXT_PATTERNS.some((p) => p.test(text))) {
355
+ issues.push({
356
+ severity: 'error',
357
+ code: 'error-page',
358
+ message: 'This page appears to be an error or status page, not article content.',
359
+ });
360
+ return;
361
+ }
362
+ if (text.length < ERROR_PAGE_CONTENT_LIMIT && ERROR_PAGE_TEXT_GATED.some((p) => p.test(text))) {
363
+ issues.push({
364
+ severity: 'error',
365
+ code: 'error-page',
366
+ message: 'This page appears to be an error or status page, not article content.',
367
+ });
368
+ }
369
+ }
370
+ function checkMaintenancePage(text, issues) {
371
+ if (MAINTENANCE_TEXT_PATTERNS.some((p) => p.test(text))) {
372
+ issues.push({
373
+ severity: 'error',
374
+ code: 'maintenance-page',
375
+ message: 'This page appears to be a maintenance or "under construction" notice.',
376
+ });
377
+ return;
378
+ }
379
+ if (text.length < MAINTENANCE_CONTENT_LIMIT && MAINTENANCE_TEXT_GATED.some((p) => p.test(text))) {
380
+ issues.push({
381
+ severity: 'error',
382
+ code: 'maintenance-page',
383
+ message: 'This page appears to be a maintenance or "coming soon" notice.',
384
+ });
385
+ }
386
+ }
387
+ function checkCookieConsentPage(html, text, issues) {
388
+ if (text.length < COOKIE_CONSENT_HTML_LIMIT && COOKIE_HTML_PATTERNS.some((p) => p.test(html))) {
389
+ issues.push({
390
+ severity: 'error',
391
+ code: 'cookie-consent-only',
392
+ message: 'The extracted content appears to be only a cookie-consent overlay. ' +
393
+ 'The actual page content was not captured.',
394
+ });
395
+ return;
396
+ }
397
+ if (text.length < COOKIE_CONSENT_TEXT_LIMIT && COOKIE_TEXT_PATTERNS.some((p) => p.test(text))) {
398
+ issues.push({
399
+ severity: 'error',
400
+ code: 'cookie-consent-only',
401
+ message: 'The extracted content appears to be only a cookie-consent overlay. ' +
402
+ 'The actual page content was not captured.',
403
+ });
404
+ }
405
+ }
406
+ // ===== Tier 2: Structural HTML analysis ====================================
407
+ function checkContentToHtmlRatio(html, text, issues) {
408
+ const htmlLen = html.length;
409
+ if (htmlLen < MIN_HTML_SIZE_FOR_RATIO)
410
+ return;
411
+ const ratio = text.length / htmlLen;
412
+ if (ratio < RATIO_ERROR_THRESHOLD && htmlLen > RATIO_ERROR_HTML_MIN) {
413
+ issues.push({
414
+ severity: 'error',
415
+ code: 'low-content-ratio',
416
+ message: `Only ${(ratio * 100).toFixed(1)}% of the page HTML is visible text ` +
417
+ `(${text.length} chars from ${htmlLen} bytes of HTML). ` +
418
+ 'The page is almost entirely scripts/markup with negligible readable content.',
419
+ });
420
+ return;
421
+ }
422
+ if (ratio < RATIO_WARNING_THRESHOLD && htmlLen > RATIO_WARNING_HTML_MIN) {
423
+ issues.push({
424
+ severity: 'warning',
425
+ code: 'low-content-ratio',
426
+ message: `Only ${(ratio * 100).toFixed(1)}% of the page HTML is visible text. ` +
427
+ 'The content may be incomplete or partially rendered.',
428
+ });
429
+ }
430
+ }
431
+ function checkLoginFormPage(html, text, issues) {
432
+ if (text.length >= LOGIN_FORM_CONTENT_LIMIT)
433
+ return;
434
+ // Fast path: no password field → not a login page
435
+ if (!/<input[^>]+type=["']password["']/i.test(html))
436
+ return;
437
+ if (LOGIN_TEXT_PATTERNS.some((p) => p.test(text))) {
438
+ issues.push({
439
+ severity: 'warning',
440
+ code: 'login-form',
441
+ message: 'This page appears to be a login form rather than article content.',
442
+ });
443
+ }
444
+ }
445
+ function checkMetaRobotsNoindex(html, issues) {
446
+ if (NOINDEX_RE.test(html)) {
447
+ issues.push({
448
+ severity: 'warning',
449
+ code: 'noindex-page',
450
+ message: 'This page has a "noindex" robots directive — the publisher does not intend it to be indexed.',
451
+ });
452
+ }
453
+ }
454
+ // ===== Tier 3: Content heuristics ==========================================
455
+ function checkThinContent(text, issues) {
456
+ if (text.length < THIN_CONTENT_LIMIT) {
457
+ issues.push({
458
+ severity: 'error',
459
+ code: 'thin-content',
460
+ message: `Extracted content is only ${text.length} characters — too short to be a real article. ` +
461
+ 'The page may be an error page, redirect landing, or access-restricted.',
462
+ });
463
+ }
464
+ }
465
+ function checkShortContent(text, issues) {
466
+ if (text.length >= THIN_CONTENT_LIMIT && text.length < SHORT_CONTENT_LIMIT) {
467
+ issues.push({
468
+ severity: 'warning',
469
+ code: 'short-content',
470
+ message: `Extracted content is only ${text.length} characters. ` +
471
+ 'The page may be truncated, paywalled, or only partially rendered.',
472
+ });
473
+ }
474
+ }
475
+ function checkPaywall(text, issues) {
476
+ if (text.length >= PAYWALL_CONTENT_LIMIT)
477
+ return;
478
+ if (PAYWALL_PATTERNS.some((p) => p.test(text))) {
479
+ issues.push({
480
+ severity: 'warning',
481
+ code: 'possible-paywall',
482
+ message: 'The extracted content is short and contains language suggesting a paywall or login wall. ' +
483
+ 'The stored content may be incomplete.',
484
+ });
485
+ }
486
+ }
487
+ function checkRedirectInterstitial(html, text, issues) {
488
+ if (text.length >= REDIRECT_CONTENT_LIMIT)
489
+ return;
490
+ const hasMetaRefresh = REDIRECT_META_REFRESH_RE.test(html);
491
+ const hasRedirectText = REDIRECT_TEXT_PATTERNS.some((p) => p.test(text));
492
+ if (hasMetaRefresh || hasRedirectText) {
493
+ issues.push({
494
+ severity: 'warning',
495
+ code: 'redirect-interstitial',
496
+ message: 'This page appears to be a redirect interstitial. ' +
497
+ 'The actual destination content was not captured.',
498
+ });
499
+ }
500
+ }
501
+ function checkAccessRestricted(html, text, issues) {
502
+ if (ACCESS_RESTRICTED_HTML_PATTERNS.some((p) => p.test(html))) {
503
+ issues.push({
504
+ severity: 'warning',
505
+ code: 'access-restricted',
506
+ message: 'This page appears to have an age-verification or region-restriction gate.',
507
+ });
508
+ return;
509
+ }
510
+ if (text.length < ACCESS_RESTRICTED_CONTENT_LIMIT && ACCESS_RESTRICTED_TEXT_PATTERNS.some((p) => p.test(text))) {
511
+ issues.push({
512
+ severity: 'warning',
513
+ code: 'access-restricted',
514
+ message: 'This page appears to have an age-verification or region-restriction gate.',
515
+ });
516
+ }
517
+ }
518
+ function checkRepetitiveContent(text, issues) {
519
+ const sentences = text
520
+ .split(/[.!?]+\s+/)
521
+ .map((s) => s.trim().toLowerCase())
522
+ .filter((s) => s.length > 0);
523
+ if (sentences.length < MIN_SENTENCES_FOR_REPETITION)
524
+ return;
525
+ const counts = new Map();
526
+ for (const s of sentences) {
527
+ counts.set(s, (counts.get(s) ?? 0) + 1);
528
+ }
529
+ const uniqueRatio = counts.size / sentences.length;
530
+ const maxRepeats = Math.max(...counts.values());
531
+ if (uniqueRatio < REPETITION_UNIQUE_RATIO || maxRepeats > MAX_SENTENCE_REPEATS) {
532
+ issues.push({
533
+ severity: 'warning',
534
+ code: 'repetitive-content',
535
+ message: 'The extracted content appears highly repetitive, which may indicate ' +
536
+ 'a broken extraction, placeholder page, or auto-generated content.',
537
+ });
538
+ }
539
+ }
540
+ function checkNavigationOnly(text, issues) {
541
+ if (text.length >= NAV_ONLY_CONTENT_LIMIT)
542
+ return;
543
+ const lines = text.split('\n').filter((l) => l.trim().length > 0);
544
+ if (lines.length < 3)
545
+ return;
546
+ // Lines that look like nav items: short, no sentence-ending punctuation
547
+ const navLikeCount = lines.filter((l) => l.trim().length < 30 && !/[.!?]$/.test(l.trim())).length;
548
+ if (navLikeCount / lines.length > 0.7) {
549
+ issues.push({
550
+ severity: 'warning',
551
+ code: 'navigation-only',
552
+ message: 'The extracted content appears to be mostly navigation links or menu items, ' +
553
+ 'not article text.',
554
+ });
555
+ }
556
+ }
557
+ function checkEncodingGarbage(text, issues) {
558
+ // Replacement character ratio
559
+ const replacementCount = (text.match(/\uFFFD/g) ?? []).length;
560
+ if (text.length > 0 && replacementCount / text.length > REPLACEMENT_CHAR_RATIO) {
561
+ issues.push({
562
+ severity: 'warning',
563
+ code: 'encoding-garbage',
564
+ message: 'The extracted content contains excessive Unicode replacement characters, ' +
565
+ 'suggesting a character-encoding mismatch.',
566
+ });
567
+ return;
568
+ }
569
+ // Mojibake patterns (UTF-8 decoded as Latin-1)
570
+ if (text.length < MOJIBAKE_TEXT_LIMIT) {
571
+ const mojibakeCount = (text.match(MOJIBAKE_RE) ?? []).length;
572
+ if (mojibakeCount >= MOJIBAKE_COUNT_THRESHOLD) {
573
+ issues.push({
574
+ severity: 'warning',
575
+ code: 'encoding-garbage',
576
+ message: 'The extracted content shows signs of mojibake (encoding corruption). ' +
577
+ 'Characters may not display correctly.',
578
+ });
579
+ return;
580
+ }
581
+ }
582
+ // Control characters
583
+ const controlCount = (text.match(CONTROL_CHAR_RE) ?? []).length;
584
+ if (text.length > 0 && controlCount / text.length > CONTROL_CHAR_RATIO) {
585
+ issues.push({
586
+ severity: 'warning',
587
+ code: 'encoding-garbage',
588
+ message: 'The extracted content contains excessive control characters, ' +
589
+ 'suggesting binary data leaked into the text.',
590
+ });
591
+ }
592
+ }
593
+ function checkZeroWidthCharacters(text, issues) {
594
+ const matches = text.match(ZERO_WIDTH_RE);
595
+ if (matches && matches.length > ZERO_WIDTH_WARN_THRESHOLD) {
596
+ issues.push({
597
+ severity: 'warning',
598
+ code: 'zero-width-chars',
599
+ message: `The extracted content contains ${matches.length} zero-width or invisible Unicode characters, ` +
600
+ 'which may indicate hidden text or Unicode steganography.',
601
+ });
602
+ }
603
+ }
604
+ function checkPromptInjection(text, issues) {
605
+ const matchedPatterns = [];
606
+ for (const pattern of PROMPT_INJECTION_PATTERNS) {
607
+ if (pattern.test(text)) {
608
+ matchedPatterns.push(pattern.source);
609
+ }
610
+ }
611
+ if (matchedPatterns.length === 0)
612
+ return;
613
+ if (matchedPatterns.length >= 3) {
614
+ issues.push({
615
+ severity: 'warning',
616
+ code: 'possible-prompt-injection',
617
+ message: `The extracted content matches ${matchedPatterns.length} prompt-injection patterns ` +
618
+ '(instruction overrides, role impersonation, or delimiter escapes). ' +
619
+ 'This content has a high likelihood of containing adversarial text designed to manipulate an AI agent.',
620
+ });
621
+ }
622
+ else {
623
+ issues.push({
624
+ severity: 'warning',
625
+ code: 'possible-prompt-injection',
626
+ message: 'The extracted content contains text that resembles a prompt-injection attempt ' +
627
+ '(e.g. instruction overrides, system impersonation, or delimiter escapes). ' +
628
+ 'The content was stored but may contain adversarial text.',
629
+ });
630
+ }
631
+ }
632
+ //# sourceMappingURL=validate.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"validate.js","sourceRoot":"","sources":["../../pipeline/validate.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAYH,8EAA8E;AAE9E,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAC9B,MAAM,mBAAmB,GAAG,GAAG,CAAC;AAChC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,sBAAsB,GAAG,IAAI,CAAC;AACpC,MAAM,wBAAwB,GAAG,GAAG,CAAC;AACrC,MAAM,yBAAyB,GAAG,GAAG,CAAC;AACtC,MAAM,sBAAsB,GAAG,GAAG,CAAC;AACnC,MAAM,wBAAwB,GAAG,GAAG,CAAC;AACrC,MAAM,yBAAyB,GAAG,GAAG,CAAC;AACtC,MAAM,yBAAyB,GAAG,GAAG,CAAC;AACtC,MAAM,+BAA+B,GAAG,GAAG,CAAC;AAC5C,MAAM,sBAAsB,GAAG,GAAG,CAAC;AAEnC,wBAAwB;AACxB,MAAM,uBAAuB,GAAG,IAAI,CAAC;AACrC,MAAM,qBAAqB,GAAG,IAAI,CAAC;AACnC,MAAM,oBAAoB,GAAG,IAAI,CAAC;AAClC,MAAM,uBAAuB,GAAG,IAAI,CAAC;AACrC,MAAM,sBAAsB,GAAG,IAAI,CAAC;AAEpC,qBAAqB;AACrB,MAAM,4BAA4B,GAAG,CAAC,CAAC;AACvC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AACpC,MAAM,oBAAoB,GAAG,CAAC,CAAC;AAE/B,mBAAmB;AACnB,MAAM,sBAAsB,GAAG,IAAI,CAAC;AACpC,MAAM,wBAAwB,GAAG,CAAC,CAAC;AACnC,MAAM,mBAAmB,GAAG,IAAI,CAAC;AACjC,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAEhC,qCAAqC;AACrC,MAAM,yBAAyB,GAAG,EAAE,CAAC;AAErC,+EAA+E;AAC/E,MAAM,eAAe,GAAG,OAAO,CAAC;AAEhC,8EAA8E;AAE9E,8EAA8E;AAE9E,MAAM,uBAAuB,GAAG;IAC9B,aAAa;IACb,8BAA8B;IAC9B,mBAAmB;IACnB,0BAA0B;IAC1B,+BAA+B;IAC/B,SAAS;IACT,UAAU;IACV,qBAAqB;IACrB,UAAU;IACV,SAAS;IACT,sBAAsB;IACtB,YAAY;IACZ,cAAc;IACd,qBAAqB;IACrB,aAAa;IACb,aAAa;IACb,WAAW;IACX,WAAW;IACX,SAAS;IACT,iBAAiB;IACjB,kBAAkB;IAClB,qBAAqB;IACrB,0BAA0B;IAC1B,0BAA0B;CAC3B,CAAC;AAEF,MAAM,uBAAuB,GAAG;IAC9B,aAAa;IACb,4CAA4C;IAC5C,sCAAsC;IACtC,cAAc;IACd,mBAAmB;IACnB,wBAAwB;IACxB,kBAAkB;IAClB,iDAAiD;IACjD,mEAAmE;IACnE,sDAAsD;IACtD,iDAAiD;CAClD,CAAC;AAEF,8EAA8E;AAE9E,MAAM,qBAAqB,GAAG;IAC5B,cAAc;IACd,YAAY;IACZ,2BAA2B;IAC3B,wBAAwB;IACxB,uBAAuB;IACvB,yCAAyC;IACzC,eAAe;IACf,2BAA2B;IAC3B,aAAa;IACb,kBAAkB;IAClB,UAAU;IACV,UAAU;IACV,UAAU;IACV,iBAAiB;IACjB,0BAA0B;CAC3B,CAAC;AAEF,MAAM,qBAAqB,GAAG;IAC5B,8BAA8B;IAC9B,8BAA8B;IAC9B,2BAA2B;IAC3B,oCAAoC;IACpC,sEAAsE;CACvE,CAAC;AAEF,8EAA8E;AAE9E,MAAM,iBAAiB,GACrB,uGAAuG,CAAC;AAE1G,MAAM,sBAAsB,GAAG;IAC7B,yCAAyC;IACzC,uEAAuE;IACvE,kIAAkI;IAClI,mDAAmD;IACnD,8BAA8B;IAC9B,qDAAqD;IACrD,kGAAkG;CACnG,CAAC;AAEF,MAAM,gBAAgB,GACpB,gGAAgG,CAAC;AAEnG,8EAA8E;AAE9E,MAAM,mBAAmB,GACvB,0HAA0H,CAAC;AAE7H,MAAM,wBAAwB,GAAG;IAC/B,uJAAuJ;IACvJ,2EAA2E;IAC3E,mBAAmB;IACnB,oBAAoB;CACrB,CAAC;AAEF,MAAM,qBAAqB,GAAG;IAC5B,yBAAyB;CAC1B,CAAC;AAEF,MAAM,wBAAwB,GAAG;IAC/B,wBAAwB;IACxB,gEAAgE;IAChE,aAAa;CACd,CAAC;AAEF,8EAA8E;AAE9E,MAAM,yBAAyB,GAAG;IAChC,kFAAkF;IAClF,mEAAmE;IACnE,iEAAiE;IACjE,qBAAqB;IACrB,wBAAwB;IACxB,uDAAuD;CACxD,CAAC;AAEF,MAAM,sBAAsB,GAAG;IAC7B,cAAc;CACf,CAAC;AAEF,8EAA8E;AAE9E,MAAM,oBAAoB,GAAG;IAC3B,iBAAiB;IACjB,qCAAqC;IACrC,iDAAiD;IACjD,uCAAuC;IACvC,+DAA+D;IAC/D,yBAAyB;CAC1B,CAAC;AAEF,MAAM,oBAAoB,GAAG;IAC3B,wFAAwF;IACxF,kDAAkD;CACnD,CAAC;AAEF,8EAA8E;AAE9E,MAAM,gBAAgB,GAAG;IACvB,wCAAwC;IACxC,kDAAkD;IAClD,0CAA0C;IAC1C,+BAA+B;IAC/B,eAAe;IACf,kBAAkB;IAClB,6BAA6B;IAC7B,mHAAmH;IACnH,gCAAgC;IAChC,8GAA8G;IAC9G,qDAAqD;IACrD,qDAAqD;IACrD,kCAAkC;CACnC,CAAC;AAEF,8EAA8E;AAE9E,MAAM,sBAAsB,GAAG;IAC7B,8DAA8D;IAC9D,uEAAuE;IACvE,mCAAmC;IACnC,6CAA6C;CAC9C,CAAC;AAEF,MAAM,wBAAwB,GAAG,8CAA8C,CAAC;AAEhF,8EAA8E;AAE9E,MAAM,+BAA+B,GAAG;IACtC,0EAA0E;IAC1E,mCAAmC;IACnC,oGAAoG;IACpG,qFAAqF;IACrF,mDAAmD;CACpD,CAAC;AAEF,MAAM,+BAA+B,GAAG;IACtC,0DAA0D;IAC1D,kBAAkB;CACnB,CAAC;AAEF,8EAA8E;AAE9E,MAAM,mBAAmB,GAAG;IAC1B,UAAU;IACV,SAAS;IACT,4BAA4B;IAC5B,cAAc;IACd,iCAAiC;CAClC,CAAC;AAEF,8EAA8E;AAE9E,MAAM,UAAU,GACd,8HAA8H,CAAC;AAEjI,8EAA8E;AAE9E,MAAM,WAAW,GAAG,oFAAoF,CAAC;AACzG,MAAM,eAAe,GAAG,mCAAmC,CAAC;AAE5D,8EAA8E;AAE9E,MAAM,aAAa,GAAG,kDAAkD,CAAC;AAEzE,8EAA8E;AAE9E,MAAM,yBAAyB,GAAG;IAChC,gCAAgC;IAChC,oFAAoF;IACpF,uFAAuF;IACvF,oFAAoF;IACpF,sFAAsF;IAEtF,4BAA4B;IAC5B,eAAe;IACf,wEAAwE;IACxE,oFAAoF;IACpF,2EAA2E;IAC3E,gDAAgD;IAEhD,2BAA2B;IAC3B,aAAa;IACb,aAAa;IACb,WAAW;IACX,gBAAgB;IAChB,aAAa;IACb,8BAA8B;IAE9B,4BAA4B;IAC5B,4FAA4F;IAC5F,gHAAgH;IAChH,wFAAwF;IACxF,2EAA2E;CAC5E,CAAC;AAEF,8EAA8E;AAE9E;;;;;;GAMG;AACH,MAAM,UAAU,eAAe,CAC7B,IAAY,EACZ,SAA2B;IAE3B,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,MAAM,IAAI,GAAG,SAAS,CAAC,OAAO,CAAC;IAC/B,oDAAoD;IACpD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEvF,4EAA4E;IAC5E,eAAe,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IACxC,YAAY,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IACrC,YAAY,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IACrC,cAAc,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IACvC,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACnC,sBAAsB,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IAE/C,4EAA4E;IAC5E,uBAAuB,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IAC5C,kBAAkB,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IAC3C,sBAAsB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAEzC,4EAA4E;IAC5E,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAC/B,iBAAiB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAChC,YAAY,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAC3B,yBAAyB,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IAClD,qBAAqB,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IAC9C,sBAAsB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACrC,mBAAmB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAClC,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACnC,wBAAwB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACvC,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAEnC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAA2B;IAC7D,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,MAAM,IAAI,GAAG,SAAS,CAAC,OAAO,CAAC;IAE/B,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAC/B,iBAAiB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAChC,sBAAsB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACrC,mBAAmB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAClC,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACnC,wBAAwB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACvC,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAEnC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,8EAA8E;AAE9E,SAAS,eAAe,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IACzE,IAAI,uBAAuB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QACtD,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,aAAa;YACnB,OAAO,EACL,+DAA+D;gBAC/D,qCAAqC;SACxC,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,uBAAuB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QACtD,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,aAAa;YACnB,OAAO,EACL,mEAAmE;gBACnE,qCAAqC;SACxC,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,YAAY,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IACtE,IAAI,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QACvG,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,SAAS;YACf,OAAO,EACL,0CAA0C;gBAC1C,8DAA8D;SACjE,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,YAAY,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IACtE,IAAI,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QAChE,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,UAAU;YAChB,OAAO,EAAE,oEAAoE;SAC9E,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,GAAG,sBAAsB,IAAI,sBAAsB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC7F,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,UAAU;YAChB,OAAO,EAAE,oEAAoE;SAC9E,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IACxE,IAAI,mBAAmB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,wBAAwB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QACzF,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,YAAY;YAClB,OAAO,EAAE,uEAAuE;SACjF,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,wBAAwB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QACvD,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,YAAY;YAClB,OAAO,EAAE,uEAAuE;SACjF,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,GAAG,wBAAwB,IAAI,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC9F,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,YAAY;YAClB,OAAO,EAAE,uEAAuE;SACjF,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAY,EAAE,MAAsB;IAChE,IAAI,yBAAyB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QACxD,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,kBAAkB;YACxB,OAAO,EAAE,uEAAuE;SACjF,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,GAAG,yBAAyB,IAAI,sBAAsB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAChG,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,kBAAkB;YACxB,OAAO,EAAE,gEAAgE;SAC1E,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IAChF,IAAI,IAAI,CAAC,MAAM,GAAG,yBAAyB,IAAI,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC9F,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,qBAAqB;YAC3B,OAAO,EACL,qEAAqE;gBACrE,2CAA2C;SAC9C,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,GAAG,yBAAyB,IAAI,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC9F,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,qBAAqB;YAC3B,OAAO,EACL,qEAAqE;gBACrE,2CAA2C;SAC9C,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,8EAA8E;AAE9E,SAAS,uBAAuB,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IACjF,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;IAC5B,IAAI,OAAO,GAAG,uBAAuB;QAAE,OAAO;IAE9C,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC;IAEpC,IAAI,KAAK,GAAG,qBAAqB,IAAI,OAAO,GAAG,oBAAoB,EAAE,CAAC;QACpE,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,mBAAmB;YACzB,OAAO,EACL,QAAQ,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,qCAAqC;gBACrE,IAAI,IAAI,CAAC,MAAM,eAAe,OAAO,mBAAmB;gBACxD,8EAA8E;SACjF,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IAED,IAAI,KAAK,GAAG,uBAAuB,IAAI,OAAO,GAAG,sBAAsB,EAAE,CAAC;QACxE,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,mBAAmB;YACzB,OAAO,EACL,QAAQ,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,sCAAsC;gBACtE,sDAAsD;SACzD,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IAC5E,IAAI,IAAI,CAAC,MAAM,IAAI,wBAAwB;QAAE,OAAO;IACpD,kDAAkD;IAClD,IAAI,CAAC,mCAAmC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO;IAC5D,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAClD,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,YAAY;YAClB,OAAO,EACL,mEAAmE;SACtE,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAY,EAAE,MAAsB;IAClE,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QAC1B,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,cAAc;YACpB,OAAO,EACL,8FAA8F;SACjG,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,8EAA8E;AAE9E,SAAS,gBAAgB,CAAC,IAAY,EAAE,MAAsB;IAC5D,IAAI,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,OAAO;YACjB,IAAI,EAAE,cAAc;YACpB,OAAO,EACL,6BAA6B,IAAI,CAAC,MAAM,gDAAgD;gBACxF,wEAAwE;SAC3E,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,MAAsB;IAC7D,IAAI,IAAI,CAAC,MAAM,IAAI,kBAAkB,IAAI,IAAI,CAAC,MAAM,GAAG,mBAAmB,EAAE,CAAC;QAC3E,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,eAAe;YACrB,OAAO,EACL,6BAA6B,IAAI,CAAC,MAAM,eAAe;gBACvD,mEAAmE;SACtE,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,YAAY,CAAC,IAAY,EAAE,MAAsB;IACxD,IAAI,IAAI,CAAC,MAAM,IAAI,qBAAqB;QAAE,OAAO;IACjD,IAAI,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC/C,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,kBAAkB;YACxB,OAAO,EACL,2FAA2F;gBAC3F,uCAAuC;SAC1C,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,yBAAyB,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IACnF,IAAI,IAAI,CAAC,MAAM,IAAI,sBAAsB;QAAE,OAAO;IAClD,MAAM,cAAc,GAAG,wBAAwB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3D,MAAM,eAAe,GAAG,sBAAsB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACzE,IAAI,cAAc,IAAI,eAAe,EAAE,CAAC;QACtC,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,uBAAuB;YAC7B,OAAO,EACL,mDAAmD;gBACnD,kDAAkD;SACrD,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,qBAAqB,CAAC,IAAY,EAAE,IAAY,EAAE,MAAsB;IAC/E,IAAI,+BAA+B,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC9D,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,mBAAmB;YACzB,OAAO,EAAE,2EAA2E;SACrF,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,GAAG,+BAA+B,IAAI,+BAA+B,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC/G,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,mBAAmB;YACzB,OAAO,EAAE,2EAA2E;SACrF,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAY,EAAE,MAAsB;IAClE,MAAM,SAAS,GAAG,IAAI;SACnB,KAAK,CAAC,WAAW,CAAC;SAClB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;SAClC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE/B,IAAI,SAAS,CAAC,MAAM,GAAG,4BAA4B;QAAE,OAAO;IAE5D,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC;IACnD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IAEhD,IAAI,WAAW,GAAG,uBAAuB,IAAI,UAAU,GAAG,oBAAoB,EAAE,CAAC;QAC/E,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,oBAAoB;YAC1B,OAAO,EACL,sEAAsE;gBACtE,mEAAmE;SACtE,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY,EAAE,MAAsB;IAC/D,IAAI,IAAI,CAAC,MAAM,IAAI,sBAAsB;QAAE,OAAO;IAElD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAClE,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO;IAE7B,wEAAwE;IACxE,MAAM,YAAY,GAAG,KAAK,CAAC,MAAM,CAC/B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,EAAE,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CACxD,CAAC,MAAM,CAAC;IAET,IAAI,YAAY,GAAG,KAAK,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QACtC,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,iBAAiB;YACvB,OAAO,EACL,6EAA6E;gBAC7E,mBAAmB;SACtB,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAY,EAAE,MAAsB;IAChE,8BAA8B;IAC9B,MAAM,gBAAgB,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAC9D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,gBAAgB,GAAG,IAAI,CAAC,MAAM,GAAG,sBAAsB,EAAE,CAAC;QAC/E,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,kBAAkB;YACxB,OAAO,EACL,2EAA2E;gBAC3E,2CAA2C;SAC9C,CAAC,CAAC;QACH,OAAO;IACT,CAAC;IAED,+CAA+C;IAC/C,IAAI,IAAI,CAAC,MAAM,GAAG,mBAAmB,EAAE,CAAC;QACtC,MAAM,aAAa,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC7D,IAAI,aAAa,IAAI,wBAAwB,EAAE,CAAC;YAC9C,MAAM,CAAC,IAAI,CAAC;gBACV,QAAQ,EAAE,SAAS;gBACnB,IAAI,EAAE,kBAAkB;gBACxB,OAAO,EACL,uEAAuE;oBACvE,uCAAuC;aAC1C,CAAC,CAAC;YACH,OAAO;QACT,CAAC;IACH,CAAC;IAED,qBAAqB;IACrB,MAAM,YAAY,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAChE,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,GAAG,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;QACvE,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,kBAAkB;YACxB,OAAO,EACL,+DAA+D;gBAC/D,8CAA8C;SACjD,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,wBAAwB,CAAC,IAAY,EAAE,MAAsB;IACpE,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC1C,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,yBAAyB,EAAE,CAAC;QAC1D,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,kBAAkB;YACxB,OAAO,EACL,kCAAkC,OAAO,CAAC,MAAM,+CAA+C;gBAC/F,0DAA0D;SAC7D,CAAC,CAAC;IACL,CAAC;AACH,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAY,EAAE,MAAsB;IAChE,MAAM,eAAe,GAAa,EAAE,CAAC;IACrC,KAAK,MAAM,OAAO,IAAI,yBAAyB,EAAE,CAAC;QAChD,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEzC,IAAI,eAAe,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QAChC,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,2BAA2B;YACjC,OAAO,EACL,iCAAiC,eAAe,CAAC,MAAM,6BAA6B;gBACpF,qEAAqE;gBACrE,uGAAuG;SAC1G,CAAC,CAAC;IACL,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC;YACV,QAAQ,EAAE,SAAS;YACnB,IAAI,EAAE,2BAA2B;YACjC,OAAO,EACL,gFAAgF;gBAChF,4EAA4E;gBAC5E,0DAA0D;SAC7D,CAAC,CAAC;IACL,CAAC;AACH,CAAC"}
@@ -4,12 +4,14 @@ export declare class SourceRepository implements ISourceRepository {
4
4
  private readonly db;
5
5
  private readonly insertStmt;
6
6
  private readonly getByIdStmt;
7
+ private readonly getByUrlStmt;
7
8
  private readonly softDeleteStmt;
8
9
  private readonly restoreStmt;
9
10
  private readonly updateLastAccessedStmt;
10
11
  constructor(db: BetterSqlite3.Database);
11
12
  insert(source: Omit<SourceRecord, 'createdAt' | 'updatedAt' | 'lastAccessedAt'>): SourceRecord;
12
13
  getById(id: string): SourceRecord | null;
14
+ getByUrl(url: string): SourceRecord | null;
13
15
  list(filter?: SourceFilter, pagination?: PaginationOptions): SourceRecord[];
14
16
  count(filter?: SourceFilter): number;
15
17
  softDelete(id: string): boolean;