webcontext-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +583 -0
- package/dist/browser/manager.d.ts +47 -0
- package/dist/browser/manager.d.ts.map +1 -0
- package/dist/browser/manager.js +215 -0
- package/dist/browser/manager.js.map +1 -0
- package/dist/cache/cache.d.ts +22 -0
- package/dist/cache/cache.d.ts.map +1 -0
- package/dist/cache/cache.js +150 -0
- package/dist/cache/cache.js.map +1 -0
- package/dist/chunking/chunker.d.ts +26 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +208 -0
- package/dist/chunking/chunker.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +406 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/pipeline.d.ts +35 -0
- package/dist/core/pipeline.d.ts.map +1 -0
- package/dist/core/pipeline.js +476 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/stream.d.ts +48 -0
- package/dist/core/stream.d.ts.map +1 -0
- package/dist/core/stream.js +72 -0
- package/dist/core/stream.js.map +1 -0
- package/dist/core/types.d.ts +259 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +4 -0
- package/dist/core/types.js.map +1 -0
- package/dist/export/index.d.ts +3 -0
- package/dist/export/index.d.ts.map +1 -0
- package/dist/export/index.js +8 -0
- package/dist/export/index.js.map +1 -0
- package/dist/export/templates.d.ts +25 -0
- package/dist/export/templates.d.ts.map +1 -0
- package/dist/export/templates.js +76 -0
- package/dist/export/templates.js.map +1 -0
- package/dist/export/vectordb.d.ts +21 -0
- package/dist/export/vectordb.d.ts.map +1 -0
- package/dist/export/vectordb.js +101 -0
- package/dist/export/vectordb.js.map +1 -0
- package/dist/extractors/content.d.ts +23 -0
- package/dist/extractors/content.d.ts.map +1 -0
- package/dist/extractors/content.js +328 -0
- package/dist/extractors/content.js.map +1 -0
- package/dist/extractors/github.d.ts +19 -0
- package/dist/extractors/github.d.ts.map +1 -0
- package/dist/extractors/github.js +150 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/images.d.ts +20 -0
- package/dist/extractors/images.d.ts.map +1 -0
- package/dist/extractors/images.js +73 -0
- package/dist/extractors/images.js.map +1 -0
- package/dist/extractors/pdf.d.ts +11 -0
- package/dist/extractors/pdf.d.ts.map +1 -0
- package/dist/extractors/pdf.js +107 -0
- package/dist/extractors/pdf.js.map +1 -0
- package/dist/extractors/screenshot.d.ts +21 -0
- package/dist/extractors/screenshot.d.ts.map +1 -0
- package/dist/extractors/screenshot.js +85 -0
- package/dist/extractors/screenshot.js.map +1 -0
- package/dist/index.d.ts +70 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +206 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp-server.d.ts +3 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +108 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/sdk/client.d.ts +48 -0
- package/dist/sdk/client.d.ts.map +1 -0
- package/dist/sdk/client.js +120 -0
- package/dist/sdk/client.js.map +1 -0
- package/dist/sdk/mcp.d.ts +12 -0
- package/dist/sdk/mcp.d.ts.map +1 -0
- package/dist/sdk/mcp.js +146 -0
- package/dist/sdk/mcp.js.map +1 -0
- package/dist/sdk/server.d.ts +5 -0
- package/dist/sdk/server.d.ts.map +1 -0
- package/dist/sdk/server.js +158 -0
- package/dist/sdk/server.js.map +1 -0
- package/dist/search/vector.d.ts +26 -0
- package/dist/search/vector.d.ts.map +1 -0
- package/dist/search/vector.js +142 -0
- package/dist/search/vector.js.map +1 -0
- package/dist/transformers/markdown.d.ts +21 -0
- package/dist/transformers/markdown.d.ts.map +1 -0
- package/dist/transformers/markdown.js +242 -0
- package/dist/transformers/markdown.js.map +1 -0
- package/dist/utils/dedup.d.ts +20 -0
- package/dist/utils/dedup.d.ts.map +1 -0
- package/dist/utils/dedup.js +61 -0
- package/dist/utils/dedup.js.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +15 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/metrics.d.ts +16 -0
- package/dist/utils/metrics.d.ts.map +1 -0
- package/dist/utils/metrics.js +28 -0
- package/dist/utils/metrics.js.map +1 -0
- package/dist/utils/scheduler.d.ts +19 -0
- package/dist/utils/scheduler.d.ts.map +1 -0
- package/dist/utils/scheduler.js +63 -0
- package/dist/utils/scheduler.js.map +1 -0
- package/dist/utils/sitemap.d.ts +17 -0
- package/dist/utils/sitemap.d.ts.map +1 -0
- package/dist/utils/sitemap.js +118 -0
- package/dist/utils/sitemap.js.map +1 -0
- package/dist/utils/validation.d.ts +142 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +35 -0
- package/dist/utils/validation.js.map +1 -0
- package/dist/utils/webhook.d.ts +21 -0
- package/dist/utils/webhook.d.ts.map +1 -0
- package/dist/utils/webhook.js +108 -0
- package/dist/utils/webhook.js.map +1 -0
- package/package.json +109 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scheduler.d.ts","sourceRoot":"","sources":["../../src/utils/scheduler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAE5D;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,IAAI,CAA0C;IAEtD,QAAQ,CAAC,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG,KAAK,OAAO,CAAC,WAAW,CAAC,GAAG,IAAI;IAYjH,MAAM,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;IAKxB,SAAS,IAAI,IAAI;IAKjB,QAAQ,IAAI,MAAM,EAAE;IAEpB;;;;OAIG;IACH,OAAO,CAAC,cAAc;CAmBvB"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.CrawlScheduler = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Simple cron-like scheduler for periodic re-crawling.
|
|
6
|
+
* Uses setInterval with parsed cron expressions for basic scheduling.
|
|
7
|
+
*/
|
|
8
|
+
class CrawlScheduler {
|
|
9
|
+
jobs = new Map();
|
|
10
|
+
schedule(id, config, executor) {
|
|
11
|
+
this.cancel(id);
|
|
12
|
+
const interval = this.cronToInterval(config.cron);
|
|
13
|
+
const timer = setInterval(async () => {
|
|
14
|
+
for (const url of config.urls) {
|
|
15
|
+
const result = await executor(url, config.options);
|
|
16
|
+
config.onComplete?.(result);
|
|
17
|
+
}
|
|
18
|
+
}, interval);
|
|
19
|
+
this.jobs.set(id, timer);
|
|
20
|
+
}
|
|
21
|
+
cancel(id) {
|
|
22
|
+
const timer = this.jobs.get(id);
|
|
23
|
+
if (timer) {
|
|
24
|
+
clearInterval(timer);
|
|
25
|
+
this.jobs.delete(id);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
cancelAll() {
|
|
29
|
+
for (const timer of this.jobs.values())
|
|
30
|
+
clearInterval(timer);
|
|
31
|
+
this.jobs.clear();
|
|
32
|
+
}
|
|
33
|
+
listJobs() { return [...this.jobs.keys()]; }
|
|
34
|
+
/**
|
|
35
|
+
* Parse a cron expression into a millisecond interval.
|
|
36
|
+
* Supports: *\/N for minutes/hours, day-of-week specific (runs daily),
|
|
37
|
+
* and common patterns. Falls back to 1 hour for unsupported expressions.
|
|
38
|
+
*/
|
|
39
|
+
cronToInterval(cron) {
|
|
40
|
+
const parts = cron.trim().split(/\s+/);
|
|
41
|
+
if (parts.length < 5)
|
|
42
|
+
return 60 * 60 * 1000; // fallback: 1 hour
|
|
43
|
+
const [minute, hour, dayOfMonth, , dayOfWeek] = parts;
|
|
44
|
+
// */N minutes (e.g., "*/5 * * * *" = every 5 min)
|
|
45
|
+
if (minute.startsWith('*/'))
|
|
46
|
+
return parseInt(minute.slice(2)) * 60 * 1000;
|
|
47
|
+
// */N hours (e.g., "0 */2 * * *" = every 2 hours)
|
|
48
|
+
if (hour.startsWith('*/'))
|
|
49
|
+
return parseInt(hour.slice(2)) * 60 * 60 * 1000;
|
|
50
|
+
// Daily at specific time (e.g., "0 9 * * *" or "0 9 * * MON")
|
|
51
|
+
if (minute !== '*' && hour !== '*' && dayOfMonth === '*')
|
|
52
|
+
return 24 * 60 * 60 * 1000;
|
|
53
|
+
// Every hour (e.g., "0 * * * *")
|
|
54
|
+
if (minute !== '*' && hour === '*')
|
|
55
|
+
return 60 * 60 * 1000;
|
|
56
|
+
// Every minute ("* * * * *")
|
|
57
|
+
if (minute === '*')
|
|
58
|
+
return 60 * 1000;
|
|
59
|
+
return 60 * 60 * 1000; // fallback: 1 hour
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
exports.CrawlScheduler = CrawlScheduler;
|
|
63
|
+
//# sourceMappingURL=scheduler.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scheduler.js","sourceRoot":"","sources":["../../src/utils/scheduler.ts"],"names":[],"mappings":";;;AAEA;;;GAGG;AACH,MAAa,cAAc;IACjB,IAAI,GAAgC,IAAI,GAAG,EAAE,CAAC;IAEtD,QAAQ,CAAC,EAAU,EAAE,MAAsB,EAAE,QAA6D;QACxG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAClD,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,IAAI,EAAE;YACnC,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;gBAC9B,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;gBACnD,MAAM,CAAC,UAAU,EAAE,CAAC,MAAM,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC,EAAE,QAAQ,CAAC,CAAC;QACb,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;IAC3B,CAAC;IAED,MAAM,CAAC,EAAU;QACf,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAChC,IAAI,KAAK,EAAE,CAAC;YAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAAC,CAAC;IAC5D,CAAC;IAED,SAAS;QACP,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE;YAAE,aAAa,CAAC,KAAK,CAAC,CAAC;QAC7D,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;IACpB,CAAC;IAED,QAAQ,KAAe,OAAO,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtD;;;;OAIG;IACK,cAAc,CAAC,IAAY;QACjC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACvC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,mBAAmB;QAEhE,MAAM,CAAC,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,AAAD,EAAG,SAAS,CAAC,GAAG,KAAK,CAAC;QAEtD,kDAAkD;QAClD,IAAI,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC;YAAE,OAAO,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC;QAC1E,kDAAkD;QAClD,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;YAAE,OAAO,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;QAC3E,8DAA8D;QAC9D,IAAI,MAAM,KAAK,GAAG,IAAI,IAAI,KAAK,GAAG,IAAI,UAAU,KAAK,GAAG;YAAE,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;QACrF,iCAAiC;QACjC,IAAI,MAAM,KAAK,GAAG,IAAI,IAAI,KAAK,GAAG;YAAE,OAAO,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;QAC1D,6BAA6B;QAC7B,IAAI,MAAM,KAAK,GAAG;YAAE,OAAO,EAAE,GAAG,IAAI,CAAC;QAErC,OAAO,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,mBAAmB;IAC5C,CAAC;CACF;AAnDD,wCAmDC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { SitemapEntry } from '../core/types';
|
|
2
|
+
/**
|
|
3
|
+
* Parse sitemap.xml and sitemap index files.
|
|
4
|
+
* Supports: standard sitemaps, sitemap indexes, gzipped sitemaps.
|
|
5
|
+
*/
|
|
6
|
+
export declare class SitemapParser {
|
|
7
|
+
private userAgent;
|
|
8
|
+
constructor(userAgent?: string);
|
|
9
|
+
/** Parse a sitemap URL, handling both sitemap indexes and regular sitemaps */
|
|
10
|
+
parse(sitemapUrl: string): Promise<SitemapEntry[]>;
|
|
11
|
+
/** Discover sitemap URL from robots.txt or common locations */
|
|
12
|
+
discover(baseUrl: string): Promise<string | null>;
|
|
13
|
+
private fetchXml;
|
|
14
|
+
private parseEntries;
|
|
15
|
+
private parseSitemapIndex;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=sitemap.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap.d.ts","sourceRoot":"","sources":["../../src/utils/sitemap.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAE7C;;;GAGG;AACH,qBAAa,aAAa;IACxB,OAAO,CAAC,SAAS,CAAS;gBAEd,SAAS,GAAE,MAAyB;IAIhD,8EAA8E;IACxE,KAAK,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAYxD,+DAA+D;IACzD,QAAQ,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;YA4BzC,QAAQ;IAUtB,OAAO,CAAC,YAAY;IAuBpB,OAAO,CAAC,iBAAiB;CAW1B"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.SitemapParser = void 0;
|
|
27
|
+
const cheerio = __importStar(require("cheerio"));
|
|
28
|
+
/**
|
|
29
|
+
* Parse sitemap.xml and sitemap index files.
|
|
30
|
+
* Supports: standard sitemaps, sitemap indexes, gzipped sitemaps.
|
|
31
|
+
*/
|
|
32
|
+
class SitemapParser {
|
|
33
|
+
userAgent;
|
|
34
|
+
constructor(userAgent = 'WebContext/1.0') {
|
|
35
|
+
this.userAgent = userAgent;
|
|
36
|
+
}
|
|
37
|
+
/** Parse a sitemap URL, handling both sitemap indexes and regular sitemaps */
|
|
38
|
+
async parse(sitemapUrl) {
|
|
39
|
+
const xml = await this.fetchXml(sitemapUrl);
|
|
40
|
+
const sitemapUrls = this.parseSitemapIndex(xml);
|
|
41
|
+
if (sitemapUrls.length > 0) {
|
|
42
|
+
const results = await Promise.all(sitemapUrls.map((url) => this.parse(url)));
|
|
43
|
+
return results.flat();
|
|
44
|
+
}
|
|
45
|
+
return this.parseEntries(xml);
|
|
46
|
+
}
|
|
47
|
+
/** Discover sitemap URL from robots.txt or common locations */
|
|
48
|
+
async discover(baseUrl) {
|
|
49
|
+
const base = baseUrl.replace(/\/$/, '');
|
|
50
|
+
try {
|
|
51
|
+
const res = await fetch(`${base}/robots.txt`, {
|
|
52
|
+
headers: { 'User-Agent': this.userAgent },
|
|
53
|
+
});
|
|
54
|
+
if (res.ok) {
|
|
55
|
+
const text = await res.text();
|
|
56
|
+
const match = text.match(/^Sitemap:\s*(.+)$/im);
|
|
57
|
+
if (match)
|
|
58
|
+
return match[1].trim();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
catch { }
|
|
62
|
+
const commonPaths = ['/sitemap.xml', '/sitemap_index.xml'];
|
|
63
|
+
for (const path of commonPaths) {
|
|
64
|
+
try {
|
|
65
|
+
const res = await fetch(`${base}${path}`, {
|
|
66
|
+
method: 'HEAD',
|
|
67
|
+
headers: { 'User-Agent': this.userAgent },
|
|
68
|
+
});
|
|
69
|
+
if (res.ok)
|
|
70
|
+
return `${base}${path}`;
|
|
71
|
+
}
|
|
72
|
+
catch { }
|
|
73
|
+
}
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
async fetchXml(url) {
|
|
77
|
+
const res = await fetch(url, {
|
|
78
|
+
headers: { 'User-Agent': this.userAgent },
|
|
79
|
+
});
|
|
80
|
+
if (!res.ok) {
|
|
81
|
+
throw new Error(`Failed to fetch sitemap: ${url} (${res.status})`);
|
|
82
|
+
}
|
|
83
|
+
return res.text();
|
|
84
|
+
}
|
|
85
|
+
parseEntries(xml) {
|
|
86
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
87
|
+
const entries = [];
|
|
88
|
+
$('url').each((_, el) => {
|
|
89
|
+
const loc = $(el).find('loc').text().trim();
|
|
90
|
+
if (!loc)
|
|
91
|
+
return;
|
|
92
|
+
const entry = { url: loc };
|
|
93
|
+
const lastmod = $(el).find('lastmod').text().trim();
|
|
94
|
+
const changefreq = $(el).find('changefreq').text().trim();
|
|
95
|
+
const priority = $(el).find('priority').text().trim();
|
|
96
|
+
if (lastmod)
|
|
97
|
+
entry.lastmod = lastmod;
|
|
98
|
+
if (changefreq)
|
|
99
|
+
entry.changefreq = changefreq;
|
|
100
|
+
if (priority)
|
|
101
|
+
entry.priority = parseFloat(priority);
|
|
102
|
+
entries.push(entry);
|
|
103
|
+
});
|
|
104
|
+
return entries;
|
|
105
|
+
}
|
|
106
|
+
parseSitemapIndex(xml) {
|
|
107
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
108
|
+
const urls = [];
|
|
109
|
+
$('sitemapindex sitemap loc').each((_, el) => {
|
|
110
|
+
const loc = $(el).text().trim();
|
|
111
|
+
if (loc)
|
|
112
|
+
urls.push(loc);
|
|
113
|
+
});
|
|
114
|
+
return urls;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
exports.SitemapParser = SitemapParser;
|
|
118
|
+
//# sourceMappingURL=sitemap.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap.js","sourceRoot":"","sources":["../../src/utils/sitemap.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iDAAmC;AAGnC;;;GAGG;AACH,MAAa,aAAa;IAChB,SAAS,CAAS;IAE1B,YAAY,YAAoB,gBAAgB;QAC9C,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,8EAA8E;IAC9E,KAAK,CAAC,KAAK,CAAC,UAAkB;QAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QAC5C,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC;QAEhD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC7E,OAAO,OAAO,CAAC,IAAI,EAAE,CAAC;QACxB,CAAC;QAED,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAChC,CAAC;IAED,+DAA+D;IAC/D,KAAK,CAAC,QAAQ,CAAC,OAAe;QAC5B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAExC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,IAAI,aAAa,EAAE;gBAC5C,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;aAC1C,CAAC,CAAC;YACH,IAAI,GAAG,CAAC,EAAE,EAAE,CAAC;gBACX,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;gBAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAChD,IAAI,KAAK;oBAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YACpC,CAAC;QACH,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;QAEV,MAAM,WAAW,GAAG,CAAC,cAAc,EAAE,oBAAoB,CAAC,CAAC;QAC3D,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,IAAI,GAAG,IAAI,EAAE,EAAE;oBACxC,MAAM,EAAE,MAAM;oBACd,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;iBAC1C,CAAC,CAAC;gBACH,IAAI,GAAG,CAAC,EAAE;oBAAE,OAAO,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC;YACtC,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,KAAK,CAAC,QAAQ,CAAC,GAAW;QAChC,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC3B,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;SAC1C,CAAC,CAAC;QACH,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,4BAA4B,GAAG,KAAK,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC;QACrE,CAAC;QACD,OAAO,GAAG,CAAC,IAAI,EAAE,CAAC;IACpB,CAAC;IAEO,YAAY,CAAC,GAAW;QAC9B,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAmB,EAAE,CAAC;QAEnC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC5C,IAAI,CAAC,GAAG;gBAAE,OAAO;YAEjB,MAAM,KAAK,GAAiB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACpD,MAAM,UAAU,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC1D,MAAM,QAAQ,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAEtD,IAAI,OAAO;gBAAE,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;YACrC,IAAI,UAAU;gBAAE,KAAK,CAAC,UAAU,GAAG,UAAU,CAAC;YAC9C,IAAI,QAAQ;gBAAE,KAAK,CAAC,QAAQ,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;YAEpD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;QAEH,OAAO,OAAO,CAAC;IACjB,CAAC;IAEO,iBAAiB,CAAC,GAAW;QACnC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/C,MAAM,IAAI,GAAa,EAAE,CAAC;QAE1B,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC3C,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAChC,IAAI,GAAG;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC;IACd,CAAC;CACF;AA7FD,sCA6FC"}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { CrawlOptions, WebContextConfig } from '../core/types';
|
|
3
|
+
/** Zod schemas for input validation */
|
|
4
|
+
export declare const urlSchema: z.ZodString;
|
|
5
|
+
export declare const crawlOptionsSchema: z.ZodObject<{
|
|
6
|
+
url: z.ZodString;
|
|
7
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
8
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
9
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
10
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
11
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
12
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
13
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
14
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
15
|
+
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
|
|
16
|
+
url: z.ZodString;
|
|
17
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
18
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
19
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
20
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
21
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
22
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
23
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
24
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
25
|
+
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
|
|
26
|
+
url: z.ZodString;
|
|
27
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
28
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
29
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
30
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
31
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
32
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
33
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
34
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
35
|
+
}, z.ZodTypeAny, "passthrough">>;
|
|
36
|
+
export declare const webContextConfigSchema: z.ZodObject<{
|
|
37
|
+
baseUrl: z.ZodString;
|
|
38
|
+
outputDir: z.ZodOptional<z.ZodString>;
|
|
39
|
+
crawlOptions: z.ZodOptional<z.ZodObject<{
|
|
40
|
+
url: z.ZodString;
|
|
41
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
42
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
43
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
44
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
45
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
46
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
47
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
48
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
49
|
+
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
|
|
50
|
+
url: z.ZodString;
|
|
51
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
52
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
53
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
54
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
55
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
56
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
57
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
58
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
59
|
+
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
|
|
60
|
+
url: z.ZodString;
|
|
61
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
62
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
63
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
64
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
65
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
66
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
67
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
68
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
69
|
+
}, z.ZodTypeAny, "passthrough">>>;
|
|
70
|
+
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
|
|
71
|
+
baseUrl: z.ZodString;
|
|
72
|
+
outputDir: z.ZodOptional<z.ZodString>;
|
|
73
|
+
crawlOptions: z.ZodOptional<z.ZodObject<{
|
|
74
|
+
url: z.ZodString;
|
|
75
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
76
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
77
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
78
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
79
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
80
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
81
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
82
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
83
|
+
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
|
|
84
|
+
url: z.ZodString;
|
|
85
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
86
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
87
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
88
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
89
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
90
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
91
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
92
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
93
|
+
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
|
|
94
|
+
url: z.ZodString;
|
|
95
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
96
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
97
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
98
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
99
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
100
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
101
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
102
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
103
|
+
}, z.ZodTypeAny, "passthrough">>>;
|
|
104
|
+
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
|
|
105
|
+
baseUrl: z.ZodString;
|
|
106
|
+
outputDir: z.ZodOptional<z.ZodString>;
|
|
107
|
+
crawlOptions: z.ZodOptional<z.ZodObject<{
|
|
108
|
+
url: z.ZodString;
|
|
109
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
110
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
111
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
112
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
113
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
114
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
115
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
116
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
117
|
+
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
|
|
118
|
+
url: z.ZodString;
|
|
119
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
120
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
121
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
122
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
123
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
124
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
125
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
126
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
127
|
+
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
|
|
128
|
+
url: z.ZodString;
|
|
129
|
+
depth: z.ZodOptional<z.ZodNumber>;
|
|
130
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
131
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
132
|
+
delay: z.ZodOptional<z.ZodNumber>;
|
|
133
|
+
respectRobotsTxt: z.ZodOptional<z.ZodBoolean>;
|
|
134
|
+
includeSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
135
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
136
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
137
|
+
}, z.ZodTypeAny, "passthrough">>>;
|
|
138
|
+
}, z.ZodTypeAny, "passthrough">>;
|
|
139
|
+
export declare function validateUrl(url: string): string;
|
|
140
|
+
export declare function validateCrawlOptions(options: unknown): CrawlOptions;
|
|
141
|
+
export declare function validateConfig(config: unknown): WebContextConfig;
|
|
142
|
+
//# sourceMappingURL=validation.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"validation.d.ts","sourceRoot":"","sources":["../../src/utils/validation.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAE/D,uCAAuC;AACvC,eAAO,MAAM,SAAS,aAAuC,CAAC;AAE9D,eAAO,MAAM,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gCAUf,CAAC;AAEjB,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gCAInB,CAAC;AAEjB,wBAAgB,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAE/C;AAED,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,OAAO,GAAG,YAAY,CAEnE;AAED,wBAAgB,cAAc,CAAC,MAAM,EAAE,OAAO,GAAG,gBAAgB,CAEhE"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.validateConfig = exports.validateCrawlOptions = exports.validateUrl = exports.webContextConfigSchema = exports.crawlOptionsSchema = exports.urlSchema = void 0;
|
|
4
|
+
const zod_1 = require("zod");
|
|
5
|
+
/** Zod schemas for input validation */
|
|
6
|
+
exports.urlSchema = zod_1.z.string().url('Invalid URL format');
|
|
7
|
+
exports.crawlOptionsSchema = zod_1.z.object({
|
|
8
|
+
url: exports.urlSchema,
|
|
9
|
+
depth: zod_1.z.number().int().min(0, 'Depth must be >= 0').max(10, 'Depth must be <= 10').optional(),
|
|
10
|
+
maxPages: zod_1.z.number().int().min(1, 'maxPages must be >= 1').max(10000, 'maxPages must be <= 10000').optional(),
|
|
11
|
+
timeout: zod_1.z.number().int().min(1000, 'Timeout must be >= 1000ms').max(120000, 'Timeout must be <= 120000ms').optional(),
|
|
12
|
+
delay: zod_1.z.number().int().min(0, 'Delay must be >= 0').max(60000, 'Delay must be <= 60000ms').optional(),
|
|
13
|
+
respectRobotsTxt: zod_1.z.boolean().optional(),
|
|
14
|
+
includeSitemap: zod_1.z.boolean().optional(),
|
|
15
|
+
includePatterns: zod_1.z.array(zod_1.z.string()).optional(),
|
|
16
|
+
excludePatterns: zod_1.z.array(zod_1.z.string()).optional(),
|
|
17
|
+
}).passthrough();
|
|
18
|
+
exports.webContextConfigSchema = zod_1.z.object({
|
|
19
|
+
baseUrl: exports.urlSchema,
|
|
20
|
+
outputDir: zod_1.z.string().min(1, 'Output directory is required').optional(),
|
|
21
|
+
crawlOptions: exports.crawlOptionsSchema.optional(),
|
|
22
|
+
}).passthrough();
|
|
23
|
+
function validateUrl(url) {
|
|
24
|
+
return exports.urlSchema.parse(url);
|
|
25
|
+
}
|
|
26
|
+
exports.validateUrl = validateUrl;
|
|
27
|
+
function validateCrawlOptions(options) {
|
|
28
|
+
return exports.crawlOptionsSchema.parse(options);
|
|
29
|
+
}
|
|
30
|
+
exports.validateCrawlOptions = validateCrawlOptions;
|
|
31
|
+
function validateConfig(config) {
|
|
32
|
+
return exports.webContextConfigSchema.parse(config);
|
|
33
|
+
}
|
|
34
|
+
exports.validateConfig = validateConfig;
|
|
35
|
+
//# sourceMappingURL=validation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"validation.js","sourceRoot":"","sources":["../../src/utils/validation.ts"],"names":[],"mappings":";;;AAAA,6BAAwB;AAGxB,uCAAuC;AAC1B,QAAA,SAAS,GAAG,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;AAEjD,QAAA,kBAAkB,GAAG,OAAC,CAAC,MAAM,CAAC;IACzC,GAAG,EAAE,iBAAS;IACd,KAAK,EAAE,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,oBAAoB,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,qBAAqB,CAAC,CAAC,QAAQ,EAAE;IAC9F,QAAQ,EAAE,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,uBAAuB,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,2BAA2B,CAAC,CAAC,QAAQ,EAAE;IAC7G,OAAO,EAAE,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,2BAA2B,CAAC,CAAC,GAAG,CAAC,MAAM,EAAE,6BAA6B,CAAC,CAAC,QAAQ,EAAE;IACtH,KAAK,EAAE,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,oBAAoB,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,0BAA0B,CAAC,CAAC,QAAQ,EAAE;IACtG,gBAAgB,EAAE,OAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE;IACxC,cAAc,EAAE,OAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE;IACtC,eAAe,EAAE,OAAC,CAAC,KAAK,CAAC,OAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC/C,eAAe,EAAE,OAAC,CAAC,KAAK,CAAC,OAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;CAChD,CAAC,CAAC,WAAW,EAAE,CAAC;AAEJ,QAAA,sBAAsB,GAAG,OAAC,CAAC,MAAM,CAAC;IAC7C,OAAO,EAAE,iBAAS;IAClB,SAAS,EAAE,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,8BAA8B,CAAC,CAAC,QAAQ,EAAE;IACvE,YAAY,EAAE,0BAAkB,CAAC,QAAQ,EAAE;CAC5C,CAAC,CAAC,WAAW,EAAE,CAAC;AAEjB,SAAgB,WAAW,CAAC,GAAW;IACrC,OAAO,iBAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;AAC9B,CAAC;AAFD,kCAEC;AAED,SAAgB,oBAAoB,CAAC,OAAgB;IACnD,OAAO,0BAAkB,CAAC,KAAK,CAAC,OAAO,CAAiB,CAAC;AAC3D,CAAC;AAFD,oDAEC;AAED,SAAgB,cAAc,CAAC,MAAe;IAC5C,OAAO,8BAAsB,CAAC,KAAK,CAAC,MAAM,CAAqB,CAAC;AAClE,CAAC;AAFD,wCAEC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { CrawlResult, ContentDiff } from '../core/types';
|
|
2
|
+
export interface WebhookConfig {
|
|
3
|
+
url: string;
|
|
4
|
+
secret?: string;
|
|
5
|
+
events: Array<'crawl.complete' | 'crawl.error' | 'content.changed'>;
|
|
6
|
+
headers?: Record<string, string>;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Webhook notification system for crawl events.
|
|
10
|
+
* Sends POST requests to configured URLs when events occur.
|
|
11
|
+
*/
|
|
12
|
+
export declare class WebhookNotifier {
|
|
13
|
+
private configs;
|
|
14
|
+
register(config: WebhookConfig): void;
|
|
15
|
+
unregister(url: string): void;
|
|
16
|
+
notifyCrawlComplete(result: CrawlResult): Promise<void>;
|
|
17
|
+
notifyCrawlError(url: string, error: string): Promise<void>;
|
|
18
|
+
notifyContentChanged(diffs: ContentDiff[]): Promise<void>;
|
|
19
|
+
private send;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=webhook.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"webhook.d.ts","sourceRoot":"","sources":["../../src/utils/webhook.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAEzD,MAAM,WAAW,aAAa;IAC5B,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,KAAK,CAAC,gBAAgB,GAAG,aAAa,GAAG,iBAAiB,CAAC,CAAC;IACpE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAClC;AAED;;;GAGG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,OAAO,CAAuB;IAEtC,QAAQ,CAAC,MAAM,EAAE,aAAa,GAAG,IAAI;IAIrC,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAIvB,mBAAmB,CAAC,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAgBvD,gBAAgB,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAU3D,oBAAoB,CAAC,KAAK,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;YAkBjD,IAAI;CAwBnB"}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.WebhookNotifier = void 0;
|
|
27
|
+
/**
|
|
28
|
+
* Webhook notification system for crawl events.
|
|
29
|
+
* Sends POST requests to configured URLs when events occur.
|
|
30
|
+
*/
|
|
31
|
+
class WebhookNotifier {
|
|
32
|
+
configs = [];
|
|
33
|
+
register(config) {
|
|
34
|
+
this.configs.push(config);
|
|
35
|
+
}
|
|
36
|
+
unregister(url) {
|
|
37
|
+
this.configs = this.configs.filter(c => c.url !== url);
|
|
38
|
+
}
|
|
39
|
+
async notifyCrawlComplete(result) {
|
|
40
|
+
const subscribers = this.configs.filter(c => c.events.includes('crawl.complete'));
|
|
41
|
+
const payload = {
|
|
42
|
+
event: 'crawl.complete',
|
|
43
|
+
timestamp: new Date().toISOString(),
|
|
44
|
+
data: {
|
|
45
|
+
source: result.context.source,
|
|
46
|
+
pagesProcessed: result.stats.pagesProcessed,
|
|
47
|
+
totalTokens: result.stats.totalTokens,
|
|
48
|
+
duration: result.stats.duration,
|
|
49
|
+
errors: result.stats.errors.length,
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
await this.send(subscribers, payload);
|
|
53
|
+
}
|
|
54
|
+
async notifyCrawlError(url, error) {
|
|
55
|
+
const subscribers = this.configs.filter(c => c.events.includes('crawl.error'));
|
|
56
|
+
const payload = {
|
|
57
|
+
event: 'crawl.error',
|
|
58
|
+
timestamp: new Date().toISOString(),
|
|
59
|
+
data: { url, error },
|
|
60
|
+
};
|
|
61
|
+
await this.send(subscribers, payload);
|
|
62
|
+
}
|
|
63
|
+
async notifyContentChanged(diffs) {
|
|
64
|
+
if (!diffs.length)
|
|
65
|
+
return;
|
|
66
|
+
const subscribers = this.configs.filter(c => c.events.includes('content.changed'));
|
|
67
|
+
const payload = {
|
|
68
|
+
event: 'content.changed',
|
|
69
|
+
timestamp: new Date().toISOString(),
|
|
70
|
+
data: {
|
|
71
|
+
changedPages: diffs.length,
|
|
72
|
+
diffs: diffs.map(d => ({
|
|
73
|
+
url: d.url,
|
|
74
|
+
addedSections: d.addedSections,
|
|
75
|
+
removedSections: d.removedSections,
|
|
76
|
+
})),
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
await this.send(subscribers, payload);
|
|
80
|
+
}
|
|
81
|
+
async send(subscribers, payload) {
|
|
82
|
+
const promises = subscribers.map(async (config) => {
|
|
83
|
+
try {
|
|
84
|
+
const headers = {
|
|
85
|
+
'Content-Type': 'application/json',
|
|
86
|
+
...config.headers,
|
|
87
|
+
};
|
|
88
|
+
if (config.secret) {
|
|
89
|
+
const { createHmac } = await Promise.resolve().then(() => __importStar(require('crypto')));
|
|
90
|
+
const signature = createHmac('sha256', config.secret)
|
|
91
|
+
.update(JSON.stringify(payload))
|
|
92
|
+
.digest('hex');
|
|
93
|
+
headers['X-Webhook-Signature'] = signature;
|
|
94
|
+
}
|
|
95
|
+
await fetch(config.url, {
|
|
96
|
+
method: 'POST',
|
|
97
|
+
headers,
|
|
98
|
+
body: JSON.stringify(payload),
|
|
99
|
+
signal: AbortSignal.timeout(10000),
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
catch { }
|
|
103
|
+
});
|
|
104
|
+
await Promise.allSettled(promises);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
exports.WebhookNotifier = WebhookNotifier;
|
|
108
|
+
//# sourceMappingURL=webhook.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"webhook.js","sourceRoot":"","sources":["../../src/utils/webhook.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AASA;;;GAGG;AACH,MAAa,eAAe;IAClB,OAAO,GAAoB,EAAE,CAAC;IAEtC,QAAQ,CAAC,MAAqB;QAC5B,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAED,UAAU,CAAC,GAAW;QACpB,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,mBAAmB,CAAC,MAAmB;QAC3C,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;QAClF,MAAM,OAAO,GAAG;YACd,KAAK,EAAE,gBAAgB;YACvB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,IAAI,EAAE;gBACJ,MAAM,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;gBAC7B,cAAc,EAAE,MAAM,CAAC,KAAK,CAAC,cAAc;gBAC3C,WAAW,EAAE,MAAM,CAAC,KAAK,CAAC,WAAW;gBACrC,QAAQ,EAAE,MAAM,CAAC,KAAK,CAAC,QAAQ;gBAC/B,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM;aACnC;SACF,CAAC;QACF,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACxC,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,GAAW,EAAE,KAAa;QAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;QAC/E,MAAM,OAAO,GAAG;YACd,KAAK,EAAE,aAAa;YACpB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,IAAI,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE;SACrB,CAAC;QACF,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACxC,CAAC;IAED,KAAK,CAAC,oBAAoB,CAAC,KAAoB;QAC7C,IAAI,CAAC,KAAK,CAAC,MAAM;YAAE,OAAO;QAC1B,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC,CAAC;QACnF,MAAM,OAAO,GAAG;YACd,KAAK,EAAE,iBAAiB;YACxB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,IAAI,EAAE;gBACJ,YAAY,EAAE,KAAK,CAAC,MAAM;gBAC1B,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;oBACrB,GAAG,EAAE,CAAC,CAAC,GAAG;oBACV,aAAa,EAAE,CAAC,CAAC,aAAa;oBAC9B,eAAe,EAAE,CAAC,CAAC,eAAe;iBACnC,CAAC,CAAC;aACJ;SACF,CAAC;QACF,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACxC,CAAC;IAEO,KAAK,CAAC,IAAI,CAAC,WAA4B,EAAE,OAAY;QAC3D,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE;YAChD,IAAI,CAAC;gBACH,MAAM,OAAO,GAA2B;oBACtC,cAAc,EAAE,kBAAkB;oBAClC,GAAG,MAAM,CAAC,OAAO;iBAClB,CAAC;gBACF,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;oBAClB,MAAM,EAAE,UAAU,EAAE,GAAG,wDAAa,QAAQ,GAAC,CAAC;oBAC9C,MAAM,SAAS,GAAG,UAAU,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC;yBAClD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;yBAC/B,MAAM,CAAC,KAAK,CAAC,CAAC;oBACjB,OAAO,CAAC,qBAAqB,CAAC,GAAG,SAAS,CAAC;gBAC7C,CAAC;gBACD,MAAM,KAAK,CAAC,MAAM,CAAC,GAAG,EAAE;oBACtB,MAAM,EAAE,MAAM;oBACd,OAAO;oBACP,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;oBAC7B,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;iBACnC,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC,CAAC,CAAC;QACH,MAAM,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;IACrC,CAAC;CACF;AA/ED,0CA+EC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "webcontext-ai",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Turn any web content into clean AI-ready context — with crawling, chunking, semantic search, and MCP tools",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"bin": {
|
|
8
|
+
"webcontext": "./dist/cli/index.js",
|
|
9
|
+
"webcontext-mcp": "./dist/mcp-server.js"
|
|
10
|
+
},
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"require": "./dist/index.js"
|
|
15
|
+
},
|
|
16
|
+
"./sdk/client": {
|
|
17
|
+
"types": "./dist/sdk/client.d.ts",
|
|
18
|
+
"require": "./dist/sdk/client.js"
|
|
19
|
+
},
|
|
20
|
+
"./sdk/mcp": {
|
|
21
|
+
"types": "./dist/sdk/mcp.d.ts",
|
|
22
|
+
"require": "./dist/sdk/mcp.js"
|
|
23
|
+
},
|
|
24
|
+
"./search": {
|
|
25
|
+
"types": "./dist/search/vector.d.ts",
|
|
26
|
+
"require": "./dist/search/vector.js"
|
|
27
|
+
},
|
|
28
|
+
"./utils": {
|
|
29
|
+
"types": "./dist/utils/index.d.ts",
|
|
30
|
+
"require": "./dist/utils/index.js"
|
|
31
|
+
},
|
|
32
|
+
"./export": {
|
|
33
|
+
"types": "./dist/export/index.d.ts",
|
|
34
|
+
"require": "./dist/export/index.js"
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
"files": [
|
|
38
|
+
"dist",
|
|
39
|
+
"README.md",
|
|
40
|
+
"LICENSE"
|
|
41
|
+
],
|
|
42
|
+
"scripts": {
|
|
43
|
+
"build": "tsc",
|
|
44
|
+
"prepublishOnly": "npm run build",
|
|
45
|
+
"dev": "ts-node src/index.ts",
|
|
46
|
+
"cli": "ts-node src/cli/index.ts",
|
|
47
|
+
"start": "node dist/cli/index.js",
|
|
48
|
+
"serve": "node dist/cli/index.js serve",
|
|
49
|
+
"test": "jest",
|
|
50
|
+
"lint": "eslint src/"
|
|
51
|
+
},
|
|
52
|
+
"keywords": [
|
|
53
|
+
"web-scraping",
|
|
54
|
+
"ai-context",
|
|
55
|
+
"llm",
|
|
56
|
+
"rag",
|
|
57
|
+
"markdown",
|
|
58
|
+
"crawling",
|
|
59
|
+
"web-crawler",
|
|
60
|
+
"documentation",
|
|
61
|
+
"firecrawl",
|
|
62
|
+
"mcp",
|
|
63
|
+
"vector-search",
|
|
64
|
+
"sitemap",
|
|
65
|
+
"tfidf",
|
|
66
|
+
"semantic-search",
|
|
67
|
+
"mcp-server",
|
|
68
|
+
"langchain",
|
|
69
|
+
"pdf-extraction",
|
|
70
|
+
"github",
|
|
71
|
+
"deduplication"
|
|
72
|
+
],
|
|
73
|
+
"author": "sumeethmoolya",
|
|
74
|
+
"repository": {
|
|
75
|
+
"type": "git",
|
|
76
|
+
"url": "https://github.com/Sumeeth-24/webScrapper-ai.git"
|
|
77
|
+
},
|
|
78
|
+
"homepage": "https://github.com/Sumeeth-24/webScrapper-ai#readme",
|
|
79
|
+
"license": "MIT",
|
|
80
|
+
"dependencies": {
|
|
81
|
+
"turndown": "7.2.0",
|
|
82
|
+
"cheerio": "1.0.0-rc.12",
|
|
83
|
+
"tiktoken": "1.0.15",
|
|
84
|
+
"commander": "12.1.0",
|
|
85
|
+
"ora": "5.4.1",
|
|
86
|
+
"chalk": "4.1.2",
|
|
87
|
+
"p-queue": "6.6.2",
|
|
88
|
+
"robots-parser": "3.0.1",
|
|
89
|
+
"lru-cache": "10.2.2",
|
|
90
|
+
"zod": "3.23.8",
|
|
91
|
+
"express": "4.19.2",
|
|
92
|
+
"cors": "2.8.5"
|
|
93
|
+
},
|
|
94
|
+
"optionalDependencies": {
|
|
95
|
+
"playwright": "1.44.0",
|
|
96
|
+
"pdf-parse": "1.1.1"
|
|
97
|
+
},
|
|
98
|
+
"devDependencies": {
|
|
99
|
+
"typescript": "5.4.5",
|
|
100
|
+
"@types/node": "20.12.12",
|
|
101
|
+
"@types/turndown": "5.0.4",
|
|
102
|
+
"@types/express": "4.17.21",
|
|
103
|
+
"@types/cors": "2.8.17",
|
|
104
|
+
"jest": "29.7.0",
|
|
105
|
+
"ts-jest": "29.1.4",
|
|
106
|
+
"ts-node": "10.9.2",
|
|
107
|
+
"eslint": "9.3.0"
|
|
108
|
+
}
|
|
109
|
+
}
|