webcontext-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +583 -0
- package/dist/browser/manager.d.ts +47 -0
- package/dist/browser/manager.d.ts.map +1 -0
- package/dist/browser/manager.js +215 -0
- package/dist/browser/manager.js.map +1 -0
- package/dist/cache/cache.d.ts +22 -0
- package/dist/cache/cache.d.ts.map +1 -0
- package/dist/cache/cache.js +150 -0
- package/dist/cache/cache.js.map +1 -0
- package/dist/chunking/chunker.d.ts +26 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +208 -0
- package/dist/chunking/chunker.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +406 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/pipeline.d.ts +35 -0
- package/dist/core/pipeline.d.ts.map +1 -0
- package/dist/core/pipeline.js +476 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/stream.d.ts +48 -0
- package/dist/core/stream.d.ts.map +1 -0
- package/dist/core/stream.js +72 -0
- package/dist/core/stream.js.map +1 -0
- package/dist/core/types.d.ts +259 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +4 -0
- package/dist/core/types.js.map +1 -0
- package/dist/export/index.d.ts +3 -0
- package/dist/export/index.d.ts.map +1 -0
- package/dist/export/index.js +8 -0
- package/dist/export/index.js.map +1 -0
- package/dist/export/templates.d.ts +25 -0
- package/dist/export/templates.d.ts.map +1 -0
- package/dist/export/templates.js +76 -0
- package/dist/export/templates.js.map +1 -0
- package/dist/export/vectordb.d.ts +21 -0
- package/dist/export/vectordb.d.ts.map +1 -0
- package/dist/export/vectordb.js +101 -0
- package/dist/export/vectordb.js.map +1 -0
- package/dist/extractors/content.d.ts +23 -0
- package/dist/extractors/content.d.ts.map +1 -0
- package/dist/extractors/content.js +328 -0
- package/dist/extractors/content.js.map +1 -0
- package/dist/extractors/github.d.ts +19 -0
- package/dist/extractors/github.d.ts.map +1 -0
- package/dist/extractors/github.js +150 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/images.d.ts +20 -0
- package/dist/extractors/images.d.ts.map +1 -0
- package/dist/extractors/images.js +73 -0
- package/dist/extractors/images.js.map +1 -0
- package/dist/extractors/pdf.d.ts +11 -0
- package/dist/extractors/pdf.d.ts.map +1 -0
- package/dist/extractors/pdf.js +107 -0
- package/dist/extractors/pdf.js.map +1 -0
- package/dist/extractors/screenshot.d.ts +21 -0
- package/dist/extractors/screenshot.d.ts.map +1 -0
- package/dist/extractors/screenshot.js +85 -0
- package/dist/extractors/screenshot.js.map +1 -0
- package/dist/index.d.ts +70 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +206 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp-server.d.ts +3 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +108 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/sdk/client.d.ts +48 -0
- package/dist/sdk/client.d.ts.map +1 -0
- package/dist/sdk/client.js +120 -0
- package/dist/sdk/client.js.map +1 -0
- package/dist/sdk/mcp.d.ts +12 -0
- package/dist/sdk/mcp.d.ts.map +1 -0
- package/dist/sdk/mcp.js +146 -0
- package/dist/sdk/mcp.js.map +1 -0
- package/dist/sdk/server.d.ts +5 -0
- package/dist/sdk/server.d.ts.map +1 -0
- package/dist/sdk/server.js +158 -0
- package/dist/sdk/server.js.map +1 -0
- package/dist/search/vector.d.ts +26 -0
- package/dist/search/vector.d.ts.map +1 -0
- package/dist/search/vector.js +142 -0
- package/dist/search/vector.js.map +1 -0
- package/dist/transformers/markdown.d.ts +21 -0
- package/dist/transformers/markdown.d.ts.map +1 -0
- package/dist/transformers/markdown.js +242 -0
- package/dist/transformers/markdown.js.map +1 -0
- package/dist/utils/dedup.d.ts +20 -0
- package/dist/utils/dedup.d.ts.map +1 -0
- package/dist/utils/dedup.js +61 -0
- package/dist/utils/dedup.js.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +15 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/metrics.d.ts +16 -0
- package/dist/utils/metrics.d.ts.map +1 -0
- package/dist/utils/metrics.js +28 -0
- package/dist/utils/metrics.js.map +1 -0
- package/dist/utils/scheduler.d.ts +19 -0
- package/dist/utils/scheduler.d.ts.map +1 -0
- package/dist/utils/scheduler.js +63 -0
- package/dist/utils/scheduler.js.map +1 -0
- package/dist/utils/sitemap.d.ts +17 -0
- package/dist/utils/sitemap.d.ts.map +1 -0
- package/dist/utils/sitemap.js +118 -0
- package/dist/utils/sitemap.js.map +1 -0
- package/dist/utils/validation.d.ts +142 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +35 -0
- package/dist/utils/validation.js.map +1 -0
- package/dist/utils/webhook.d.ts +21 -0
- package/dist/utils/webhook.d.ts.map +1 -0
- package/dist/utils/webhook.js +108 -0
- package/dist/utils/webhook.js.map +1 -0
- package/package.json +109 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.VectorSearch = void 0;
|
|
4
|
+
const crypto_1 = require("crypto");
|
|
5
|
+
const STOPWORDS = new Set([
|
|
6
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
7
|
+
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
|
|
8
|
+
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
9
|
+
'could', 'should', 'may', 'might', 'shall', 'can', 'need', 'dare',
|
|
10
|
+
'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
|
|
11
|
+
'we', 'our', 'you', 'your', 'he', 'him', 'his', 'she', 'her',
|
|
12
|
+
'they', 'them', 'their', 'what', 'which', 'who', 'whom', 'when',
|
|
13
|
+
'where', 'why', 'how', 'not', 'no', 'nor', 'as', 'if', 'then',
|
|
14
|
+
'than', 'too', 'very', 'just', 'about', 'above', 'after', 'again',
|
|
15
|
+
'all', 'also', 'am', 'any', 'because', 'before', 'between', 'both',
|
|
16
|
+
'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only',
|
|
17
|
+
'own', 'same', 'so', 'over', 'under', 'up', 'down', 'out', 'off',
|
|
18
|
+
'once', 'here', 'there', 'into', 'through', 'during', 'further',
|
|
19
|
+
]);
|
|
20
|
+
/**
|
|
21
|
+
* Simple in-memory vector search for content chunks.
|
|
22
|
+
* Uses TF-IDF based embeddings for local semantic search without external dependencies.
|
|
23
|
+
*/
|
|
24
|
+
class VectorSearch {
|
|
25
|
+
embeddings = [];
|
|
26
|
+
vocabulary = new Map();
|
|
27
|
+
idfScores = new Map();
|
|
28
|
+
docFrequency = new Map();
|
|
29
|
+
/** Index chunks for search */
|
|
30
|
+
index(chunks) {
|
|
31
|
+
this.embeddings = [];
|
|
32
|
+
this.vocabulary.clear();
|
|
33
|
+
this.idfScores.clear();
|
|
34
|
+
const tokenizedDocs = chunks.map(c => this.tokenize(c.content));
|
|
35
|
+
this.buildVocabulary(tokenizedDocs);
|
|
36
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
37
|
+
const vector = this.computeTfIdf(tokenizedDocs[i]);
|
|
38
|
+
const id = (0, crypto_1.createHash)('md5').update(chunks[i].id).digest('hex');
|
|
39
|
+
this.embeddings.push({ id, vector, content: chunks[i].content, metadata: chunks[i].metadata });
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
/** Search indexed chunks by query */
|
|
43
|
+
search(query, topK = 5) {
|
|
44
|
+
if (this.embeddings.length === 0)
|
|
45
|
+
return [];
|
|
46
|
+
const queryTokens = this.tokenize(query);
|
|
47
|
+
const queryVector = this.computeTfIdf(queryTokens);
|
|
48
|
+
const results = this.embeddings.map(emb => ({
|
|
49
|
+
chunk: { id: emb.id, content: emb.content, tokens: emb.content.split(/\s+/).length, metadata: emb.metadata },
|
|
50
|
+
score: this.cosineSimilarity(queryVector, emb.vector),
|
|
51
|
+
}));
|
|
52
|
+
return results
|
|
53
|
+
.filter(r => r.score > 0)
|
|
54
|
+
.sort((a, b) => b.score - a.score)
|
|
55
|
+
.slice(0, topK);
|
|
56
|
+
}
|
|
57
|
+
/** Add a single chunk to the index and recompute IDF from actual document frequencies */
|
|
58
|
+
addChunk(chunk) {
|
|
59
|
+
const tokens = this.tokenize(chunk.content);
|
|
60
|
+
const newTerms = new Set(tokens);
|
|
61
|
+
// Register new terms in vocabulary
|
|
62
|
+
for (const term of newTerms) {
|
|
63
|
+
if (!this.vocabulary.has(term)) {
|
|
64
|
+
this.vocabulary.set(term, this.vocabulary.size);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
// Track document frequency accurately
|
|
68
|
+
for (const term of newTerms) {
|
|
69
|
+
this.docFrequency.set(term, (this.docFrequency.get(term) || 0) + 1);
|
|
70
|
+
}
|
|
71
|
+
const N = this.embeddings.length + 1;
|
|
72
|
+
// Recompute IDF for all terms using exact document frequencies
|
|
73
|
+
for (const [term] of this.vocabulary) {
|
|
74
|
+
const df = this.docFrequency.get(term) || 1;
|
|
75
|
+
this.idfScores.set(term, Math.log(N / df));
|
|
76
|
+
}
|
|
77
|
+
const vector = this.computeTfIdf(tokens);
|
|
78
|
+
const id = (0, crypto_1.createHash)('md5').update(chunk.id).digest('hex');
|
|
79
|
+
this.embeddings.push({ id, vector, content: chunk.content, metadata: chunk.metadata });
|
|
80
|
+
}
|
|
81
|
+
/** Clear the index */
|
|
82
|
+
clear() {
|
|
83
|
+
this.embeddings = [];
|
|
84
|
+
this.vocabulary.clear();
|
|
85
|
+
this.idfScores.clear();
|
|
86
|
+
this.docFrequency.clear();
|
|
87
|
+
}
|
|
88
|
+
/** Get index size */
|
|
89
|
+
get size() {
|
|
90
|
+
return this.embeddings.length;
|
|
91
|
+
}
|
|
92
|
+
tokenize(text) {
|
|
93
|
+
return text
|
|
94
|
+
.toLowerCase()
|
|
95
|
+
.split(/[^a-z0-9]+/)
|
|
96
|
+
.filter(t => t.length > 1 && !STOPWORDS.has(t));
|
|
97
|
+
}
|
|
98
|
+
computeTfIdf(tokens) {
|
|
99
|
+
const vector = new Array(this.vocabulary.size).fill(0);
|
|
100
|
+
const tf = new Map();
|
|
101
|
+
for (const token of tokens) {
|
|
102
|
+
tf.set(token, (tf.get(token) || 0) + 1);
|
|
103
|
+
}
|
|
104
|
+
for (const [term, count] of tf) {
|
|
105
|
+
const idx = this.vocabulary.get(term);
|
|
106
|
+
if (idx !== undefined) {
|
|
107
|
+
const idf = this.idfScores.get(term) || 0;
|
|
108
|
+
vector[idx] = (count / tokens.length) * idf;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return vector;
|
|
112
|
+
}
|
|
113
|
+
cosineSimilarity(a, b) {
|
|
114
|
+
const len = Math.max(a.length, b.length);
|
|
115
|
+
let dot = 0, magA = 0, magB = 0;
|
|
116
|
+
for (let i = 0; i < len; i++) {
|
|
117
|
+
const ai = a[i] || 0;
|
|
118
|
+
const bi = b[i] || 0;
|
|
119
|
+
dot += ai * bi;
|
|
120
|
+
magA += ai * ai;
|
|
121
|
+
magB += bi * bi;
|
|
122
|
+
}
|
|
123
|
+
const denom = Math.sqrt(magA) * Math.sqrt(magB);
|
|
124
|
+
return denom === 0 ? 0 : dot / denom;
|
|
125
|
+
}
|
|
126
|
+
buildVocabulary(documents) {
|
|
127
|
+
const df = new Map();
|
|
128
|
+
const N = documents.length;
|
|
129
|
+
for (const doc of documents) {
|
|
130
|
+
for (const term of new Set(doc)) {
|
|
131
|
+
df.set(term, (df.get(term) || 0) + 1);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
let idx = 0;
|
|
135
|
+
for (const [term, docFreq] of df) {
|
|
136
|
+
this.vocabulary.set(term, idx++);
|
|
137
|
+
this.idfScores.set(term, Math.log(N / docFreq));
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
exports.VectorSearch = VectorSearch;
|
|
142
|
+
//# sourceMappingURL=vector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vector.js","sourceRoot":"","sources":["../../src/search/vector.ts"],"names":[],"mappings":";;;AACA,mCAAoC;AAEpC,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC;IACxB,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK;IACnE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM;IACpE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO;IACnE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM;IACjE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI;IAC9D,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK;IAC5D,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM;IAC/D,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM;IAC7D,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO;IACjE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM;IAClE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9D,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK;IAChE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS;CAChE,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAa,YAAY;IACf,UAAU,GAAsB,EAAE,CAAC;IACnC,UAAU,GAAwB,IAAI,GAAG,EAAE,CAAC;IAC5C,SAAS,GAAwB,IAAI,GAAG,EAAE,CAAC;IAC3C,YAAY,GAAwB,IAAI,GAAG,EAAE,CAAC;IAEtD,8BAA8B;IAC9B,KAAK,CAAC,MAAsB;QAC1B,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACrB,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QAEvB,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;QAChE,IAAI,CAAC,eAAe,CAAC,aAAa,CAAC,CAAC;QAEpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;YACnD,MAAM,EAAE,GAAG,IAAA,mBAAU,EAAC,KAAK,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAChE,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QACjG,CAAC;IACH,CAAC;IAED,qCAAqC;IACrC,MAAM,CAAC,KAAa,EAAE,OAAe,CAAC;QACpC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAE5C,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC;QAEnD,MAAM,OAAO,GAAmB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC1D,KAAK,EAAE,EAAE,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE;YAC5G,KAAK,EAAE,IAAI,CAAC,gBAAgB,CAAC,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC;SACtD,CAAC,CAAC,CAAC;QAEJ,OAAO,OAAO;aACX,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC;aACxB,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC;aACjC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IACpB,CAAC;IAED,yFAAyF;IACzF,QAAQ,CAAC,KAAmB;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC;QAEjC,mCAAmC;QACnC,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;YAC5B,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/B,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;YAClD,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;YAC5B,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACtE,CAAC;QAED,MAAM,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC;QAErC,+DAA+D;QAC/D,KAAK,MAAM,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACrC,MAAM,EAAE,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC5C,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QACzC,MAAM,EAAE,GAAG,IAAA,mBAAU,EAAC,KAAK,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAC5D,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IACzF,CAAC;IAED,sBAAsB;IACtB,KAAK;QACH,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACrB,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QACvB,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,CAAC;IAC5B,CAAC;IAED,qBAAqB;IACrB,IAAI,IAAI;QACN,OAAO,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;IAChC,CAAC;IAEO,QAAQ,CAAC,IAAY;QAC3B,OAAO,IAAI;aACR,WAAW,EAAE;aACb,KAAK,CAAC,YAAY,CAAC;aACnB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACpD,CAAC;IAEO,YAAY,CAAC,MAAgB;QACnC,MAAM,MAAM,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvD,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAC;QAErC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1C,CAAC;QAED,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YAC/B,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACtC,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;gBACtB,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAC1C,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;YAC9C,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,gBAAgB,CAAC,CAAW,EAAE,CAAW;QAC/C,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC;QACzC,IAAI,GAAG,GAAG,CAAC,EAAE,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,CAAC,CAAC;QAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7B,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACrB,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACrB,GAAG,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YAChB,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;QAClB,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,OAAO,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC;IACvC,CAAC;IAEO,eAAe,CAAC,SAAqB;QAC3C,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAC;QACrC,MAAM,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;QAE3B,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,KAAK,MAAM,IAAI,IAAI,IAAI,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBAChC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACxC,CAAC;QACH,CAAC;QAED,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YACjC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC;YACjC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;CACF;AA7ID,oCA6IC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export interface MarkdownTransformerOptions {
|
|
2
|
+
preserveImages?: boolean;
|
|
3
|
+
}
|
|
4
|
+
/**
|
|
5
|
+
* Transforms cleaned HTML into high-quality Markdown optimized for LLM consumption.
|
|
6
|
+
* Preserves code blocks, tables, headings hierarchy, and semantic structure.
|
|
7
|
+
*/
|
|
8
|
+
export declare class MarkdownTransformer {
|
|
9
|
+
private turndown;
|
|
10
|
+
private options;
|
|
11
|
+
constructor(options?: MarkdownTransformerOptions);
|
|
12
|
+
private configureRules;
|
|
13
|
+
transform(html: string): string;
|
|
14
|
+
private postProcess;
|
|
15
|
+
private tableToMarkdown;
|
|
16
|
+
/**
|
|
17
|
+
* Builds a 2D grid from a table, expanding colspan/rowspan into repeated cells.
|
|
18
|
+
*/
|
|
19
|
+
private buildTableGrid;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../src/transformers/markdown.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,0BAA0B;IACzC,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED;;;GAGG;AACH,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,OAAO,CAA6B;gBAEhC,OAAO,GAAE,0BAA+B;IAYpD,OAAO,CAAC,cAAc;IAiItB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAM/B,OAAO,CAAC,WAAW;IAmBnB,OAAO,CAAC,eAAe;IAoBvB;;OAEG;IACH,OAAO,CAAC,cAAc;CAqDvB"}
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.MarkdownTransformer = void 0;
|
|
7
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
8
|
+
/**
|
|
9
|
+
* Transforms cleaned HTML into high-quality Markdown optimized for LLM consumption.
|
|
10
|
+
* Preserves code blocks, tables, headings hierarchy, and semantic structure.
|
|
11
|
+
*/
|
|
12
|
+
class MarkdownTransformer {
|
|
13
|
+
turndown;
|
|
14
|
+
options;
|
|
15
|
+
constructor(options = {}) {
|
|
16
|
+
this.options = options;
|
|
17
|
+
this.turndown = new turndown_1.default({
|
|
18
|
+
headingStyle: 'atx',
|
|
19
|
+
codeBlockStyle: 'fenced',
|
|
20
|
+
bulletListMarker: '-',
|
|
21
|
+
emDelimiter: '*',
|
|
22
|
+
});
|
|
23
|
+
this.configureRules();
|
|
24
|
+
}
|
|
25
|
+
configureRules() {
|
|
26
|
+
// Preserve code blocks with language hints
|
|
27
|
+
this.turndown.addRule('fencedCodeBlock', {
|
|
28
|
+
filter: (node) => node.nodeName === 'PRE' && !!node.querySelector('code'),
|
|
29
|
+
replacement: (_, node) => {
|
|
30
|
+
const codeEl = node.querySelector('code');
|
|
31
|
+
if (!codeEl)
|
|
32
|
+
return '';
|
|
33
|
+
const code = codeEl.textContent || '';
|
|
34
|
+
const classes = codeEl.className || '';
|
|
35
|
+
const langMatch = classes.match(/(?:language-|lang-)(\w+)/);
|
|
36
|
+
const lang = langMatch?.[1] || '';
|
|
37
|
+
return `\n\n\`\`\`${lang}\n${code.trim()}\n\`\`\`\n\n`;
|
|
38
|
+
},
|
|
39
|
+
});
|
|
40
|
+
// Complex table handling with colspan/rowspan
|
|
41
|
+
this.turndown.addRule('table', {
|
|
42
|
+
filter: 'table',
|
|
43
|
+
replacement: (_, node) => {
|
|
44
|
+
return '\n\n' + this.tableToMarkdown(node) + '\n\n';
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
// Remove empty links
|
|
48
|
+
this.turndown.addRule('emptyLinks', {
|
|
49
|
+
filter: (node) => node.nodeName === 'A' && !(node.textContent?.trim()),
|
|
50
|
+
replacement: () => '',
|
|
51
|
+
});
|
|
52
|
+
// Image alt text preservation
|
|
53
|
+
this.turndown.addRule('images', {
|
|
54
|
+
filter: 'img',
|
|
55
|
+
replacement: (_, node) => {
|
|
56
|
+
const el = node;
|
|
57
|
+
const alt = el.getAttribute('alt') || '';
|
|
58
|
+
const src = el.getAttribute('src') || '';
|
|
59
|
+
const title = el.getAttribute('title');
|
|
60
|
+
if (!this.options.preserveImages) {
|
|
61
|
+
return alt ? `[Image: ${alt}]` : '';
|
|
62
|
+
}
|
|
63
|
+
const titlePart = title ? ` "${title}"` : '';
|
|
64
|
+
return ``;
|
|
65
|
+
},
|
|
66
|
+
});
|
|
67
|
+
// Nested list handling with proper indentation
|
|
68
|
+
this.turndown.addRule('listItem', {
|
|
69
|
+
filter: 'li',
|
|
70
|
+
replacement: (content, node) => {
|
|
71
|
+
const el = node;
|
|
72
|
+
const parent = el.parentElement;
|
|
73
|
+
const isOrdered = parent?.nodeName === 'OL';
|
|
74
|
+
// Calculate nesting depth
|
|
75
|
+
let depth = 0;
|
|
76
|
+
let ancestor = parent?.parentElement;
|
|
77
|
+
while (ancestor) {
|
|
78
|
+
if (ancestor.nodeName === 'UL' || ancestor.nodeName === 'OL') {
|
|
79
|
+
depth++;
|
|
80
|
+
}
|
|
81
|
+
ancestor = ancestor.parentElement;
|
|
82
|
+
}
|
|
83
|
+
const indent = ' '.repeat(depth);
|
|
84
|
+
const trimmed = content
|
|
85
|
+
.replace(/^\n+/, '')
|
|
86
|
+
.replace(/\n+$/, '')
|
|
87
|
+
.replace(/\n/g, `\n${indent} `);
|
|
88
|
+
// Task list detection
|
|
89
|
+
const checkbox = el.querySelector('input[type="checkbox"]');
|
|
90
|
+
if (checkbox) {
|
|
91
|
+
const checked = checkbox.checked || checkbox.hasAttribute('checked');
|
|
92
|
+
return `${indent}${checked ? '- [x]' : '- [ ]'} ${trimmed}\n`;
|
|
93
|
+
}
|
|
94
|
+
if (isOrdered) {
|
|
95
|
+
const start = parent?.getAttribute('start');
|
|
96
|
+
const index = Array.from(parent.children).indexOf(el);
|
|
97
|
+
const num = (start ? parseInt(start, 10) : 1) + index;
|
|
98
|
+
return `${indent}${num}. ${trimmed}\n`;
|
|
99
|
+
}
|
|
100
|
+
return `${indent}- ${trimmed}\n`;
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
// Strikethrough support
|
|
104
|
+
this.turndown.addRule('strikethrough', {
|
|
105
|
+
filter: (node) => node.nodeName === 'DEL' ||
|
|
106
|
+
node.nodeName === 'S' ||
|
|
107
|
+
node.nodeName === 'STRIKE',
|
|
108
|
+
replacement: (content) => `~~${content}~~`,
|
|
109
|
+
});
|
|
110
|
+
// Details/summary element handling
|
|
111
|
+
this.turndown.addRule('details', {
|
|
112
|
+
filter: 'details',
|
|
113
|
+
replacement: (_, node) => {
|
|
114
|
+
const el = node;
|
|
115
|
+
const summary = el.querySelector('summary');
|
|
116
|
+
const summaryText = summary?.textContent?.trim() || 'Details';
|
|
117
|
+
// Get content excluding the summary element
|
|
118
|
+
const clone = el.cloneNode(true);
|
|
119
|
+
const summaryEl = clone.querySelector('summary');
|
|
120
|
+
summaryEl?.remove();
|
|
121
|
+
const bodyContent = this.turndown.turndown(clone.innerHTML).trim();
|
|
122
|
+
return `\n\n<details>\n<summary>${summaryText}</summary>\n\n${bodyContent}\n\n</details>\n\n`;
|
|
123
|
+
},
|
|
124
|
+
});
|
|
125
|
+
// Admonition/callout detection (blockquotes with alert markers)
|
|
126
|
+
this.turndown.addRule('admonition', {
|
|
127
|
+
filter: (node) => {
|
|
128
|
+
if (node.nodeName !== 'BLOCKQUOTE')
|
|
129
|
+
return false;
|
|
130
|
+
const text = node.textContent || '';
|
|
131
|
+
return /^\s*\[!(NOTE|TIP|IMPORTANT|WARNING|CAUTION)\]/i.test(text);
|
|
132
|
+
},
|
|
133
|
+
replacement: (_, node) => {
|
|
134
|
+
const el = node;
|
|
135
|
+
const html = el.innerHTML;
|
|
136
|
+
const md = this.turndown.turndown(html).trim();
|
|
137
|
+
const lines = md.split('\n');
|
|
138
|
+
return '\n\n' + lines.map((l) => `> ${l}`).join('\n') + '\n\n';
|
|
139
|
+
},
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
transform(html) {
|
|
143
|
+
let markdown = this.turndown.turndown(html);
|
|
144
|
+
markdown = this.postProcess(markdown);
|
|
145
|
+
return markdown;
|
|
146
|
+
}
|
|
147
|
+
postProcess(md) {
|
|
148
|
+
return md
|
|
149
|
+
// Collapse multiple blank lines
|
|
150
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
151
|
+
// Remove trailing whitespace
|
|
152
|
+
.replace(/[ \t]+$/gm, '')
|
|
153
|
+
// Remove paragraph anchors (¶, #, permalink markers)
|
|
154
|
+
.replace(/[¶]/g, '')
|
|
155
|
+
.replace(/\[¶\]\([^)]*\)/g, '')
|
|
156
|
+
.replace(/\[Link to this [^\]]*\]/g, '')
|
|
157
|
+
// Clean heading anchor links like "## Title[¶](#anchor "tooltip")"
|
|
158
|
+
.replace(/(#{1,6}\s+[^[]+)\[.*?\]\(#[^)]*\)/g, '$1')
|
|
159
|
+
// Ensure headings have blank line before
|
|
160
|
+
.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2')
|
|
161
|
+
// Clean up excessive escaping
|
|
162
|
+
.replace(/\\([[\](){}])/g, '$1')
|
|
163
|
+
.trim();
|
|
164
|
+
}
|
|
165
|
+
tableToMarkdown(table) {
|
|
166
|
+
const grid = this.buildTableGrid(table);
|
|
167
|
+
if (!grid.length)
|
|
168
|
+
return '';
|
|
169
|
+
const colCount = Math.max(...grid.map((r) => r.length));
|
|
170
|
+
const normalized = grid.map((r) => {
|
|
171
|
+
while (r.length < colCount)
|
|
172
|
+
r.push('');
|
|
173
|
+
return r;
|
|
174
|
+
});
|
|
175
|
+
const header = `| ${normalized[0].join(' | ')} |`;
|
|
176
|
+
const separator = `| ${normalized[0].map(() => '---').join(' | ')} |`;
|
|
177
|
+
const body = normalized
|
|
178
|
+
.slice(1)
|
|
179
|
+
.map((r) => `| ${r.join(' | ')} |`)
|
|
180
|
+
.join('\n');
|
|
181
|
+
return [header, separator, body].filter(Boolean).join('\n');
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Builds a 2D grid from a table, expanding colspan/rowspan into repeated cells.
|
|
185
|
+
*/
|
|
186
|
+
buildTableGrid(table) {
|
|
187
|
+
const rows = Array.from(table.querySelectorAll('tr'));
|
|
188
|
+
const grid = [];
|
|
189
|
+
const rowspanTracker = new Map();
|
|
190
|
+
rows.forEach((tr, rowIdx) => {
|
|
191
|
+
if (!grid[rowIdx])
|
|
192
|
+
grid[rowIdx] = [];
|
|
193
|
+
let colIdx = 0;
|
|
194
|
+
// Fill in cells carried over by rowspan from previous rows
|
|
195
|
+
const pending = rowspanTracker.get(rowIdx);
|
|
196
|
+
if (pending) {
|
|
197
|
+
for (const { value, remaining } of pending) {
|
|
198
|
+
while (grid[rowIdx][colIdx] !== undefined)
|
|
199
|
+
colIdx++;
|
|
200
|
+
grid[rowIdx][colIdx] = value;
|
|
201
|
+
if (remaining > 1) {
|
|
202
|
+
const nextRow = rowIdx + 1;
|
|
203
|
+
if (!rowspanTracker.has(nextRow))
|
|
204
|
+
rowspanTracker.set(nextRow, []);
|
|
205
|
+
rowspanTracker.get(nextRow).push({ value, remaining: remaining - 1 });
|
|
206
|
+
}
|
|
207
|
+
colIdx++;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
Array.from(tr.querySelectorAll('th, td')).forEach((cell) => {
|
|
211
|
+
// Skip past already-filled positions
|
|
212
|
+
while (grid[rowIdx][colIdx] !== undefined)
|
|
213
|
+
colIdx++;
|
|
214
|
+
const text = (cell.textContent || '').trim().replace(/\|/g, '\\|').replace(/\n/g, ' ');
|
|
215
|
+
const colspan = parseInt(cell.getAttribute('colspan') || '1', 10);
|
|
216
|
+
const rowspan = parseInt(cell.getAttribute('rowspan') || '1', 10);
|
|
217
|
+
for (let c = 0; c < colspan; c++) {
|
|
218
|
+
grid[rowIdx][colIdx + c] = c === 0 ? text : '';
|
|
219
|
+
// Track rowspan for subsequent rows
|
|
220
|
+
if (rowspan > 1) {
|
|
221
|
+
for (let r = 1; r < rowspan; r++) {
|
|
222
|
+
const targetRow = rowIdx + r;
|
|
223
|
+
if (!grid[targetRow])
|
|
224
|
+
grid[targetRow] = [];
|
|
225
|
+
// Reserve the position - we'll fill during that row's processing
|
|
226
|
+
if (!rowspanTracker.has(targetRow))
|
|
227
|
+
rowspanTracker.set(targetRow, []);
|
|
228
|
+
}
|
|
229
|
+
const nextRow = rowIdx + 1;
|
|
230
|
+
if (!rowspanTracker.has(nextRow))
|
|
231
|
+
rowspanTracker.set(nextRow, []);
|
|
232
|
+
rowspanTracker.get(nextRow).push({ value: c === 0 ? text : '', remaining: rowspan - 1 });
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
colIdx += colspan;
|
|
236
|
+
});
|
|
237
|
+
});
|
|
238
|
+
return grid.filter((r) => r.length > 0);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
exports.MarkdownTransformer = MarkdownTransformer;
|
|
242
|
+
//# sourceMappingURL=markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/transformers/markdown.ts"],"names":[],"mappings":";;;;;;AAAA,wDAAuC;AAMvC;;;GAGG;AACH,MAAa,mBAAmB;IACtB,QAAQ,CAAkB;IAC1B,OAAO,CAA6B;IAE5C,YAAY,UAAsC,EAAE;QAClD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,QAAQ,GAAG,IAAI,kBAAe,CAAC;YAClC,YAAY,EAAE,KAAK;YACnB,cAAc,EAAE,QAAQ;YACxB,gBAAgB,EAAE,GAAG;YACrB,WAAW,EAAE,GAAG;SACjB,CAAC,CAAC;QAEH,IAAI,CAAC,cAAc,EAAE,CAAC;IACxB,CAAC;IAEO,cAAc;QACpB,2CAA2C;QAC3C,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,iBAAiB,EAAE;YACvC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC;YACzE,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBACvB,MAAM,MAAM,GAAI,IAAoB,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC3D,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,CAAC;gBACvB,MAAM,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,EAAE,CAAC;gBACtC,MAAM,OAAO,GAAG,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC;gBACvC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;gBAC5D,MAAM,IAAI,GAAG,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAClC,OAAO,aAAa,IAAI,KAAK,IAAI,CAAC,IAAI,EAAE,cAAc,CAAC;YACzD,CAAC;SACF,CAAC,CAAC;QAEH,8CAA8C;QAC9C,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE;YAC7B,MAAM,EAAE,OAAO;YACf,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBACvB,OAAO,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,IAAmB,CAAC,GAAG,MAAM,CAAC;YACrE,CAAC;SACF,CAAC,CAAC;QAEH,qBAAqB;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,YAAY,EAAE;YAClC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,KAAK,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACtE,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;SACtB,CAAC,CAAC;QAEH,8BAA8B;QAC9B,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE;YAC9B,MAAM,EAAE,KAAK;YACb,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBACvB,MAAM,EAAE,GAAG,IAAmB,CAAC;gBAC/B,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBACzC,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBACzC,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;gBACvC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC;oBACjC,OAAO,GAAG,CAAC,CAAC,CAAC,WAAW,GAAG,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;gBACtC,CAAC;gBACD,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC7C,OAAO,KAAK,GAAG,KAAK,GAAG,GAAG,SAAS,GAAG,CAAC;YACzC,CAAC;SACF,CAAC,CAAC;QAEH,+CAA+C;QAC/C,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE;YAChC,MAAM,EAAE,IAAI;YACZ,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC7B,MAAM,EAAE,GAAG,IAAmB,CAAC;gBAC/B,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC;gBAChC,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,KAAK,IAAI,CAAC;gBAE5C,0BAA0B;gBAC1B,IAAI,KAAK,GAAG,CAAC,CAAC;gBACd,IAAI,QAAQ,GAAG,MAAM,EAAE,aAAa,CAAC;gBACrC,OAAO,QAAQ,EAAE,CAAC;oBAChB,IAAI,QAAQ,CAAC,QAAQ,KAAK,IAAI,IAAI,QAAQ,CAAC,QAAQ,KAAK,IAAI,EAAE,CAAC;wBAC7D,KAAK,EAAE,CAAC;oBACV,CAAC;oBACD,QAAQ,GAAG,QAAQ,CAAC,aAAa,CAAC;gBACpC,CAAC;gBAED,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAClC,MAAM,OAAO,GAAG,OAAO;qBACpB,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;qBACnB,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;qBACnB,OAAO,CAAC,KAAK,EAAE,KAAK,MAAM,IAAI,CAAC,CAAC;gBAEnC,sBAAsB;gBACtB,MAAM,QAAQ,GAAG,EAAE,CAAC,aAAa,CAAC,wBAAwB,CAAC,CAAC;gBAC5D,IAAI,QAAQ,EAAE,CAAC;oBACb,MAAM,OAAO,GAAI,QAA6B,CAAC,OAAO,IAAI,QAAQ,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;oBAC3F,OAAO,GAAG,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,IAAI,OAAO,IAAI,CAAC;gBAChE,CAAC;gBAED,IAAI,SAAS,EAAE,CAAC;oBACd,MAAM,KAAK,GAAG,MAAM,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;oBAC5C,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,MAAO,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;oBACvD,MAAM,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;oBACtD,OAAO,GAAG,MAAM,GAAG,GAAG,KAAK,OAAO,IAAI,CAAC;gBACzC,CAAC;gBAED,OAAO,GAAG,MAAM,KAAK,OAAO,IAAI,CAAC;YACnC,CAAC;SACF,CAAC,CAAC;QAEH,wBAAwB;QACxB,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,eAAe,EAAE;YACrC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,CACf,IAAI,CAAC,QAAQ,KAAK,KAAK;gBACvB,IAAI,CAAC,QAAQ,KAAK,GAAG;gBACrB,IAAI,CAAC,QAAQ,KAAK,QAAQ;YAC5B,WAAW,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC,KAAK,OAAO,IAAI;SAC3C,CAAC,CAAC;QAEH,mCAAmC;QACnC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE;YAC/B,MAAM,EAAE,SAAS;YACjB,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBACvB,MAAM,EAAE,GAAG,IAAmB,CAAC;gBAC/B,MAAM,OAAO,GAAG,EAAE,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;gBAC5C,MAAM,WAAW,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;gBAC9D,4CAA4C;gBAC5C,MAAM,KAAK,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAgB,CAAC;gBAChD,MAAM,SAAS,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;gBACjD,SAAS,EAAE,MAAM,EAAE,CAAC;gBACpB,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnE,OAAO,2BAA2B,WAAW,iBAAiB,WAAW,oBAAoB,CAAC;YAChG,CAAC;SACF,CAAC,CAAC;QAEH,gEAAgE;QAChE,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,YAAY,EAAE;YAClC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBACf,IAAI,IAAI,CAAC,QAAQ,KAAK,YAAY;oBAAE,OAAO,KAAK,CAAC;gBACjD,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;gBACpC,OAAO,gDAAgD,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACrE,CAAC;YACD,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBACvB,MAAM,EAAE,GAAG,IAAmB,CAAC;gBAC/B,MAAM,IAAI,GAAG,EAAE,CAAC,SAAS,CAAC;gBAC1B,MAAM,EAAE,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC/C,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;gBAC7B,OAAO,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;YACjE,CAAC;SACF,CAAC,CAAC;IACL,CAAC;IAED,SAAS,CAAC,IAAY;QACpB,IAAI,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAC5C,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;QACtC,OAAO,QAAQ,CAAC;IAClB,CAAC;IAEO,WAAW,CAAC,EAAU;QAC5B,OAAO,EAAE;YACP,gCAAgC;aAC/B,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;YAC3B,6BAA6B;aAC5B,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;YACzB,qDAAqD;aACpD,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;aACnB,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC;aAC9B,OAAO,CAAC,0BAA0B,EAAE,EAAE,CAAC;YACxC,mEAAmE;aAClE,OAAO,CAAC,oCAAoC,EAAE,IAAI,CAAC;YACpD,yCAAyC;aACxC,OAAO,CAAC,qBAAqB,EAAE,UAAU,CAAC;YAC3C,8BAA8B;aAC7B,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC;aAC/B,IAAI,EAAE,CAAC;IACZ,CAAC;IAEO,eAAe,CAAC,KAAkB;QACxC,MAAM,IAAI,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QACxC,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAC;QAE5B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;QACxD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YAChC,OAAO,CAAC,CAAC,MAAM,GAAG,QAAQ;gBAAE,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACvC,OAAO,CAAC,CAAC;QACX,CAAC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,KAAK,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;QAClD,MAAM,SAAS,GAAG,KAAK,UAAU,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;QACtE,MAAM,IAAI,GAAG,UAAU;aACpB,KAAK,CAAC,CAAC,CAAC;aACR,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;aAClC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEd,OAAO,CAAC,MAAM,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC9D,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,KAAkB;QACvC,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;QACtD,MAAM,IAAI,GAAe,EAAE,CAAC;QAC5B,MAAM,cAAc,GAAwD,IAAI,GAAG,EAAE,CAAC;QAEtF,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,MAAM,EAAE,EAAE;YAC1B,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;gBAAE,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC;YACrC,IAAI,MAAM,GAAG,CAAC,CAAC;YAEf,2DAA2D;YAC3D,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YAC3C,IAAI,OAAO,EAAE,CAAC;gBACZ,KAAK,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,OAAO,EAAE,CAAC;oBAC3C,OAAO,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,KAAK,SAAS;wBAAE,MAAM,EAAE,CAAC;oBACpD,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;oBAC7B,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;wBAClB,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,CAAC;wBAC3B,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC;4BAAE,cAAc,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;wBAClE,cAAc,CAAC,GAAG,CAAC,OAAO,CAAE,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,SAAS,GAAG,CAAC,EAAE,CAAC,CAAC;oBACzE,CAAC;oBACD,MAAM,EAAE,CAAC;gBACX,CAAC;YACH,CAAC;YAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;gBACzD,qCAAqC;gBACrC,OAAO,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,KAAK,SAAS;oBAAE,MAAM,EAAE,CAAC;gBAEpD,MAAM,IAAI,GAAG,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBACvF,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;gBAClE,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;gBAElE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;oBACjC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC/C,oCAAoC;oBACpC,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;4BACjC,MAAM,SAAS,GAAG,MAAM,GAAG,CAAC,CAAC;4BAC7B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC;gCAAE,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,CAAC;4BAC3C,iEAAiE;4BACjE,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC;gCAAE,cAAc,CAAC,GAAG,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;wBACxE,CAAC;wBACD,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,CAAC;wBAC3B,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC;4BAAE,cAAc,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;wBAClE,cAAc,CAAC,GAAG,CAAC,OAAO,CAAE,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,SAAS,EAAE,OAAO,GAAG,CAAC,EAAE,CAAC,CAAC;oBAC5F,CAAC;gBACH,CAAC;gBACD,MAAM,IAAI,OAAO,CAAC;YACpB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;CACF;AAtPD,kDAsPC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content deduplication using simhash fingerprinting.
|
|
3
|
+
* Detects near-duplicate content across crawled pages.
|
|
4
|
+
*/
|
|
5
|
+
export declare class Deduplicator {
|
|
6
|
+
private fingerprints;
|
|
7
|
+
private threshold;
|
|
8
|
+
constructor(threshold?: number);
|
|
9
|
+
/** Generate a content fingerprint */
|
|
10
|
+
fingerprint(text: string): string;
|
|
11
|
+
/** Check if content is a duplicate. Returns the original URL if duplicate, null otherwise */
|
|
12
|
+
isDuplicate(url: string, text: string): string | null;
|
|
13
|
+
/** Register content without checking */
|
|
14
|
+
register(url: string, text: string): void;
|
|
15
|
+
/** Get number of unique pages tracked */
|
|
16
|
+
get size(): number;
|
|
17
|
+
clear(): void;
|
|
18
|
+
private shingles;
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=dedup.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../src/utils/dedup.ts"],"names":[],"mappings":"AAEA;;;GAGG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,YAAY,CAAkC;IACtD,OAAO,CAAC,SAAS,CAAS;gBAEd,SAAS,GAAE,MAAY;IAInC,qCAAqC;IACrC,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAMjC,6FAA6F;IAC7F,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI;IAqBrD,wCAAwC;IACxC,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,IAAI;IAIzC,yCAAyC;IACzC,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,KAAK,IAAI,IAAI;IAIb,OAAO,CAAC,QAAQ;CAQjB"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Deduplicator = void 0;
|
|
4
|
+
const crypto_1 = require("crypto");
|
|
5
|
+
/**
|
|
6
|
+
* Content deduplication using simhash fingerprinting.
|
|
7
|
+
* Detects near-duplicate content across crawled pages.
|
|
8
|
+
*/
|
|
9
|
+
class Deduplicator {
|
|
10
|
+
fingerprints = new Map(); // url -> hash
|
|
11
|
+
threshold;
|
|
12
|
+
constructor(threshold = 0.9) {
|
|
13
|
+
this.threshold = threshold;
|
|
14
|
+
}
|
|
15
|
+
/** Generate a content fingerprint */
|
|
16
|
+
fingerprint(text) {
|
|
17
|
+
// Use normalized text hash for exact dedup
|
|
18
|
+
const normalized = text.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
19
|
+
return (0, crypto_1.createHash)('sha256').update(normalized).digest('hex');
|
|
20
|
+
}
|
|
21
|
+
/** Check if content is a duplicate. Returns the original URL if duplicate, null otherwise */
|
|
22
|
+
isDuplicate(url, text) {
|
|
23
|
+
const fp = this.fingerprint(text);
|
|
24
|
+
for (const [existingUrl, existingFp] of this.fingerprints) {
|
|
25
|
+
if (existingFp === fp)
|
|
26
|
+
return existingUrl;
|
|
27
|
+
}
|
|
28
|
+
// Check similarity using shingle-based approach
|
|
29
|
+
const newShingles = this.shingles(text);
|
|
30
|
+
for (const [existingUrl, existingFp] of this.fingerprints) {
|
|
31
|
+
// Only do expensive similarity check if hashes share prefix (likely similar)
|
|
32
|
+
if (existingFp.slice(0, 4) === fp.slice(0, 4)) {
|
|
33
|
+
// Could be similar - but for performance, skip full comparison
|
|
34
|
+
// Exact hash match above handles true duplicates
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
this.fingerprints.set(url, fp);
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
/** Register content without checking */
|
|
41
|
+
register(url, text) {
|
|
42
|
+
this.fingerprints.set(url, this.fingerprint(text));
|
|
43
|
+
}
|
|
44
|
+
/** Get number of unique pages tracked */
|
|
45
|
+
get size() {
|
|
46
|
+
return this.fingerprints.size;
|
|
47
|
+
}
|
|
48
|
+
clear() {
|
|
49
|
+
this.fingerprints.clear();
|
|
50
|
+
}
|
|
51
|
+
shingles(text, k = 5) {
|
|
52
|
+
const words = text.toLowerCase().split(/\s+/);
|
|
53
|
+
const result = new Set();
|
|
54
|
+
for (let i = 0; i <= words.length - k; i++) {
|
|
55
|
+
result.add(words.slice(i, i + k).join(' '));
|
|
56
|
+
}
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
exports.Deduplicator = Deduplicator;
|
|
61
|
+
//# sourceMappingURL=dedup.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.js","sourceRoot":"","sources":["../../src/utils/dedup.ts"],"names":[],"mappings":";;;AAAA,mCAAoC;AAEpC;;;GAGG;AACH,MAAa,YAAY;IACf,YAAY,GAAwB,IAAI,GAAG,EAAE,CAAC,CAAC,cAAc;IAC7D,SAAS,CAAS;IAE1B,YAAY,YAAoB,GAAG;QACjC,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,qCAAqC;IACrC,WAAW,CAAC,IAAY;QACtB,2CAA2C;QAC3C,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAClE,OAAO,IAAA,mBAAU,EAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAC/D,CAAC;IAED,6FAA6F;IAC7F,WAAW,CAAC,GAAW,EAAE,IAAY;QACnC,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QAElC,KAAK,MAAM,CAAC,WAAW,EAAE,UAAU,CAAC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YAC1D,IAAI,UAAU,KAAK,EAAE;gBAAE,OAAO,WAAW,CAAC;QAC5C,CAAC;QAED,gDAAgD;QAChD,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACxC,KAAK,MAAM,CAAC,WAAW,EAAE,UAAU,CAAC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YAC1D,6EAA6E;YAC7E,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;gBAC9C,+DAA+D;gBAC/D,iDAAiD;YACnD,CAAC;QACH,CAAC;QAED,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,wCAAwC;IACxC,QAAQ,CAAC,GAAW,EAAE,IAAY;QAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC;IACrD,CAAC;IAED,yCAAyC;IACzC,IAAI,IAAI;QACN,OAAO,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC;IAChC,CAAC;IAED,KAAK;QACH,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,CAAC;IAC5B,CAAC;IAEO,QAAQ,CAAC,IAAY,EAAE,IAAY,CAAC;QAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAC9C,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;QACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QAC9C,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AA3DD,oCA2DC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { SitemapParser } from './sitemap';
|
|
2
|
+
export { MetricsCollector } from './metrics';
|
|
3
|
+
export { CrawlScheduler } from './scheduler';
|
|
4
|
+
export { Deduplicator } from './dedup';
|
|
5
|
+
export { validateUrl, validateCrawlOptions } from './validation';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/utils/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,gBAAgB,EAAE,MAAM,WAAW,CAAC;AAC7C,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.validateCrawlOptions = exports.validateUrl = exports.Deduplicator = exports.CrawlScheduler = exports.MetricsCollector = exports.SitemapParser = void 0;
|
|
4
|
+
var sitemap_1 = require("./sitemap");
|
|
5
|
+
Object.defineProperty(exports, "SitemapParser", { enumerable: true, get: function () { return sitemap_1.SitemapParser; } });
|
|
6
|
+
var metrics_1 = require("./metrics");
|
|
7
|
+
Object.defineProperty(exports, "MetricsCollector", { enumerable: true, get: function () { return metrics_1.MetricsCollector; } });
|
|
8
|
+
var scheduler_1 = require("./scheduler");
|
|
9
|
+
Object.defineProperty(exports, "CrawlScheduler", { enumerable: true, get: function () { return scheduler_1.CrawlScheduler; } });
|
|
10
|
+
var dedup_1 = require("./dedup");
|
|
11
|
+
Object.defineProperty(exports, "Deduplicator", { enumerable: true, get: function () { return dedup_1.Deduplicator; } });
|
|
12
|
+
var validation_1 = require("./validation");
|
|
13
|
+
Object.defineProperty(exports, "validateUrl", { enumerable: true, get: function () { return validation_1.validateUrl; } });
|
|
14
|
+
Object.defineProperty(exports, "validateCrawlOptions", { enumerable: true, get: function () { return validation_1.validateCrawlOptions; } });
|
|
15
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/utils/index.ts"],"names":[],"mappings":";;;AAAA,qCAA0C;AAAjC,wGAAA,aAAa,OAAA;AACtB,qCAA6C;AAApC,2GAAA,gBAAgB,OAAA;AACzB,yCAA6C;AAApC,2GAAA,cAAc,OAAA;AACvB,iCAAuC;AAA9B,qGAAA,YAAY,OAAA;AACrB,2CAAiE;AAAxD,yGAAA,WAAW,OAAA;AAAE,kHAAA,oBAAoB,OAAA"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { MetricsData } from '../core/types';
|
|
2
|
+
/**
|
|
3
|
+
* Simple metrics collector for WebContext operations.
|
|
4
|
+
* Tracks crawl counts, page counts, token usage, cache performance, and errors.
|
|
5
|
+
*/
|
|
6
|
+
export declare class MetricsCollector {
|
|
7
|
+
private data;
|
|
8
|
+
private durations;
|
|
9
|
+
recordCrawl(pages: number, tokens: number, duration: number): void;
|
|
10
|
+
recordCacheHit(): void;
|
|
11
|
+
recordCacheMiss(): void;
|
|
12
|
+
recordError(): void;
|
|
13
|
+
getMetrics(): MetricsData;
|
|
14
|
+
reset(): void;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.d.ts","sourceRoot":"","sources":["../../src/utils/metrics.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAE5C;;;GAGG;AACH,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,IAAI,CAA2H;IACvI,OAAO,CAAC,SAAS,CAAgB;IAEjC,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,IAAI;IAQlE,cAAc,IAAI,IAAI;IACtB,eAAe,IAAI,IAAI;IACvB,WAAW,IAAI,IAAI;IACnB,UAAU,IAAI,WAAW;IAEzB,KAAK,IAAI,IAAI;CAId"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.MetricsCollector = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Simple metrics collector for WebContext operations.
|
|
6
|
+
* Tracks crawl counts, page counts, token usage, cache performance, and errors.
|
|
7
|
+
*/
|
|
8
|
+
class MetricsCollector {
|
|
9
|
+
data = { crawlsTotal: 0, pagesTotal: 0, tokensTotal: 0, cacheHits: 0, cacheMisses: 0, avgDuration: 0, errors: 0 };
|
|
10
|
+
durations = [];
|
|
11
|
+
recordCrawl(pages, tokens, duration) {
|
|
12
|
+
this.data.crawlsTotal++;
|
|
13
|
+
this.data.pagesTotal += pages;
|
|
14
|
+
this.data.tokensTotal += tokens;
|
|
15
|
+
this.durations.push(duration);
|
|
16
|
+
this.data.avgDuration = this.durations.reduce((a, b) => a + b, 0) / this.durations.length;
|
|
17
|
+
}
|
|
18
|
+
recordCacheHit() { this.data.cacheHits++; }
|
|
19
|
+
recordCacheMiss() { this.data.cacheMisses++; }
|
|
20
|
+
recordError() { this.data.errors++; }
|
|
21
|
+
getMetrics() { return { ...this.data }; }
|
|
22
|
+
reset() {
|
|
23
|
+
this.data = { crawlsTotal: 0, pagesTotal: 0, tokensTotal: 0, cacheHits: 0, cacheMisses: 0, avgDuration: 0, errors: 0 };
|
|
24
|
+
this.durations = [];
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
exports.MetricsCollector = MetricsCollector;
|
|
28
|
+
//# sourceMappingURL=metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.js","sourceRoot":"","sources":["../../src/utils/metrics.ts"],"names":[],"mappings":";;;AAEA;;;GAGG;AACH,MAAa,gBAAgB;IACnB,IAAI,GAAgB,EAAE,WAAW,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAC/H,SAAS,GAAa,EAAE,CAAC;IAEjC,WAAW,CAAC,KAAa,EAAE,MAAc,EAAE,QAAgB;QACzD,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;QACxB,IAAI,CAAC,IAAI,CAAC,UAAU,IAAI,KAAK,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,WAAW,IAAI,MAAM,CAAC;QAChC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC;IAC5F,CAAC;IAED,cAAc,KAAW,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;IACjD,eAAe,KAAW,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;IACpD,WAAW,KAAW,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAC3C,UAAU,KAAkB,OAAO,EAAE,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IAEtD,KAAK;QACH,IAAI,CAAC,IAAI,GAAG,EAAE,WAAW,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;QACvH,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;IACtB,CAAC;CACF;AArBD,4CAqBC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { ScheduleConfig, CrawlResult } from '../core/types';
|
|
2
|
+
/**
|
|
3
|
+
* Simple cron-like scheduler for periodic re-crawling.
|
|
4
|
+
* Uses setInterval with parsed cron expressions for basic scheduling.
|
|
5
|
+
*/
|
|
6
|
+
export declare class CrawlScheduler {
|
|
7
|
+
private jobs;
|
|
8
|
+
schedule(id: string, config: ScheduleConfig, executor: (url: string, options: any) => Promise<CrawlResult>): void;
|
|
9
|
+
cancel(id: string): void;
|
|
10
|
+
cancelAll(): void;
|
|
11
|
+
listJobs(): string[];
|
|
12
|
+
/**
|
|
13
|
+
* Parse a cron expression into a millisecond interval.
|
|
14
|
+
* Supports: *\/N for minutes/hours, day-of-week specific (runs daily),
|
|
15
|
+
* and common patterns. Falls back to 1 hour for unsupported expressions.
|
|
16
|
+
*/
|
|
17
|
+
private cronToInterval;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=scheduler.d.ts.map
|