raguard 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +154 -0
- package/dist/index.d.mts +123 -0
- package/dist/index.d.ts +123 -0
- package/dist/index.js +896 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +890 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +58 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,896 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// src/utils/hashing.ts
|
|
4
|
+
function randomId() {
|
|
5
|
+
const bytes = new Uint8Array(6);
|
|
6
|
+
if (typeof globalThis.crypto !== "undefined" && globalThis.crypto.getRandomValues) {
|
|
7
|
+
globalThis.crypto.getRandomValues(bytes);
|
|
8
|
+
} else {
|
|
9
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
10
|
+
bytes[i] = Math.floor(Math.random() * 256);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return Array.from(bytes).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
14
|
+
}
|
|
15
|
+
function extractNgrams(text, n = 3) {
|
|
16
|
+
const words = text.toLowerCase().replace(/[^\w\s]/g, "").split(/\s+/);
|
|
17
|
+
const ngrams = /* @__PURE__ */ new Set();
|
|
18
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
19
|
+
ngrams.add(words.slice(i, i + n).join(" "));
|
|
20
|
+
}
|
|
21
|
+
return ngrams;
|
|
22
|
+
}
|
|
23
|
+
function ngramOverlap(text1, text2, n = 3) {
|
|
24
|
+
const ngrams1 = extractNgrams(text1, n);
|
|
25
|
+
const ngrams2 = extractNgrams(text2, n);
|
|
26
|
+
if (ngrams1.size === 0 && ngrams2.size === 0) return 0;
|
|
27
|
+
let intersection = 0;
|
|
28
|
+
for (const ng of ngrams1) {
|
|
29
|
+
if (ngrams2.has(ng)) intersection++;
|
|
30
|
+
}
|
|
31
|
+
const union = ngrams1.size + ngrams2.size - intersection;
|
|
32
|
+
return union === 0 ? 0 : intersection / union;
|
|
33
|
+
}
|
|
34
|
+
function extractDomain(url) {
|
|
35
|
+
try {
|
|
36
|
+
const match = url.match(/https?:\/\/(?:www\.)?([^/]+)/i);
|
|
37
|
+
return match ? match[1].toLowerCase() : null;
|
|
38
|
+
} catch {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
var PROMOTIONAL_PATTERNS = [
|
|
43
|
+
/\bbest\s+ever\b/i,
|
|
44
|
+
/\b100%\s+(?:safe|secure|guaranteed)\b/i,
|
|
45
|
+
/\bact\s+now\b/i,
|
|
46
|
+
/\blimited\s+time\b/i,
|
|
47
|
+
/\bfree\s+(?:download|trial|offer)\b/i,
|
|
48
|
+
/\bclick\s+here\b/i
|
|
49
|
+
];
|
|
50
|
+
var REPETITION_THRESHOLD = 3;
|
|
51
|
+
function detectSuspiciousPatterns(text) {
|
|
52
|
+
const promotional = PROMOTIONAL_PATTERNS.some((p) => p.test(text));
|
|
53
|
+
const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 10);
|
|
54
|
+
const sentenceSet = new Set(sentences.map((s) => s.trim().toLowerCase()));
|
|
55
|
+
const repetitive = sentences.length > 0 && sentences.length - sentenceSet.size >= REPETITION_THRESHOLD;
|
|
56
|
+
const unicodeAnomalies = /[\u200B-\u200F\u2028-\u202F\uFEFF]/.test(text);
|
|
57
|
+
let score = 0;
|
|
58
|
+
if (promotional) score += 0.3;
|
|
59
|
+
if (repetitive) score += 0.4;
|
|
60
|
+
if (unicodeAnomalies) score += 0.3;
|
|
61
|
+
return { promotional, repetitive, unicodeAnomalies, score: Math.min(score, 1) };
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// src/models.ts
|
|
65
|
+
var Recommendation = /* @__PURE__ */ ((Recommendation2) => {
|
|
66
|
+
Recommendation2["PROCEED"] = "proceed";
|
|
67
|
+
Recommendation2["CITE_WITH_WARNING"] = "cite_with_warning";
|
|
68
|
+
Recommendation2["BLOCK"] = "block";
|
|
69
|
+
return Recommendation2;
|
|
70
|
+
})(Recommendation || {});
|
|
71
|
+
var Document = class {
|
|
72
|
+
content;
|
|
73
|
+
metadata;
|
|
74
|
+
doc_id;
|
|
75
|
+
constructor(input) {
|
|
76
|
+
if (typeof input === "string") {
|
|
77
|
+
this.content = input;
|
|
78
|
+
this.metadata = {};
|
|
79
|
+
this.doc_id = randomId();
|
|
80
|
+
} else {
|
|
81
|
+
this.content = input.content;
|
|
82
|
+
this.metadata = input.metadata ?? {};
|
|
83
|
+
this.doc_id = input.doc_id ?? randomId();
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
// src/config.ts
|
|
89
|
+
var DEFAULT_CONFIG = {
|
|
90
|
+
riskThreshold: 0.7,
|
|
91
|
+
warningThreshold: 0.4,
|
|
92
|
+
enabledDetectors: [
|
|
93
|
+
"consensus_clustering",
|
|
94
|
+
"semantic_anomaly",
|
|
95
|
+
"source_reputation"
|
|
96
|
+
],
|
|
97
|
+
consensus: {
|
|
98
|
+
similarityThreshold: 0.75,
|
|
99
|
+
minClusterSize: 2,
|
|
100
|
+
weightSimilarity: 0.4,
|
|
101
|
+
weightSourceDiversity: 0.3,
|
|
102
|
+
weightTemporal: 0.2,
|
|
103
|
+
weightLexical: 0.1
|
|
104
|
+
},
|
|
105
|
+
anomaly: {
|
|
106
|
+
contamination: 0.1,
|
|
107
|
+
minRelevanceScore: 0.3
|
|
108
|
+
},
|
|
109
|
+
reputation: {
|
|
110
|
+
unknownDomainScore: 0.4,
|
|
111
|
+
metadataWeight: 0.2,
|
|
112
|
+
contentQualityWeight: 0.15
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
function mergeConfig(overrides) {
|
|
116
|
+
if (!overrides) return { ...DEFAULT_CONFIG };
|
|
117
|
+
return {
|
|
118
|
+
riskThreshold: overrides.riskThreshold ?? DEFAULT_CONFIG.riskThreshold,
|
|
119
|
+
warningThreshold: overrides.warningThreshold ?? DEFAULT_CONFIG.warningThreshold,
|
|
120
|
+
enabledDetectors: overrides.enabledDetectors ?? [...DEFAULT_CONFIG.enabledDetectors],
|
|
121
|
+
consensus: { ...DEFAULT_CONFIG.consensus, ...overrides.consensus },
|
|
122
|
+
anomaly: { ...DEFAULT_CONFIG.anomaly, ...overrides.anomaly },
|
|
123
|
+
reputation: { ...DEFAULT_CONFIG.reputation, ...overrides.reputation }
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// src/detectors/base.ts
|
|
128
|
+
var BaseDetector = class {
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
// src/utils/math.ts
|
|
132
|
+
function dot(a, b) {
|
|
133
|
+
let sum = 0;
|
|
134
|
+
for (let i = 0; i < a.length; i++) {
|
|
135
|
+
sum += a[i] * b[i];
|
|
136
|
+
}
|
|
137
|
+
return sum;
|
|
138
|
+
}
|
|
139
|
+
function norm(a) {
|
|
140
|
+
let sum = 0;
|
|
141
|
+
for (let i = 0; i < a.length; i++) {
|
|
142
|
+
sum += a[i] * a[i];
|
|
143
|
+
}
|
|
144
|
+
return Math.sqrt(sum);
|
|
145
|
+
}
|
|
146
|
+
function cosineSimilarity(a, b) {
|
|
147
|
+
const d = dot(a, b);
|
|
148
|
+
const na = norm(a);
|
|
149
|
+
const nb = norm(b);
|
|
150
|
+
if (na === 0 || nb === 0) return 0;
|
|
151
|
+
return d / (na * nb);
|
|
152
|
+
}
|
|
153
|
+
function cosineSimilarityMatrix(vectors) {
|
|
154
|
+
const n = vectors.length;
|
|
155
|
+
const matrix = Array.from(
|
|
156
|
+
{ length: n },
|
|
157
|
+
() => new Array(n).fill(0)
|
|
158
|
+
);
|
|
159
|
+
const norms = vectors.map(norm);
|
|
160
|
+
for (let i = 0; i < n; i++) {
|
|
161
|
+
matrix[i][i] = 1;
|
|
162
|
+
for (let j = i + 1; j < n; j++) {
|
|
163
|
+
const sim = norms[i] === 0 || norms[j] === 0 ? 0 : dot(vectors[i], vectors[j]) / (norms[i] * norms[j]);
|
|
164
|
+
matrix[i][j] = sim;
|
|
165
|
+
matrix[j][i] = sim;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return matrix;
|
|
169
|
+
}
|
|
170
|
+
function mean(arr) {
|
|
171
|
+
if (arr.length === 0) return 0;
|
|
172
|
+
return arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
173
|
+
}
|
|
174
|
+
function std(arr) {
|
|
175
|
+
if (arr.length === 0) return 0;
|
|
176
|
+
const m = mean(arr);
|
|
177
|
+
const variance = arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / arr.length;
|
|
178
|
+
return Math.sqrt(variance);
|
|
179
|
+
}
|
|
180
|
+
function clamp(value, min, max) {
|
|
181
|
+
return Math.max(min, Math.min(max, value));
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// src/utils/embeddings.ts
|
|
185
|
+
function tokenize(text) {
|
|
186
|
+
return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((w) => w.length > 1);
|
|
187
|
+
}
|
|
188
|
+
function termFrequency(tokens) {
|
|
189
|
+
const tf = /* @__PURE__ */ new Map();
|
|
190
|
+
for (const token of tokens) {
|
|
191
|
+
tf.set(token, (tf.get(token) ?? 0) + 1);
|
|
192
|
+
}
|
|
193
|
+
const total = tokens.length;
|
|
194
|
+
if (total > 0) {
|
|
195
|
+
for (const [key, val] of tf) {
|
|
196
|
+
tf.set(key, val / total);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return tf;
|
|
200
|
+
}
|
|
201
|
+
function buildIdf(tokenizedDocs) {
|
|
202
|
+
const docFreq = /* @__PURE__ */ new Map();
|
|
203
|
+
const n = tokenizedDocs.length;
|
|
204
|
+
for (const tokens of tokenizedDocs) {
|
|
205
|
+
const unique = new Set(tokens);
|
|
206
|
+
for (const token of unique) {
|
|
207
|
+
docFreq.set(token, (docFreq.get(token) ?? 0) + 1);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
const idf = /* @__PURE__ */ new Map();
|
|
211
|
+
const vocab = [];
|
|
212
|
+
for (const [term, df] of docFreq) {
|
|
213
|
+
if (df < n && df > 0) {
|
|
214
|
+
idf.set(term, Math.log((n + 1) / (df + 1)) + 1);
|
|
215
|
+
vocab.push(term);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
vocab.sort();
|
|
219
|
+
return { vocab, idf };
|
|
220
|
+
}
|
|
221
|
+
function encodeTexts(texts) {
|
|
222
|
+
const tokenizedDocs = texts.map(tokenize);
|
|
223
|
+
const { vocab, idf } = buildIdf(tokenizedDocs);
|
|
224
|
+
if (vocab.length === 0) {
|
|
225
|
+
return texts.map(() => new Array(1).fill(0));
|
|
226
|
+
}
|
|
227
|
+
const vectors = [];
|
|
228
|
+
for (const tokens of tokenizedDocs) {
|
|
229
|
+
const tf = termFrequency(tokens);
|
|
230
|
+
const vec = new Array(vocab.length).fill(0);
|
|
231
|
+
for (let i = 0; i < vocab.length; i++) {
|
|
232
|
+
const term = vocab[i];
|
|
233
|
+
const tfVal = tf.get(term) ?? 0;
|
|
234
|
+
const idfVal = idf.get(term) ?? 0;
|
|
235
|
+
vec[i] = tfVal * idfVal;
|
|
236
|
+
}
|
|
237
|
+
let normVal = 0;
|
|
238
|
+
for (const v of vec) normVal += v * v;
|
|
239
|
+
normVal = Math.sqrt(normVal);
|
|
240
|
+
if (normVal > 0) {
|
|
241
|
+
for (let i = 0; i < vec.length; i++) {
|
|
242
|
+
vec[i] /= normVal;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
vectors.push(vec);
|
|
246
|
+
}
|
|
247
|
+
return vectors;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// src/detectors/consensus.ts
|
|
251
|
+
function agglomerativeClustering(distanceMatrix, distanceThreshold) {
|
|
252
|
+
const n = distanceMatrix.length;
|
|
253
|
+
const clusterOf = Array.from({ length: n }, (_, i) => i);
|
|
254
|
+
const clusters = /* @__PURE__ */ new Map();
|
|
255
|
+
for (let i = 0; i < n; i++) {
|
|
256
|
+
clusters.set(i, [i]);
|
|
257
|
+
}
|
|
258
|
+
while (clusters.size > 1) {
|
|
259
|
+
let minDist = Infinity;
|
|
260
|
+
let mergeA = -1;
|
|
261
|
+
let mergeB = -1;
|
|
262
|
+
const keys = Array.from(clusters.keys());
|
|
263
|
+
for (let i = 0; i < keys.length; i++) {
|
|
264
|
+
for (let j = i + 1; j < keys.length; j++) {
|
|
265
|
+
const membersA2 = clusters.get(keys[i]);
|
|
266
|
+
const membersB2 = clusters.get(keys[j]);
|
|
267
|
+
let totalDist = 0;
|
|
268
|
+
let count = 0;
|
|
269
|
+
for (const a of membersA2) {
|
|
270
|
+
for (const b of membersB2) {
|
|
271
|
+
totalDist += distanceMatrix[a][b];
|
|
272
|
+
count++;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
const avgDist = totalDist / count;
|
|
276
|
+
if (avgDist < minDist) {
|
|
277
|
+
minDist = avgDist;
|
|
278
|
+
mergeA = keys[i];
|
|
279
|
+
mergeB = keys[j];
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
if (minDist > distanceThreshold || mergeA === -1) break;
|
|
284
|
+
const membersB = clusters.get(mergeB);
|
|
285
|
+
const membersA = clusters.get(mergeA);
|
|
286
|
+
for (const idx of membersB) {
|
|
287
|
+
membersA.push(idx);
|
|
288
|
+
clusterOf[idx] = mergeA;
|
|
289
|
+
}
|
|
290
|
+
clusters.delete(mergeB);
|
|
291
|
+
}
|
|
292
|
+
const labelMap = /* @__PURE__ */ new Map();
|
|
293
|
+
let nextLabel = 0;
|
|
294
|
+
const labels = new Array(n);
|
|
295
|
+
for (let i = 0; i < n; i++) {
|
|
296
|
+
const cluster = clusterOf[i];
|
|
297
|
+
let root = cluster;
|
|
298
|
+
while (!clusters.has(root)) {
|
|
299
|
+
for (const [key, members] of clusters) {
|
|
300
|
+
if (members.includes(i)) {
|
|
301
|
+
root = key;
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
break;
|
|
306
|
+
}
|
|
307
|
+
if (!labelMap.has(root)) {
|
|
308
|
+
labelMap.set(root, nextLabel++);
|
|
309
|
+
}
|
|
310
|
+
labels[i] = labelMap.get(root);
|
|
311
|
+
}
|
|
312
|
+
return labels;
|
|
313
|
+
}
|
|
314
|
+
var ConsensusClusteringDetector = class extends BaseDetector {
|
|
315
|
+
name = "consensus_clustering";
|
|
316
|
+
similarityThreshold;
|
|
317
|
+
minClusterSize;
|
|
318
|
+
wSim;
|
|
319
|
+
wSrc;
|
|
320
|
+
wTemp;
|
|
321
|
+
wLex;
|
|
322
|
+
constructor(config) {
|
|
323
|
+
super();
|
|
324
|
+
this.similarityThreshold = config.similarityThreshold;
|
|
325
|
+
this.minClusterSize = config.minClusterSize;
|
|
326
|
+
this.wSim = config.weightSimilarity;
|
|
327
|
+
this.wSrc = config.weightSourceDiversity;
|
|
328
|
+
this.wTemp = config.weightTemporal;
|
|
329
|
+
this.wLex = config.weightLexical;
|
|
330
|
+
}
|
|
331
|
+
detect(documents, options) {
|
|
332
|
+
if (documents.length < this.minClusterSize) {
|
|
333
|
+
return {
|
|
334
|
+
detectorName: this.name,
|
|
335
|
+
riskScore: 0,
|
|
336
|
+
flaggedIndices: [],
|
|
337
|
+
details: { reason: "too_few_documents" }
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
const embeddings = options?.embeddings ?? encodeTexts(documents.map((d) => d.content));
|
|
341
|
+
const simMatrix = cosineSimilarityMatrix(embeddings);
|
|
342
|
+
const n = documents.length;
|
|
343
|
+
const distMatrix = Array.from(
|
|
344
|
+
{ length: n },
|
|
345
|
+
() => new Array(n).fill(0)
|
|
346
|
+
);
|
|
347
|
+
for (let i = 0; i < n; i++) {
|
|
348
|
+
for (let j = 0; j < n; j++) {
|
|
349
|
+
distMatrix[i][j] = i === j ? 0 : Math.max(0, Math.min(2, 1 - simMatrix[i][j]));
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
const labels = agglomerativeClustering(
|
|
353
|
+
distMatrix,
|
|
354
|
+
1 - this.similarityThreshold
|
|
355
|
+
);
|
|
356
|
+
const clusterCounts = /* @__PURE__ */ new Map();
|
|
357
|
+
for (const label of labels) {
|
|
358
|
+
clusterCounts.set(label, (clusterCounts.get(label) ?? 0) + 1);
|
|
359
|
+
}
|
|
360
|
+
const suspiciousClusters = /* @__PURE__ */ new Map();
|
|
361
|
+
for (const [clusterId, count] of clusterCounts) {
|
|
362
|
+
if (count >= this.minClusterSize) {
|
|
363
|
+
suspiciousClusters.set(clusterId, count);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
if (suspiciousClusters.size === 0) {
|
|
367
|
+
return {
|
|
368
|
+
detectorName: this.name,
|
|
369
|
+
riskScore: 0,
|
|
370
|
+
flaggedIndices: [],
|
|
371
|
+
details: { clusters_found: 0 }
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
const clusterScores = [];
|
|
375
|
+
const allFlagged = /* @__PURE__ */ new Set();
|
|
376
|
+
for (const [clusterId, size] of suspiciousClusters) {
|
|
377
|
+
const members = labels.map((label, idx) => label === clusterId ? idx : -1).filter((idx) => idx >= 0);
|
|
378
|
+
for (const m of members) allFlagged.add(m);
|
|
379
|
+
const simUniformity = this.intraClusterSimilarity(simMatrix, members);
|
|
380
|
+
const sourceDiversity = this.sourceDiversity(documents, members);
|
|
381
|
+
const temporalScore = this.temporalClustering(documents, members);
|
|
382
|
+
const lexicalScore = this.lexicalOverlap(documents, members);
|
|
383
|
+
const clusterRisk = this.wSim * simUniformity + this.wSrc * (1 - sourceDiversity) + this.wTemp * temporalScore + this.wLex * lexicalScore;
|
|
384
|
+
clusterScores.push({
|
|
385
|
+
cluster_id: clusterId,
|
|
386
|
+
size,
|
|
387
|
+
members,
|
|
388
|
+
risk_score: Math.round(clusterRisk * 1e4) / 1e4,
|
|
389
|
+
similarity_uniformity: Math.round(simUniformity * 1e4) / 1e4,
|
|
390
|
+
source_diversity: Math.round(sourceDiversity * 1e4) / 1e4,
|
|
391
|
+
temporal_clustering: Math.round(temporalScore * 1e4) / 1e4,
|
|
392
|
+
lexical_overlap: Math.round(lexicalScore * 1e4) / 1e4
|
|
393
|
+
});
|
|
394
|
+
}
|
|
395
|
+
const overallRisk = Math.max(
|
|
396
|
+
...clusterScores.map((cs) => cs.risk_score)
|
|
397
|
+
);
|
|
398
|
+
return {
|
|
399
|
+
detectorName: this.name,
|
|
400
|
+
riskScore: Math.min(overallRisk, 1),
|
|
401
|
+
flaggedIndices: Array.from(allFlagged).sort((a, b) => a - b),
|
|
402
|
+
details: {
|
|
403
|
+
clusters_found: suspiciousClusters.size,
|
|
404
|
+
clusters: clusterScores
|
|
405
|
+
}
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
intraClusterSimilarity(simMatrix, members) {
|
|
409
|
+
if (members.length < 2) return 0;
|
|
410
|
+
const sims = [];
|
|
411
|
+
for (let i = 0; i < members.length; i++) {
|
|
412
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
413
|
+
sims.push(simMatrix[members[i]][members[j]]);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
return sims.length > 0 ? mean(sims) : 0;
|
|
417
|
+
}
|
|
418
|
+
sourceDiversity(documents, members) {
|
|
419
|
+
if (members.length === 0) return 1;
|
|
420
|
+
const sources = /* @__PURE__ */ new Set();
|
|
421
|
+
let unknownCount = 0;
|
|
422
|
+
for (const idx of members) {
|
|
423
|
+
const source = documents[idx].metadata.source;
|
|
424
|
+
if (source) {
|
|
425
|
+
const domain = extractDomain(source);
|
|
426
|
+
sources.add(domain ?? "__unknown__");
|
|
427
|
+
} else {
|
|
428
|
+
unknownCount++;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
if (unknownCount > 0) sources.add("__unknown__");
|
|
432
|
+
return sources.size / members.length;
|
|
433
|
+
}
|
|
434
|
+
temporalClustering(documents, members) {
|
|
435
|
+
const dates = [];
|
|
436
|
+
for (const idx of members) {
|
|
437
|
+
const dateStr = documents[idx].metadata.date ?? documents[idx].metadata.published_date;
|
|
438
|
+
if (dateStr) {
|
|
439
|
+
try {
|
|
440
|
+
const d = new Date(dateStr);
|
|
441
|
+
if (!isNaN(d.getTime())) dates.push(d);
|
|
442
|
+
} catch {
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
if (dates.length < 2) return 0;
|
|
448
|
+
dates.sort((a, b) => a.getTime() - b.getTime());
|
|
449
|
+
const spanDays = (dates[dates.length - 1].getTime() - dates[0].getTime()) / 864e5;
|
|
450
|
+
if (spanDays <= 1) return 1;
|
|
451
|
+
if (spanDays <= 7) return 0.7;
|
|
452
|
+
if (spanDays <= 30) return 0.3;
|
|
453
|
+
return 0;
|
|
454
|
+
}
|
|
455
|
+
lexicalOverlap(documents, members) {
|
|
456
|
+
if (members.length < 2) return 0;
|
|
457
|
+
const overlaps = [];
|
|
458
|
+
for (let i = 0; i < members.length; i++) {
|
|
459
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
460
|
+
overlaps.push(
|
|
461
|
+
ngramOverlap(documents[members[i]].content, documents[members[j]].content)
|
|
462
|
+
);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
return overlaps.length > 0 ? mean(overlaps) : 0;
|
|
466
|
+
}
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
// src/detectors/anomaly.ts
|
|
470
|
+
var NEGATION_PATTERNS = [
|
|
471
|
+
/\bhowever\b/i,
|
|
472
|
+
/\bbut\b/i,
|
|
473
|
+
/\bon the contrary\b/i,
|
|
474
|
+
/\bactually\b.*\bnot\b/i,
|
|
475
|
+
/\bfalse\b/i,
|
|
476
|
+
/\bincorrect\b/i,
|
|
477
|
+
/\bcontradicts?\b/i,
|
|
478
|
+
/\bdespite\b.*\bclaims?\b/i,
|
|
479
|
+
/\bis not\b/i,
|
|
480
|
+
/\bare not\b/i,
|
|
481
|
+
/\bwas not\b/i,
|
|
482
|
+
/\bnever\b/i,
|
|
483
|
+
/\bno evidence\b/i
|
|
484
|
+
];
|
|
485
|
+
function distanceBasedOutlierScores(embeddings, contamination) {
|
|
486
|
+
const n = embeddings.length;
|
|
487
|
+
const simMatrix = cosineSimilarityMatrix(embeddings);
|
|
488
|
+
const avgDistances = [];
|
|
489
|
+
for (let i = 0; i < n; i++) {
|
|
490
|
+
let totalDist = 0;
|
|
491
|
+
for (let j = 0; j < n; j++) {
|
|
492
|
+
if (i !== j) totalDist += 1 - simMatrix[i][j];
|
|
493
|
+
}
|
|
494
|
+
avgDistances.push(totalDist / (n - 1));
|
|
495
|
+
}
|
|
496
|
+
const m = mean(avgDistances);
|
|
497
|
+
const s = std(avgDistances);
|
|
498
|
+
if (s === 0) {
|
|
499
|
+
return new Array(n).fill(0.1);
|
|
500
|
+
}
|
|
501
|
+
const scores = avgDistances.map((d) => -(d - m) / s);
|
|
502
|
+
const sorted = [...scores].sort((a, b) => a - b);
|
|
503
|
+
const cutoffIdx = Math.max(0, Math.floor(n * contamination) - 1);
|
|
504
|
+
const cutoff = sorted[cutoffIdx];
|
|
505
|
+
return scores.map((s2) => s2 - cutoff);
|
|
506
|
+
}
|
|
507
|
+
var SemanticAnomalyDetector = class extends BaseDetector {
|
|
508
|
+
name = "semantic_anomaly";
|
|
509
|
+
contamination;
|
|
510
|
+
minRelevanceScore;
|
|
511
|
+
constructor(config) {
|
|
512
|
+
super();
|
|
513
|
+
this.contamination = config.contamination;
|
|
514
|
+
this.minRelevanceScore = config.minRelevanceScore;
|
|
515
|
+
}
|
|
516
|
+
detect(documents, options) {
|
|
517
|
+
if (documents.length < 3) {
|
|
518
|
+
return {
|
|
519
|
+
detectorName: this.name,
|
|
520
|
+
riskScore: 0,
|
|
521
|
+
flaggedIndices: [],
|
|
522
|
+
details: { reason: "too_few_documents_for_anomaly_detection" }
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
const embeddings = options?.embeddings ?? encodeTexts(documents.map((d) => d.content));
|
|
526
|
+
const outlierScores = distanceBasedOutlierScores(
|
|
527
|
+
embeddings,
|
|
528
|
+
Math.min(this.contamination, (documents.length - 1) / documents.length)
|
|
529
|
+
);
|
|
530
|
+
let relevanceScores = null;
|
|
531
|
+
const lowRelevanceFlags = [];
|
|
532
|
+
if (options?.query) {
|
|
533
|
+
const queryEmb = options.queryEmbedding?.[0] ?? encodeTexts([options.query])[0];
|
|
534
|
+
relevanceScores = embeddings.map((emb) => cosineSimilarity(emb, queryEmb));
|
|
535
|
+
for (let i = 0; i < relevanceScores.length; i++) {
|
|
536
|
+
if (relevanceScores[i] < this.minRelevanceScore) {
|
|
537
|
+
lowRelevanceFlags.push(i);
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
const contradictionScores = this.detectContradictions(documents);
|
|
542
|
+
const coordinatedFlags = this.checkCoordinatedInjection(
|
|
543
|
+
embeddings,
|
|
544
|
+
relevanceScores
|
|
545
|
+
);
|
|
546
|
+
const flagged = /* @__PURE__ */ new Set();
|
|
547
|
+
const docRisks = new Array(documents.length).fill(0);
|
|
548
|
+
for (let i = 0; i < outlierScores.length; i++) {
|
|
549
|
+
if (outlierScores[i] < 0) {
|
|
550
|
+
const anomalyStrength = Math.abs(outlierScores[i]);
|
|
551
|
+
docRisks[i] += anomalyStrength * 0.4;
|
|
552
|
+
if (anomalyStrength > 0.3) flagged.add(i);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
for (let i = 0; i < contradictionScores.length; i++) {
|
|
556
|
+
docRisks[i] += contradictionScores[i] * 0.3;
|
|
557
|
+
}
|
|
558
|
+
for (const i of lowRelevanceFlags) {
|
|
559
|
+
docRisks[i] += 0.15;
|
|
560
|
+
flagged.add(i);
|
|
561
|
+
}
|
|
562
|
+
for (const i of coordinatedFlags) {
|
|
563
|
+
docRisks[i] += 0.25;
|
|
564
|
+
flagged.add(i);
|
|
565
|
+
}
|
|
566
|
+
for (let i = 0; i < docRisks.length; i++) {
|
|
567
|
+
docRisks[i] = clamp(docRisks[i], 0, 1);
|
|
568
|
+
}
|
|
569
|
+
const overallRisk = Math.max(...docRisks);
|
|
570
|
+
return {
|
|
571
|
+
detectorName: this.name,
|
|
572
|
+
riskScore: Math.min(overallRisk, 1),
|
|
573
|
+
flaggedIndices: Array.from(flagged).sort((a, b) => a - b),
|
|
574
|
+
details: {
|
|
575
|
+
outlier_scores: outlierScores.map((s) => Math.round(s * 1e4) / 1e4),
|
|
576
|
+
contradiction_scores: contradictionScores.map(
|
|
577
|
+
(s) => Math.round(s * 1e4) / 1e4
|
|
578
|
+
),
|
|
579
|
+
relevance_scores: relevanceScores?.map(
|
|
580
|
+
(s) => Math.round(s * 1e4) / 1e4
|
|
581
|
+
) ?? null,
|
|
582
|
+
coordinated_injection_indices: coordinatedFlags,
|
|
583
|
+
per_document_risk: docRisks.map(
|
|
584
|
+
(r) => Math.round(r * 1e4) / 1e4
|
|
585
|
+
)
|
|
586
|
+
}
|
|
587
|
+
};
|
|
588
|
+
}
|
|
589
|
+
detectContradictions(documents) {
|
|
590
|
+
return documents.map((doc) => {
|
|
591
|
+
const text = doc.content.toLowerCase();
|
|
592
|
+
const contradictionCount = NEGATION_PATTERNS.filter(
|
|
593
|
+
(p) => p.test(text)
|
|
594
|
+
).length;
|
|
595
|
+
const wordCount = Math.max(text.split(/\s+/).length, 1);
|
|
596
|
+
const density = contradictionCount / (wordCount / 100);
|
|
597
|
+
return Math.min(density * 0.2, 1);
|
|
598
|
+
});
|
|
599
|
+
}
|
|
600
|
+
checkCoordinatedInjection(embeddings, relevanceScores) {
|
|
601
|
+
if (!relevanceScores) return [];
|
|
602
|
+
const lowRelevance = relevanceScores.map((s, i) => s < 0.4 ? i : -1).filter((i) => i >= 0);
|
|
603
|
+
if (lowRelevance.length < 2) return [];
|
|
604
|
+
const subEmbeddings = lowRelevance.map((i) => embeddings[i]);
|
|
605
|
+
const simMatrix = cosineSimilarityMatrix(subEmbeddings);
|
|
606
|
+
const flagged = [];
|
|
607
|
+
for (let i = 0; i < lowRelevance.length; i++) {
|
|
608
|
+
let maxSim = 0;
|
|
609
|
+
for (let j = 0; j < lowRelevance.length; j++) {
|
|
610
|
+
if (i !== j) maxSim = Math.max(maxSim, simMatrix[i][j]);
|
|
611
|
+
}
|
|
612
|
+
if (maxSim > 0.8) flagged.push(lowRelevance[i]);
|
|
613
|
+
}
|
|
614
|
+
return flagged;
|
|
615
|
+
}
|
|
616
|
+
};
|
|
617
|
+
|
|
618
|
+
// src/detectors/reputation.ts
|
|
619
|
+
var TRUSTED_DOMAINS = {
|
|
620
|
+
// Security vendors
|
|
621
|
+
"crowdstrike.com": 0.95,
|
|
622
|
+
"mandiant.com": 0.95,
|
|
623
|
+
"paloaltonetworks.com": 0.9,
|
|
624
|
+
"fortinet.com": 0.9,
|
|
625
|
+
"sentinelone.com": 0.9,
|
|
626
|
+
"trendmicro.com": 0.9,
|
|
627
|
+
"kaspersky.com": 0.85,
|
|
628
|
+
"sophos.com": 0.9,
|
|
629
|
+
"elastic.co": 0.85,
|
|
630
|
+
// Government / standards
|
|
631
|
+
"nist.gov": 0.95,
|
|
632
|
+
"cisa.gov": 0.95,
|
|
633
|
+
"nvd.nist.gov": 0.95,
|
|
634
|
+
"mitre.org": 0.95,
|
|
635
|
+
"us-cert.gov": 0.95,
|
|
636
|
+
// Major tech
|
|
637
|
+
"microsoft.com": 0.9,
|
|
638
|
+
"google.com": 0.9,
|
|
639
|
+
"aws.amazon.com": 0.9,
|
|
640
|
+
"cloud.google.com": 0.9,
|
|
641
|
+
"docs.github.com": 0.85,
|
|
642
|
+
// Academic / research
|
|
643
|
+
"arxiv.org": 0.85,
|
|
644
|
+
"ieee.org": 0.9,
|
|
645
|
+
"acm.org": 0.9,
|
|
646
|
+
"usenix.org": 0.9,
|
|
647
|
+
"springer.com": 0.85,
|
|
648
|
+
// AI / ML specific
|
|
649
|
+
"openai.com": 0.85,
|
|
650
|
+
"anthropic.com": 0.85,
|
|
651
|
+
"huggingface.co": 0.8,
|
|
652
|
+
"langchain.com": 0.8,
|
|
653
|
+
"llamaindex.ai": 0.8,
|
|
654
|
+
// Known documentation
|
|
655
|
+
"docs.python.org": 0.9,
|
|
656
|
+
"stackoverflow.com": 0.75,
|
|
657
|
+
"wikipedia.org": 0.7
|
|
658
|
+
};
|
|
659
|
+
var LOW_TRUST_PATTERNS = [
|
|
660
|
+
"blogspot.com",
|
|
661
|
+
"wordpress.com",
|
|
662
|
+
"medium.com",
|
|
663
|
+
"substack.com",
|
|
664
|
+
"pastebin.com",
|
|
665
|
+
"hastebin.com"
|
|
666
|
+
];
|
|
667
|
+
var EXPECTED_METADATA_FIELDS = ["source", "author", "date", "title"];
|
|
668
|
+
var SourceReputationDetector = class extends BaseDetector {
|
|
669
|
+
name = "source_reputation";
|
|
670
|
+
unknownDomainScore;
|
|
671
|
+
metadataWeight;
|
|
672
|
+
contentQualityWeight;
|
|
673
|
+
trustedDomains;
|
|
674
|
+
constructor(config, customTrustedDomains) {
|
|
675
|
+
super();
|
|
676
|
+
this.unknownDomainScore = config.unknownDomainScore;
|
|
677
|
+
this.metadataWeight = config.metadataWeight;
|
|
678
|
+
this.contentQualityWeight = config.contentQualityWeight;
|
|
679
|
+
this.trustedDomains = { ...TRUSTED_DOMAINS, ...customTrustedDomains };
|
|
680
|
+
}
|
|
681
|
+
detect(documents, _options) {
|
|
682
|
+
if (documents.length === 0) {
|
|
683
|
+
return {
|
|
684
|
+
detectorName: this.name,
|
|
685
|
+
riskScore: 0,
|
|
686
|
+
flaggedIndices: [],
|
|
687
|
+
details: { reason: "no_documents" }
|
|
688
|
+
};
|
|
689
|
+
}
|
|
690
|
+
const docReputations = [];
|
|
691
|
+
const flagged = [];
|
|
692
|
+
for (let i = 0; i < documents.length; i++) {
|
|
693
|
+
const rep = this.scoreDocument(documents[i]);
|
|
694
|
+
docReputations.push(rep);
|
|
695
|
+
if (rep.reputation_score < 0.4) {
|
|
696
|
+
flagged.push(i);
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
const minRep = Math.min(
|
|
700
|
+
...docReputations.map((r) => r.reputation_score)
|
|
701
|
+
);
|
|
702
|
+
const overallRisk = 1 - minRep;
|
|
703
|
+
return {
|
|
704
|
+
detectorName: this.name,
|
|
705
|
+
riskScore: Math.min(overallRisk, 1),
|
|
706
|
+
flaggedIndices: flagged,
|
|
707
|
+
details: {
|
|
708
|
+
per_document_reputation: docReputations
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
}
|
|
712
|
+
scoreDocument(doc) {
|
|
713
|
+
const domainScore = this.domainScore(doc);
|
|
714
|
+
const metadataScore = this.metadataCompleteness(doc);
|
|
715
|
+
const qualityScore = this.contentQuality(doc);
|
|
716
|
+
const domainWeight = 1 - this.metadataWeight - this.contentQualityWeight;
|
|
717
|
+
const finalScore = domainWeight * domainScore + this.metadataWeight * metadataScore + this.contentQualityWeight * qualityScore;
|
|
718
|
+
return {
|
|
719
|
+
reputation_score: Math.round(finalScore * 1e4) / 1e4,
|
|
720
|
+
domain_score: Math.round(domainScore * 1e4) / 1e4,
|
|
721
|
+
metadata_score: Math.round(metadataScore * 1e4) / 1e4,
|
|
722
|
+
quality_score: Math.round(qualityScore * 1e4) / 1e4,
|
|
723
|
+
domain: this.getDomain(doc)
|
|
724
|
+
};
|
|
725
|
+
}
|
|
726
|
+
domainScore(doc) {
|
|
727
|
+
const domain = this.getDomain(doc);
|
|
728
|
+
if (!domain) return this.unknownDomainScore;
|
|
729
|
+
if (domain in this.trustedDomains) {
|
|
730
|
+
return this.trustedDomains[domain];
|
|
731
|
+
}
|
|
732
|
+
const parts = domain.split(".");
|
|
733
|
+
if (parts.length > 2) {
|
|
734
|
+
const parent = parts.slice(-2).join(".");
|
|
735
|
+
if (parent in this.trustedDomains) {
|
|
736
|
+
return this.trustedDomains[parent] * 0.9;
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
for (const pattern of LOW_TRUST_PATTERNS) {
|
|
740
|
+
if (domain.includes(pattern)) return 0.3;
|
|
741
|
+
}
|
|
742
|
+
return this.unknownDomainScore;
|
|
743
|
+
}
|
|
744
|
+
getDomain(doc) {
|
|
745
|
+
const source = doc.metadata.source ?? doc.metadata.url ?? "";
|
|
746
|
+
return source ? extractDomain(source) : null;
|
|
747
|
+
}
|
|
748
|
+
metadataCompleteness(doc) {
|
|
749
|
+
const present = EXPECTED_METADATA_FIELDS.filter(
|
|
750
|
+
(field) => doc.metadata[field]
|
|
751
|
+
).length;
|
|
752
|
+
return present / EXPECTED_METADATA_FIELDS.length;
|
|
753
|
+
}
|
|
754
|
+
contentQuality(doc) {
|
|
755
|
+
if (!doc.content.trim()) return 0;
|
|
756
|
+
const patterns = detectSuspiciousPatterns(doc.content);
|
|
757
|
+
const penalty = (patterns.promotional ? 0.4 : 0) + (patterns.repetitive ? 0.3 : 0) + (patterns.unicodeAnomalies ? 0.3 : 0);
|
|
758
|
+
return Math.max(1 - penalty, 0);
|
|
759
|
+
}
|
|
760
|
+
};
|
|
761
|
+
|
|
762
|
+
// src/detectors/index.ts
|
|
763
|
+
var DETECTOR_REGISTRY = {
|
|
764
|
+
consensus_clustering: (config) => new ConsensusClusteringDetector(config.consensus),
|
|
765
|
+
semantic_anomaly: (config) => new SemanticAnomalyDetector(config.anomaly),
|
|
766
|
+
source_reputation: (config) => new SourceReputationDetector(config.reputation)
|
|
767
|
+
};
|
|
768
|
+
|
|
769
|
+
// src/raguard.ts
|
|
770
|
+
var RAGuard = class _RAGuard {
|
|
771
|
+
config;
|
|
772
|
+
detectors;
|
|
773
|
+
constructor(options) {
|
|
774
|
+
if (options?.apiKey) {
|
|
775
|
+
throw new Error(
|
|
776
|
+
'RAGuard API mode is coming soon! For now, use local mode (no API key needed):\n\n const guard = new RAGuard(); // local mode\n const result = await guard.scan(docs, { query: "..." });\n\nFollow https://www.raguard.deepseal.ai for API launch updates.'
|
|
777
|
+
);
|
|
778
|
+
}
|
|
779
|
+
this.config = mergeConfig(options?.config);
|
|
780
|
+
if (options?.detectors) {
|
|
781
|
+
this.config.enabledDetectors = options.detectors;
|
|
782
|
+
}
|
|
783
|
+
this.detectors = this.initDetectors();
|
|
784
|
+
}
|
|
785
|
+
initDetectors() {
|
|
786
|
+
const instances = [];
|
|
787
|
+
for (const name of this.config.enabledDetectors) {
|
|
788
|
+
const factory = DETECTOR_REGISTRY[name];
|
|
789
|
+
if (!factory) {
|
|
790
|
+
throw new Error(
|
|
791
|
+
`Unknown detector: ${name}. Available: ${Object.keys(DETECTOR_REGISTRY).join(", ")}`
|
|
792
|
+
);
|
|
793
|
+
}
|
|
794
|
+
instances.push(factory(this.config));
|
|
795
|
+
}
|
|
796
|
+
return instances;
|
|
797
|
+
}
|
|
798
|
+
/**
|
|
799
|
+
* Scan documents for adversarial threats.
|
|
800
|
+
*
|
|
801
|
+
* @param documents - Array of documents (Document objects, plain objects, or strings).
|
|
802
|
+
* @param options - Scan options including the original query.
|
|
803
|
+
* @returns ScanResult with risk scores, flagged documents, and recommendation.
|
|
804
|
+
*/
|
|
805
|
+
async scan(documents, options) {
|
|
806
|
+
const start = performance.now();
|
|
807
|
+
const normalized = _RAGuard.normalizeDocuments(documents);
|
|
808
|
+
return this.scanLocal(normalized, options?.query, start);
|
|
809
|
+
}
|
|
810
|
+
/**
|
|
811
|
+
* Convenience method: scan and return only safe documents.
|
|
812
|
+
*
|
|
813
|
+
* @param documents - Array of documents to scan.
|
|
814
|
+
* @param options - Scan options including the original query.
|
|
815
|
+
* @returns Array of documents that passed all safety checks.
|
|
816
|
+
*/
|
|
817
|
+
async filter(documents, options) {
|
|
818
|
+
const normalized = _RAGuard.normalizeDocuments(documents);
|
|
819
|
+
const result = await this.scan(normalized, options);
|
|
820
|
+
const flaggedSet = new Set(result.flaggedDocuments);
|
|
821
|
+
return normalized.filter((_, i) => !flaggedSet.has(i));
|
|
822
|
+
}
|
|
823
|
+
scanLocal(documents, query, startTime) {
|
|
824
|
+
const embeddings = encodeTexts(documents.map((d) => d.content));
|
|
825
|
+
const queryEmbedding = query ? encodeTexts([query]) : void 0;
|
|
826
|
+
const detectorResults = {};
|
|
827
|
+
const allFlagged = /* @__PURE__ */ new Set();
|
|
828
|
+
for (const detector of this.detectors) {
|
|
829
|
+
const result = detector.detect(documents, {
|
|
830
|
+
query,
|
|
831
|
+
embeddings,
|
|
832
|
+
queryEmbedding
|
|
833
|
+
});
|
|
834
|
+
detectorResults[detector.name] = result;
|
|
835
|
+
for (const idx of result.flaggedIndices) {
|
|
836
|
+
allFlagged.add(idx);
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
const riskScores = Object.values(detectorResults).map((r) => r.riskScore);
|
|
840
|
+
const overallRisk = riskScores.length > 0 ? Math.max(...riskScores) : 0;
|
|
841
|
+
let recommendation;
|
|
842
|
+
if (overallRisk >= this.config.riskThreshold) {
|
|
843
|
+
recommendation = "block" /* BLOCK */;
|
|
844
|
+
} else if (overallRisk >= this.config.warningThreshold) {
|
|
845
|
+
recommendation = "cite_with_warning" /* CITE_WITH_WARNING */;
|
|
846
|
+
} else {
|
|
847
|
+
recommendation = "proceed" /* PROCEED */;
|
|
848
|
+
}
|
|
849
|
+
const latencyMs = performance.now() - startTime;
|
|
850
|
+
return {
|
|
851
|
+
safe: overallRisk < this.config.warningThreshold,
|
|
852
|
+
overallRiskScore: Math.round(overallRisk * 1e4) / 1e4,
|
|
853
|
+
recommendation,
|
|
854
|
+
flaggedDocuments: Array.from(allFlagged).sort((a, b) => a - b),
|
|
855
|
+
detectors: detectorResults,
|
|
856
|
+
scanId: randomId() + randomId(),
|
|
857
|
+
latencyMs: Math.round(latencyMs * 100) / 100,
|
|
858
|
+
documentsScanned: documents.length
|
|
859
|
+
};
|
|
860
|
+
}
|
|
861
|
+
/**
|
|
862
|
+
* Normalize various input formats into Document objects.
|
|
863
|
+
*/
|
|
864
|
+
static normalizeDocuments(documents) {
|
|
865
|
+
return documents.map((doc) => {
|
|
866
|
+
if (doc instanceof Document) return doc;
|
|
867
|
+
if (typeof doc === "string") return new Document(doc);
|
|
868
|
+
if (typeof doc === "object" && doc !== null) {
|
|
869
|
+
if ("content" in doc) {
|
|
870
|
+
return new Document(doc);
|
|
871
|
+
}
|
|
872
|
+
if ("page_content" in doc) {
|
|
873
|
+
return new Document({
|
|
874
|
+
content: doc.page_content,
|
|
875
|
+
metadata: doc.metadata
|
|
876
|
+
});
|
|
877
|
+
}
|
|
878
|
+
throw new Error(
|
|
879
|
+
`Object must have 'content' or 'page_content' key: ${JSON.stringify(doc)}`
|
|
880
|
+
);
|
|
881
|
+
}
|
|
882
|
+
throw new TypeError(`Unsupported document type: ${typeof doc}`);
|
|
883
|
+
});
|
|
884
|
+
}
|
|
885
|
+
};
|
|
886
|
+
|
|
887
|
+
// src/index.ts
|
|
888
|
+
var VERSION = "0.0.0";
|
|
889
|
+
|
|
890
|
+
exports.DEFAULT_CONFIG = DEFAULT_CONFIG;
|
|
891
|
+
exports.Document = Document;
|
|
892
|
+
exports.RAGuard = RAGuard;
|
|
893
|
+
exports.Recommendation = Recommendation;
|
|
894
|
+
exports.VERSION = VERSION;
|
|
895
|
+
//# sourceMappingURL=index.js.map
|
|
896
|
+
//# sourceMappingURL=index.js.map
|