@sourcepress/knowledge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -0
- package/.turbo/turbo-test.log +21 -0
- package/dist/__tests__/graph-builder.test.d.ts +2 -0
- package/dist/__tests__/graph-builder.test.d.ts.map +1 -0
- package/dist/__tests__/graph-builder.test.js +122 -0
- package/dist/__tests__/graph-builder.test.js.map +1 -0
- package/dist/__tests__/graph-ops.test.d.ts +2 -0
- package/dist/__tests__/graph-ops.test.d.ts.map +1 -0
- package/dist/__tests__/graph-ops.test.js +181 -0
- package/dist/__tests__/graph-ops.test.js.map +1 -0
- package/dist/__tests__/ingestion.test.d.ts +2 -0
- package/dist/__tests__/ingestion.test.d.ts.map +1 -0
- package/dist/__tests__/ingestion.test.js +108 -0
- package/dist/__tests__/ingestion.test.js.map +1 -0
- package/dist/__tests__/json-file-store.test.d.ts +2 -0
- package/dist/__tests__/json-file-store.test.d.ts.map +1 -0
- package/dist/__tests__/json-file-store.test.js +180 -0
- package/dist/__tests__/json-file-store.test.js.map +1 -0
- package/dist/__tests__/knowledge-engine.test.d.ts +2 -0
- package/dist/__tests__/knowledge-engine.test.d.ts.map +1 -0
- package/dist/__tests__/knowledge-engine.test.js +152 -0
- package/dist/__tests__/knowledge-engine.test.js.map +1 -0
- package/dist/__tests__/knowledge-store.test.d.ts +2 -0
- package/dist/__tests__/knowledge-store.test.d.ts.map +1 -0
- package/dist/__tests__/knowledge-store.test.js +97 -0
- package/dist/__tests__/knowledge-store.test.js.map +1 -0
- package/dist/__tests__/scraper.test.d.ts +2 -0
- package/dist/__tests__/scraper.test.d.ts.map +1 -0
- package/dist/__tests__/scraper.test.js +66 -0
- package/dist/__tests__/scraper.test.js.map +1 -0
- package/dist/__tests__/sitemap-parser.test.d.ts +2 -0
- package/dist/__tests__/sitemap-parser.test.d.ts.map +1 -0
- package/dist/__tests__/sitemap-parser.test.js +75 -0
- package/dist/__tests__/sitemap-parser.test.js.map +1 -0
- package/dist/graph-builder.d.ts +17 -0
- package/dist/graph-builder.d.ts.map +1 -0
- package/dist/graph-builder.js +98 -0
- package/dist/graph-builder.js.map +1 -0
- package/dist/graph-ops.d.ts +21 -0
- package/dist/graph-ops.d.ts.map +1 -0
- package/dist/graph-ops.js +108 -0
- package/dist/graph-ops.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/ingestion/index.d.ts +4 -0
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +3 -0
- package/dist/ingestion/index.js.map +1 -0
- package/dist/ingestion/scraper.d.ts +22 -0
- package/dist/ingestion/scraper.d.ts.map +1 -0
- package/dist/ingestion/scraper.js +118 -0
- package/dist/ingestion/scraper.js.map +1 -0
- package/dist/ingestion/sitemap-parser.d.ts +32 -0
- package/dist/ingestion/sitemap-parser.d.ts.map +1 -0
- package/dist/ingestion/sitemap-parser.js +104 -0
- package/dist/ingestion/sitemap-parser.js.map +1 -0
- package/dist/ingestion/types.d.ts +58 -0
- package/dist/ingestion/types.d.ts.map +1 -0
- package/dist/ingestion/types.js +2 -0
- package/dist/ingestion/types.js.map +1 -0
- package/dist/json-file-store.d.ts +19 -0
- package/dist/json-file-store.d.ts.map +1 -0
- package/dist/json-file-store.js +100 -0
- package/dist/json-file-store.js.map +1 -0
- package/dist/knowledge-engine.d.ts +45 -0
- package/dist/knowledge-engine.d.ts.map +1 -0
- package/dist/knowledge-engine.js +160 -0
- package/dist/knowledge-engine.js.map +1 -0
- package/dist/knowledge-store.d.ts +14 -0
- package/dist/knowledge-store.d.ts.map +1 -0
- package/dist/knowledge-store.js +40 -0
- package/dist/knowledge-store.js.map +1 -0
- package/dist/types.d.ts +67 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +26 -0
- package/src/__tests__/graph-builder.test.ts +129 -0
- package/src/__tests__/graph-ops.test.ts +189 -0
- package/src/__tests__/ingestion.test.ts +127 -0
- package/src/__tests__/json-file-store.test.ts +206 -0
- package/src/__tests__/knowledge-engine.test.ts +177 -0
- package/src/__tests__/knowledge-store.test.ts +111 -0
- package/src/__tests__/scraper.test.ts +74 -0
- package/src/__tests__/sitemap-parser.test.ts +85 -0
- package/src/graph-builder.ts +109 -0
- package/src/graph-ops.ts +129 -0
- package/src/index.ts +27 -0
- package/src/ingestion/index.ts +10 -0
- package/src/ingestion/scraper.ts +137 -0
- package/src/ingestion/sitemap-parser.ts +119 -0
- package/src/ingestion/types.ts +57 -0
- package/src/json-file-store.ts +127 -0
- package/src/knowledge-engine.ts +217 -0
- package/src/knowledge-store.ts +49 -0
- package/src/types.ts +76 -0
- package/tsconfig.json +5 -0
- package/vitest.config.ts +2 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap-parser.test.js","sourceRoot":"","sources":["../../src/__tests__/sitemap-parser.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EAAE,aAAa,EAAE,MAAM,gCAAgC,CAAC;AAE/D,MAAM,cAAc,GAAG;;;;;;;;;;UAUb,CAAC;AAEX,SAAS,WAAW,CAAC,GAAW,EAAE,MAAM,GAAG,GAAG;IAC7C,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC;QAChC,EAAE,EAAE,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG;QACjC,MAAM;QACN,UAAU,EAAE,IAAI;QAChB,IAAI,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC;KAChC,CAAuC,CAAC;AAC1C,CAAC;AAED,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QAErE,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,aAAa,CAAC,CAAC;QAC1E,MAAM,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;QAC/B,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAEhC,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC;QAClE,MAAM,CAAC,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;QAC3B,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sBAAsB,EAAE,KAAK,IAAI,EAAE;QACrC,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,WAAW,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC;QACvD,MAAM,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACtF,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,KAAK,IAAI,EAAE;QAC1C,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,WAAW,CAAC,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAChG,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;QACjD,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACrE,MAAM,QAAQ,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE;YAC1C,WAAW,EAAE,iCAAiC;YAC9C,OAAO,EAAE,CAAC,aAAa,CAAC;SACxB,CAAC,CAAC;QACH,MAAM,CAAC,QAAQ,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;QACjD,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACrE,MAAM,QAAQ,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE;YAC1C,WAAW,EAAE,iCAAiC;YAC9C,OAAO,EAAE,CAAC,SAAS,CAAC;SACpB,CAAC,CAAC;QACH,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC/D,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,mBAAmB;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8BAA8B,EAAE,KAAK,IAAI,EAAE;QAC7C,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACrE,MAAM,QAAQ,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE;YAC1C,WAAW,EAAE,iCAAiC;YAC9C,OAAO,EAAE,CAAC,aAAa,EAAE,SAAS,CAAC;YACnC,OAAO,EAAE,CAAC,SAAS,CAAC;SACpB,CAAC,CAAC;QACH,MAAM,CAAC,QAAQ,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpE,CAAC,CAAC,CAAC;AACJ,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { ExtractedEntity, ExtractedRelation, KnowledgeGraph } from "./types.js";
|
|
2
|
+
export interface GraphBuilderOptions {
|
|
3
|
+
minConfidence?: number;
|
|
4
|
+
}
|
|
5
|
+
export declare class GraphBuilder {
|
|
6
|
+
private entities;
|
|
7
|
+
private relations;
|
|
8
|
+
private sourceFiles;
|
|
9
|
+
private minConfidence;
|
|
10
|
+
constructor(options?: GraphBuilderOptions);
|
|
11
|
+
addEntities(entities: ExtractedEntity[]): void;
|
|
12
|
+
getEntities(): Map<string, ExtractedEntity>;
|
|
13
|
+
addRelations(relations: ExtractedRelation[]): void;
|
|
14
|
+
build(): KnowledgeGraph;
|
|
15
|
+
private buildClusters;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=graph-builder.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph-builder.d.ts","sourceRoot":"","sources":["../src/graph-builder.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAiB,eAAe,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAEpG,MAAM,WAAW,mBAAmB;IACnC,aAAa,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,qBAAa,YAAY;IACxB,OAAO,CAAC,QAAQ,CAA2C;IAC3D,OAAO,CAAC,SAAS,CAA2B;IAC5C,OAAO,CAAC,WAAW,CAA0B;IAC7C,OAAO,CAAC,aAAa,CAAS;gBAElB,OAAO,GAAE,mBAAwB;IAI7C,WAAW,CAAC,QAAQ,EAAE,eAAe,EAAE,GAAG,IAAI;IAmB9C,WAAW,IAAI,GAAG,CAAC,MAAM,EAAE,eAAe,CAAC;IAI3C,YAAY,CAAC,SAAS,EAAE,iBAAiB,EAAE,GAAG,IAAI;IAOlD,KAAK,IAAI,cAAc;IAYvB,OAAO,CAAC,aAAa;CAkDrB"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
export class GraphBuilder {
|
|
2
|
+
entities = new Map();
|
|
3
|
+
relations = [];
|
|
4
|
+
sourceFiles = new Set();
|
|
5
|
+
minConfidence;
|
|
6
|
+
constructor(options = {}) {
|
|
7
|
+
this.minConfidence = options.minConfidence ?? 0;
|
|
8
|
+
}
|
|
9
|
+
addEntities(entities) {
|
|
10
|
+
for (const entity of entities) {
|
|
11
|
+
this.sourceFiles.add(entity.source_file);
|
|
12
|
+
const existing = this.entities.get(entity.name);
|
|
13
|
+
if (existing) {
|
|
14
|
+
const mergedAliases = Array.from(new Set([...(existing.aliases || []), ...(entity.aliases || [])]));
|
|
15
|
+
if (entity.confidence > existing.confidence) {
|
|
16
|
+
this.entities.set(entity.name, { ...entity, aliases: mergedAliases });
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
existing.aliases = mergedAliases;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
this.entities.set(entity.name, { ...entity });
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
getEntities() {
|
|
28
|
+
return this.entities;
|
|
29
|
+
}
|
|
30
|
+
addRelations(relations) {
|
|
31
|
+
for (const relation of relations) {
|
|
32
|
+
this.sourceFiles.add(relation.source_file);
|
|
33
|
+
this.relations.push({ ...relation });
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
build() {
|
|
37
|
+
const filteredRelations = this.relations.filter((r) => r.confidence >= this.minConfidence);
|
|
38
|
+
const clusters = this.buildClusters(filteredRelations);
|
|
39
|
+
return {
|
|
40
|
+
entities: new Map(this.entities),
|
|
41
|
+
relations: filteredRelations,
|
|
42
|
+
clusters,
|
|
43
|
+
built_at: new Date().toISOString(),
|
|
44
|
+
file_count: this.sourceFiles.size,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
buildClusters(relations) {
|
|
48
|
+
const parent = new Map();
|
|
49
|
+
const find = (x) => {
|
|
50
|
+
if (!parent.has(x))
|
|
51
|
+
parent.set(x, x);
|
|
52
|
+
// biome-ignore lint/style/noNonNullAssertion: key presence guaranteed by parent.has() check above
|
|
53
|
+
if (parent.get(x) !== x)
|
|
54
|
+
parent.set(x, find(parent.get(x)));
|
|
55
|
+
// biome-ignore lint/style/noNonNullAssertion: key presence guaranteed by parent.has() check above
|
|
56
|
+
return parent.get(x);
|
|
57
|
+
};
|
|
58
|
+
const union = (a, b) => {
|
|
59
|
+
const rootA = find(a);
|
|
60
|
+
const rootB = find(b);
|
|
61
|
+
if (rootA !== rootB)
|
|
62
|
+
parent.set(rootB, rootA);
|
|
63
|
+
};
|
|
64
|
+
for (const name of this.entities.keys())
|
|
65
|
+
find(name);
|
|
66
|
+
for (const relation of relations) {
|
|
67
|
+
if (this.entities.has(relation.from_entity) && this.entities.has(relation.to_entity)) {
|
|
68
|
+
union(relation.from_entity, relation.to_entity);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
const groups = new Map();
|
|
72
|
+
for (const name of this.entities.keys()) {
|
|
73
|
+
const root = find(name);
|
|
74
|
+
if (!groups.has(root))
|
|
75
|
+
groups.set(root, []);
|
|
76
|
+
// biome-ignore lint/style/noNonNullAssertion: key presence guaranteed by groups.has() check above
|
|
77
|
+
groups.get(root).push(name);
|
|
78
|
+
}
|
|
79
|
+
const clusters = [];
|
|
80
|
+
let clusterId = 0;
|
|
81
|
+
for (const [, members] of groups) {
|
|
82
|
+
if (members.length < 2)
|
|
83
|
+
continue;
|
|
84
|
+
const clusterRelations = relations.filter((r) => members.includes(r.from_entity) && members.includes(r.to_entity));
|
|
85
|
+
const coherence = clusterRelations.length > 0
|
|
86
|
+
? clusterRelations.reduce((sum, r) => sum + r.confidence, 0) / clusterRelations.length
|
|
87
|
+
: 0;
|
|
88
|
+
clusters.push({
|
|
89
|
+
id: `cluster-${clusterId++}`,
|
|
90
|
+
name: members[0],
|
|
91
|
+
entities: members,
|
|
92
|
+
coherence_score: Math.round(coherence * 100) / 100,
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
return clusters;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=graph-builder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph-builder.js","sourceRoot":"","sources":["../src/graph-builder.ts"],"names":[],"mappings":"AAMA,MAAM,OAAO,YAAY;IAChB,QAAQ,GAAiC,IAAI,GAAG,EAAE,CAAC;IACnD,SAAS,GAAwB,EAAE,CAAC;IACpC,WAAW,GAAgB,IAAI,GAAG,EAAE,CAAC;IACrC,aAAa,CAAS;IAE9B,YAAY,UAA+B,EAAE;QAC5C,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,aAAa,IAAI,CAAC,CAAC;IACjD,CAAC;IAED,WAAW,CAAC,QAA2B;QACtC,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;YACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YAChD,IAAI,QAAQ,EAAE,CAAC;gBACd,MAAM,aAAa,GAAG,KAAK,CAAC,IAAI,CAC/B,IAAI,GAAG,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,IAAI,EAAE,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC,CACjE,CAAC;gBACF,IAAI,MAAM,CAAC,UAAU,GAAG,QAAQ,CAAC,UAAU,EAAE,CAAC;oBAC7C,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,EAAE,GAAG,MAAM,EAAE,OAAO,EAAE,aAAa,EAAE,CAAC,CAAC;gBACvE,CAAC;qBAAM,CAAC;oBACP,QAAQ,CAAC,OAAO,GAAG,aAAa,CAAC;gBAClC,CAAC;YACF,CAAC;iBAAM,CAAC;gBACP,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,EAAE,GAAG,MAAM,EAAE,CAAC,CAAC;YAC/C,CAAC;QACF,CAAC;IACF,CAAC;IAED,WAAW;QACV,OAAO,IAAI,CAAC,QAAQ,CAAC;IACtB,CAAC;IAED,YAAY,CAAC,SAA8B;QAC1C,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YAClC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;YAC3C,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,GAAG,QAAQ,EAAE,CAAC,CAAC;QACtC,CAAC;IACF,CAAC;IAED,KAAK;QACJ,MAAM,iBAAiB,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,IAAI,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3F,MAAM,QAAQ,GAAG,IAAI,CAAC,aAAa,CAAC,iBAAiB,CAAC,CAAC;QACvD,OAAO;YACN,QAAQ,EAAE,IAAI,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC;YAChC,SAAS,EAAE,iBAAiB;YAC5B,QAAQ;YACR,QAAQ,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YAClC,UAAU,EAAE,IAAI,CAAC,WAAW,CAAC,IAAI;SACjC,CAAC;IACH,CAAC;IAEO,aAAa,CAAC,SAA8B;QACnD,MAAM,MAAM,GAAwB,IAAI,GAAG,EAAE,CAAC;QAC9C,MAAM,IAAI,GAAG,CAAC,CAAS,EAAU,EAAE;YAClC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;gBAAE,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YACrC,kGAAkG;YAClG,IAAI,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;gBAAE,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC;YAC7D,kGAAkG;YAClG,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC;QACvB,CAAC,CAAC;QACF,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,CAAS,EAAQ,EAAE;YAC5C,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YACtB,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YACtB,IAAI,KAAK,KAAK,KAAK;gBAAE,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAC/C,CAAC,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE;YAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACpD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YAClC,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBACtF,KAAK,CAAC,QAAQ,CAAC,WAAW,EAAE,QAAQ,CAAC,SAAS,CAAC,CAAC;YACjD,CAAC;QACF,CAAC;QAED,MAAM,MAAM,GAA0B,IAAI,GAAG,EAAE,CAAC;QAChD,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;YACzC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAC5C,kGAAkG;YAClG,MAAM,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC;QAED,MAAM,QAAQ,GAAoB,EAAE,CAAC;QACrC,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,KAAK,MAAM,CAAC,EAAE,OAAO,CAAC,IAAI,MAAM,EAAE,CAAC;YAClC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;gBAAE,SAAS;YACjC,MAAM,gBAAgB,GAAG,SAAS,CAAC,MAAM,CACxC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CACvE,CAAC;YACF,MAAM,SAAS,GACd,gBAAgB,CAAC,MAAM,GAAG,CAAC;gBAC1B,CAAC,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,gBAAgB,CAAC,MAAM;gBACtF,CAAC,CAAC,CAAC,CAAC;YACN,QAAQ,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,WAAW,SAAS,EAAE,EAAE;gBAC5B,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC;gBAChB,QAAQ,EAAE,OAAO;gBACjB,eAAe,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,GAAG,GAAG;aAClD,CAAC,CAAC;QACJ,CAAC;QACD,OAAO,QAAQ,CAAC;IACjB,CAAC;CACD"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ContentFile } from "@sourcepress/core";
|
|
2
|
+
import type { GraphQueryResult, KnowledgeGap, KnowledgeGraph, StaleContent } from "./types.js";
|
|
3
|
+
export interface GraphStats {
|
|
4
|
+
entity_count: number;
|
|
5
|
+
relation_count: number;
|
|
6
|
+
cluster_count: number;
|
|
7
|
+
file_count: number;
|
|
8
|
+
built_at: string;
|
|
9
|
+
}
|
|
10
|
+
export declare class GraphOps {
|
|
11
|
+
private graph;
|
|
12
|
+
private aliasIndex;
|
|
13
|
+
constructor(graph: KnowledgeGraph);
|
|
14
|
+
private buildAliasIndex;
|
|
15
|
+
private resolveEntityName;
|
|
16
|
+
query(nameOrAlias: string): GraphQueryResult | null;
|
|
17
|
+
findGaps(contentFiles: ContentFile[]): KnowledgeGap[];
|
|
18
|
+
findStale(contentFiles: ContentFile[], knowledgeTimestamps: Record<string, string>): StaleContent[];
|
|
19
|
+
getStats(): GraphStats;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=graph-ops.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph-ops.d.ts","sourceRoot":"","sources":["../src/graph-ops.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAEX,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,YAAY,EACZ,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,UAAU;IAC1B,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;CACjB;AAED,qBAAa,QAAQ;IACpB,OAAO,CAAC,KAAK,CAAiB;IAC9B,OAAO,CAAC,UAAU,CAAsB;gBAE5B,KAAK,EAAE,cAAc;IAKjC,OAAO,CAAC,eAAe;IAWvB,OAAO,CAAC,iBAAiB;IAKzB,KAAK,CAAC,WAAW,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI;IA0BnD,QAAQ,CAAC,YAAY,EAAE,WAAW,EAAE,GAAG,YAAY,EAAE;IAqBrD,SAAS,CACR,YAAY,EAAE,WAAW,EAAE,EAC3B,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACzC,YAAY,EAAE;IA2BjB,QAAQ,IAAI,UAAU;CAStB"}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
export class GraphOps {
|
|
2
|
+
graph;
|
|
3
|
+
aliasIndex;
|
|
4
|
+
constructor(graph) {
|
|
5
|
+
this.graph = graph;
|
|
6
|
+
this.aliasIndex = this.buildAliasIndex();
|
|
7
|
+
}
|
|
8
|
+
buildAliasIndex() {
|
|
9
|
+
const index = new Map();
|
|
10
|
+
for (const [name, entity] of this.graph.entities) {
|
|
11
|
+
index.set(name.toLowerCase(), name);
|
|
12
|
+
for (const alias of entity.aliases || []) {
|
|
13
|
+
index.set(alias.toLowerCase(), name);
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
return index;
|
|
17
|
+
}
|
|
18
|
+
resolveEntityName(nameOrAlias) {
|
|
19
|
+
if (this.graph.entities.has(nameOrAlias))
|
|
20
|
+
return nameOrAlias;
|
|
21
|
+
return this.aliasIndex.get(nameOrAlias.toLowerCase()) ?? null;
|
|
22
|
+
}
|
|
23
|
+
query(nameOrAlias) {
|
|
24
|
+
const resolvedName = this.resolveEntityName(nameOrAlias);
|
|
25
|
+
if (!resolvedName)
|
|
26
|
+
return null;
|
|
27
|
+
const entity = this.graph.entities.get(resolvedName);
|
|
28
|
+
if (!entity)
|
|
29
|
+
return null;
|
|
30
|
+
const relations = this.graph.relations.filter((r) => r.from_entity === resolvedName || r.to_entity === resolvedName);
|
|
31
|
+
const relatedNames = new Set();
|
|
32
|
+
for (const r of relations) {
|
|
33
|
+
if (r.from_entity === resolvedName)
|
|
34
|
+
relatedNames.add(r.to_entity);
|
|
35
|
+
if (r.to_entity === resolvedName)
|
|
36
|
+
relatedNames.add(r.from_entity);
|
|
37
|
+
}
|
|
38
|
+
const related_entities = [];
|
|
39
|
+
for (const name of relatedNames) {
|
|
40
|
+
const e = this.graph.entities.get(name);
|
|
41
|
+
if (e)
|
|
42
|
+
related_entities.push(e);
|
|
43
|
+
}
|
|
44
|
+
const files = new Set();
|
|
45
|
+
files.add(entity.source_file);
|
|
46
|
+
for (const r of relations)
|
|
47
|
+
files.add(r.source_file);
|
|
48
|
+
return { entity, relations, related_entities, files: Array.from(files) };
|
|
49
|
+
}
|
|
50
|
+
findGaps(contentFiles) {
|
|
51
|
+
const gaps = [];
|
|
52
|
+
for (const [name, entity] of this.graph.entities) {
|
|
53
|
+
const contentCount = contentFiles.filter((c) => {
|
|
54
|
+
const bodyLower = c.body.toLowerCase();
|
|
55
|
+
const titleLower = String(c.frontmatter.title ?? "").toLowerCase();
|
|
56
|
+
return bodyLower.includes(name.toLowerCase()) || titleLower.includes(name.toLowerCase());
|
|
57
|
+
}).length;
|
|
58
|
+
if (contentCount === 0) {
|
|
59
|
+
gaps.push({
|
|
60
|
+
entity_name: name,
|
|
61
|
+
entity_type: entity.type,
|
|
62
|
+
knowledge_file_count: 1,
|
|
63
|
+
content_file_count: 0,
|
|
64
|
+
reason: `Entity "${name}" (${entity.type}) exists in knowledge but has no corresponding content`,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return gaps;
|
|
69
|
+
}
|
|
70
|
+
findStale(contentFiles, knowledgeTimestamps) {
|
|
71
|
+
const stale = [];
|
|
72
|
+
for (const content of contentFiles) {
|
|
73
|
+
if (!content.provenance?.generated_at || !content.provenance?.source_files)
|
|
74
|
+
continue;
|
|
75
|
+
const generatedAt = content.provenance.generated_at;
|
|
76
|
+
const staleSources = [];
|
|
77
|
+
let newestSourceChange = "";
|
|
78
|
+
for (const sourcePath of content.provenance.source_files) {
|
|
79
|
+
const sourceTimestamp = knowledgeTimestamps[sourcePath];
|
|
80
|
+
if (sourceTimestamp && sourceTimestamp > generatedAt) {
|
|
81
|
+
staleSources.push(sourcePath);
|
|
82
|
+
if (sourceTimestamp > newestSourceChange)
|
|
83
|
+
newestSourceChange = sourceTimestamp;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (staleSources.length > 0) {
|
|
87
|
+
stale.push({
|
|
88
|
+
content_path: content.path,
|
|
89
|
+
generated_at: generatedAt,
|
|
90
|
+
newest_source_change: newestSourceChange,
|
|
91
|
+
stale_sources: staleSources,
|
|
92
|
+
reason: `Content generated at ${generatedAt} but source(s) updated at ${newestSourceChange}`,
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return stale;
|
|
97
|
+
}
|
|
98
|
+
getStats() {
|
|
99
|
+
return {
|
|
100
|
+
entity_count: this.graph.entities.size,
|
|
101
|
+
relation_count: this.graph.relations.length,
|
|
102
|
+
cluster_count: this.graph.clusters.length,
|
|
103
|
+
file_count: this.graph.file_count,
|
|
104
|
+
built_at: this.graph.built_at,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
//# sourceMappingURL=graph-ops.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph-ops.js","sourceRoot":"","sources":["../src/graph-ops.ts"],"names":[],"mappings":"AAiBA,MAAM,OAAO,QAAQ;IACZ,KAAK,CAAiB;IACtB,UAAU,CAAsB;IAExC,YAAY,KAAqB;QAChC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;IAC1C,CAAC;IAEO,eAAe;QACtB,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;QACxC,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;YAClD,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,IAAI,CAAC,CAAC;YACpC,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;gBAC1C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,EAAE,EAAE,IAAI,CAAC,CAAC;YACtC,CAAC;QACF,CAAC;QACD,OAAO,KAAK,CAAC;IACd,CAAC;IAEO,iBAAiB,CAAC,WAAmB;QAC5C,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,WAAW,CAAC;YAAE,OAAO,WAAW,CAAC;QAC7D,OAAO,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,WAAW,CAAC,WAAW,EAAE,CAAC,IAAI,IAAI,CAAC;IAC/D,CAAC;IAED,KAAK,CAAC,WAAmB;QACxB,MAAM,YAAY,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;QACzD,IAAI,CAAC,YAAY;YAAE,OAAO,IAAI,CAAC;QAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC;QAEzB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAC5C,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,KAAK,YAAY,IAAI,CAAC,CAAC,SAAS,KAAK,YAAY,CACrE,CAAC;QACF,MAAM,YAAY,GAAG,IAAI,GAAG,EAAU,CAAC;QACvC,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;YAC3B,IAAI,CAAC,CAAC,WAAW,KAAK,YAAY;gBAAE,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;YAClE,IAAI,CAAC,CAAC,SAAS,KAAK,YAAY;gBAAE,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACnE,CAAC;QACD,MAAM,gBAAgB,GAAsB,EAAE,CAAC;QAC/C,KAAK,MAAM,IAAI,IAAI,YAAY,EAAE,CAAC;YACjC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC;gBAAE,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACjC,CAAC;QACD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;QAChC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,SAAS;YAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QAEpD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,gBAAgB,EAAE,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;IAC1E,CAAC;IAED,QAAQ,CAAC,YAA2B;QACnC,MAAM,IAAI,GAAmB,EAAE,CAAC;QAChC,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;YAClD,MAAM,YAAY,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC9C,MAAM,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;gBACvC,MAAM,UAAU,GAAG,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;gBACnE,OAAO,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,IAAI,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;YAC1F,CAAC,CAAC,CAAC,MAAM,CAAC;YACV,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACxB,IAAI,CAAC,IAAI,CAAC;oBACT,WAAW,EAAE,IAAI;oBACjB,WAAW,EAAE,MAAM,CAAC,IAAI;oBACxB,oBAAoB,EAAE,CAAC;oBACvB,kBAAkB,EAAE,CAAC;oBACrB,MAAM,EAAE,WAAW,IAAI,MAAM,MAAM,CAAC,IAAI,wDAAwD;iBAChG,CAAC,CAAC;YACJ,CAAC;QACF,CAAC;QACD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,SAAS,CACR,YAA2B,EAC3B,mBAA2C;QAE3C,MAAM,KAAK,GAAmB,EAAE,CAAC;QACjC,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;YACpC,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,YAAY,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,YAAY;gBAAE,SAAS;YACrF,MAAM,WAAW,GAAG,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC;YACpD,MAAM,YAAY,GAAa,EAAE,CAAC;YAClC,IAAI,kBAAkB,GAAG,EAAE,CAAC;YAC5B,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,UAAU,CAAC,YAAY,EAAE,CAAC;gBAC1D,MAAM,eAAe,GAAG,mBAAmB,CAAC,UAAU,CAAC,CAAC;gBACxD,IAAI,eAAe,IAAI,eAAe,GAAG,WAAW,EAAE,CAAC;oBACtD,YAAY,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;oBAC9B,IAAI,eAAe,GAAG,kBAAkB;wBAAE,kBAAkB,GAAG,eAAe,CAAC;gBAChF,CAAC;YACF,CAAC;YACD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,KAAK,CAAC,IAAI,CAAC;oBACV,YAAY,EAAE,OAAO,CAAC,IAAI;oBAC1B,YAAY,EAAE,WAAW;oBACzB,oBAAoB,EAAE,kBAAkB;oBACxC,aAAa,EAAE,YAAY;oBAC3B,MAAM,EAAE,wBAAwB,WAAW,6BAA6B,kBAAkB,EAAE;iBAC5F,CAAC,CAAC;YACJ,CAAC;QACF,CAAC;QACD,OAAO,KAAK,CAAC;IACd,CAAC;IAED,QAAQ;QACP,OAAO;YACN,YAAY,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI;YACtC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM;YAC3C,aAAa,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM;YACzC,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU;YACjC,QAAQ,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ;SAC7B,CAAC;IACH,CAAC;CACD"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export { InMemoryKnowledgeStore } from "./knowledge-store.js";
|
|
2
|
+
export { JsonFileStore } from "./json-file-store.js";
|
|
3
|
+
export { KnowledgeEngine } from "./knowledge-engine.js";
|
|
4
|
+
export { GraphBuilder, type GraphBuilderOptions } from "./graph-builder.js";
|
|
5
|
+
export { GraphOps, type GraphStats } from "./graph-ops.js";
|
|
6
|
+
export type { ExtractionResult, ExtractedEntity, ExtractedRelation, EntityCluster, KnowledgeGraph, GraphQueryResult, KnowledgeGap, StaleContent, KnowledgeStoreBackend, KnowledgeFileFilter, } from "./types.js";
|
|
7
|
+
export { Scraper } from "./ingestion/scraper.js";
|
|
8
|
+
export { SitemapParser } from "./ingestion/sitemap-parser.js";
|
|
9
|
+
export type { Fetcher, ScrapeResult, SitemapSection, SitemapResult, SitemapRunOptions, BatchProgress, } from "./ingestion/types.js";
|
|
10
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,KAAK,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AAC5E,OAAO,EAAE,QAAQ,EAAE,KAAK,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC3D,YAAY,EACX,gBAAgB,EAChB,eAAe,EACf,iBAAiB,EACjB,aAAa,EACb,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,YAAY,EACZ,qBAAqB,EACrB,mBAAmB,GACnB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,OAAO,EAAE,MAAM,wBAAwB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,+BAA+B,CAAC;AAC9D,YAAY,EACX,OAAO,EACP,YAAY,EACZ,cAAc,EACd,aAAa,EACb,iBAAiB,EACjB,aAAa,GACb,MAAM,sBAAsB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export { InMemoryKnowledgeStore } from "./knowledge-store.js";
|
|
2
|
+
export { JsonFileStore } from "./json-file-store.js";
|
|
3
|
+
export { KnowledgeEngine } from "./knowledge-engine.js";
|
|
4
|
+
export { GraphBuilder } from "./graph-builder.js";
|
|
5
|
+
export { GraphOps } from "./graph-ops.js";
|
|
6
|
+
export { Scraper } from "./ingestion/scraper.js";
|
|
7
|
+
export { SitemapParser } from "./ingestion/sitemap-parser.js";
|
|
8
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAA4B,MAAM,oBAAoB,CAAC;AAC5E,OAAO,EAAE,QAAQ,EAAmB,MAAM,gBAAgB,CAAC;AAa3D,OAAO,EAAE,OAAO,EAAE,MAAM,wBAAwB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,+BAA+B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/ingestion/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,YAAY,EACX,OAAO,EACP,YAAY,EACZ,cAAc,EACd,aAAa,EACb,iBAAiB,EACjB,aAAa,GACb,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/ingestion/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { Fetcher, ScrapeResult } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Validate a URL against SSRF risks.
|
|
4
|
+
* Throws if the URL targets a private/loopback address or non-http(s) scheme.
|
|
5
|
+
*/
|
|
6
|
+
export declare function validateUrl(rawUrl: string): URL;
|
|
7
|
+
/**
|
|
8
|
+
* Scrapes a URL into readable content.
|
|
9
|
+
* Uses linkedom for DOM parsing (works in all runtimes) + Mozilla Readability for extraction.
|
|
10
|
+
*/
|
|
11
|
+
export declare class Scraper {
|
|
12
|
+
private fetcher;
|
|
13
|
+
constructor(fetcher?: Fetcher);
|
|
14
|
+
scrape(url: string): Promise<ScrapeResult>;
|
|
15
|
+
extractFromHtml(url: string, html: string): ScrapeResult;
|
|
16
|
+
/**
|
|
17
|
+
* Lightweight HTML to markdown conversion.
|
|
18
|
+
* Handles common elements: headings, paragraphs, links, lists, bold, italic, code.
|
|
19
|
+
*/
|
|
20
|
+
private htmlToMarkdown;
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=scraper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../../src/ingestion/scraper.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAaxD;;;GAGG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,GAAG,CA6B/C;AAED;;;GAGG;AACH,qBAAa,OAAO;IACnB,OAAO,CAAC,OAAO,CAAU;gBAEb,OAAO,CAAC,EAAE,OAAO;IAIvB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;IAYhD,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,YAAY;IAyBxD;;;OAGG;IACH,OAAO,CAAC,cAAc;CAkCtB"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
2
|
+
import { parseHTML } from "linkedom";
|
|
3
|
+
const PRIVATE_IP_PATTERNS = [
|
|
4
|
+
/^127\./,
|
|
5
|
+
/^10\./,
|
|
6
|
+
/^192\.168\./,
|
|
7
|
+
/^172\.(1[6-9]|2\d|3[01])\./,
|
|
8
|
+
/^169\.254\./,
|
|
9
|
+
/^::1$/,
|
|
10
|
+
/^fc00:/i,
|
|
11
|
+
/^fe80:/i,
|
|
12
|
+
];
|
|
13
|
+
/**
|
|
14
|
+
* Validate a URL against SSRF risks.
|
|
15
|
+
* Throws if the URL targets a private/loopback address or non-http(s) scheme.
|
|
16
|
+
*/
|
|
17
|
+
export function validateUrl(rawUrl) {
|
|
18
|
+
let parsed;
|
|
19
|
+
try {
|
|
20
|
+
parsed = new URL(rawUrl);
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
throw new Error(`Invalid URL: ${rawUrl}`);
|
|
24
|
+
}
|
|
25
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
26
|
+
throw new Error(`URL scheme "${parsed.protocol}" is not allowed. Only http and https are permitted.`);
|
|
27
|
+
}
|
|
28
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
29
|
+
for (const pattern of PRIVATE_IP_PATTERNS) {
|
|
30
|
+
if (pattern.test(hostname)) {
|
|
31
|
+
throw new Error(`URL hostname "${hostname}" resolves to a private or loopback address, which is not allowed.`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
// Block "localhost" explicitly
|
|
35
|
+
if (hostname === "localhost" || hostname === "0.0.0.0") {
|
|
36
|
+
throw new Error(`URL hostname "${hostname}" is not allowed.`);
|
|
37
|
+
}
|
|
38
|
+
return parsed;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Scrapes a URL into readable content.
|
|
42
|
+
* Uses linkedom for DOM parsing (works in all runtimes) + Mozilla Readability for extraction.
|
|
43
|
+
*/
|
|
44
|
+
export class Scraper {
|
|
45
|
+
fetcher;
|
|
46
|
+
constructor(fetcher) {
|
|
47
|
+
this.fetcher = fetcher ?? globalThis.fetch.bind(globalThis);
|
|
48
|
+
}
|
|
49
|
+
async scrape(url) {
|
|
50
|
+
validateUrl(url);
|
|
51
|
+
const response = await this.fetcher(url, undefined);
|
|
52
|
+
if (!response.ok) {
|
|
53
|
+
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
|
54
|
+
}
|
|
55
|
+
const html = await response.text();
|
|
56
|
+
return this.extractFromHtml(url, html);
|
|
57
|
+
}
|
|
58
|
+
extractFromHtml(url, html) {
|
|
59
|
+
const { document } = parseHTML(html);
|
|
60
|
+
// biome-ignore lint/suspicious/noExplicitAny: linkedom document is not typed as DOM Document
|
|
61
|
+
const reader = new Readability(document);
|
|
62
|
+
const article = reader.parse();
|
|
63
|
+
const text = article?.textContent?.trim() ?? "";
|
|
64
|
+
if (!article || text.length < 50) {
|
|
65
|
+
throw new Error(`No readable content found at ${url}`);
|
|
66
|
+
}
|
|
67
|
+
const markdown = this.htmlToMarkdown(article.content);
|
|
68
|
+
return {
|
|
69
|
+
url,
|
|
70
|
+
title: article.title || new URL(url).pathname,
|
|
71
|
+
content: article.textContent,
|
|
72
|
+
markdown,
|
|
73
|
+
byline: article.byline || undefined,
|
|
74
|
+
excerpt: article.excerpt || undefined,
|
|
75
|
+
length: article.textContent.length,
|
|
76
|
+
scraped_at: new Date().toISOString(),
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Lightweight HTML to markdown conversion.
|
|
81
|
+
* Handles common elements: headings, paragraphs, links, lists, bold, italic, code.
|
|
82
|
+
*/
|
|
83
|
+
htmlToMarkdown(html) {
|
|
84
|
+
let md = html;
|
|
85
|
+
// Headings
|
|
86
|
+
md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n");
|
|
87
|
+
md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n");
|
|
88
|
+
md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n");
|
|
89
|
+
md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n");
|
|
90
|
+
// Bold and italic
|
|
91
|
+
md = md.replace(/<(strong|b)>(.*?)<\/\1>/gi, "**$2**");
|
|
92
|
+
md = md.replace(/<(em|i)>(.*?)<\/\1>/gi, "*$2*");
|
|
93
|
+
// Code
|
|
94
|
+
md = md.replace(/<code>(.*?)<\/code>/gi, "`$1`");
|
|
95
|
+
md = md.replace(/<pre[^>]*>(.*?)<\/pre>/gis, "```\n$1\n```\n\n");
|
|
96
|
+
// Links
|
|
97
|
+
md = md.replace(/<a[^>]+href="([^"]*)"[^>]*>(.*?)<\/a>/gi, "[$2]($1)");
|
|
98
|
+
// Lists
|
|
99
|
+
md = md.replace(/<li[^>]*>(.*?)<\/li>/gi, "- $1\n");
|
|
100
|
+
md = md.replace(/<\/?[uo]l[^>]*>/gi, "\n");
|
|
101
|
+
// Paragraphs and line breaks
|
|
102
|
+
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
103
|
+
md = md.replace(/<p[^>]*>(.*?)<\/p>/gis, "$1\n\n");
|
|
104
|
+
// Strip remaining HTML tags
|
|
105
|
+
md = md.replace(/<[^>]+>/g, "");
|
|
106
|
+
// Decode basic HTML entities
|
|
107
|
+
md = md.replace(/&/g, "&");
|
|
108
|
+
md = md.replace(/</g, "<");
|
|
109
|
+
md = md.replace(/>/g, ">");
|
|
110
|
+
md = md.replace(/"/g, '"');
|
|
111
|
+
md = md.replace(/'/g, "'");
|
|
112
|
+
md = md.replace(/ /g, " ");
|
|
113
|
+
// Clean up whitespace
|
|
114
|
+
md = md.replace(/\n{3,}/g, "\n\n").trim();
|
|
115
|
+
return md;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
//# sourceMappingURL=scraper.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scraper.js","sourceRoot":"","sources":["../../src/ingestion/scraper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC,MAAM,mBAAmB,GAAG;IAC3B,QAAQ;IACR,OAAO;IACP,aAAa;IACb,4BAA4B;IAC5B,aAAa;IACb,OAAO;IACP,SAAS;IACT,SAAS;CACT,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,MAAc;IACzC,IAAI,MAAW,CAAC;IAChB,IAAI,CAAC;QACJ,MAAM,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC;IAC1B,CAAC;IAAC,MAAM,CAAC;QACR,MAAM,IAAI,KAAK,CAAC,gBAAgB,MAAM,EAAE,CAAC,CAAC;IAC3C,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,KAAK,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,QAAQ,EAAE,CAAC;QACjE,MAAM,IAAI,KAAK,CACd,eAAe,MAAM,CAAC,QAAQ,sDAAsD,CACpF,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IAC/C,KAAK,MAAM,OAAO,IAAI,mBAAmB,EAAE,CAAC;QAC3C,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CACd,iBAAiB,QAAQ,oEAAoE,CAC7F,CAAC;QACH,CAAC;IACF,CAAC;IAED,+BAA+B;IAC/B,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,iBAAiB,QAAQ,mBAAmB,CAAC,CAAC;IAC/D,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,OAAO;IACX,OAAO,CAAU;IAEzB,YAAY,OAAiB;QAC5B,IAAI,CAAC,OAAO,GAAG,OAAO,IAAI,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC7D,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,GAAW;QACvB,WAAW,CAAC,GAAG,CAAC,CAAC;QAEjB,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;QACpD,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,KAAK,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACtF,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;IACxC,CAAC;IAED,eAAe,CAAC,GAAW,EAAE,IAAY;QACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,6FAA6F;QAC7F,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,MAAM,IAAI,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAChD,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,gCAAgC,GAAG,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEtD,OAAO;YACN,GAAG;YACH,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ;YAC7C,OAAO,EAAE,OAAO,CAAC,WAAW;YAC5B,QAAQ;YACR,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;YACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;YACrC,MAAM,EAAE,OAAO,CAAC,WAAW,CAAC,MAAM;YAClC,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC;IACH,CAAC;IAED;;;OAGG;IACK,cAAc,CAAC,IAAY;QAClC,IAAI,EAAE,GAAG,IAAI,CAAC;QACd,WAAW;QACX,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wBAAwB,EAAE,UAAU,CAAC,CAAC;QACtD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wBAAwB,EAAE,WAAW,CAAC,CAAC;QACvD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wBAAwB,EAAE,YAAY,CAAC,CAAC;QACxD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wBAAwB,EAAE,aAAa,CAAC,CAAC;QACzD,kBAAkB;QAClB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,2BAA2B,EAAE,QAAQ,CAAC,CAAC;QACvD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,uBAAuB,EAAE,MAAM,CAAC,CAAC;QACjD,OAAO;QACP,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,uBAAuB,EAAE,MAAM,CAAC,CAAC;QACjD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,2BAA2B,EAAE,kBAAkB,CAAC,CAAC;QACjE,QAAQ;QACR,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,yCAAyC,EAAE,UAAU,CAAC,CAAC;QACvE,QAAQ;QACR,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wBAAwB,EAAE,QAAQ,CAAC,CAAC;QACpD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,mBAAmB,EAAE,IAAI,CAAC,CAAC;QAC3C,6BAA6B;QAC7B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;QACtC,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,uBAAuB,EAAE,QAAQ,CAAC,CAAC;QACnD,4BAA4B;QAC5B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAChC,6BAA6B;QAC7B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAC/B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QAC9B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QAC9B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAChC,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAC/B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAChC,sBAAsB;QACtB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,OAAO,EAAE,CAAC;IACX,CAAC;CACD"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { Fetcher, SitemapResult, SitemapRunOptions } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Parses XML sitemaps and groups URLs by path pattern for interactive selection.
|
|
4
|
+
*/
|
|
5
|
+
export declare class SitemapParser {
|
|
6
|
+
private fetcher;
|
|
7
|
+
constructor(fetcher?: Fetcher);
|
|
8
|
+
/**
|
|
9
|
+
* Fetch and parse a sitemap, returning URLs grouped by path section.
|
|
10
|
+
*/
|
|
11
|
+
parse(sitemapUrl: string): Promise<SitemapResult>;
|
|
12
|
+
/**
|
|
13
|
+
* Filter URLs based on include/exclude patterns.
|
|
14
|
+
* Patterns use simple glob matching: "/blog/*" matches "/blog/anything".
|
|
15
|
+
*/
|
|
16
|
+
filterUrls(result: SitemapResult, options: SitemapRunOptions): string[];
|
|
17
|
+
/**
|
|
18
|
+
* Extract <loc> URLs from sitemap XML.
|
|
19
|
+
* Handles both regular sitemaps and sitemap indexes.
|
|
20
|
+
*/
|
|
21
|
+
private extractUrls;
|
|
22
|
+
/**
|
|
23
|
+
* Group URLs by their first path segment to create browsable sections.
|
|
24
|
+
*/
|
|
25
|
+
private groupByPattern;
|
|
26
|
+
/**
|
|
27
|
+
* Simple glob pattern matching.
|
|
28
|
+
* Supports trailing wildcard: "/blog/*" matches "/blog/my-post".
|
|
29
|
+
*/
|
|
30
|
+
private matchPattern;
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=sitemap-parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap-parser.d.ts","sourceRoot":"","sources":["../../src/ingestion/sitemap-parser.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAkB,MAAM,YAAY,CAAC;AAE5F;;GAEG;AACH,qBAAa,aAAa;IACzB,OAAO,CAAC,OAAO,CAAU;gBAEb,OAAO,CAAC,EAAE,OAAO;IAI7B;;OAEG;IACG,KAAK,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC;IAuBvD;;;OAGG;IACH,UAAU,CAAC,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,iBAAiB,GAAG,MAAM,EAAE;IAoBvE;;;OAGG;IACH,OAAO,CAAC,WAAW;IAUnB;;OAEG;IACH,OAAO,CAAC,cAAc;IA2BtB;;;OAGG;IACH,OAAO,CAAC,YAAY;CAOpB"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { validateUrl } from "./scraper.js";
|
|
2
|
+
/**
|
|
3
|
+
* Parses XML sitemaps and groups URLs by path pattern for interactive selection.
|
|
4
|
+
*/
|
|
5
|
+
export class SitemapParser {
|
|
6
|
+
fetcher;
|
|
7
|
+
constructor(fetcher) {
|
|
8
|
+
this.fetcher = fetcher ?? globalThis.fetch.bind(globalThis);
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Fetch and parse a sitemap, returning URLs grouped by path section.
|
|
12
|
+
*/
|
|
13
|
+
async parse(sitemapUrl) {
|
|
14
|
+
validateUrl(sitemapUrl);
|
|
15
|
+
const response = await this.fetcher(sitemapUrl);
|
|
16
|
+
if (!response.ok) {
|
|
17
|
+
throw new Error(`Failed to fetch sitemap ${sitemapUrl}: ${response.status}`);
|
|
18
|
+
}
|
|
19
|
+
const xml = await response.text();
|
|
20
|
+
const urls = this.extractUrls(xml);
|
|
21
|
+
if (urls.length === 0) {
|
|
22
|
+
throw new Error(`No URLs found in sitemap ${sitemapUrl}`);
|
|
23
|
+
}
|
|
24
|
+
const sections = this.groupByPattern(urls);
|
|
25
|
+
return {
|
|
26
|
+
sitemap_url: sitemapUrl,
|
|
27
|
+
sections,
|
|
28
|
+
total_urls: urls.length,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Filter URLs based on include/exclude patterns.
|
|
33
|
+
* Patterns use simple glob matching: "/blog/*" matches "/blog/anything".
|
|
34
|
+
*/
|
|
35
|
+
filterUrls(result, options) {
|
|
36
|
+
let urls = result.sections.flatMap((s) => s.urls);
|
|
37
|
+
if (options.include && options.include.length > 0) {
|
|
38
|
+
urls = urls.filter((url) => {
|
|
39
|
+
const path = new URL(url).pathname;
|
|
40
|
+
return options.include?.some((pattern) => this.matchPattern(path, pattern));
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
if (options.exclude && options.exclude.length > 0) {
|
|
44
|
+
urls = urls.filter((url) => {
|
|
45
|
+
const path = new URL(url).pathname;
|
|
46
|
+
return !options.exclude?.some((pattern) => this.matchPattern(path, pattern));
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
return urls;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Extract <loc> URLs from sitemap XML.
|
|
53
|
+
* Handles both regular sitemaps and sitemap indexes.
|
|
54
|
+
*/
|
|
55
|
+
extractUrls(xml) {
|
|
56
|
+
const urls = [];
|
|
57
|
+
const locRegex = /<loc>\s*(.*?)\s*<\/loc>/gi;
|
|
58
|
+
for (const match of xml.matchAll(locRegex)) {
|
|
59
|
+
const url = match[1].trim();
|
|
60
|
+
if (url)
|
|
61
|
+
urls.push(url);
|
|
62
|
+
}
|
|
63
|
+
return urls;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Group URLs by their first path segment to create browsable sections.
|
|
67
|
+
*/
|
|
68
|
+
groupByPattern(urls) {
|
|
69
|
+
const groups = new Map();
|
|
70
|
+
for (const url of urls) {
|
|
71
|
+
try {
|
|
72
|
+
const pathname = new URL(url).pathname;
|
|
73
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
74
|
+
const section = segments.length > 0 ? `/${segments[0]}` : "/";
|
|
75
|
+
const existing = groups.get(section) ?? [];
|
|
76
|
+
existing.push(url);
|
|
77
|
+
groups.set(section, existing);
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
// Skip invalid URLs
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return Array.from(groups.entries())
|
|
84
|
+
.map(([pattern, sectionUrls]) => ({
|
|
85
|
+
name: pattern === "/" ? "Root" : pattern.slice(1).charAt(0).toUpperCase() + pattern.slice(2),
|
|
86
|
+
pattern: `${pattern}/*`,
|
|
87
|
+
urls: sectionUrls,
|
|
88
|
+
count: sectionUrls.length,
|
|
89
|
+
}))
|
|
90
|
+
.sort((a, b) => b.count - a.count);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Simple glob pattern matching.
|
|
94
|
+
* Supports trailing wildcard: "/blog/*" matches "/blog/my-post".
|
|
95
|
+
*/
|
|
96
|
+
matchPattern(path, pattern) {
|
|
97
|
+
if (pattern.endsWith("/*")) {
|
|
98
|
+
const prefix = pattern.slice(0, -2);
|
|
99
|
+
return path === prefix || path.startsWith(`${prefix}/`);
|
|
100
|
+
}
|
|
101
|
+
return path === pattern;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
//# sourceMappingURL=sitemap-parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap-parser.js","sourceRoot":"","sources":["../../src/ingestion/sitemap-parser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAG3C;;GAEG;AACH,MAAM,OAAO,aAAa;IACjB,OAAO,CAAU;IAEzB,YAAY,OAAiB;QAC5B,IAAI,CAAC,OAAO,GAAG,OAAO,IAAI,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC7D,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,UAAkB;QAC7B,WAAW,CAAC,UAAU,CAAC,CAAC;QACxB,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;QAChD,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,2BAA2B,UAAU,KAAK,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QAC9E,CAAC;QAED,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAClC,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAEnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,4BAA4B,UAAU,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;QAE3C,OAAO;YACN,WAAW,EAAE,UAAU;YACvB,QAAQ;YACR,UAAU,EAAE,IAAI,CAAC,MAAM;SACvB,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,UAAU,CAAC,MAAqB,EAAE,OAA0B;QAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAElD,IAAI,OAAO,CAAC,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnD,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;gBAC1B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBACnC,OAAO,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC;YAC7E,CAAC,CAAC,CAAC;QACJ,CAAC;QAED,IAAI,OAAO,CAAC,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnD,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;gBAC1B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBACnC,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC;YAC9E,CAAC,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;;OAGG;IACK,WAAW,CAAC,GAAW;QAC9B,MAAM,IAAI,GAAa,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,2BAA2B,CAAC;QAC7C,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC5C,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC5B,IAAI,GAAG;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACzB,CAAC;QACD,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAc;QACpC,MAAM,MAAM,GAAG,IAAI,GAAG,EAAoB,CAAC;QAE3C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACxB,IAAI,CAAC;gBACJ,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBACvC,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;gBACrD,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;gBAC9D,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;gBAC3C,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,MAAM,CAAC,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;YAC/B,CAAC;YAAC,MAAM,CAAC;gBACR,oBAAoB;YACrB,CAAC;QACF,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;aACjC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,WAAW,CAAC,EAAE,EAAE,CAAC,CAAC;YACjC,IAAI,EACH,OAAO,KAAK,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;YACvF,OAAO,EAAE,GAAG,OAAO,IAAI;YACvB,IAAI,EAAE,WAAW;YACjB,KAAK,EAAE,WAAW,CAAC,MAAM;SACzB,CAAC,CAAC;aACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IACrC,CAAC;IAED;;;OAGG;IACK,YAAY,CAAC,IAAY,EAAE,OAAe;QACjD,IAAI,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YACpC,OAAO,IAAI,KAAK,MAAM,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC;QACzD,CAAC;QACD,OAAO,IAAI,KAAK,OAAO,CAAC;IACzB,CAAC;CACD"}
|