@sourcepress/knowledge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -0
- package/.turbo/turbo-test.log +21 -0
- package/dist/__tests__/graph-builder.test.d.ts +2 -0
- package/dist/__tests__/graph-builder.test.d.ts.map +1 -0
- package/dist/__tests__/graph-builder.test.js +122 -0
- package/dist/__tests__/graph-builder.test.js.map +1 -0
- package/dist/__tests__/graph-ops.test.d.ts +2 -0
- package/dist/__tests__/graph-ops.test.d.ts.map +1 -0
- package/dist/__tests__/graph-ops.test.js +181 -0
- package/dist/__tests__/graph-ops.test.js.map +1 -0
- package/dist/__tests__/ingestion.test.d.ts +2 -0
- package/dist/__tests__/ingestion.test.d.ts.map +1 -0
- package/dist/__tests__/ingestion.test.js +108 -0
- package/dist/__tests__/ingestion.test.js.map +1 -0
- package/dist/__tests__/json-file-store.test.d.ts +2 -0
- package/dist/__tests__/json-file-store.test.d.ts.map +1 -0
- package/dist/__tests__/json-file-store.test.js +180 -0
- package/dist/__tests__/json-file-store.test.js.map +1 -0
- package/dist/__tests__/knowledge-engine.test.d.ts +2 -0
- package/dist/__tests__/knowledge-engine.test.d.ts.map +1 -0
- package/dist/__tests__/knowledge-engine.test.js +152 -0
- package/dist/__tests__/knowledge-engine.test.js.map +1 -0
- package/dist/__tests__/knowledge-store.test.d.ts +2 -0
- package/dist/__tests__/knowledge-store.test.d.ts.map +1 -0
- package/dist/__tests__/knowledge-store.test.js +97 -0
- package/dist/__tests__/knowledge-store.test.js.map +1 -0
- package/dist/__tests__/scraper.test.d.ts +2 -0
- package/dist/__tests__/scraper.test.d.ts.map +1 -0
- package/dist/__tests__/scraper.test.js +66 -0
- package/dist/__tests__/scraper.test.js.map +1 -0
- package/dist/__tests__/sitemap-parser.test.d.ts +2 -0
- package/dist/__tests__/sitemap-parser.test.d.ts.map +1 -0
- package/dist/__tests__/sitemap-parser.test.js +75 -0
- package/dist/__tests__/sitemap-parser.test.js.map +1 -0
- package/dist/graph-builder.d.ts +17 -0
- package/dist/graph-builder.d.ts.map +1 -0
- package/dist/graph-builder.js +98 -0
- package/dist/graph-builder.js.map +1 -0
- package/dist/graph-ops.d.ts +21 -0
- package/dist/graph-ops.d.ts.map +1 -0
- package/dist/graph-ops.js +108 -0
- package/dist/graph-ops.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/ingestion/index.d.ts +4 -0
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +3 -0
- package/dist/ingestion/index.js.map +1 -0
- package/dist/ingestion/scraper.d.ts +22 -0
- package/dist/ingestion/scraper.d.ts.map +1 -0
- package/dist/ingestion/scraper.js +118 -0
- package/dist/ingestion/scraper.js.map +1 -0
- package/dist/ingestion/sitemap-parser.d.ts +32 -0
- package/dist/ingestion/sitemap-parser.d.ts.map +1 -0
- package/dist/ingestion/sitemap-parser.js +104 -0
- package/dist/ingestion/sitemap-parser.js.map +1 -0
- package/dist/ingestion/types.d.ts +58 -0
- package/dist/ingestion/types.d.ts.map +1 -0
- package/dist/ingestion/types.js +2 -0
- package/dist/ingestion/types.js.map +1 -0
- package/dist/json-file-store.d.ts +19 -0
- package/dist/json-file-store.d.ts.map +1 -0
- package/dist/json-file-store.js +100 -0
- package/dist/json-file-store.js.map +1 -0
- package/dist/knowledge-engine.d.ts +45 -0
- package/dist/knowledge-engine.d.ts.map +1 -0
- package/dist/knowledge-engine.js +160 -0
- package/dist/knowledge-engine.js.map +1 -0
- package/dist/knowledge-store.d.ts +14 -0
- package/dist/knowledge-store.d.ts.map +1 -0
- package/dist/knowledge-store.js +40 -0
- package/dist/knowledge-store.js.map +1 -0
- package/dist/types.d.ts +67 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +26 -0
- package/src/__tests__/graph-builder.test.ts +129 -0
- package/src/__tests__/graph-ops.test.ts +189 -0
- package/src/__tests__/ingestion.test.ts +127 -0
- package/src/__tests__/json-file-store.test.ts +206 -0
- package/src/__tests__/knowledge-engine.test.ts +177 -0
- package/src/__tests__/knowledge-store.test.ts +111 -0
- package/src/__tests__/scraper.test.ts +74 -0
- package/src/__tests__/sitemap-parser.test.ts +85 -0
- package/src/graph-builder.ts +109 -0
- package/src/graph-ops.ts +129 -0
- package/src/index.ts +27 -0
- package/src/ingestion/index.ts +10 -0
- package/src/ingestion/scraper.ts +137 -0
- package/src/ingestion/sitemap-parser.ts +119 -0
- package/src/ingestion/types.ts +57 -0
- package/src/json-file-store.ts +127 -0
- package/src/knowledge-engine.ts +217 -0
- package/src/knowledge-store.ts +49 -0
- package/src/types.ts +76 -0
- package/tsconfig.json +5 -0
- package/vitest.config.ts +2 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import type { BudgetTracker, ResolvedProvider } from "@sourcepress/ai";
|
|
2
|
+
import type { ContentFile } from "@sourcepress/core";
|
|
3
|
+
import { beforeEach, describe, expect, it, vi } from "vitest";
|
|
4
|
+
import { KnowledgeEngine } from "../knowledge-engine.js";
|
|
5
|
+
import { InMemoryKnowledgeStore } from "../knowledge-store.js";
|
|
6
|
+
|
|
7
|
+
vi.mock("@sourcepress/ai", async () => {
|
|
8
|
+
const actual = await vi.importActual("@sourcepress/ai");
|
|
9
|
+
return { ...actual, classify: vi.fn(), extract: vi.fn() };
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
import { classify, extract } from "@sourcepress/ai";
|
|
13
|
+
|
|
14
|
+
const mockProvider: ResolvedProvider = {
|
|
15
|
+
provider: "anthropic",
|
|
16
|
+
model: "claude-sonnet-4-5-20250514",
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
function makeMockBudget(): BudgetTracker {
|
|
20
|
+
return {
|
|
21
|
+
record: vi.fn(),
|
|
22
|
+
getStatus: vi.fn().mockReturnValue({
|
|
23
|
+
spent_today_usd: 0,
|
|
24
|
+
limit_usd: 5,
|
|
25
|
+
warn_at_usd: 3,
|
|
26
|
+
remaining_usd: 5,
|
|
27
|
+
is_over_limit: false,
|
|
28
|
+
is_warned: false,
|
|
29
|
+
reset_at: new Date().toISOString(),
|
|
30
|
+
}),
|
|
31
|
+
canSpend: vi.fn().mockReturnValue(true),
|
|
32
|
+
getHistory: vi.fn().mockReturnValue([]),
|
|
33
|
+
resetDaily: vi.fn(),
|
|
34
|
+
} as unknown as BudgetTracker;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const mockUsage = {
|
|
38
|
+
input_tokens: 500,
|
|
39
|
+
output_tokens: 100,
|
|
40
|
+
estimated_cost_usd: 0.01,
|
|
41
|
+
function_name: "test",
|
|
42
|
+
timestamp: new Date().toISOString(),
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
describe("KnowledgeEngine", () => {
|
|
46
|
+
let engine: KnowledgeEngine;
|
|
47
|
+
let store: InMemoryKnowledgeStore;
|
|
48
|
+
let budget: BudgetTracker;
|
|
49
|
+
|
|
50
|
+
beforeEach(() => {
|
|
51
|
+
store = new InMemoryKnowledgeStore();
|
|
52
|
+
budget = makeMockBudget();
|
|
53
|
+
engine = new KnowledgeEngine(store, mockProvider, budget);
|
|
54
|
+
vi.clearAllMocks();
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
describe("ingest", () => {
|
|
58
|
+
it("classifies, extracts, and stores a knowledge file", async () => {
|
|
59
|
+
vi.mocked(classify).mockResolvedValueOnce({
|
|
60
|
+
quality: "structured",
|
|
61
|
+
quality_score: 8,
|
|
62
|
+
type: "project-notes",
|
|
63
|
+
reasoning: "Well structured",
|
|
64
|
+
usage: mockUsage,
|
|
65
|
+
});
|
|
66
|
+
vi.mocked(extract).mockResolvedValueOnce({
|
|
67
|
+
entities: [{ type: "client", name: "Acme Corp", aliases: ["Acme"], confidence: 0.95 }],
|
|
68
|
+
relations: [
|
|
69
|
+
{
|
|
70
|
+
from_entity: "Acme Corp",
|
|
71
|
+
to_entity: "Next.js",
|
|
72
|
+
relation_type: "uses",
|
|
73
|
+
confidence: 0.9,
|
|
74
|
+
evidence: "Acme uses Next.js",
|
|
75
|
+
},
|
|
76
|
+
],
|
|
77
|
+
usage: mockUsage,
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
const result = await engine.ingest(
|
|
81
|
+
"knowledge/clients/acme.md",
|
|
82
|
+
"Meeting with Acme Corp about their Next.js migration.",
|
|
83
|
+
"manual",
|
|
84
|
+
);
|
|
85
|
+
expect(result.path).toBe("knowledge/clients/acme.md");
|
|
86
|
+
expect(result.quality).toBe("structured");
|
|
87
|
+
expect(result.quality_score).toBe(8);
|
|
88
|
+
expect(result.entities).toHaveLength(1);
|
|
89
|
+
|
|
90
|
+
const stored = await store.retrieve("knowledge/clients/acme.md");
|
|
91
|
+
expect(stored).not.toBeNull();
|
|
92
|
+
// biome-ignore lint/style/noNonNullAssertion: asserted not null on line above
|
|
93
|
+
expect(stored!.quality).toBe("structured");
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
describe("buildGraph", () => {
|
|
98
|
+
it("builds graph from all stored knowledge files", async () => {
|
|
99
|
+
await store.store({
|
|
100
|
+
path: "knowledge/a.md",
|
|
101
|
+
type: "notes",
|
|
102
|
+
quality: "structured",
|
|
103
|
+
quality_score: 8,
|
|
104
|
+
entities: [{ type: "client", name: "Acme Corp" }],
|
|
105
|
+
ingested_at: new Date().toISOString(),
|
|
106
|
+
source: "manual",
|
|
107
|
+
body: "Acme meeting notes",
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
vi.mocked(extract).mockResolvedValueOnce({
|
|
111
|
+
entities: [{ type: "client", name: "Acme Corp", aliases: ["Acme"], confidence: 0.95 }],
|
|
112
|
+
relations: [],
|
|
113
|
+
usage: mockUsage,
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
const graph = await engine.buildGraph();
|
|
117
|
+
expect(graph.entities.size).toBeGreaterThanOrEqual(1);
|
|
118
|
+
expect(graph.built_at).toBeTruthy();
|
|
119
|
+
});
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
describe("query", () => {
|
|
123
|
+
it("queries the graph for an entity", async () => {
|
|
124
|
+
await store.store({
|
|
125
|
+
path: "knowledge/a.md",
|
|
126
|
+
type: "notes",
|
|
127
|
+
quality: "structured",
|
|
128
|
+
quality_score: 8,
|
|
129
|
+
entities: [{ type: "client", name: "Acme Corp" }],
|
|
130
|
+
ingested_at: new Date().toISOString(),
|
|
131
|
+
source: "manual",
|
|
132
|
+
body: "Acme notes",
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
vi.mocked(extract).mockResolvedValueOnce({
|
|
136
|
+
entities: [{ type: "client", name: "Acme Corp", aliases: ["Acme"], confidence: 0.95 }],
|
|
137
|
+
relations: [],
|
|
138
|
+
usage: mockUsage,
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
await engine.buildGraph();
|
|
142
|
+
const result = engine.query("Acme Corp");
|
|
143
|
+
expect(result).not.toBeNull();
|
|
144
|
+
// biome-ignore lint/style/noNonNullAssertion: asserted not null on line above
|
|
145
|
+
expect(result!.entity.name).toBe("Acme Corp");
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
it("returns null before graph is built", () => {
|
|
149
|
+
expect(engine.query("Anything")).toBeNull();
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
describe("findGaps", () => {
|
|
154
|
+
it("delegates gap detection to graph ops", async () => {
|
|
155
|
+
await store.store({
|
|
156
|
+
path: "knowledge/a.md",
|
|
157
|
+
type: "notes",
|
|
158
|
+
quality: "structured",
|
|
159
|
+
quality_score: 8,
|
|
160
|
+
entities: [{ type: "client", name: "Acme Corp" }],
|
|
161
|
+
ingested_at: new Date().toISOString(),
|
|
162
|
+
source: "manual",
|
|
163
|
+
body: "Acme notes",
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
vi.mocked(extract).mockResolvedValueOnce({
|
|
167
|
+
entities: [{ type: "client", name: "Acme Corp", aliases: [], confidence: 0.95 }],
|
|
168
|
+
relations: [],
|
|
169
|
+
usage: mockUsage,
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
await engine.buildGraph();
|
|
173
|
+
const gaps = engine.findGaps([]);
|
|
174
|
+
expect(gaps.length).toBeGreaterThanOrEqual(1);
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
});
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import type { KnowledgeFile } from "@sourcepress/core";
|
|
2
|
+
import { beforeEach, describe, expect, it } from "vitest";
|
|
3
|
+
import { InMemoryKnowledgeStore } from "../knowledge-store.js";
|
|
4
|
+
|
|
5
|
+
function makeKnowledgeFile(overrides: Partial<KnowledgeFile> = {}): KnowledgeFile {
|
|
6
|
+
return {
|
|
7
|
+
path: "knowledge/clients/acme.md",
|
|
8
|
+
type: "project-notes",
|
|
9
|
+
quality: "structured",
|
|
10
|
+
quality_score: 8,
|
|
11
|
+
entities: [
|
|
12
|
+
{ type: "client", name: "Acme Corp" },
|
|
13
|
+
{ type: "technology", name: "Next.js" },
|
|
14
|
+
],
|
|
15
|
+
ingested_at: "2026-04-04T10:00:00Z",
|
|
16
|
+
source: "manual",
|
|
17
|
+
body: "Meeting with Acme Corp about Next.js migration.",
|
|
18
|
+
...overrides,
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
describe("InMemoryKnowledgeStore", () => {
|
|
23
|
+
let store: InMemoryKnowledgeStore;
|
|
24
|
+
|
|
25
|
+
beforeEach(() => {
|
|
26
|
+
store = new InMemoryKnowledgeStore();
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
describe("store and retrieve", () => {
|
|
30
|
+
it("stores and retrieves a knowledge file", async () => {
|
|
31
|
+
const file = makeKnowledgeFile();
|
|
32
|
+
await store.store(file);
|
|
33
|
+
const retrieved = await store.retrieve(file.path);
|
|
34
|
+
expect(retrieved).toEqual(file);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it("returns null for non-existent file", async () => {
|
|
38
|
+
const retrieved = await store.retrieve("nonexistent.md");
|
|
39
|
+
expect(retrieved).toBeNull();
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("overwrites existing file at same path", async () => {
|
|
43
|
+
const file = makeKnowledgeFile();
|
|
44
|
+
await store.store(file);
|
|
45
|
+
const updated = makeKnowledgeFile({ quality_score: 9 });
|
|
46
|
+
await store.store(updated);
|
|
47
|
+
const retrieved = await store.retrieve(file.path);
|
|
48
|
+
expect(retrieved?.quality_score).toBe(9);
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
describe("list", () => {
|
|
53
|
+
it("lists all stored files", async () => {
|
|
54
|
+
await store.store(makeKnowledgeFile({ path: "knowledge/a.md" }));
|
|
55
|
+
await store.store(makeKnowledgeFile({ path: "knowledge/b.md" }));
|
|
56
|
+
const files = await store.list();
|
|
57
|
+
expect(files).toHaveLength(2);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("filters by type", async () => {
|
|
61
|
+
await store.store(makeKnowledgeFile({ path: "a.md", type: "project-notes" }));
|
|
62
|
+
await store.store(makeKnowledgeFile({ path: "b.md", type: "transcript" }));
|
|
63
|
+
const files = await store.list({ type: "transcript" });
|
|
64
|
+
expect(files).toHaveLength(1);
|
|
65
|
+
expect(files[0].type).toBe("transcript");
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("filters by quality", async () => {
|
|
69
|
+
await store.store(makeKnowledgeFile({ path: "a.md", quality: "structured" }));
|
|
70
|
+
await store.store(makeKnowledgeFile({ path: "b.md", quality: "draft" }));
|
|
71
|
+
const files = await store.list({ quality: "draft" });
|
|
72
|
+
expect(files).toHaveLength(1);
|
|
73
|
+
expect(files[0].quality).toBe("draft");
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it("filters by source", async () => {
|
|
77
|
+
await store.store(makeKnowledgeFile({ path: "a.md", source: "manual" }));
|
|
78
|
+
await store.store(makeKnowledgeFile({ path: "b.md", source: "url" }));
|
|
79
|
+
const files = await store.list({ source: "url" });
|
|
80
|
+
expect(files).toHaveLength(1);
|
|
81
|
+
expect(files[0].source).toBe("url");
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
describe("delete", () => {
|
|
86
|
+
it("deletes an existing file", async () => {
|
|
87
|
+
await store.store(makeKnowledgeFile());
|
|
88
|
+
const deleted = await store.delete("knowledge/clients/acme.md");
|
|
89
|
+
expect(deleted).toBe(true);
|
|
90
|
+
const retrieved = await store.retrieve("knowledge/clients/acme.md");
|
|
91
|
+
expect(retrieved).toBeNull();
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("returns false for non-existent file", async () => {
|
|
95
|
+
const deleted = await store.delete("nonexistent.md");
|
|
96
|
+
expect(deleted).toBe(false);
|
|
97
|
+
});
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
describe("count", () => {
|
|
101
|
+
it("returns 0 for empty store", async () => {
|
|
102
|
+
expect(await store.count()).toBe(0);
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it("returns correct count", async () => {
|
|
106
|
+
await store.store(makeKnowledgeFile({ path: "a.md" }));
|
|
107
|
+
await store.store(makeKnowledgeFile({ path: "b.md" }));
|
|
108
|
+
expect(await store.count()).toBe(2);
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
});
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { describe, expect, it, vi } from "vitest";
|
|
2
|
+
import { Scraper } from "../ingestion/scraper.js";
|
|
3
|
+
|
|
4
|
+
const SAMPLE_HTML = `
|
|
5
|
+
<!DOCTYPE html>
|
|
6
|
+
<html>
|
|
7
|
+
<head><title>Test Article</title></head>
|
|
8
|
+
<body>
|
|
9
|
+
<header><nav>Skip me</nav></header>
|
|
10
|
+
<article>
|
|
11
|
+
<h1>Test Article Title</h1>
|
|
12
|
+
<p>This is the first paragraph with <strong>bold text</strong> and a
|
|
13
|
+
<a href="https://example.com">link</a>.</p>
|
|
14
|
+
<h2>Section Two</h2>
|
|
15
|
+
<p>Another paragraph with <em>italic</em> content. It has enough text to
|
|
16
|
+
be considered readable content by the Readability algorithm, which typically
|
|
17
|
+
requires a minimum amount of textual content to determine that an element
|
|
18
|
+
contains the main article body rather than boilerplate navigation.</p>
|
|
19
|
+
<ul>
|
|
20
|
+
<li>Item one</li>
|
|
21
|
+
<li>Item two</li>
|
|
22
|
+
</ul>
|
|
23
|
+
<p>Final paragraph to ensure sufficient content length for the readability
|
|
24
|
+
parser to properly identify the main content area of this test page.</p>
|
|
25
|
+
</article>
|
|
26
|
+
<footer>Footer stuff</footer>
|
|
27
|
+
</body>
|
|
28
|
+
</html>`;
|
|
29
|
+
|
|
30
|
+
function mockFetcher(html: string, status = 200): typeof globalThis.fetch {
|
|
31
|
+
return vi.fn().mockResolvedValue({
|
|
32
|
+
ok: status >= 200 && status < 300,
|
|
33
|
+
status,
|
|
34
|
+
statusText: status === 200 ? "OK" : "Not Found",
|
|
35
|
+
text: () => Promise.resolve(html),
|
|
36
|
+
}) as unknown as typeof globalThis.fetch;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
describe("Scraper", () => {
|
|
40
|
+
it("scrapes a URL and returns readable content", async () => {
|
|
41
|
+
const fetcher = mockFetcher(SAMPLE_HTML);
|
|
42
|
+
const scraper = new Scraper(fetcher);
|
|
43
|
+
const result = await scraper.scrape("https://example.com/article");
|
|
44
|
+
|
|
45
|
+
expect(fetcher).toHaveBeenCalledWith("https://example.com/article", undefined);
|
|
46
|
+
expect(result.url).toBe("https://example.com/article");
|
|
47
|
+
expect(result.title).toContain("Test Article");
|
|
48
|
+
expect(result.content).toContain("first paragraph");
|
|
49
|
+
expect(result.markdown).toContain("**bold text**");
|
|
50
|
+
expect(result.length).toBeGreaterThan(0);
|
|
51
|
+
expect(result.scraped_at).toBeTruthy();
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it("throws on HTTP error", async () => {
|
|
55
|
+
const fetcher = mockFetcher("", 404);
|
|
56
|
+
const scraper = new Scraper(fetcher);
|
|
57
|
+
await expect(scraper.scrape("https://example.com/missing")).rejects.toThrow("404");
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("throws when no readable content found", async () => {
|
|
61
|
+
const fetcher = mockFetcher("<html><body><nav>Just navigation</nav></body></html>");
|
|
62
|
+
const scraper = new Scraper(fetcher);
|
|
63
|
+
await expect(scraper.scrape("https://example.com/empty")).rejects.toThrow(
|
|
64
|
+
"No readable content",
|
|
65
|
+
);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("extracts markdown from HTML directly", () => {
|
|
69
|
+
const scraper = new Scraper(mockFetcher(""));
|
|
70
|
+
const result = scraper.extractFromHtml("https://example.com", SAMPLE_HTML);
|
|
71
|
+
expect(result.markdown).toContain("[link](https://example.com)");
|
|
72
|
+
expect(result.markdown).toContain("- Item one");
|
|
73
|
+
});
|
|
74
|
+
});
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { describe, expect, it, vi } from "vitest";
|
|
2
|
+
import { SitemapParser } from "../ingestion/sitemap-parser.js";
|
|
3
|
+
|
|
4
|
+
const SAMPLE_SITEMAP = `<?xml version="1.0" encoding="UTF-8"?>
|
|
5
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
6
|
+
<url><loc>https://example.com/</loc></url>
|
|
7
|
+
<url><loc>https://example.com/blog/post-1</loc></url>
|
|
8
|
+
<url><loc>https://example.com/blog/post-2</loc></url>
|
|
9
|
+
<url><loc>https://example.com/services/web</loc></url>
|
|
10
|
+
<url><loc>https://example.com/services/mobile</loc></url>
|
|
11
|
+
<url><loc>https://example.com/services/cloud</loc></url>
|
|
12
|
+
<url><loc>https://example.com/cases/acme</loc></url>
|
|
13
|
+
<url><loc>https://example.com/about</loc></url>
|
|
14
|
+
</urlset>`;
|
|
15
|
+
|
|
16
|
+
function mockFetcher(xml: string, status = 200) {
|
|
17
|
+
return vi.fn().mockResolvedValue({
|
|
18
|
+
ok: status >= 200 && status < 300,
|
|
19
|
+
status,
|
|
20
|
+
statusText: "OK",
|
|
21
|
+
text: () => Promise.resolve(xml),
|
|
22
|
+
}) as unknown as typeof globalThis.fetch;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
describe("SitemapParser", () => {
|
|
26
|
+
it("parses a sitemap and groups URLs by section", async () => {
|
|
27
|
+
const parser = new SitemapParser(mockFetcher(SAMPLE_SITEMAP));
|
|
28
|
+
const result = await parser.parse("https://example.com/sitemap.xml");
|
|
29
|
+
|
|
30
|
+
expect(result.total_urls).toBe(8);
|
|
31
|
+
expect(result.sections.length).toBeGreaterThanOrEqual(3);
|
|
32
|
+
|
|
33
|
+
const services = result.sections.find((s) => s.pattern === "/services/*");
|
|
34
|
+
expect(services).toBeDefined();
|
|
35
|
+
expect(services?.count).toBe(3);
|
|
36
|
+
|
|
37
|
+
const blog = result.sections.find((s) => s.pattern === "/blog/*");
|
|
38
|
+
expect(blog).toBeDefined();
|
|
39
|
+
expect(blog?.count).toBe(2);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("throws on HTTP error", async () => {
|
|
43
|
+
const parser = new SitemapParser(mockFetcher("", 404));
|
|
44
|
+
await expect(parser.parse("https://example.com/sitemap.xml")).rejects.toThrow("404");
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it("throws when no URLs found", async () => {
|
|
48
|
+
const parser = new SitemapParser(mockFetcher("<urlset></urlset>"));
|
|
49
|
+
await expect(parser.parse("https://example.com/sitemap.xml")).rejects.toThrow("No URLs found");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("filters URLs by include patterns", async () => {
|
|
53
|
+
const parser = new SitemapParser(mockFetcher(SAMPLE_SITEMAP));
|
|
54
|
+
const result = await parser.parse("https://example.com/sitemap.xml");
|
|
55
|
+
const filtered = parser.filterUrls(result, {
|
|
56
|
+
sitemap_url: "https://example.com/sitemap.xml",
|
|
57
|
+
include: ["/services/*"],
|
|
58
|
+
});
|
|
59
|
+
expect(filtered).toHaveLength(3);
|
|
60
|
+
expect(filtered.every((u) => u.includes("/services/"))).toBe(true);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("filters URLs by exclude patterns", async () => {
|
|
64
|
+
const parser = new SitemapParser(mockFetcher(SAMPLE_SITEMAP));
|
|
65
|
+
const result = await parser.parse("https://example.com/sitemap.xml");
|
|
66
|
+
const filtered = parser.filterUrls(result, {
|
|
67
|
+
sitemap_url: "https://example.com/sitemap.xml",
|
|
68
|
+
exclude: ["/blog/*"],
|
|
69
|
+
});
|
|
70
|
+
expect(filtered.some((u) => u.includes("/blog/"))).toBe(false);
|
|
71
|
+
expect(filtered.length).toBe(6); // 8 total - 2 blog
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it("combines include and exclude", async () => {
|
|
75
|
+
const parser = new SitemapParser(mockFetcher(SAMPLE_SITEMAP));
|
|
76
|
+
const result = await parser.parse("https://example.com/sitemap.xml");
|
|
77
|
+
const filtered = parser.filterUrls(result, {
|
|
78
|
+
sitemap_url: "https://example.com/sitemap.xml",
|
|
79
|
+
include: ["/services/*", "/blog/*"],
|
|
80
|
+
exclude: ["/blog/*"],
|
|
81
|
+
});
|
|
82
|
+
expect(filtered).toHaveLength(3);
|
|
83
|
+
expect(filtered.every((u) => u.includes("/services/"))).toBe(true);
|
|
84
|
+
});
|
|
85
|
+
});
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import type { EntityCluster, ExtractedEntity, ExtractedRelation, KnowledgeGraph } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export interface GraphBuilderOptions {
|
|
4
|
+
minConfidence?: number;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export class GraphBuilder {
|
|
8
|
+
private entities: Map<string, ExtractedEntity> = new Map();
|
|
9
|
+
private relations: ExtractedRelation[] = [];
|
|
10
|
+
private sourceFiles: Set<string> = new Set();
|
|
11
|
+
private minConfidence: number;
|
|
12
|
+
|
|
13
|
+
constructor(options: GraphBuilderOptions = {}) {
|
|
14
|
+
this.minConfidence = options.minConfidence ?? 0;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
addEntities(entities: ExtractedEntity[]): void {
|
|
18
|
+
for (const entity of entities) {
|
|
19
|
+
this.sourceFiles.add(entity.source_file);
|
|
20
|
+
const existing = this.entities.get(entity.name);
|
|
21
|
+
if (existing) {
|
|
22
|
+
const mergedAliases = Array.from(
|
|
23
|
+
new Set([...(existing.aliases || []), ...(entity.aliases || [])]),
|
|
24
|
+
);
|
|
25
|
+
if (entity.confidence > existing.confidence) {
|
|
26
|
+
this.entities.set(entity.name, { ...entity, aliases: mergedAliases });
|
|
27
|
+
} else {
|
|
28
|
+
existing.aliases = mergedAliases;
|
|
29
|
+
}
|
|
30
|
+
} else {
|
|
31
|
+
this.entities.set(entity.name, { ...entity });
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
getEntities(): Map<string, ExtractedEntity> {
|
|
37
|
+
return this.entities;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
addRelations(relations: ExtractedRelation[]): void {
|
|
41
|
+
for (const relation of relations) {
|
|
42
|
+
this.sourceFiles.add(relation.source_file);
|
|
43
|
+
this.relations.push({ ...relation });
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
build(): KnowledgeGraph {
|
|
48
|
+
const filteredRelations = this.relations.filter((r) => r.confidence >= this.minConfidence);
|
|
49
|
+
const clusters = this.buildClusters(filteredRelations);
|
|
50
|
+
return {
|
|
51
|
+
entities: new Map(this.entities),
|
|
52
|
+
relations: filteredRelations,
|
|
53
|
+
clusters,
|
|
54
|
+
built_at: new Date().toISOString(),
|
|
55
|
+
file_count: this.sourceFiles.size,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
private buildClusters(relations: ExtractedRelation[]): EntityCluster[] {
|
|
60
|
+
const parent: Map<string, string> = new Map();
|
|
61
|
+
const find = (x: string): string => {
|
|
62
|
+
if (!parent.has(x)) parent.set(x, x);
|
|
63
|
+
// biome-ignore lint/style/noNonNullAssertion: key presence guaranteed by parent.has() check above
|
|
64
|
+
if (parent.get(x) !== x) parent.set(x, find(parent.get(x)!));
|
|
65
|
+
// biome-ignore lint/style/noNonNullAssertion: key presence guaranteed by parent.has() check above
|
|
66
|
+
return parent.get(x)!;
|
|
67
|
+
};
|
|
68
|
+
const union = (a: string, b: string): void => {
|
|
69
|
+
const rootA = find(a);
|
|
70
|
+
const rootB = find(b);
|
|
71
|
+
if (rootA !== rootB) parent.set(rootB, rootA);
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
for (const name of this.entities.keys()) find(name);
|
|
75
|
+
for (const relation of relations) {
|
|
76
|
+
if (this.entities.has(relation.from_entity) && this.entities.has(relation.to_entity)) {
|
|
77
|
+
union(relation.from_entity, relation.to_entity);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const groups: Map<string, string[]> = new Map();
|
|
82
|
+
for (const name of this.entities.keys()) {
|
|
83
|
+
const root = find(name);
|
|
84
|
+
if (!groups.has(root)) groups.set(root, []);
|
|
85
|
+
// biome-ignore lint/style/noNonNullAssertion: key presence guaranteed by groups.has() check above
|
|
86
|
+
groups.get(root)!.push(name);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const clusters: EntityCluster[] = [];
|
|
90
|
+
let clusterId = 0;
|
|
91
|
+
for (const [, members] of groups) {
|
|
92
|
+
if (members.length < 2) continue;
|
|
93
|
+
const clusterRelations = relations.filter(
|
|
94
|
+
(r) => members.includes(r.from_entity) && members.includes(r.to_entity),
|
|
95
|
+
);
|
|
96
|
+
const coherence =
|
|
97
|
+
clusterRelations.length > 0
|
|
98
|
+
? clusterRelations.reduce((sum, r) => sum + r.confidence, 0) / clusterRelations.length
|
|
99
|
+
: 0;
|
|
100
|
+
clusters.push({
|
|
101
|
+
id: `cluster-${clusterId++}`,
|
|
102
|
+
name: members[0],
|
|
103
|
+
entities: members,
|
|
104
|
+
coherence_score: Math.round(coherence * 100) / 100,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
return clusters;
|
|
108
|
+
}
|
|
109
|
+
}
|
package/src/graph-ops.ts
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import type { ContentFile } from "@sourcepress/core";
|
|
2
|
+
import type {
|
|
3
|
+
ExtractedEntity,
|
|
4
|
+
GraphQueryResult,
|
|
5
|
+
KnowledgeGap,
|
|
6
|
+
KnowledgeGraph,
|
|
7
|
+
StaleContent,
|
|
8
|
+
} from "./types.js";
|
|
9
|
+
|
|
10
|
+
export interface GraphStats {
|
|
11
|
+
entity_count: number;
|
|
12
|
+
relation_count: number;
|
|
13
|
+
cluster_count: number;
|
|
14
|
+
file_count: number;
|
|
15
|
+
built_at: string;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export class GraphOps {
|
|
19
|
+
private graph: KnowledgeGraph;
|
|
20
|
+
private aliasIndex: Map<string, string>;
|
|
21
|
+
|
|
22
|
+
constructor(graph: KnowledgeGraph) {
|
|
23
|
+
this.graph = graph;
|
|
24
|
+
this.aliasIndex = this.buildAliasIndex();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
private buildAliasIndex(): Map<string, string> {
|
|
28
|
+
const index = new Map<string, string>();
|
|
29
|
+
for (const [name, entity] of this.graph.entities) {
|
|
30
|
+
index.set(name.toLowerCase(), name);
|
|
31
|
+
for (const alias of entity.aliases || []) {
|
|
32
|
+
index.set(alias.toLowerCase(), name);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return index;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
private resolveEntityName(nameOrAlias: string): string | null {
|
|
39
|
+
if (this.graph.entities.has(nameOrAlias)) return nameOrAlias;
|
|
40
|
+
return this.aliasIndex.get(nameOrAlias.toLowerCase()) ?? null;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
query(nameOrAlias: string): GraphQueryResult | null {
|
|
44
|
+
const resolvedName = this.resolveEntityName(nameOrAlias);
|
|
45
|
+
if (!resolvedName) return null;
|
|
46
|
+
const entity = this.graph.entities.get(resolvedName);
|
|
47
|
+
if (!entity) return null;
|
|
48
|
+
|
|
49
|
+
const relations = this.graph.relations.filter(
|
|
50
|
+
(r) => r.from_entity === resolvedName || r.to_entity === resolvedName,
|
|
51
|
+
);
|
|
52
|
+
const relatedNames = new Set<string>();
|
|
53
|
+
for (const r of relations) {
|
|
54
|
+
if (r.from_entity === resolvedName) relatedNames.add(r.to_entity);
|
|
55
|
+
if (r.to_entity === resolvedName) relatedNames.add(r.from_entity);
|
|
56
|
+
}
|
|
57
|
+
const related_entities: ExtractedEntity[] = [];
|
|
58
|
+
for (const name of relatedNames) {
|
|
59
|
+
const e = this.graph.entities.get(name);
|
|
60
|
+
if (e) related_entities.push(e);
|
|
61
|
+
}
|
|
62
|
+
const files = new Set<string>();
|
|
63
|
+
files.add(entity.source_file);
|
|
64
|
+
for (const r of relations) files.add(r.source_file);
|
|
65
|
+
|
|
66
|
+
return { entity, relations, related_entities, files: Array.from(files) };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
findGaps(contentFiles: ContentFile[]): KnowledgeGap[] {
|
|
70
|
+
const gaps: KnowledgeGap[] = [];
|
|
71
|
+
for (const [name, entity] of this.graph.entities) {
|
|
72
|
+
const contentCount = contentFiles.filter((c) => {
|
|
73
|
+
const bodyLower = c.body.toLowerCase();
|
|
74
|
+
const titleLower = String(c.frontmatter.title ?? "").toLowerCase();
|
|
75
|
+
return bodyLower.includes(name.toLowerCase()) || titleLower.includes(name.toLowerCase());
|
|
76
|
+
}).length;
|
|
77
|
+
if (contentCount === 0) {
|
|
78
|
+
gaps.push({
|
|
79
|
+
entity_name: name,
|
|
80
|
+
entity_type: entity.type,
|
|
81
|
+
knowledge_file_count: 1,
|
|
82
|
+
content_file_count: 0,
|
|
83
|
+
reason: `Entity "${name}" (${entity.type}) exists in knowledge but has no corresponding content`,
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return gaps;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
findStale(
|
|
91
|
+
contentFiles: ContentFile[],
|
|
92
|
+
knowledgeTimestamps: Record<string, string>,
|
|
93
|
+
): StaleContent[] {
|
|
94
|
+
const stale: StaleContent[] = [];
|
|
95
|
+
for (const content of contentFiles) {
|
|
96
|
+
if (!content.provenance?.generated_at || !content.provenance?.source_files) continue;
|
|
97
|
+
const generatedAt = content.provenance.generated_at;
|
|
98
|
+
const staleSources: string[] = [];
|
|
99
|
+
let newestSourceChange = "";
|
|
100
|
+
for (const sourcePath of content.provenance.source_files) {
|
|
101
|
+
const sourceTimestamp = knowledgeTimestamps[sourcePath];
|
|
102
|
+
if (sourceTimestamp && sourceTimestamp > generatedAt) {
|
|
103
|
+
staleSources.push(sourcePath);
|
|
104
|
+
if (sourceTimestamp > newestSourceChange) newestSourceChange = sourceTimestamp;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if (staleSources.length > 0) {
|
|
108
|
+
stale.push({
|
|
109
|
+
content_path: content.path,
|
|
110
|
+
generated_at: generatedAt,
|
|
111
|
+
newest_source_change: newestSourceChange,
|
|
112
|
+
stale_sources: staleSources,
|
|
113
|
+
reason: `Content generated at ${generatedAt} but source(s) updated at ${newestSourceChange}`,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return stale;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
getStats(): GraphStats {
|
|
121
|
+
return {
|
|
122
|
+
entity_count: this.graph.entities.size,
|
|
123
|
+
relation_count: this.graph.relations.length,
|
|
124
|
+
cluster_count: this.graph.clusters.length,
|
|
125
|
+
file_count: this.graph.file_count,
|
|
126
|
+
built_at: this.graph.built_at,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
}
|