@sourcepress/knowledge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -0
- package/.turbo/turbo-test.log +21 -0
- package/dist/__tests__/graph-builder.test.d.ts +2 -0
- package/dist/__tests__/graph-builder.test.d.ts.map +1 -0
- package/dist/__tests__/graph-builder.test.js +122 -0
- package/dist/__tests__/graph-builder.test.js.map +1 -0
- package/dist/__tests__/graph-ops.test.d.ts +2 -0
- package/dist/__tests__/graph-ops.test.d.ts.map +1 -0
- package/dist/__tests__/graph-ops.test.js +181 -0
- package/dist/__tests__/graph-ops.test.js.map +1 -0
- package/dist/__tests__/ingestion.test.d.ts +2 -0
- package/dist/__tests__/ingestion.test.d.ts.map +1 -0
- package/dist/__tests__/ingestion.test.js +108 -0
- package/dist/__tests__/ingestion.test.js.map +1 -0
- package/dist/__tests__/json-file-store.test.d.ts +2 -0
- package/dist/__tests__/json-file-store.test.d.ts.map +1 -0
- package/dist/__tests__/json-file-store.test.js +180 -0
- package/dist/__tests__/json-file-store.test.js.map +1 -0
- package/dist/__tests__/knowledge-engine.test.d.ts +2 -0
- package/dist/__tests__/knowledge-engine.test.d.ts.map +1 -0
- package/dist/__tests__/knowledge-engine.test.js +152 -0
- package/dist/__tests__/knowledge-engine.test.js.map +1 -0
- package/dist/__tests__/knowledge-store.test.d.ts +2 -0
- package/dist/__tests__/knowledge-store.test.d.ts.map +1 -0
- package/dist/__tests__/knowledge-store.test.js +97 -0
- package/dist/__tests__/knowledge-store.test.js.map +1 -0
- package/dist/__tests__/scraper.test.d.ts +2 -0
- package/dist/__tests__/scraper.test.d.ts.map +1 -0
- package/dist/__tests__/scraper.test.js +66 -0
- package/dist/__tests__/scraper.test.js.map +1 -0
- package/dist/__tests__/sitemap-parser.test.d.ts +2 -0
- package/dist/__tests__/sitemap-parser.test.d.ts.map +1 -0
- package/dist/__tests__/sitemap-parser.test.js +75 -0
- package/dist/__tests__/sitemap-parser.test.js.map +1 -0
- package/dist/graph-builder.d.ts +17 -0
- package/dist/graph-builder.d.ts.map +1 -0
- package/dist/graph-builder.js +98 -0
- package/dist/graph-builder.js.map +1 -0
- package/dist/graph-ops.d.ts +21 -0
- package/dist/graph-ops.d.ts.map +1 -0
- package/dist/graph-ops.js +108 -0
- package/dist/graph-ops.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/ingestion/index.d.ts +4 -0
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +3 -0
- package/dist/ingestion/index.js.map +1 -0
- package/dist/ingestion/scraper.d.ts +22 -0
- package/dist/ingestion/scraper.d.ts.map +1 -0
- package/dist/ingestion/scraper.js +118 -0
- package/dist/ingestion/scraper.js.map +1 -0
- package/dist/ingestion/sitemap-parser.d.ts +32 -0
- package/dist/ingestion/sitemap-parser.d.ts.map +1 -0
- package/dist/ingestion/sitemap-parser.js +104 -0
- package/dist/ingestion/sitemap-parser.js.map +1 -0
- package/dist/ingestion/types.d.ts +58 -0
- package/dist/ingestion/types.d.ts.map +1 -0
- package/dist/ingestion/types.js +2 -0
- package/dist/ingestion/types.js.map +1 -0
- package/dist/json-file-store.d.ts +19 -0
- package/dist/json-file-store.d.ts.map +1 -0
- package/dist/json-file-store.js +100 -0
- package/dist/json-file-store.js.map +1 -0
- package/dist/knowledge-engine.d.ts +45 -0
- package/dist/knowledge-engine.d.ts.map +1 -0
- package/dist/knowledge-engine.js +160 -0
- package/dist/knowledge-engine.js.map +1 -0
- package/dist/knowledge-store.d.ts +14 -0
- package/dist/knowledge-store.d.ts.map +1 -0
- package/dist/knowledge-store.js +40 -0
- package/dist/knowledge-store.js.map +1 -0
- package/dist/types.d.ts +67 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +26 -0
- package/src/__tests__/graph-builder.test.ts +129 -0
- package/src/__tests__/graph-ops.test.ts +189 -0
- package/src/__tests__/ingestion.test.ts +127 -0
- package/src/__tests__/json-file-store.test.ts +206 -0
- package/src/__tests__/knowledge-engine.test.ts +177 -0
- package/src/__tests__/knowledge-store.test.ts +111 -0
- package/src/__tests__/scraper.test.ts +74 -0
- package/src/__tests__/sitemap-parser.test.ts +85 -0
- package/src/graph-builder.ts +109 -0
- package/src/graph-ops.ts +129 -0
- package/src/index.ts +27 -0
- package/src/ingestion/index.ts +10 -0
- package/src/ingestion/scraper.ts +137 -0
- package/src/ingestion/sitemap-parser.ts +119 -0
- package/src/ingestion/types.ts +57 -0
- package/src/json-file-store.ts +127 -0
- package/src/knowledge-engine.ts +217 -0
- package/src/knowledge-store.ts +49 -0
- package/src/types.ts +76 -0
- package/tsconfig.json +5 -0
- package/vitest.config.ts +2 -0
package/src/index.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export { InMemoryKnowledgeStore } from "./knowledge-store.js";
|
|
2
|
+
export { JsonFileStore } from "./json-file-store.js";
|
|
3
|
+
export { KnowledgeEngine } from "./knowledge-engine.js";
|
|
4
|
+
export { GraphBuilder, type GraphBuilderOptions } from "./graph-builder.js";
|
|
5
|
+
export { GraphOps, type GraphStats } from "./graph-ops.js";
|
|
6
|
+
export type {
|
|
7
|
+
ExtractionResult,
|
|
8
|
+
ExtractedEntity,
|
|
9
|
+
ExtractedRelation,
|
|
10
|
+
EntityCluster,
|
|
11
|
+
KnowledgeGraph,
|
|
12
|
+
GraphQueryResult,
|
|
13
|
+
KnowledgeGap,
|
|
14
|
+
StaleContent,
|
|
15
|
+
KnowledgeStoreBackend,
|
|
16
|
+
KnowledgeFileFilter,
|
|
17
|
+
} from "./types.js";
|
|
18
|
+
export { Scraper } from "./ingestion/scraper.js";
|
|
19
|
+
export { SitemapParser } from "./ingestion/sitemap-parser.js";
|
|
20
|
+
export type {
|
|
21
|
+
Fetcher,
|
|
22
|
+
ScrapeResult,
|
|
23
|
+
SitemapSection,
|
|
24
|
+
SitemapResult,
|
|
25
|
+
SitemapRunOptions,
|
|
26
|
+
BatchProgress,
|
|
27
|
+
} from "./ingestion/types.js";
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
2
|
+
import { parseHTML } from "linkedom";
|
|
3
|
+
import type { Fetcher, ScrapeResult } from "./types.js";
|
|
4
|
+
|
|
5
|
+
const PRIVATE_IP_PATTERNS = [
|
|
6
|
+
/^127\./,
|
|
7
|
+
/^10\./,
|
|
8
|
+
/^192\.168\./,
|
|
9
|
+
/^172\.(1[6-9]|2\d|3[01])\./,
|
|
10
|
+
/^169\.254\./,
|
|
11
|
+
/^::1$/,
|
|
12
|
+
/^fc00:/i,
|
|
13
|
+
/^fe80:/i,
|
|
14
|
+
];
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Validate a URL against SSRF risks.
|
|
18
|
+
* Throws if the URL targets a private/loopback address or non-http(s) scheme.
|
|
19
|
+
*/
|
|
20
|
+
export function validateUrl(rawUrl: string): URL {
|
|
21
|
+
let parsed: URL;
|
|
22
|
+
try {
|
|
23
|
+
parsed = new URL(rawUrl);
|
|
24
|
+
} catch {
|
|
25
|
+
throw new Error(`Invalid URL: ${rawUrl}`);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
29
|
+
throw new Error(
|
|
30
|
+
`URL scheme "${parsed.protocol}" is not allowed. Only http and https are permitted.`,
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
35
|
+
for (const pattern of PRIVATE_IP_PATTERNS) {
|
|
36
|
+
if (pattern.test(hostname)) {
|
|
37
|
+
throw new Error(
|
|
38
|
+
`URL hostname "${hostname}" resolves to a private or loopback address, which is not allowed.`,
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Block "localhost" explicitly
|
|
44
|
+
if (hostname === "localhost" || hostname === "0.0.0.0") {
|
|
45
|
+
throw new Error(`URL hostname "${hostname}" is not allowed.`);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return parsed;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Scrapes a URL into readable content.
|
|
53
|
+
* Uses linkedom for DOM parsing (works in all runtimes) + Mozilla Readability for extraction.
|
|
54
|
+
*/
|
|
55
|
+
export class Scraper {
|
|
56
|
+
private fetcher: Fetcher;
|
|
57
|
+
|
|
58
|
+
constructor(fetcher?: Fetcher) {
|
|
59
|
+
this.fetcher = fetcher ?? globalThis.fetch.bind(globalThis);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async scrape(url: string): Promise<ScrapeResult> {
|
|
63
|
+
validateUrl(url);
|
|
64
|
+
|
|
65
|
+
const response = await this.fetcher(url, undefined);
|
|
66
|
+
if (!response.ok) {
|
|
67
|
+
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const html = await response.text();
|
|
71
|
+
return this.extractFromHtml(url, html);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
extractFromHtml(url: string, html: string): ScrapeResult {
|
|
75
|
+
const { document } = parseHTML(html);
|
|
76
|
+
// biome-ignore lint/suspicious/noExplicitAny: linkedom document is not typed as DOM Document
|
|
77
|
+
const reader = new Readability(document as any);
|
|
78
|
+
const article = reader.parse();
|
|
79
|
+
|
|
80
|
+
const text = article?.textContent?.trim() ?? "";
|
|
81
|
+
if (!article || text.length < 50) {
|
|
82
|
+
throw new Error(`No readable content found at ${url}`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const markdown = this.htmlToMarkdown(article.content);
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
url,
|
|
89
|
+
title: article.title || new URL(url).pathname,
|
|
90
|
+
content: article.textContent,
|
|
91
|
+
markdown,
|
|
92
|
+
byline: article.byline || undefined,
|
|
93
|
+
excerpt: article.excerpt || undefined,
|
|
94
|
+
length: article.textContent.length,
|
|
95
|
+
scraped_at: new Date().toISOString(),
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Lightweight HTML to markdown conversion.
|
|
101
|
+
* Handles common elements: headings, paragraphs, links, lists, bold, italic, code.
|
|
102
|
+
*/
|
|
103
|
+
private htmlToMarkdown(html: string): string {
|
|
104
|
+
let md = html;
|
|
105
|
+
// Headings
|
|
106
|
+
md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n");
|
|
107
|
+
md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n");
|
|
108
|
+
md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n");
|
|
109
|
+
md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n");
|
|
110
|
+
// Bold and italic
|
|
111
|
+
md = md.replace(/<(strong|b)>(.*?)<\/\1>/gi, "**$2**");
|
|
112
|
+
md = md.replace(/<(em|i)>(.*?)<\/\1>/gi, "*$2*");
|
|
113
|
+
// Code
|
|
114
|
+
md = md.replace(/<code>(.*?)<\/code>/gi, "`$1`");
|
|
115
|
+
md = md.replace(/<pre[^>]*>(.*?)<\/pre>/gis, "```\n$1\n```\n\n");
|
|
116
|
+
// Links
|
|
117
|
+
md = md.replace(/<a[^>]+href="([^"]*)"[^>]*>(.*?)<\/a>/gi, "[$2]($1)");
|
|
118
|
+
// Lists
|
|
119
|
+
md = md.replace(/<li[^>]*>(.*?)<\/li>/gi, "- $1\n");
|
|
120
|
+
md = md.replace(/<\/?[uo]l[^>]*>/gi, "\n");
|
|
121
|
+
// Paragraphs and line breaks
|
|
122
|
+
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
123
|
+
md = md.replace(/<p[^>]*>(.*?)<\/p>/gis, "$1\n\n");
|
|
124
|
+
// Strip remaining HTML tags
|
|
125
|
+
md = md.replace(/<[^>]+>/g, "");
|
|
126
|
+
// Decode basic HTML entities
|
|
127
|
+
md = md.replace(/&/g, "&");
|
|
128
|
+
md = md.replace(/</g, "<");
|
|
129
|
+
md = md.replace(/>/g, ">");
|
|
130
|
+
md = md.replace(/"/g, '"');
|
|
131
|
+
md = md.replace(/'/g, "'");
|
|
132
|
+
md = md.replace(/ /g, " ");
|
|
133
|
+
// Clean up whitespace
|
|
134
|
+
md = md.replace(/\n{3,}/g, "\n\n").trim();
|
|
135
|
+
return md;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { validateUrl } from "./scraper.js";
|
|
2
|
+
import type { Fetcher, SitemapResult, SitemapRunOptions, SitemapSection } from "./types.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Parses XML sitemaps and groups URLs by path pattern for interactive selection.
|
|
6
|
+
*/
|
|
7
|
+
export class SitemapParser {
|
|
8
|
+
private fetcher: Fetcher;
|
|
9
|
+
|
|
10
|
+
constructor(fetcher?: Fetcher) {
|
|
11
|
+
this.fetcher = fetcher ?? globalThis.fetch.bind(globalThis);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Fetch and parse a sitemap, returning URLs grouped by path section.
|
|
16
|
+
*/
|
|
17
|
+
async parse(sitemapUrl: string): Promise<SitemapResult> {
|
|
18
|
+
validateUrl(sitemapUrl);
|
|
19
|
+
const response = await this.fetcher(sitemapUrl);
|
|
20
|
+
if (!response.ok) {
|
|
21
|
+
throw new Error(`Failed to fetch sitemap ${sitemapUrl}: ${response.status}`);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const xml = await response.text();
|
|
25
|
+
const urls = this.extractUrls(xml);
|
|
26
|
+
|
|
27
|
+
if (urls.length === 0) {
|
|
28
|
+
throw new Error(`No URLs found in sitemap ${sitemapUrl}`);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const sections = this.groupByPattern(urls);
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
sitemap_url: sitemapUrl,
|
|
35
|
+
sections,
|
|
36
|
+
total_urls: urls.length,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Filter URLs based on include/exclude patterns.
|
|
42
|
+
* Patterns use simple glob matching: "/blog/*" matches "/blog/anything".
|
|
43
|
+
*/
|
|
44
|
+
filterUrls(result: SitemapResult, options: SitemapRunOptions): string[] {
|
|
45
|
+
let urls = result.sections.flatMap((s) => s.urls);
|
|
46
|
+
|
|
47
|
+
if (options.include && options.include.length > 0) {
|
|
48
|
+
urls = urls.filter((url) => {
|
|
49
|
+
const path = new URL(url).pathname;
|
|
50
|
+
return options.include?.some((pattern) => this.matchPattern(path, pattern));
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (options.exclude && options.exclude.length > 0) {
|
|
55
|
+
urls = urls.filter((url) => {
|
|
56
|
+
const path = new URL(url).pathname;
|
|
57
|
+
return !options.exclude?.some((pattern) => this.matchPattern(path, pattern));
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return urls;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Extract <loc> URLs from sitemap XML.
|
|
66
|
+
* Handles both regular sitemaps and sitemap indexes.
|
|
67
|
+
*/
|
|
68
|
+
private extractUrls(xml: string): string[] {
|
|
69
|
+
const urls: string[] = [];
|
|
70
|
+
const locRegex = /<loc>\s*(.*?)\s*<\/loc>/gi;
|
|
71
|
+
for (const match of xml.matchAll(locRegex)) {
|
|
72
|
+
const url = match[1].trim();
|
|
73
|
+
if (url) urls.push(url);
|
|
74
|
+
}
|
|
75
|
+
return urls;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Group URLs by their first path segment to create browsable sections.
|
|
80
|
+
*/
|
|
81
|
+
private groupByPattern(urls: string[]): SitemapSection[] {
|
|
82
|
+
const groups = new Map<string, string[]>();
|
|
83
|
+
|
|
84
|
+
for (const url of urls) {
|
|
85
|
+
try {
|
|
86
|
+
const pathname = new URL(url).pathname;
|
|
87
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
88
|
+
const section = segments.length > 0 ? `/${segments[0]}` : "/";
|
|
89
|
+
const existing = groups.get(section) ?? [];
|
|
90
|
+
existing.push(url);
|
|
91
|
+
groups.set(section, existing);
|
|
92
|
+
} catch {
|
|
93
|
+
// Skip invalid URLs
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return Array.from(groups.entries())
|
|
98
|
+
.map(([pattern, sectionUrls]) => ({
|
|
99
|
+
name:
|
|
100
|
+
pattern === "/" ? "Root" : pattern.slice(1).charAt(0).toUpperCase() + pattern.slice(2),
|
|
101
|
+
pattern: `${pattern}/*`,
|
|
102
|
+
urls: sectionUrls,
|
|
103
|
+
count: sectionUrls.length,
|
|
104
|
+
}))
|
|
105
|
+
.sort((a, b) => b.count - a.count);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Simple glob pattern matching.
|
|
110
|
+
* Supports trailing wildcard: "/blog/*" matches "/blog/my-post".
|
|
111
|
+
*/
|
|
112
|
+
private matchPattern(path: string, pattern: string): boolean {
|
|
113
|
+
if (pattern.endsWith("/*")) {
|
|
114
|
+
const prefix = pattern.slice(0, -2);
|
|
115
|
+
return path === prefix || path.startsWith(`${prefix}/`);
|
|
116
|
+
}
|
|
117
|
+
return path === pattern;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Injectable fetch function for testability.
|
|
3
|
+
* Matches the standard fetch signature (subset).
|
|
4
|
+
*/
|
|
5
|
+
export type Fetcher = (url: string, init?: RequestInit) => Promise<Response>;
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Result of scraping a single URL.
|
|
9
|
+
*/
|
|
10
|
+
export interface ScrapeResult {
|
|
11
|
+
url: string;
|
|
12
|
+
title: string;
|
|
13
|
+
content: string; // Readable text extracted from HTML
|
|
14
|
+
markdown: string; // Content converted to markdown
|
|
15
|
+
byline?: string;
|
|
16
|
+
excerpt?: string;
|
|
17
|
+
length: number; // Character count of content
|
|
18
|
+
scraped_at: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Sitemap section — a group of URLs sharing a path pattern.
|
|
23
|
+
*/
|
|
24
|
+
export interface SitemapSection {
|
|
25
|
+
name: string;
|
|
26
|
+
pattern: string;
|
|
27
|
+
urls: string[];
|
|
28
|
+
count: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Parsed sitemap result — sections grouped by URL pattern.
|
|
33
|
+
*/
|
|
34
|
+
export interface SitemapResult {
|
|
35
|
+
sitemap_url: string;
|
|
36
|
+
sections: SitemapSection[];
|
|
37
|
+
total_urls: number;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Options for running a sitemap import.
|
|
42
|
+
*/
|
|
43
|
+
export interface SitemapRunOptions {
|
|
44
|
+
sitemap_url: string;
|
|
45
|
+
include?: string[];
|
|
46
|
+
exclude?: string[];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Progress info for batch operations.
|
|
51
|
+
*/
|
|
52
|
+
export interface BatchProgress {
|
|
53
|
+
completed: number;
|
|
54
|
+
total: number;
|
|
55
|
+
failed: number;
|
|
56
|
+
results: Array<{ url: string; path: string; success: boolean; error?: string }>;
|
|
57
|
+
}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import type { KnowledgeFile } from "@sourcepress/core";
|
|
4
|
+
import type {
|
|
5
|
+
ExtractedEntity,
|
|
6
|
+
KnowledgeFileFilter,
|
|
7
|
+
KnowledgeGraph,
|
|
8
|
+
KnowledgeStoreBackend,
|
|
9
|
+
} from "./types.js";
|
|
10
|
+
|
|
11
|
+
interface SerializedGraph {
|
|
12
|
+
entities: Array<[string, ExtractedEntity]>;
|
|
13
|
+
relations: KnowledgeGraph["relations"];
|
|
14
|
+
clusters: KnowledgeGraph["clusters"];
|
|
15
|
+
built_at: string;
|
|
16
|
+
file_count: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export class JsonFileStore implements KnowledgeStoreBackend {
|
|
20
|
+
private readonly knowledgePath: string;
|
|
21
|
+
private readonly graphPath: string;
|
|
22
|
+
|
|
23
|
+
private files: Map<string, KnowledgeFile> | null = null;
|
|
24
|
+
private graph: KnowledgeGraph | null | undefined = undefined;
|
|
25
|
+
|
|
26
|
+
constructor(dir: string) {
|
|
27
|
+
this.knowledgePath = join(dir, "knowledge.json");
|
|
28
|
+
this.graphPath = join(dir, "graph.json");
|
|
29
|
+
|
|
30
|
+
if (!existsSync(dir)) {
|
|
31
|
+
mkdirSync(dir, { recursive: true });
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async store(file: KnowledgeFile): Promise<void> {
|
|
36
|
+
const files = this.loadFiles();
|
|
37
|
+
files.set(file.path, { ...file });
|
|
38
|
+
this.writeFiles(files);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async retrieve(path: string): Promise<KnowledgeFile | null> {
|
|
42
|
+
return this.loadFiles().get(path) ?? null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async list(filter?: KnowledgeFileFilter): Promise<KnowledgeFile[]> {
|
|
46
|
+
let results = Array.from(this.loadFiles().values());
|
|
47
|
+
if (filter?.type) {
|
|
48
|
+
results = results.filter((f) => f.type === filter.type);
|
|
49
|
+
}
|
|
50
|
+
if (filter?.quality) {
|
|
51
|
+
results = results.filter((f) => f.quality === filter.quality);
|
|
52
|
+
}
|
|
53
|
+
if (filter?.source) {
|
|
54
|
+
results = results.filter((f) => f.source === filter.source);
|
|
55
|
+
}
|
|
56
|
+
if (filter?.since) {
|
|
57
|
+
const since = filter.since;
|
|
58
|
+
results = results.filter((f) => f.ingested_at >= since);
|
|
59
|
+
}
|
|
60
|
+
return results;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
async delete(path: string): Promise<boolean> {
|
|
64
|
+
const files = this.loadFiles();
|
|
65
|
+
const deleted = files.delete(path);
|
|
66
|
+
if (deleted) {
|
|
67
|
+
this.writeFiles(files);
|
|
68
|
+
}
|
|
69
|
+
return deleted;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
async count(): Promise<number> {
|
|
73
|
+
return this.loadFiles().size;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async saveGraph(graph: KnowledgeGraph): Promise<void> {
|
|
77
|
+
this.graph = graph;
|
|
78
|
+
const serialized: SerializedGraph = {
|
|
79
|
+
entities: Array.from(graph.entities.entries()),
|
|
80
|
+
relations: graph.relations,
|
|
81
|
+
clusters: graph.clusters,
|
|
82
|
+
built_at: graph.built_at,
|
|
83
|
+
file_count: graph.file_count,
|
|
84
|
+
};
|
|
85
|
+
writeFileSync(this.graphPath, JSON.stringify(serialized, null, "\t"), "utf-8");
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
async loadGraph(): Promise<KnowledgeGraph | null> {
|
|
89
|
+
if (this.graph !== undefined) {
|
|
90
|
+
return this.graph;
|
|
91
|
+
}
|
|
92
|
+
if (!existsSync(this.graphPath)) {
|
|
93
|
+
this.graph = null;
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
const raw = readFileSync(this.graphPath, "utf-8");
|
|
97
|
+
const data = JSON.parse(raw) as SerializedGraph;
|
|
98
|
+
this.graph = {
|
|
99
|
+
entities: new Map(data.entities),
|
|
100
|
+
relations: data.relations,
|
|
101
|
+
clusters: data.clusters,
|
|
102
|
+
built_at: data.built_at,
|
|
103
|
+
file_count: data.file_count,
|
|
104
|
+
};
|
|
105
|
+
return this.graph;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
private loadFiles(): Map<string, KnowledgeFile> {
|
|
109
|
+
if (this.files !== null) {
|
|
110
|
+
return this.files;
|
|
111
|
+
}
|
|
112
|
+
if (!existsSync(this.knowledgePath)) {
|
|
113
|
+
this.files = new Map();
|
|
114
|
+
return this.files;
|
|
115
|
+
}
|
|
116
|
+
const raw = readFileSync(this.knowledgePath, "utf-8");
|
|
117
|
+
const entries = JSON.parse(raw) as Array<[string, KnowledgeFile]>;
|
|
118
|
+
this.files = new Map(entries);
|
|
119
|
+
return this.files;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
private writeFiles(files: Map<string, KnowledgeFile>): void {
|
|
123
|
+
this.files = files;
|
|
124
|
+
const entries = Array.from(files.entries());
|
|
125
|
+
writeFileSync(this.knowledgePath, JSON.stringify(entries, null, "\t"), "utf-8");
|
|
126
|
+
}
|
|
127
|
+
}
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import type { BudgetTracker, ResolvedProvider } from "@sourcepress/ai";
|
|
2
|
+
import { classify, extract } from "@sourcepress/ai";
|
|
3
|
+
import type { ContentFile, KnowledgeFile } from "@sourcepress/core";
|
|
4
|
+
import { GraphBuilder } from "./graph-builder.js";
|
|
5
|
+
import { GraphOps } from "./graph-ops.js";
|
|
6
|
+
import { Scraper } from "./ingestion/scraper.js";
|
|
7
|
+
import { SitemapParser } from "./ingestion/sitemap-parser.js";
|
|
8
|
+
import type {
|
|
9
|
+
BatchProgress,
|
|
10
|
+
Fetcher,
|
|
11
|
+
SitemapResult,
|
|
12
|
+
SitemapRunOptions,
|
|
13
|
+
} from "./ingestion/types.js";
|
|
14
|
+
import type {
|
|
15
|
+
ExtractedEntity,
|
|
16
|
+
GraphQueryResult,
|
|
17
|
+
KnowledgeGap,
|
|
18
|
+
KnowledgeGraph,
|
|
19
|
+
KnowledgeStoreBackend,
|
|
20
|
+
StaleContent,
|
|
21
|
+
} from "./types.js";
|
|
22
|
+
|
|
23
|
+
export class KnowledgeEngine {
|
|
24
|
+
private store: KnowledgeStoreBackend;
|
|
25
|
+
private provider: ResolvedProvider;
|
|
26
|
+
private budget: BudgetTracker;
|
|
27
|
+
private graphOps: GraphOps | null = null;
|
|
28
|
+
private currentGraph: KnowledgeGraph | null = null;
|
|
29
|
+
private scraper: Scraper;
|
|
30
|
+
private sitemapParser: SitemapParser;
|
|
31
|
+
|
|
32
|
+
constructor(
|
|
33
|
+
store: KnowledgeStoreBackend,
|
|
34
|
+
provider: ResolvedProvider,
|
|
35
|
+
budget: BudgetTracker,
|
|
36
|
+
fetcher?: Fetcher,
|
|
37
|
+
) {
|
|
38
|
+
this.store = store;
|
|
39
|
+
this.provider = provider;
|
|
40
|
+
this.budget = budget;
|
|
41
|
+
this.scraper = new Scraper(fetcher);
|
|
42
|
+
this.sitemapParser = new SitemapParser(fetcher);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async initialize(): Promise<void> {
|
|
46
|
+
const graph = await this.store.loadGraph();
|
|
47
|
+
if (graph) {
|
|
48
|
+
this.currentGraph = graph;
|
|
49
|
+
this.graphOps = new GraphOps(graph);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async ingest(
|
|
54
|
+
path: string,
|
|
55
|
+
body: string,
|
|
56
|
+
source: KnowledgeFile["source"],
|
|
57
|
+
sourceUrl?: string,
|
|
58
|
+
): Promise<KnowledgeFile> {
|
|
59
|
+
const classification = await classify({ text: body }, this.provider, this.budget);
|
|
60
|
+
const extraction = await extract(
|
|
61
|
+
{ text: body, file_path: path, existing_entities: this.getExistingEntities() },
|
|
62
|
+
this.provider,
|
|
63
|
+
this.budget,
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
const knowledgeFile: KnowledgeFile = {
|
|
67
|
+
path,
|
|
68
|
+
type: classification.type,
|
|
69
|
+
quality: classification.quality,
|
|
70
|
+
quality_score: classification.quality_score,
|
|
71
|
+
entities: extraction.entities.map((e) => ({ type: e.type, name: e.name })),
|
|
72
|
+
ingested_at: new Date().toISOString(),
|
|
73
|
+
source,
|
|
74
|
+
source_url: sourceUrl,
|
|
75
|
+
body,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
await this.store.store(knowledgeFile);
|
|
79
|
+
return knowledgeFile;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
async buildGraph(): Promise<KnowledgeGraph> {
|
|
83
|
+
const files = await this.store.list();
|
|
84
|
+
const builder = new GraphBuilder();
|
|
85
|
+
|
|
86
|
+
for (const file of files) {
|
|
87
|
+
const extraction = await extract(
|
|
88
|
+
{
|
|
89
|
+
text: file.body,
|
|
90
|
+
file_path: file.path,
|
|
91
|
+
existing_entities: Array.from(builder.getEntities().values()),
|
|
92
|
+
},
|
|
93
|
+
this.provider,
|
|
94
|
+
this.budget,
|
|
95
|
+
);
|
|
96
|
+
|
|
97
|
+
const entities: ExtractedEntity[] = extraction.entities.map((e) => ({
|
|
98
|
+
type: e.type,
|
|
99
|
+
name: e.name,
|
|
100
|
+
aliases: e.aliases ?? [],
|
|
101
|
+
confidence: e.confidence ?? 0.5,
|
|
102
|
+
source_file: file.path,
|
|
103
|
+
}));
|
|
104
|
+
|
|
105
|
+
builder.addEntities(entities);
|
|
106
|
+
builder.addRelations(extraction.relations.map((r) => ({ ...r, source_file: file.path })));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const graph = builder.build();
|
|
110
|
+
this.currentGraph = graph;
|
|
111
|
+
this.graphOps = new GraphOps(graph);
|
|
112
|
+
await this.store.saveGraph(graph);
|
|
113
|
+
return graph;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
query(nameOrAlias: string): GraphQueryResult | null {
|
|
117
|
+
if (!this.graphOps) return null;
|
|
118
|
+
return this.graphOps.query(nameOrAlias);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
findGaps(contentFiles: ContentFile[]): KnowledgeGap[] {
|
|
122
|
+
if (!this.graphOps) return [];
|
|
123
|
+
return this.graphOps.findGaps(contentFiles);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
findStale(
|
|
127
|
+
contentFiles: ContentFile[],
|
|
128
|
+
knowledgeTimestamps: Record<string, string>,
|
|
129
|
+
): StaleContent[] {
|
|
130
|
+
if (!this.graphOps) return [];
|
|
131
|
+
return this.graphOps.findStale(contentFiles, knowledgeTimestamps);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
getGraph(): KnowledgeGraph | null {
|
|
135
|
+
return this.currentGraph;
|
|
136
|
+
}
|
|
137
|
+
getStore(): KnowledgeStoreBackend {
|
|
138
|
+
return this.store;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Import a single URL: scrape → classify → extract entities → store.
|
|
143
|
+
*/
|
|
144
|
+
async importUrl(url: string): Promise<KnowledgeFile> {
|
|
145
|
+
const scraped = await this.scraper.scrape(url);
|
|
146
|
+
const path = this.urlToPath(url);
|
|
147
|
+
return this.ingest(path, scraped.markdown, "url", url);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Parse a sitemap and return grouped sections for interactive selection.
|
|
152
|
+
*/
|
|
153
|
+
async parseSitemap(sitemapUrl: string): Promise<SitemapResult> {
|
|
154
|
+
return this.sitemapParser.parse(sitemapUrl);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Filter a parsed sitemap result by include/exclude patterns.
|
|
159
|
+
*/
|
|
160
|
+
filterSitemapUrls(result: SitemapResult, options: SitemapRunOptions): string[] {
|
|
161
|
+
return this.sitemapParser.filterUrls(result, options);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Import a batch of URLs. Calls progressCallback for each completed URL.
|
|
166
|
+
*/
|
|
167
|
+
async importBatch(
|
|
168
|
+
urls: string[],
|
|
169
|
+
progressCallback?: (completed: number, total: number, failed: number) => void,
|
|
170
|
+
): Promise<BatchProgress> {
|
|
171
|
+
const progress: BatchProgress = {
|
|
172
|
+
completed: 0,
|
|
173
|
+
total: urls.length,
|
|
174
|
+
failed: 0,
|
|
175
|
+
results: [],
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
for (const url of urls) {
|
|
179
|
+
try {
|
|
180
|
+
const file = await this.importUrl(url);
|
|
181
|
+
progress.completed++;
|
|
182
|
+
progress.results.push({ url, path: file.path, success: true });
|
|
183
|
+
} catch (error) {
|
|
184
|
+
progress.failed++;
|
|
185
|
+
progress.completed++;
|
|
186
|
+
progress.results.push({
|
|
187
|
+
url,
|
|
188
|
+
path: "",
|
|
189
|
+
success: false,
|
|
190
|
+
error: error instanceof Error ? error.message : String(error),
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
progressCallback?.(progress.completed, progress.total, progress.failed);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return progress;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Convert a URL to a knowledge file path.
|
|
201
|
+
* https://example.com/blog/post-1 → knowledge/example.com/blog/post-1.md
|
|
202
|
+
*/
|
|
203
|
+
private urlToPath(url: string): string {
|
|
204
|
+
const parsed = new URL(url);
|
|
205
|
+
const pathname = parsed.pathname.replace(/\/$/, "") || "/index";
|
|
206
|
+
const clean = pathname.replace(/\.[^.]+$/, ""); // Remove file extensions
|
|
207
|
+
return `knowledge/${parsed.hostname}${clean}.md`;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
private getExistingEntities(): Array<{ type: string; name: string }> {
|
|
211
|
+
if (!this.currentGraph) return [];
|
|
212
|
+
return Array.from(this.currentGraph.entities.values()).map((e) => ({
|
|
213
|
+
type: e.type,
|
|
214
|
+
name: e.name,
|
|
215
|
+
}));
|
|
216
|
+
}
|
|
217
|
+
}
|