greptor 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +365 -0
- package/dist/greptor.d.ts +7 -0
- package/dist/greptor.d.ts.map +1 -0
- package/dist/greptor.js +98 -0
- package/dist/greptor.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/llm-factory.d.ts +7 -0
- package/dist/llm/llm-factory.d.ts.map +1 -0
- package/dist/llm/llm-factory.js +53 -0
- package/dist/llm/llm-factory.js.map +1 -0
- package/dist/metadata-schema/generate.d.ts +3 -0
- package/dist/metadata-schema/generate.d.ts.map +1 -0
- package/dist/metadata-schema/generate.js +43 -0
- package/dist/metadata-schema/generate.js.map +1 -0
- package/dist/metadata-schema/initialize.d.ts +5 -0
- package/dist/metadata-schema/initialize.d.ts.map +1 -0
- package/dist/metadata-schema/initialize.js +37 -0
- package/dist/metadata-schema/initialize.js.map +1 -0
- package/dist/metadata-schema/types.d.ts +34 -0
- package/dist/metadata-schema/types.d.ts.map +1 -0
- package/dist/metadata-schema/types.js +30 -0
- package/dist/metadata-schema/types.js.map +1 -0
- package/dist/processing/chunk.d.ts +3 -0
- package/dist/processing/chunk.d.ts.map +1 -0
- package/dist/processing/chunk.js +36 -0
- package/dist/processing/chunk.js.map +1 -0
- package/dist/processing/extract-metadata.d.ts +4 -0
- package/dist/processing/extract-metadata.d.ts.map +1 -0
- package/dist/processing/extract-metadata.js +39 -0
- package/dist/processing/extract-metadata.js.map +1 -0
- package/dist/processing/processor.d.ts +28 -0
- package/dist/processing/processor.d.ts.map +1 -0
- package/dist/processing/processor.js +112 -0
- package/dist/processing/processor.js.map +1 -0
- package/dist/skills/skill-generator.d.ts +16 -0
- package/dist/skills/skill-generator.d.ts.map +1 -0
- package/dist/skills/skill-generator.js +210 -0
- package/dist/skills/skill-generator.js.map +1 -0
- package/dist/storage/file-storage.d.ts +16 -0
- package/dist/storage/file-storage.d.ts.map +1 -0
- package/dist/storage/file-storage.js +162 -0
- package/dist/storage/file-storage.js.map +1 -0
- package/dist/storage/index.d.ts +3 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +3 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/types.d.ts +16 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +2 -0
- package/dist/storage/types.js.map +1 -0
- package/dist/types.d.ts +53 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/file.d.ts +2 -0
- package/dist/utils/file.d.ts.map +1 -0
- package/dist/utils/file.js +11 -0
- package/dist/utils/file.js.map +1 -0
- package/dist/utils/hash.d.ts +2 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +5 -0
- package/dist/utils/hash.js.map +1 -0
- package/package.json +63 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import YAML from "yaml";
|
|
4
|
+
import { fileExists } from "../utils/file.js";
|
|
5
|
+
import { generateMetadataSchema } from "./generate.js";
|
|
6
|
+
export const METADATA_SCHEMA_FILENAME = "metadata-schema.yaml";
|
|
7
|
+
async function persist(schemaFilePath, metadataSchema, logger) {
|
|
8
|
+
const schemaYaml = YAML.stringify(metadataSchema);
|
|
9
|
+
await mkdir(path.dirname(schemaFilePath), { recursive: true });
|
|
10
|
+
await writeFile(schemaFilePath, schemaYaml, "utf8");
|
|
11
|
+
logger?.debug?.("Metadata schema saved", { path: schemaFilePath });
|
|
12
|
+
}
|
|
13
|
+
export async function initializeMetadataSchema(baseDir, llmModel, topic, metadataSchema, logger) {
|
|
14
|
+
const schemaFilePath = path.join(baseDir, METADATA_SCHEMA_FILENAME);
|
|
15
|
+
// If a schema is provided, save it to disk and return it straight away
|
|
16
|
+
if (metadataSchema) {
|
|
17
|
+
await persist(schemaFilePath, metadataSchema, logger);
|
|
18
|
+
return metadataSchema;
|
|
19
|
+
}
|
|
20
|
+
// If schema file exists on disk, load and return it
|
|
21
|
+
if (await fileExists(schemaFilePath)) {
|
|
22
|
+
logger?.debug?.("Metadata schema not provided, loading from file", {
|
|
23
|
+
path: schemaFilePath,
|
|
24
|
+
});
|
|
25
|
+
return YAML.parse(await readFile(schemaFilePath, "utf8"));
|
|
26
|
+
}
|
|
27
|
+
// Otherwise, generate a new schema using the LLM
|
|
28
|
+
logger?.info?.("Generating metadata schema", { topic });
|
|
29
|
+
const schema = await generateMetadataSchema(topic, llmModel);
|
|
30
|
+
await persist(schemaFilePath, schema, logger);
|
|
31
|
+
logger?.info?.("Metadata schema generated", {
|
|
32
|
+
path: schemaFilePath,
|
|
33
|
+
fields: schema.length,
|
|
34
|
+
});
|
|
35
|
+
return schema;
|
|
36
|
+
}
|
|
37
|
+
//# sourceMappingURL=initialize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"initialize.js","sourceRoot":"","sources":["../../src/metadata-schema/initialize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,IAAI,MAAM,MAAM,CAAC;AAGxB,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC;AAEvD,MAAM,CAAC,MAAM,wBAAwB,GAAG,sBAAsB,CAAC;AAE/D,KAAK,UAAU,OAAO,CACrB,cAAsB,EACtB,cAA8B,EAC9B,MAAe;IAEf,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;IAClD,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/D,MAAM,SAAS,CAAC,cAAc,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;IAEpD,MAAM,EAAE,KAAK,EAAE,CAAC,uBAAuB,EAAE,EAAE,IAAI,EAAE,cAAc,EAAE,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC7C,OAAe,EACf,QAAgB,EAChB,KAAa,EACb,cAA+B,EAC/B,MAAe;IAEf,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,wBAAwB,CAAC,CAAC;IAEpE,uEAAuE;IACvE,IAAI,cAAc,EAAE,CAAC;QACpB,MAAM,OAAO,CAAC,cAAc,EAAE,cAAc,EAAE,MAAM,CAAC,CAAC;QACtD,OAAO,cAAc,CAAC;IACvB,CAAC;IAED,oDAAoD;IACpD,IAAI,MAAM,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QACtC,MAAM,EAAE,KAAK,EAAE,CAAC,iDAAiD,EAAE;YAClE,IAAI,EAAE,cAAc;SACpB,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC,CAAmB,CAAC;IAC7E,CAAC;IAED,iDAAiD;IACjD,MAAM,EAAE,IAAI,EAAE,CAAC,4BAA4B,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IACxD,MAAM,MAAM,GAAG,MAAM,sBAAsB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;IAC7D,MAAM,OAAO,CAAC,cAAc,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IAC9C,MAAM,EAAE,IAAI,EAAE,CAAC,2BAA2B,EAAE;QAC3C,IAAI,EAAE,cAAc;QACpB,MAAM,EAAE,MAAM,CAAC,MAAM;KACrB,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const MetadataFieldSchema: z.ZodObject<{
|
|
3
|
+
name: z.ZodString;
|
|
4
|
+
type: z.ZodEnum<{
|
|
5
|
+
string: "string";
|
|
6
|
+
number: "number";
|
|
7
|
+
boolean: "boolean";
|
|
8
|
+
"string[]": "string[]";
|
|
9
|
+
"number[]": "number[]";
|
|
10
|
+
enum: "enum";
|
|
11
|
+
"enum[]": "enum[]";
|
|
12
|
+
date: "date";
|
|
13
|
+
}>;
|
|
14
|
+
description: z.ZodString;
|
|
15
|
+
enumValues: z.ZodNullable<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
16
|
+
}, z.core.$strip>;
|
|
17
|
+
export declare const ResponseSchema: z.ZodObject<{
|
|
18
|
+
metadata_fields: z.ZodArray<z.ZodObject<{
|
|
19
|
+
name: z.ZodString;
|
|
20
|
+
type: z.ZodEnum<{
|
|
21
|
+
string: "string";
|
|
22
|
+
number: "number";
|
|
23
|
+
boolean: "boolean";
|
|
24
|
+
"string[]": "string[]";
|
|
25
|
+
"number[]": "number[]";
|
|
26
|
+
enum: "enum";
|
|
27
|
+
"enum[]": "enum[]";
|
|
28
|
+
date: "date";
|
|
29
|
+
}>;
|
|
30
|
+
description: z.ZodString;
|
|
31
|
+
enumValues: z.ZodNullable<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
32
|
+
}, z.core.$strip>>;
|
|
33
|
+
}, z.core.$strip>;
|
|
34
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/metadata-schema/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;iBAoB9B,CAAC;AAEH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;iBAMzB,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export const MetadataFieldSchema = z.object({
|
|
3
|
+
name: z.string().describe("Metadata field name in snake_case"),
|
|
4
|
+
type: z
|
|
5
|
+
.enum([
|
|
6
|
+
"string",
|
|
7
|
+
"string[]",
|
|
8
|
+
"number",
|
|
9
|
+
"number[]",
|
|
10
|
+
"boolean",
|
|
11
|
+
"enum",
|
|
12
|
+
"enum[]",
|
|
13
|
+
"date",
|
|
14
|
+
])
|
|
15
|
+
.describe("Field data type"),
|
|
16
|
+
description: z.string().describe("Purpose and usage of this metadata field"),
|
|
17
|
+
enumValues: z
|
|
18
|
+
.array(z.string())
|
|
19
|
+
.optional()
|
|
20
|
+
.nullable()
|
|
21
|
+
.describe("Full list of enum values for enum types."),
|
|
22
|
+
});
|
|
23
|
+
export const ResponseSchema = z.object({
|
|
24
|
+
metadata_fields: z
|
|
25
|
+
.array(MetadataFieldSchema)
|
|
26
|
+
.min(5)
|
|
27
|
+
.max(10)
|
|
28
|
+
.describe("List of metadata fields for the given topic"),
|
|
29
|
+
});
|
|
30
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/metadata-schema/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,mCAAmC,CAAC;IAC9D,IAAI,EAAE,CAAC;SACL,IAAI,CAAC;QACL,QAAQ;QACR,UAAU;QACV,QAAQ;QACR,UAAU;QACV,SAAS;QACT,MAAM;QACN,QAAQ;QACR,MAAM;KACN,CAAC;SACD,QAAQ,CAAC,iBAAiB,CAAC;IAC7B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,0CAA0C,CAAC;IAC5E,UAAU,EAAE,CAAC;SACX,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;SACjB,QAAQ,EAAE;SACV,QAAQ,EAAE;SACV,QAAQ,CAAC,0CAA0C,CAAC;CACtD,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,eAAe,EAAE,CAAC;SAChB,KAAK,CAAC,mBAAmB,CAAC;SAC1B,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,EAAE,CAAC;SACP,QAAQ,CAAC,6CAA6C,CAAC;CACzD,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk.d.ts","sourceRoot":"","sources":["../../src/processing/chunk.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAuBvD,wBAAsB,KAAK,CAC1B,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,GAAG,EAAE,SAAS,GACZ,OAAO,CAAC,MAAM,CAAC,CAiBjB"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
const createCleanPrompt = (text, domain) => `
|
|
2
|
+
Clean + structure the raw content into independent semantic chunks for domain: ${domain}
|
|
3
|
+
|
|
4
|
+
Remove completely (noise/boilerplate): ads/sponsors, greetings/intros/outros, CTAs/promos ("like & subscribe"), duplicates/filler, contact/social/discount codes, sign-offs ("reach out", "visit our website"), anything not primary informational content.
|
|
5
|
+
|
|
6
|
+
Preserve meaning exactly: no summarizing/paraphrasing. Keep all facts/numbers/names/dates, domain terminology, tables/lists, and URLs only if meaningful.
|
|
7
|
+
|
|
8
|
+
Normalize: canonicalize entity names; remove formatting garbage/repeated boilerplate; normalize whitespace/tabs/newlines.
|
|
9
|
+
|
|
10
|
+
Chunking: prefer fewer richer chunks. Target 100–200+ words per chunk where possible. Group related subtopics; split only when topics are truly distinct or context switches.
|
|
11
|
+
|
|
12
|
+
Output (English only), chunks separated by blank lines:
|
|
13
|
+
CHUNK c01: "Short Topic Title"
|
|
14
|
+
<cleaned content>
|
|
15
|
+
|
|
16
|
+
Rules: do not paraphrase, shorten meaning, add interpretation, or merge unrelated topics.
|
|
17
|
+
|
|
18
|
+
RAW CONTENT:
|
|
19
|
+
${text}
|
|
20
|
+
`;
|
|
21
|
+
export async function chunk(rawContent, domain, llm) {
|
|
22
|
+
const prompt = createCleanPrompt(rawContent, domain);
|
|
23
|
+
const messages = [
|
|
24
|
+
{ role: "user", content: prompt },
|
|
25
|
+
];
|
|
26
|
+
const completion = await llm.client.chat.completions.parse({
|
|
27
|
+
model: llm.model,
|
|
28
|
+
messages,
|
|
29
|
+
});
|
|
30
|
+
const content = completion.choices[0]?.message?.content;
|
|
31
|
+
if (!content) {
|
|
32
|
+
throw new Error("Failed to clean content: empty LLM response");
|
|
33
|
+
}
|
|
34
|
+
return content;
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=chunk.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk.js","sourceRoot":"","sources":["../../src/processing/chunk.ts"],"names":[],"mappings":"AAGA,MAAM,iBAAiB,GAAG,CAAC,IAAY,EAAE,MAAc,EAAE,EAAE,CAAC;iFACqB,MAAM;;;;;;;;;;;;;;;;;EAiBrF,IAAI;CACL,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,KAAK,CAC1B,UAAkB,EAClB,MAAc,EACd,GAAc;IAEd,MAAM,MAAM,GAAG,iBAAiB,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;IACrD,MAAM,QAAQ,GAAiC;QAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;KACjC,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1D,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,QAAQ;KACR,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;IACxD,IAAI,CAAC,OAAO,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,OAAO,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { LlmClient } from "../llm/llm-factory.js";
|
|
2
|
+
import type { Metadata } from "../types.js";
|
|
3
|
+
export declare function extractMetadata(chunkedContent: string, domain: string, metadataSchema: string, llm: LlmClient): Promise<Metadata[]>;
|
|
4
|
+
//# sourceMappingURL=extract-metadata.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract-metadata.d.ts","sourceRoot":"","sources":["../../src/processing/extract-metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AA6B5C,wBAAsB,eAAe,CACpC,cAAc,EAAE,MAAM,EACtB,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,MAAM,EACtB,GAAG,EAAE,SAAS,GACZ,OAAO,CAAC,QAAQ,EAAE,CAAC,CAqBrB"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import YAML from "yaml";
|
|
2
|
+
const createExtractMetadataPrompt = (text, domain, metadataSchema) => `
|
|
3
|
+
Extract per-chunk metadata for domain: ${domain}. Optimize for grep-based search/filtering.
|
|
4
|
+
|
|
5
|
+
Input format:
|
|
6
|
+
CHUNK c01: "Title"\n...\n\nCHUNK c02: "Title"\n...
|
|
7
|
+
|
|
8
|
+
Output: YAML ONLY (no fences/comments). YAML list like:
|
|
9
|
+
- id: c01
|
|
10
|
+
title: "Title"
|
|
11
|
+
key1: value
|
|
12
|
+
key2: [v1, v2]
|
|
13
|
+
|
|
14
|
+
Value formats: strings snake_case (except title); numbers put unit in key (price_aud: 1250000); percentages use _percent; dates ISO-8601; ranges as [min, max]; arrays MUST be single-line YAML (e.g., [a, b]).
|
|
15
|
+
|
|
16
|
+
Rules: extract each chunk separately; do not output null/empty fields; for enum fields, only use values from schema; include additional useful numeric/date metrics for grep filtering.
|
|
17
|
+
|
|
18
|
+
SCHEMA:
|
|
19
|
+
${metadataSchema}
|
|
20
|
+
|
|
21
|
+
INPUT:
|
|
22
|
+
${text}
|
|
23
|
+
`;
|
|
24
|
+
export async function extractMetadata(chunkedContent, domain, metadataSchema, llm) {
|
|
25
|
+
const prompt = createExtractMetadataPrompt(chunkedContent, domain, metadataSchema);
|
|
26
|
+
const messages = [
|
|
27
|
+
{ role: "user", content: prompt },
|
|
28
|
+
];
|
|
29
|
+
const completion = await llm.client.chat.completions.parse({
|
|
30
|
+
model: llm.model,
|
|
31
|
+
messages,
|
|
32
|
+
});
|
|
33
|
+
const content = completion.choices[0]?.message?.content;
|
|
34
|
+
if (!content) {
|
|
35
|
+
throw new Error("Failed to extract metadata: empty LLM response");
|
|
36
|
+
}
|
|
37
|
+
return YAML.parse(content);
|
|
38
|
+
}
|
|
39
|
+
//# sourceMappingURL=extract-metadata.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract-metadata.js","sourceRoot":"","sources":["../../src/processing/extract-metadata.ts"],"names":[],"mappings":"AACA,OAAO,IAAI,MAAM,MAAM,CAAC;AAIxB,MAAM,2BAA2B,GAAG,CACnC,IAAY,EACZ,MAAc,EACd,cAAsB,EACrB,EAAE,CAAC;yCACoC,MAAM;;;;;;;;;;;;;;;;EAgB7C,cAAc;;;EAGd,IAAI;CACL,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,eAAe,CACpC,cAAsB,EACtB,MAAc,EACd,cAAsB,EACtB,GAAc;IAEd,MAAM,MAAM,GAAG,2BAA2B,CACzC,cAAc,EACd,MAAM,EACN,cAAc,CACd,CAAC;IACF,MAAM,QAAQ,GAAiC;QAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;KACjC,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1D,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,QAAQ;KACR,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;IACxD,IAAI,CAAC,OAAO,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;IACnE,CAAC;IAED,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAe,CAAC;AAC1C,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { LlmClient } from "../llm/llm-factory.js";
|
|
2
|
+
import type { DocumentRef, FileStorage } from "../storage/index.js";
|
|
3
|
+
import type { Logger } from "../types.js";
|
|
4
|
+
export interface ProcessorContext {
|
|
5
|
+
domain: string;
|
|
6
|
+
metadataSchema: string;
|
|
7
|
+
llm: LlmClient;
|
|
8
|
+
storage: FileStorage;
|
|
9
|
+
logger?: Logger;
|
|
10
|
+
}
|
|
11
|
+
export interface ProcessingQueue {
|
|
12
|
+
enqueue: (ref: DocumentRef) => void;
|
|
13
|
+
dequeue: () => DocumentRef | undefined;
|
|
14
|
+
size: () => number;
|
|
15
|
+
}
|
|
16
|
+
export declare function createProcessingQueue(): ProcessingQueue;
|
|
17
|
+
export declare function startBackgroundWorkers(args: {
|
|
18
|
+
ctx: ProcessorContext;
|
|
19
|
+
queue: ProcessingQueue;
|
|
20
|
+
concurrency?: number;
|
|
21
|
+
idleSleepMs?: number;
|
|
22
|
+
}): void;
|
|
23
|
+
export declare function enqueueUnprocessedDocuments(args: {
|
|
24
|
+
storage: FileStorage;
|
|
25
|
+
queue: ProcessingQueue;
|
|
26
|
+
logger?: Logger;
|
|
27
|
+
}): Promise<number>;
|
|
28
|
+
//# sourceMappingURL=processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processor.d.ts","sourceRoot":"","sources":["../../src/processing/processor.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,KAAK,EAAE,MAAM,EAAY,MAAM,aAAa,CAAC;AAMpD,MAAM,WAAW,gBAAgB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,GAAG,EAAE,SAAS,CAAC;IACf,OAAO,EAAE,WAAW,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,eAAe;IAC/B,OAAO,EAAE,CAAC,GAAG,EAAE,WAAW,KAAK,IAAI,CAAC;IACpC,OAAO,EAAE,MAAM,WAAW,GAAG,SAAS,CAAC;IACvC,IAAI,EAAE,MAAM,MAAM,CAAC;CACnB;AAED,wBAAgB,qBAAqB,IAAI,eAAe,CAgBvD;AAgFD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE;IAC5C,GAAG,EAAE,gBAAgB,CAAC;IACtB,KAAK,EAAE,eAAe,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB,GAAG,IAAI,CAkCP;AAED,wBAAsB,2BAA2B,CAAC,IAAI,EAAE;IACvD,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,EAAE,eAAe,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,OAAO,CAAC,MAAM,CAAC,CAalB"}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import YAML from "yaml";
|
|
2
|
+
import { chunk as chunkDocument } from "./chunk.js";
|
|
3
|
+
import { extractMetadata } from "./extract-metadata.js";
|
|
4
|
+
const DEFAULT_IDLE_SLEEP_MS = 750;
|
|
5
|
+
export function createProcessingQueue() {
|
|
6
|
+
const items = [];
|
|
7
|
+
return {
|
|
8
|
+
enqueue(ref) {
|
|
9
|
+
items.push(ref);
|
|
10
|
+
},
|
|
11
|
+
size() {
|
|
12
|
+
return items.length;
|
|
13
|
+
},
|
|
14
|
+
dequeue() {
|
|
15
|
+
return items.shift();
|
|
16
|
+
},
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
function renderProcessedDocument(metadata, chunkMetadata, chunkContent) {
|
|
20
|
+
const combinedMetadata = {
|
|
21
|
+
...metadata,
|
|
22
|
+
chunks: chunkMetadata,
|
|
23
|
+
};
|
|
24
|
+
const doc = new YAML.Document(combinedMetadata);
|
|
25
|
+
YAML.visit(doc, {
|
|
26
|
+
Seq(_, node) {
|
|
27
|
+
const allScalars = node.items.every((item) => YAML.isScalar(item));
|
|
28
|
+
if (allScalars) {
|
|
29
|
+
node.flow = true;
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
});
|
|
33
|
+
const renderedMetadata = doc.toString({ lineWidth: 200 });
|
|
34
|
+
return [
|
|
35
|
+
"---",
|
|
36
|
+
renderedMetadata.trimEnd(),
|
|
37
|
+
"---",
|
|
38
|
+
"",
|
|
39
|
+
chunkContent.trim(),
|
|
40
|
+
].join("\n");
|
|
41
|
+
}
|
|
42
|
+
async function processDocument(ref, ctx) {
|
|
43
|
+
// 1. Read raw content
|
|
44
|
+
const { metadata, content } = await ctx.storage.readRawContent(ref);
|
|
45
|
+
const contentLength = content.length;
|
|
46
|
+
// 2. Chunk content with LLM
|
|
47
|
+
ctx.logger?.debug?.("Chunking document", { ref, step: "chunk" });
|
|
48
|
+
const chunkContent = await chunkDocument(content, ctx.domain, ctx.llm);
|
|
49
|
+
// 3. Extract metadata with LLM
|
|
50
|
+
ctx.logger?.debug?.("Extracting metadata", { ref, step: "metadata" });
|
|
51
|
+
const chunkMetadata = await extractMetadata(chunkContent, ctx.domain, ctx.metadataSchema, ctx.llm);
|
|
52
|
+
// 4. Parse chunk metadata and render final content
|
|
53
|
+
const rendered = renderProcessedDocument(metadata, chunkMetadata, chunkContent);
|
|
54
|
+
// 5. Save processed content
|
|
55
|
+
await ctx.storage.saveProcessedContent(ref, rendered);
|
|
56
|
+
ctx.logger?.info?.("Document processed", {
|
|
57
|
+
ref,
|
|
58
|
+
chunks: chunkMetadata.length,
|
|
59
|
+
bytes: contentLength,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
function sleep(ms) {
|
|
63
|
+
return new Promise((resolve) => {
|
|
64
|
+
const t = setTimeout(resolve, ms);
|
|
65
|
+
// If nothing else is keeping the process alive, don't block exit.
|
|
66
|
+
t.unref?.();
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
export function startBackgroundWorkers(args) {
|
|
70
|
+
const concurrency = Math.max(1, args.concurrency ?? 1);
|
|
71
|
+
const idleSleepMs = Math.max(50, args.idleSleepMs ?? DEFAULT_IDLE_SLEEP_MS);
|
|
72
|
+
const { ctx, queue } = args;
|
|
73
|
+
async function workerLoop(workerIndex) {
|
|
74
|
+
while (true) {
|
|
75
|
+
const docRef = queue.dequeue();
|
|
76
|
+
if (!docRef) {
|
|
77
|
+
await sleep(idleSleepMs);
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
ctx.logger?.debug?.("Processing started", {
|
|
81
|
+
worker: workerIndex,
|
|
82
|
+
ref: docRef,
|
|
83
|
+
});
|
|
84
|
+
try {
|
|
85
|
+
await processDocument(docRef, ctx);
|
|
86
|
+
}
|
|
87
|
+
catch (error) {
|
|
88
|
+
ctx.logger?.error?.("Processing failed", {
|
|
89
|
+
err: error,
|
|
90
|
+
ref: docRef,
|
|
91
|
+
worker: workerIndex,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
for (let i = 0; i < concurrency; i++) {
|
|
97
|
+
workerLoop(i + 1);
|
|
98
|
+
}
|
|
99
|
+
ctx.logger?.debug?.("Background workers started", { concurrency });
|
|
100
|
+
}
|
|
101
|
+
export async function enqueueUnprocessedDocuments(args) {
|
|
102
|
+
const refs = await args.storage.getUnprocessedContents();
|
|
103
|
+
for (const ref of refs) {
|
|
104
|
+
args.logger?.debug?.("Queued unprocessed document", { ref });
|
|
105
|
+
args.queue.enqueue(ref);
|
|
106
|
+
}
|
|
107
|
+
if (refs.length > 0) {
|
|
108
|
+
args.logger?.debug?.("Found unprocessed documents", { count: refs.length });
|
|
109
|
+
}
|
|
110
|
+
return refs.length;
|
|
111
|
+
}
|
|
112
|
+
//# sourceMappingURL=processor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processor.js","sourceRoot":"","sources":["../../src/processing/processor.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AAIxB,OAAO,EAAE,KAAK,IAAI,aAAa,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAgBlC,MAAM,UAAU,qBAAqB;IACpC,MAAM,KAAK,GAAkB,EAAE,CAAC;IAEhC,OAAO;QACN,OAAO,CAAC,GAAG;YACV,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjB,CAAC;QAED,IAAI;YACH,OAAO,KAAK,CAAC,MAAM,CAAC;QACrB,CAAC;QAED,OAAO;YACN,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;QACtB,CAAC;KACD,CAAC;AACH,CAAC;AAED,SAAS,uBAAuB,CAC/B,QAAkB,EAClB,aAAyB,EACzB,YAAoB;IAEpB,MAAM,gBAAgB,GAAG;QACxB,GAAG,QAAQ;QACX,MAAM,EAAE,aAAa;KACrB,CAAC;IAEF,MAAM,GAAG,GAAG,IAAI,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IAEhD,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE;QACf,GAAG,CAAC,CAAC,EAAE,IAAI;YACV,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;YACnE,IAAI,UAAU,EAAE,CAAC;gBAChB,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;YAClB,CAAC;QACF,CAAC;KACD,CAAC,CAAC;IAEH,MAAM,gBAAgB,GAAG,GAAG,CAAC,QAAQ,CAAC,EAAE,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;IAE1D,OAAO;QACN,KAAK;QACL,gBAAgB,CAAC,OAAO,EAAE;QAC1B,KAAK;QACL,EAAE;QACF,YAAY,CAAC,IAAI,EAAE;KACnB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACd,CAAC;AAED,KAAK,UAAU,eAAe,CAC7B,GAAgB,EAChB,GAAqB;IAErB,sBAAsB;IACtB,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;IACpE,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC;IAErC,4BAA4B;IAC5B,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,mBAAmB,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IACjE,MAAM,YAAY,GAAG,MAAM,aAAa,CAAC,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;IAEvE,+BAA+B;IAC/B,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,qBAAqB,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC;IACtE,MAAM,aAAa,GAAG,MAAM,eAAe,CAC1C,YAAY,EACZ,GAAG,CAAC,MAAM,EACV,GAAG,CAAC,cAAc,EAClB,GAAG,CAAC,GAAG,CACP,CAAC;IAEF,mDAAmD;IACnD,MAAM,QAAQ,GAAG,uBAAuB,CACvC,QAAQ,EACR,aAAa,EACb,YAAY,CACZ,CAAC;IAEF,4BAA4B;IAC5B,MAAM,GAAG,CAAC,OAAO,CAAC,oBAAoB,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAEtD,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,oBAAoB,EAAE;QACxC,GAAG;QACH,MAAM,EAAE,aAAa,CAAC,MAAM;QAC5B,KAAK,EAAE,aAAa;KACpB,CAAC,CAAC;AACJ,CAAC;AAED,SAAS,KAAK,CAAC,EAAU;IACxB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC9B,MAAM,CAAC,GAAG,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAClC,kEAAkE;QACjE,CAAuC,CAAC,KAAK,EAAE,EAAE,CAAC;IACpD,CAAC,CAAC,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,IAKtC;IACA,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC;IACvD,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,WAAW,IAAI,qBAAqB,CAAC,CAAC;IAC5E,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC;IAE5B,KAAK,UAAU,UAAU,CAAC,WAAmB;QAC5C,OAAO,IAAI,EAAE,CAAC;YACb,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC;YAC/B,IAAI,CAAC,MAAM,EAAE,CAAC;gBACb,MAAM,KAAK,CAAC,WAAW,CAAC,CAAC;gBACzB,SAAS;YACV,CAAC;YAED,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,oBAAoB,EAAE;gBACzC,MAAM,EAAE,WAAW;gBACnB,GAAG,EAAE,MAAM;aACX,CAAC,CAAC;YACH,IAAI,CAAC;gBACJ,MAAM,eAAe,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YACpC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBAChB,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,mBAAmB,EAAE;oBACxC,GAAG,EAAE,KAAK;oBACV,GAAG,EAAE,MAAM;oBACX,MAAM,EAAE,WAAW;iBACnB,CAAC,CAAC;YACJ,CAAC;QACF,CAAC;IACF,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACnB,CAAC;IAED,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,4BAA4B,EAAE,EAAE,WAAW,EAAE,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,2BAA2B,CAAC,IAIjD;IACA,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,sBAAsB,EAAE,CAAC;IAEzD,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,6BAA6B,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QAC7D,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACzB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrB,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,6BAA6B,EAAE,EAAE,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7E,CAAC;IAED,OAAO,IAAI,CAAC,MAAM,CAAC;AACpB,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { FileStorage } from "../storage/file-storage.js";
|
|
2
|
+
import type { MetadataSchemaItem } from "../types.js";
|
|
3
|
+
export interface SkillGeneratorOptions {
|
|
4
|
+
domain: string;
|
|
5
|
+
sources: string[];
|
|
6
|
+
baseDir: string;
|
|
7
|
+
metadataSchema: MetadataSchemaItem[];
|
|
8
|
+
overwrite: boolean;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Generate a Claude Code skill file for searching indexed data
|
|
12
|
+
*/
|
|
13
|
+
export declare function generateSkill(options: SkillGeneratorOptions, fileStorage: FileStorage): Promise<{
|
|
14
|
+
skillPath: string;
|
|
15
|
+
}>;
|
|
16
|
+
//# sourceMappingURL=skill-generator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"skill-generator.d.ts","sourceRoot":"","sources":["../../src/skills/skill-generator.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAC9D,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAGtD,MAAM,WAAW,qBAAqB;IACrC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,EAAE,kBAAkB,EAAE,CAAC;IACrC,SAAS,EAAE,OAAO,CAAC;CACnB;AAqND;;GAEG;AACH,wBAAsB,aAAa,CAClC,OAAO,EAAE,qBAAqB,EAC9B,WAAW,EAAE,WAAW,GACtB,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC,CAsBhC"}
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { fileExists } from "../utils/file.js";
|
|
4
|
+
/**
|
|
5
|
+
* Generate ripgrep example patterns for a metadata field
|
|
6
|
+
*/
|
|
7
|
+
function generateRgPattern(field, sampleValue, isArray) {
|
|
8
|
+
if (isArray) {
|
|
9
|
+
// For array fields, search within YAML array syntax
|
|
10
|
+
return `rg "${field}:\\s*\\[.*${sampleValue}.*\\]" processed`;
|
|
11
|
+
}
|
|
12
|
+
// For single-value fields, search for exact value
|
|
13
|
+
return `rg "${field}:\\s*${sampleValue}" processed`;
|
|
14
|
+
}
|
|
15
|
+
function generateSkillName(sources, maxLength = 30) {
|
|
16
|
+
let sourcesStr = sources.join("-");
|
|
17
|
+
sourcesStr = sourcesStr.trim();
|
|
18
|
+
sourcesStr = sourcesStr.toLowerCase().replace(/[^a-z0-9]+/g, "-");
|
|
19
|
+
sourcesStr = sourcesStr.replace(/^-+/, "").replace(/-+$/, "");
|
|
20
|
+
sourcesStr = sourcesStr.replace(/-+/g, "-");
|
|
21
|
+
let skillName = `search-${sourcesStr}`;
|
|
22
|
+
if (skillName.length > maxLength) {
|
|
23
|
+
skillName = skillName.slice(0, maxLength);
|
|
24
|
+
}
|
|
25
|
+
return skillName;
|
|
26
|
+
}
|
|
27
|
+
function sanitizePathSegment(name, maxLength = 50) {
|
|
28
|
+
let sanitized = name
|
|
29
|
+
.trim()
|
|
30
|
+
.toLowerCase()
|
|
31
|
+
.replace(/[^a-z0-9]+/g, "-");
|
|
32
|
+
sanitized = sanitized.replace(/^-+/, "").replace(/-+$/, "");
|
|
33
|
+
sanitized = sanitized.replace(/-+/g, "-");
|
|
34
|
+
if (sanitized.length > maxLength) {
|
|
35
|
+
sanitized = sanitized.slice(0, maxLength);
|
|
36
|
+
}
|
|
37
|
+
return sanitized || "unknown";
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Generate the skill content from a template
|
|
41
|
+
*/
|
|
42
|
+
function generateSkillContent(domain, sources, metadataSchema, fileStorage) {
|
|
43
|
+
const skillName = generateSkillName(sources);
|
|
44
|
+
// Generate metadata list from schema
|
|
45
|
+
const metadataList = metadataSchema
|
|
46
|
+
.map((field) => {
|
|
47
|
+
const typeSuffix = field.type.startsWith("enum") && field.enumValues
|
|
48
|
+
? ` (values: ${field.enumValues.join(", ")})`
|
|
49
|
+
: "";
|
|
50
|
+
return `- \`${field.name}\` - *${field.type}*${typeSuffix}`;
|
|
51
|
+
})
|
|
52
|
+
.join("\n");
|
|
53
|
+
// Generate ripgrep examples for first 3-4 fields, showing both array and single-value patterns
|
|
54
|
+
const exampleFields = metadataSchema.slice(0, 4);
|
|
55
|
+
const rgExamples = exampleFields
|
|
56
|
+
.map((field) => {
|
|
57
|
+
// Check if field type indicates an array (e.g., "string[]", "enum[]")
|
|
58
|
+
const typeStr = String(field.type);
|
|
59
|
+
const isArray = typeStr.includes("[]");
|
|
60
|
+
const sampleValue = field.enumValues?.[0] ?? "VALUE";
|
|
61
|
+
const example = generateRgPattern(field.name, sampleValue, isArray);
|
|
62
|
+
return `# By ${field.name}\n${example}`;
|
|
63
|
+
})
|
|
64
|
+
.join("\n\n");
|
|
65
|
+
const exampleSource = sanitizePathSegment(sources[0] ?? "source");
|
|
66
|
+
return `---
|
|
67
|
+
name: ${skillName}
|
|
68
|
+
description: Guide for searching and analyzing indexed content from ${sources.join(", ")} in the '${domain}' domain. This skill should be used when you need information from ${sources.join(", ")} sources to answer questions or conduct research.
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
# Skill Overview
|
|
72
|
+
|
|
73
|
+
This skill provides guidance for efficient search over indexed content from ${sources.join(", ")} in the \`${domain}\` domain. It leverages grep-friendly metadata and chunked content storage to enable precise filtering and retrieval.
|
|
74
|
+
|
|
75
|
+
## About the Content
|
|
76
|
+
|
|
77
|
+
Content from ${sources.join(", ")} has been fetched, chunked, enriched with searchable metadata, and stored in the '${fileStorage.baseDir}' directory.
|
|
78
|
+
|
|
79
|
+
### Directory Structure
|
|
80
|
+
|
|
81
|
+
\`\`\`
|
|
82
|
+
${fileStorage.baseDir}/
|
|
83
|
+
├── processed/ # Cleaned, search-optimized content with metadata
|
|
84
|
+
│ └── {source}/ # ${sources.join(", ")}, etc.
|
|
85
|
+
│ └── {publisher?}/
|
|
86
|
+
│ └── YYYY-MM/
|
|
87
|
+
│ └── YYYY-MM-DD-label.md
|
|
88
|
+
└── raw/ # Original raw content as ingested (mirrors processed/)
|
|
89
|
+
\`\`\`
|
|
90
|
+
|
|
91
|
+
### File Format
|
|
92
|
+
|
|
93
|
+
Each processed file contains YAML frontmatter with document metadata and a \`chunks\` array. Each chunk includes:
|
|
94
|
+
- \`id\`: Unique chunk identifier (e.g., c01, c02)
|
|
95
|
+
- \`title\`: Chunk title
|
|
96
|
+
- Domain-specific metadata fields
|
|
97
|
+
|
|
98
|
+
Chunked content follows the frontmatter, with chunk IDs as section headers.
|
|
99
|
+
|
|
100
|
+
\`\`\`yaml
|
|
101
|
+
---
|
|
102
|
+
title: "Document Title"
|
|
103
|
+
source: "Source Name"
|
|
104
|
+
publisher: "Publisher Name"
|
|
105
|
+
<other metadata fields>
|
|
106
|
+
chunks:
|
|
107
|
+
- id: c01
|
|
108
|
+
title: "First Chunk Title"
|
|
109
|
+
# Domain-specific metadata fields related to this chunk
|
|
110
|
+
- id: c02
|
|
111
|
+
title: "Second Chunk Title"
|
|
112
|
+
# Domain-specific metadata fields related to this chunk
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
CHUNK c01: "First Chunk Title"
|
|
116
|
+
<chunk content here>
|
|
117
|
+
|
|
118
|
+
CHUNK c02: "Second Chunk Title"
|
|
119
|
+
<chunk content here>
|
|
120
|
+
\`\`\`
|
|
121
|
+
|
|
122
|
+
The YAML frontmatter serves as an index for the entire document.
|
|
123
|
+
|
|
124
|
+
### Key Metadata Fields
|
|
125
|
+
|
|
126
|
+
Below are the most common metadata fields with sample values. Additional metadata fields and values may exist beyond those listed here.
|
|
127
|
+
|
|
128
|
+
${metadataList}
|
|
129
|
+
|
|
130
|
+
## Recommended Search Strategy
|
|
131
|
+
|
|
132
|
+
0. **ALWAYS use rg (ripgrep)**:
|
|
133
|
+
Ripgrep is optimized for searching large codebases and text files quickly. It supports regex, file path patterns, and context capture.
|
|
134
|
+
It MUST be your primary search tool for this content if installed.
|
|
135
|
+
|
|
136
|
+
1. **Constrain by time range first**
|
|
137
|
+
Use file path patterns (e.g., \`YYYY/MM/YYYY-MM-DD\`) to limit the search space before inspecting content.
|
|
138
|
+
|
|
139
|
+
2. **Apply metadata filters**
|
|
140
|
+
Use \`ripgrep\` to match specific YAML frontmatter fields. Note that metadata fields can be either:
|
|
141
|
+
- **Single values**: Match with \`field: value\` (e.g., \`date: 2025-01-15\`)
|
|
142
|
+
- **Arrays**: Match with \`field: [ value1, value2 ]\` or search within arrays using \`field:\\s*\\[.*value.*\\]\`
|
|
143
|
+
|
|
144
|
+
Refer to the Key Metadata Fields section below to understand which fields are arrays vs single values.
|
|
145
|
+
|
|
146
|
+
3. **Leverage YAML frontmatter as a document index**
|
|
147
|
+
Treat frontmatter as a document summary. Read it first to understand:
|
|
148
|
+
- Which chunks exist
|
|
149
|
+
- What each chunk covers
|
|
150
|
+
This avoids unnecessary full-content reads.
|
|
151
|
+
|
|
152
|
+
4. **Identify relevant chunks**
|
|
153
|
+
From search results and frontmatter, collect IDs of chunks likely to contain relevant information.
|
|
154
|
+
|
|
155
|
+
5. **Enumerate candidate documents**
|
|
156
|
+
Before reading chunk content, broaden queries slightly (alternative wording, synonyms, metadata variations) to ensure all relevant documents and chunk IDs are discovered.
|
|
157
|
+
|
|
158
|
+
6. **Refine iteratively**
|
|
159
|
+
Adjust path patterns, metadata filters, and query terms based on findings until no new relevant documents or chunks appear.
|
|
160
|
+
|
|
161
|
+
7. **Read targeted content only**
|
|
162
|
+
Use collected chunk IDs to read only the necessary sections of each document.
|
|
163
|
+
|
|
164
|
+
## Ripgrep Search Examples
|
|
165
|
+
|
|
166
|
+
\`\`\`bash
|
|
167
|
+
# Find specific chunk content with context
|
|
168
|
+
rg "CHUNK c01:" -A 20 processed
|
|
169
|
+
|
|
170
|
+
# Search only ${exampleSource} content
|
|
171
|
+
rg "search query" processed/${exampleSource}
|
|
172
|
+
|
|
173
|
+
# Search ${exampleSource} content from December 2025
|
|
174
|
+
rg "search query" processed/${exampleSource} --glob "**/2025-12/*.md"
|
|
175
|
+
|
|
176
|
+
# Combine multiple metadata filters
|
|
177
|
+
rg -l "field1:.*value1" processed | xargs rg "field2:.*value2"
|
|
178
|
+
|
|
179
|
+
# List unique values for a metadata field
|
|
180
|
+
rg "field_name:" processed | sort | uniq -c | sort -rn | head -20
|
|
181
|
+
|
|
182
|
+
${rgExamples}
|
|
183
|
+
\`\`\`
|
|
184
|
+
|
|
185
|
+
## Important Guidelines
|
|
186
|
+
|
|
187
|
+
- **Primary source**: Always use the \`processed/\` directory for searches; only fall back to \`raw/\` if necessary
|
|
188
|
+
- **Metadata first**: Start with metadata filtering before full-text content search
|
|
189
|
+
- **Context capture**: Use \`-B\` and \`-A\` flags to capture surrounding lines for context without reading entire chunks
|
|
190
|
+
- **Citation style**: When referencing content, cite by source name, publisher, and title—never expose internal structure like chunk IDs or file paths to the user
|
|
191
|
+
`;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Generate a Claude Code skill file for searching indexed data
|
|
195
|
+
*/
|
|
196
|
+
export async function generateSkill(options, fileStorage) {
|
|
197
|
+
const skillName = generateSkillName(options.sources, 30);
|
|
198
|
+
const skillContent = generateSkillContent(options.domain, options.sources, options.metadataSchema, fileStorage);
|
|
199
|
+
// Create skill directory
|
|
200
|
+
const skillDir = path.join(options.baseDir, ".claude", "skills", skillName);
|
|
201
|
+
await mkdir(skillDir, { recursive: true });
|
|
202
|
+
// Write skill file
|
|
203
|
+
const skillPath = path.join(skillDir, "SKILL.md");
|
|
204
|
+
const skillExists = await fileExists(skillPath);
|
|
205
|
+
if (!skillExists || options.overwrite) {
|
|
206
|
+
await writeFile(skillPath, skillContent, "utf8");
|
|
207
|
+
}
|
|
208
|
+
return { skillPath };
|
|
209
|
+
}
|
|
210
|
+
//# sourceMappingURL=skill-generator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"skill-generator.js","sourceRoot":"","sources":["../../src/skills/skill-generator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,IAAI,MAAM,WAAW,CAAC;AAG7B,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAU9C;;GAEG;AACH,SAAS,iBAAiB,CACzB,KAAa,EACb,WAAmB,EACnB,OAAgB;IAEhB,IAAI,OAAO,EAAE,CAAC;QACb,oDAAoD;QACpD,OAAO,OAAO,KAAK,aAAa,WAAW,kBAAkB,CAAC;IAC/D,CAAC;IACD,kDAAkD;IAClD,OAAO,OAAO,KAAK,QAAQ,WAAW,aAAa,CAAC;AACrD,CAAC;AAED,SAAS,iBAAiB,CAAC,OAAiB,EAAE,SAAS,GAAG,EAAE;IAC3D,IAAI,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnC,UAAU,GAAG,UAAU,CAAC,IAAI,EAAE,CAAC;IAC/B,UAAU,GAAG,UAAU,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;IAClE,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC9D,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAE5C,IAAI,SAAS,GAAG,UAAU,UAAU,EAAE,CAAC;IACvC,IAAI,SAAS,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;QAClC,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,SAAS,CAAC;AAClB,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY,EAAE,SAAS,GAAG,EAAE;IACxD,IAAI,SAAS,GAAG,IAAI;SAClB,IAAI,EAAE;SACN,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;IAC9B,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC5D,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAE1C,IAAI,SAAS,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;QAClC,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,SAAS,IAAI,SAAS,CAAC;AAC/B,CAAC;AAED;;GAEG;AACH,SAAS,oBAAoB,CAC5B,MAAc,EACd,OAAiB,EACjB,cAAoC,EACpC,WAAwB;IAExB,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAE7C,qCAAqC;IACrC,MAAM,YAAY,GAAG,cAAc;SACjC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;QACd,MAAM,UAAU,GACf,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,UAAU;YAChD,CAAC,CAAC,aAAa,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YAC7C,CAAC,CAAC,EAAE,CAAC;QACP,OAAO,OAAO,KAAK,CAAC,IAAI,SAAS,KAAK,CAAC,IAAI,IAAI,UAAU,EAAE,CAAC;IAC7D,CAAC,CAAC;SACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEb,+FAA+F;IAC/F,MAAM,aAAa,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACjD,MAAM,UAAU,GAAG,aAAa;SAC9B,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;QACd,sEAAsE;QACtE,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvC,MAAM,WAAW,GAAG,KAAK,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC;QACrD,MAAM,OAAO,GAAG,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,WAAW,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,QAAQ,KAAK,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;IACzC,CAAC,CAAC;SACD,IAAI,CAAC,MAAM,CAAC,CAAC;IAEf,MAAM,aAAa,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC;IAElE,OAAO;QACA,SAAS;sEACqD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,MAAM,sEAAsE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;8EAKpH,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,MAAM;;;;eAIpG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,qFAAqF,WAAW,CAAC,OAAO;;;;;EAKvI,WAAW,CAAC,OAAO;;yBAEI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA4CzC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA0CE,aAAa;8BACC,aAAa;;WAEhC,aAAa;8BACM,aAAa;;;;;;;;EAQzC,UAAU;;;;;;;;;CASX,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAClC,OAA8B,EAC9B,WAAwB;IAExB,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACzD,MAAM,YAAY,GAAG,oBAAoB,CACxC,OAAO,CAAC,MAAM,EACd,OAAO,CAAC,OAAO,EACf,OAAO,CAAC,cAAc,EACtB,WAAW,CACX,CAAC;IAEF,yBAAyB;IACzB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC5E,MAAM,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,mBAAmB;IACnB,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAClD,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,SAAS,CAAC,CAAC;IAEhD,IAAI,CAAC,WAAW,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;QACvC,MAAM,SAAS,CAAC,SAAS,EAAE,YAAY,EAAE,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,EAAE,SAAS,EAAE,CAAC;AACtB,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { DocumentRef, DocumentAddResult as DocumentSaveResult } from "../storage/types.js";
|
|
2
|
+
import type { GreptorEatInput, Metadata } from "../types.js";
|
|
3
|
+
export interface FileStorage {
|
|
4
|
+
readonly baseDir: string;
|
|
5
|
+
readonly rawContentPath: string;
|
|
6
|
+
readonly processedContentPath: string;
|
|
7
|
+
saveRawContent(input: GreptorEatInput): Promise<DocumentSaveResult>;
|
|
8
|
+
readRawContent(ref: DocumentRef): Promise<{
|
|
9
|
+
metadata: Metadata;
|
|
10
|
+
content: string;
|
|
11
|
+
}>;
|
|
12
|
+
getUnprocessedContents(): Promise<DocumentRef[]>;
|
|
13
|
+
saveProcessedContent(ref: DocumentRef, content: string): Promise<void>;
|
|
14
|
+
}
|
|
15
|
+
export declare function createFileStorage(baseDir: string): FileStorage;
|
|
16
|
+
//# sourceMappingURL=file-storage.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"file-storage.d.ts","sourceRoot":"","sources":["../../src/storage/file-storage.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EACX,WAAW,EACX,iBAAiB,IAAI,kBAAkB,EACvC,MAAM,qBAAqB,CAAC;AAC7B,OAAO,KAAK,EAAE,eAAe,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAG7D,MAAM,WAAW,WAAW;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,oBAAoB,EAAE,MAAM,CAAC;IAEtC,cAAc,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAC;IACpE,cAAc,CACb,GAAG,EAAE,WAAW,GACd,OAAO,CAAC;QAAE,QAAQ,EAAE,QAAQ,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACpD,sBAAsB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC;IACjD,oBAAoB,CAAC,GAAG,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CACvE;AAED,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,MAAM,GAAG,WAAW,CAyN9D"}
|