greptor 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +365 -0
  3. package/dist/greptor.d.ts +7 -0
  4. package/dist/greptor.d.ts.map +1 -0
  5. package/dist/greptor.js +98 -0
  6. package/dist/greptor.js.map +1 -0
  7. package/dist/index.d.ts +4 -0
  8. package/dist/index.d.ts.map +1 -0
  9. package/dist/index.js +3 -0
  10. package/dist/index.js.map +1 -0
  11. package/dist/llm/llm-factory.d.ts +7 -0
  12. package/dist/llm/llm-factory.d.ts.map +1 -0
  13. package/dist/llm/llm-factory.js +53 -0
  14. package/dist/llm/llm-factory.js.map +1 -0
  15. package/dist/metadata-schema/generate.d.ts +3 -0
  16. package/dist/metadata-schema/generate.d.ts.map +1 -0
  17. package/dist/metadata-schema/generate.js +43 -0
  18. package/dist/metadata-schema/generate.js.map +1 -0
  19. package/dist/metadata-schema/initialize.d.ts +5 -0
  20. package/dist/metadata-schema/initialize.d.ts.map +1 -0
  21. package/dist/metadata-schema/initialize.js +37 -0
  22. package/dist/metadata-schema/initialize.js.map +1 -0
  23. package/dist/metadata-schema/types.d.ts +34 -0
  24. package/dist/metadata-schema/types.d.ts.map +1 -0
  25. package/dist/metadata-schema/types.js +30 -0
  26. package/dist/metadata-schema/types.js.map +1 -0
  27. package/dist/processing/chunk.d.ts +3 -0
  28. package/dist/processing/chunk.d.ts.map +1 -0
  29. package/dist/processing/chunk.js +36 -0
  30. package/dist/processing/chunk.js.map +1 -0
  31. package/dist/processing/extract-metadata.d.ts +4 -0
  32. package/dist/processing/extract-metadata.d.ts.map +1 -0
  33. package/dist/processing/extract-metadata.js +39 -0
  34. package/dist/processing/extract-metadata.js.map +1 -0
  35. package/dist/processing/processor.d.ts +28 -0
  36. package/dist/processing/processor.d.ts.map +1 -0
  37. package/dist/processing/processor.js +112 -0
  38. package/dist/processing/processor.js.map +1 -0
  39. package/dist/skills/skill-generator.d.ts +16 -0
  40. package/dist/skills/skill-generator.d.ts.map +1 -0
  41. package/dist/skills/skill-generator.js +210 -0
  42. package/dist/skills/skill-generator.js.map +1 -0
  43. package/dist/storage/file-storage.d.ts +16 -0
  44. package/dist/storage/file-storage.d.ts.map +1 -0
  45. package/dist/storage/file-storage.js +162 -0
  46. package/dist/storage/file-storage.js.map +1 -0
  47. package/dist/storage/index.d.ts +3 -0
  48. package/dist/storage/index.d.ts.map +1 -0
  49. package/dist/storage/index.js +3 -0
  50. package/dist/storage/index.js.map +1 -0
  51. package/dist/storage/types.d.ts +16 -0
  52. package/dist/storage/types.d.ts.map +1 -0
  53. package/dist/storage/types.js +2 -0
  54. package/dist/storage/types.js.map +1 -0
  55. package/dist/types.d.ts +53 -0
  56. package/dist/types.d.ts.map +1 -0
  57. package/dist/types.js +2 -0
  58. package/dist/types.js.map +1 -0
  59. package/dist/utils/file.d.ts +2 -0
  60. package/dist/utils/file.d.ts.map +1 -0
  61. package/dist/utils/file.js +11 -0
  62. package/dist/utils/file.js.map +1 -0
  63. package/dist/utils/hash.d.ts +2 -0
  64. package/dist/utils/hash.d.ts.map +1 -0
  65. package/dist/utils/hash.js +5 -0
  66. package/dist/utils/hash.js.map +1 -0
  67. package/package.json +63 -0
@@ -0,0 +1,37 @@
1
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import YAML from "yaml";
4
+ import { fileExists } from "../utils/file.js";
5
+ import { generateMetadataSchema } from "./generate.js";
6
+ export const METADATA_SCHEMA_FILENAME = "metadata-schema.yaml";
7
+ async function persist(schemaFilePath, metadataSchema, logger) {
8
+ const schemaYaml = YAML.stringify(metadataSchema);
9
+ await mkdir(path.dirname(schemaFilePath), { recursive: true });
10
+ await writeFile(schemaFilePath, schemaYaml, "utf8");
11
+ logger?.debug?.("Metadata schema saved", { path: schemaFilePath });
12
+ }
13
+ export async function initializeMetadataSchema(baseDir, llmModel, topic, metadataSchema, logger) {
14
+ const schemaFilePath = path.join(baseDir, METADATA_SCHEMA_FILENAME);
15
+ // If a schema is provided, save it to disk and return it straight away
16
+ if (metadataSchema) {
17
+ await persist(schemaFilePath, metadataSchema, logger);
18
+ return metadataSchema;
19
+ }
20
+ // If schema file exists on disk, load and return it
21
+ if (await fileExists(schemaFilePath)) {
22
+ logger?.debug?.("Metadata schema not provided, loading from file", {
23
+ path: schemaFilePath,
24
+ });
25
+ return YAML.parse(await readFile(schemaFilePath, "utf8"));
26
+ }
27
+ // Otherwise, generate a new schema using the LLM
28
+ logger?.info?.("Generating metadata schema", { topic });
29
+ const schema = await generateMetadataSchema(topic, llmModel);
30
+ await persist(schemaFilePath, schema, logger);
31
+ logger?.info?.("Metadata schema generated", {
32
+ path: schemaFilePath,
33
+ fields: schema.length,
34
+ });
35
+ return schema;
36
+ }
37
+ //# sourceMappingURL=initialize.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"initialize.js","sourceRoot":"","sources":["../../src/metadata-schema/initialize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,IAAI,MAAM,MAAM,CAAC;AAGxB,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC;AAEvD,MAAM,CAAC,MAAM,wBAAwB,GAAG,sBAAsB,CAAC;AAE/D,KAAK,UAAU,OAAO,CACrB,cAAsB,EACtB,cAA8B,EAC9B,MAAe;IAEf,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;IAClD,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/D,MAAM,SAAS,CAAC,cAAc,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;IAEpD,MAAM,EAAE,KAAK,EAAE,CAAC,uBAAuB,EAAE,EAAE,IAAI,EAAE,cAAc,EAAE,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC7C,OAAe,EACf,QAAgB,EAChB,KAAa,EACb,cAA+B,EAC/B,MAAe;IAEf,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,wBAAwB,CAAC,CAAC;IAEpE,uEAAuE;IACvE,IAAI,cAAc,EAAE,CAAC;QACpB,MAAM,OAAO,CAAC,cAAc,EAAE,cAAc,EAAE,MAAM,CAAC,CAAC;QACtD,OAAO,cAAc,CAAC;IACvB,CAAC;IAED,oDAAoD;IACpD,IAAI,MAAM,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QACtC,MAAM,EAAE,KAAK,EAAE,CAAC,iDAAiD,EAAE;YAClE,IAAI,EAAE,cAAc;SACpB,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC,CAAmB,CAAC;IAC7E,CAAC;IAED,iDAAiD;IACjD,MAAM,EAAE,IAAI,EAAE,CAAC,4BAA4B,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IACxD,MAAM,MAAM,GAAG,MAAM,sBAAsB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;IAC7D,MAAM,OAAO,CAAC,cAAc,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IAC9C,MAAM,EAAE,IAAI,EAAE,CAAC,2BAA2B,EAAE;QAC3C,IAAI,EAAE,cAAc;QACpB,MAAM,EAAE,MAAM,CAAC,MAAM;KACrB,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AACf,CAAC"}
@@ -0,0 +1,34 @@
1
+ import { z } from "zod";
2
+ export declare const MetadataFieldSchema: z.ZodObject<{
3
+ name: z.ZodString;
4
+ type: z.ZodEnum<{
5
+ string: "string";
6
+ number: "number";
7
+ boolean: "boolean";
8
+ "string[]": "string[]";
9
+ "number[]": "number[]";
10
+ enum: "enum";
11
+ "enum[]": "enum[]";
12
+ date: "date";
13
+ }>;
14
+ description: z.ZodString;
15
+ enumValues: z.ZodNullable<z.ZodOptional<z.ZodArray<z.ZodString>>>;
16
+ }, z.core.$strip>;
17
+ export declare const ResponseSchema: z.ZodObject<{
18
+ metadata_fields: z.ZodArray<z.ZodObject<{
19
+ name: z.ZodString;
20
+ type: z.ZodEnum<{
21
+ string: "string";
22
+ number: "number";
23
+ boolean: "boolean";
24
+ "string[]": "string[]";
25
+ "number[]": "number[]";
26
+ enum: "enum";
27
+ "enum[]": "enum[]";
28
+ date: "date";
29
+ }>;
30
+ description: z.ZodString;
31
+ enumValues: z.ZodNullable<z.ZodOptional<z.ZodArray<z.ZodString>>>;
32
+ }, z.core.$strip>>;
33
+ }, z.core.$strip>;
34
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/metadata-schema/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;iBAoB9B,CAAC;AAEH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;iBAMzB,CAAC"}
@@ -0,0 +1,30 @@
1
+ import { z } from "zod";
2
+ export const MetadataFieldSchema = z.object({
3
+ name: z.string().describe("Metadata field name in snake_case"),
4
+ type: z
5
+ .enum([
6
+ "string",
7
+ "string[]",
8
+ "number",
9
+ "number[]",
10
+ "boolean",
11
+ "enum",
12
+ "enum[]",
13
+ "date",
14
+ ])
15
+ .describe("Field data type"),
16
+ description: z.string().describe("Purpose and usage of this metadata field"),
17
+ enumValues: z
18
+ .array(z.string())
19
+ .optional()
20
+ .nullable()
21
+ .describe("Full list of enum values for enum types."),
22
+ });
23
+ export const ResponseSchema = z.object({
24
+ metadata_fields: z
25
+ .array(MetadataFieldSchema)
26
+ .min(5)
27
+ .max(10)
28
+ .describe("List of metadata fields for the given topic"),
29
+ });
30
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/metadata-schema/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,mCAAmC,CAAC;IAC9D,IAAI,EAAE,CAAC;SACL,IAAI,CAAC;QACL,QAAQ;QACR,UAAU;QACV,QAAQ;QACR,UAAU;QACV,SAAS;QACT,MAAM;QACN,QAAQ;QACR,MAAM;KACN,CAAC;SACD,QAAQ,CAAC,iBAAiB,CAAC;IAC7B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,0CAA0C,CAAC;IAC5E,UAAU,EAAE,CAAC;SACX,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;SACjB,QAAQ,EAAE;SACV,QAAQ,EAAE;SACV,QAAQ,CAAC,0CAA0C,CAAC;CACtD,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,eAAe,EAAE,CAAC;SAChB,KAAK,CAAC,mBAAmB,CAAC;SAC1B,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,EAAE,CAAC;SACP,QAAQ,CAAC,6CAA6C,CAAC;CACzD,CAAC,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { LlmClient } from "../llm/llm-factory.js";
2
+ export declare function chunk(rawContent: string, domain: string, llm: LlmClient): Promise<string>;
3
+ //# sourceMappingURL=chunk.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk.d.ts","sourceRoot":"","sources":["../../src/processing/chunk.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAuBvD,wBAAsB,KAAK,CAC1B,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,GAAG,EAAE,SAAS,GACZ,OAAO,CAAC,MAAM,CAAC,CAiBjB"}
@@ -0,0 +1,36 @@
1
+ const createCleanPrompt = (text, domain) => `
2
+ Clean + structure the raw content into independent semantic chunks for domain: ${domain}
3
+
4
+ Remove completely (noise/boilerplate): ads/sponsors, greetings/intros/outros, CTAs/promos ("like & subscribe"), duplicates/filler, contact/social/discount codes, sign-offs ("reach out", "visit our website"), anything not primary informational content.
5
+
6
+ Preserve meaning exactly: no summarizing/paraphrasing. Keep all facts/numbers/names/dates, domain terminology, tables/lists, and URLs only if meaningful.
7
+
8
+ Normalize: canonicalize entity names; remove formatting garbage/repeated boilerplate; normalize whitespace/tabs/newlines.
9
+
10
+ Chunking: prefer fewer richer chunks. Target 100–200+ words per chunk where possible. Group related subtopics; split only when topics are truly distinct or context switches.
11
+
12
+ Output (English only), chunks separated by blank lines:
13
+ CHUNK c01: "Short Topic Title"
14
+ <cleaned content>
15
+
16
+ Rules: do not paraphrase, shorten meaning, add interpretation, or merge unrelated topics.
17
+
18
+ RAW CONTENT:
19
+ ${text}
20
+ `;
21
+ export async function chunk(rawContent, domain, llm) {
22
+ const prompt = createCleanPrompt(rawContent, domain);
23
+ const messages = [
24
+ { role: "user", content: prompt },
25
+ ];
26
+ const completion = await llm.client.chat.completions.parse({
27
+ model: llm.model,
28
+ messages,
29
+ });
30
+ const content = completion.choices[0]?.message?.content;
31
+ if (!content) {
32
+ throw new Error("Failed to clean content: empty LLM response");
33
+ }
34
+ return content;
35
+ }
36
+ //# sourceMappingURL=chunk.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk.js","sourceRoot":"","sources":["../../src/processing/chunk.ts"],"names":[],"mappings":"AAGA,MAAM,iBAAiB,GAAG,CAAC,IAAY,EAAE,MAAc,EAAE,EAAE,CAAC;iFACqB,MAAM;;;;;;;;;;;;;;;;;EAiBrF,IAAI;CACL,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,KAAK,CAC1B,UAAkB,EAClB,MAAc,EACd,GAAc;IAEd,MAAM,MAAM,GAAG,iBAAiB,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;IACrD,MAAM,QAAQ,GAAiC;QAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;KACjC,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1D,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,QAAQ;KACR,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;IACxD,IAAI,CAAC,OAAO,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,OAAO,CAAC;AAChB,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { LlmClient } from "../llm/llm-factory.js";
2
+ import type { Metadata } from "../types.js";
3
+ export declare function extractMetadata(chunkedContent: string, domain: string, metadataSchema: string, llm: LlmClient): Promise<Metadata[]>;
4
+ //# sourceMappingURL=extract-metadata.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract-metadata.d.ts","sourceRoot":"","sources":["../../src/processing/extract-metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AA6B5C,wBAAsB,eAAe,CACpC,cAAc,EAAE,MAAM,EACtB,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,MAAM,EACtB,GAAG,EAAE,SAAS,GACZ,OAAO,CAAC,QAAQ,EAAE,CAAC,CAqBrB"}
@@ -0,0 +1,39 @@
1
+ import YAML from "yaml";
2
+ const createExtractMetadataPrompt = (text, domain, metadataSchema) => `
3
+ Extract per-chunk metadata for domain: ${domain}. Optimize for grep-based search/filtering.
4
+
5
+ Input format:
6
+ CHUNK c01: "Title"\n...\n\nCHUNK c02: "Title"\n...
7
+
8
+ Output: YAML ONLY (no fences/comments). YAML list like:
9
+ - id: c01
10
+ title: "Title"
11
+ key1: value
12
+ key2: [v1, v2]
13
+
14
+ Value formats: strings snake_case (except title); numbers put unit in key (price_aud: 1250000); percentages use _percent; dates ISO-8601; ranges as [min, max]; arrays MUST be single-line YAML (e.g., [a, b]).
15
+
16
+ Rules: extract each chunk separately; do not output null/empty fields; for enum fields, only use values from schema; include additional useful numeric/date metrics for grep filtering.
17
+
18
+ SCHEMA:
19
+ ${metadataSchema}
20
+
21
+ INPUT:
22
+ ${text}
23
+ `;
24
+ export async function extractMetadata(chunkedContent, domain, metadataSchema, llm) {
25
+ const prompt = createExtractMetadataPrompt(chunkedContent, domain, metadataSchema);
26
+ const messages = [
27
+ { role: "user", content: prompt },
28
+ ];
29
+ const completion = await llm.client.chat.completions.parse({
30
+ model: llm.model,
31
+ messages,
32
+ });
33
+ const content = completion.choices[0]?.message?.content;
34
+ if (!content) {
35
+ throw new Error("Failed to extract metadata: empty LLM response");
36
+ }
37
+ return YAML.parse(content);
38
+ }
39
+ //# sourceMappingURL=extract-metadata.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract-metadata.js","sourceRoot":"","sources":["../../src/processing/extract-metadata.ts"],"names":[],"mappings":"AACA,OAAO,IAAI,MAAM,MAAM,CAAC;AAIxB,MAAM,2BAA2B,GAAG,CACnC,IAAY,EACZ,MAAc,EACd,cAAsB,EACrB,EAAE,CAAC;yCACoC,MAAM;;;;;;;;;;;;;;;;EAgB7C,cAAc;;;EAGd,IAAI;CACL,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,eAAe,CACpC,cAAsB,EACtB,MAAc,EACd,cAAsB,EACtB,GAAc;IAEd,MAAM,MAAM,GAAG,2BAA2B,CACzC,cAAc,EACd,MAAM,EACN,cAAc,CACd,CAAC;IACF,MAAM,QAAQ,GAAiC;QAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;KACjC,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1D,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,QAAQ;KACR,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;IACxD,IAAI,CAAC,OAAO,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;IACnE,CAAC;IAED,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAe,CAAC;AAC1C,CAAC"}
@@ -0,0 +1,28 @@
1
+ import type { LlmClient } from "../llm/llm-factory.js";
2
+ import type { DocumentRef, FileStorage } from "../storage/index.js";
3
+ import type { Logger } from "../types.js";
4
+ export interface ProcessorContext {
5
+ domain: string;
6
+ metadataSchema: string;
7
+ llm: LlmClient;
8
+ storage: FileStorage;
9
+ logger?: Logger;
10
+ }
11
+ export interface ProcessingQueue {
12
+ enqueue: (ref: DocumentRef) => void;
13
+ dequeue: () => DocumentRef | undefined;
14
+ size: () => number;
15
+ }
16
+ export declare function createProcessingQueue(): ProcessingQueue;
17
+ export declare function startBackgroundWorkers(args: {
18
+ ctx: ProcessorContext;
19
+ queue: ProcessingQueue;
20
+ concurrency?: number;
21
+ idleSleepMs?: number;
22
+ }): void;
23
+ export declare function enqueueUnprocessedDocuments(args: {
24
+ storage: FileStorage;
25
+ queue: ProcessingQueue;
26
+ logger?: Logger;
27
+ }): Promise<number>;
28
+ //# sourceMappingURL=processor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processor.d.ts","sourceRoot":"","sources":["../../src/processing/processor.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,KAAK,EAAE,MAAM,EAAY,MAAM,aAAa,CAAC;AAMpD,MAAM,WAAW,gBAAgB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,GAAG,EAAE,SAAS,CAAC;IACf,OAAO,EAAE,WAAW,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,eAAe;IAC/B,OAAO,EAAE,CAAC,GAAG,EAAE,WAAW,KAAK,IAAI,CAAC;IACpC,OAAO,EAAE,MAAM,WAAW,GAAG,SAAS,CAAC;IACvC,IAAI,EAAE,MAAM,MAAM,CAAC;CACnB;AAED,wBAAgB,qBAAqB,IAAI,eAAe,CAgBvD;AAgFD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE;IAC5C,GAAG,EAAE,gBAAgB,CAAC;IACtB,KAAK,EAAE,eAAe,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB,GAAG,IAAI,CAkCP;AAED,wBAAsB,2BAA2B,CAAC,IAAI,EAAE;IACvD,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,EAAE,eAAe,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,OAAO,CAAC,MAAM,CAAC,CAalB"}
@@ -0,0 +1,112 @@
1
+ import YAML from "yaml";
2
+ import { chunk as chunkDocument } from "./chunk.js";
3
+ import { extractMetadata } from "./extract-metadata.js";
4
+ const DEFAULT_IDLE_SLEEP_MS = 750;
5
+ export function createProcessingQueue() {
6
+ const items = [];
7
+ return {
8
+ enqueue(ref) {
9
+ items.push(ref);
10
+ },
11
+ size() {
12
+ return items.length;
13
+ },
14
+ dequeue() {
15
+ return items.shift();
16
+ },
17
+ };
18
+ }
19
+ function renderProcessedDocument(metadata, chunkMetadata, chunkContent) {
20
+ const combinedMetadata = {
21
+ ...metadata,
22
+ chunks: chunkMetadata,
23
+ };
24
+ const doc = new YAML.Document(combinedMetadata);
25
+ YAML.visit(doc, {
26
+ Seq(_, node) {
27
+ const allScalars = node.items.every((item) => YAML.isScalar(item));
28
+ if (allScalars) {
29
+ node.flow = true;
30
+ }
31
+ },
32
+ });
33
+ const renderedMetadata = doc.toString({ lineWidth: 200 });
34
+ return [
35
+ "---",
36
+ renderedMetadata.trimEnd(),
37
+ "---",
38
+ "",
39
+ chunkContent.trim(),
40
+ ].join("\n");
41
+ }
42
+ async function processDocument(ref, ctx) {
43
+ // 1. Read raw content
44
+ const { metadata, content } = await ctx.storage.readRawContent(ref);
45
+ const contentLength = content.length;
46
+ // 2. Chunk content with LLM
47
+ ctx.logger?.debug?.("Chunking document", { ref, step: "chunk" });
48
+ const chunkContent = await chunkDocument(content, ctx.domain, ctx.llm);
49
+ // 3. Extract metadata with LLM
50
+ ctx.logger?.debug?.("Extracting metadata", { ref, step: "metadata" });
51
+ const chunkMetadata = await extractMetadata(chunkContent, ctx.domain, ctx.metadataSchema, ctx.llm);
52
+ // 4. Parse chunk metadata and render final content
53
+ const rendered = renderProcessedDocument(metadata, chunkMetadata, chunkContent);
54
+ // 5. Save processed content
55
+ await ctx.storage.saveProcessedContent(ref, rendered);
56
+ ctx.logger?.info?.("Document processed", {
57
+ ref,
58
+ chunks: chunkMetadata.length,
59
+ bytes: contentLength,
60
+ });
61
+ }
62
+ function sleep(ms) {
63
+ return new Promise((resolve) => {
64
+ const t = setTimeout(resolve, ms);
65
+ // If nothing else is keeping the process alive, don't block exit.
66
+ t.unref?.();
67
+ });
68
+ }
69
+ export function startBackgroundWorkers(args) {
70
+ const concurrency = Math.max(1, args.concurrency ?? 1);
71
+ const idleSleepMs = Math.max(50, args.idleSleepMs ?? DEFAULT_IDLE_SLEEP_MS);
72
+ const { ctx, queue } = args;
73
+ async function workerLoop(workerIndex) {
74
+ while (true) {
75
+ const docRef = queue.dequeue();
76
+ if (!docRef) {
77
+ await sleep(idleSleepMs);
78
+ continue;
79
+ }
80
+ ctx.logger?.debug?.("Processing started", {
81
+ worker: workerIndex,
82
+ ref: docRef,
83
+ });
84
+ try {
85
+ await processDocument(docRef, ctx);
86
+ }
87
+ catch (error) {
88
+ ctx.logger?.error?.("Processing failed", {
89
+ err: error,
90
+ ref: docRef,
91
+ worker: workerIndex,
92
+ });
93
+ }
94
+ }
95
+ }
96
+ for (let i = 0; i < concurrency; i++) {
97
+ workerLoop(i + 1);
98
+ }
99
+ ctx.logger?.debug?.("Background workers started", { concurrency });
100
+ }
101
+ export async function enqueueUnprocessedDocuments(args) {
102
+ const refs = await args.storage.getUnprocessedContents();
103
+ for (const ref of refs) {
104
+ args.logger?.debug?.("Queued unprocessed document", { ref });
105
+ args.queue.enqueue(ref);
106
+ }
107
+ if (refs.length > 0) {
108
+ args.logger?.debug?.("Found unprocessed documents", { count: refs.length });
109
+ }
110
+ return refs.length;
111
+ }
112
+ //# sourceMappingURL=processor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processor.js","sourceRoot":"","sources":["../../src/processing/processor.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AAIxB,OAAO,EAAE,KAAK,IAAI,aAAa,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAgBlC,MAAM,UAAU,qBAAqB;IACpC,MAAM,KAAK,GAAkB,EAAE,CAAC;IAEhC,OAAO;QACN,OAAO,CAAC,GAAG;YACV,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjB,CAAC;QAED,IAAI;YACH,OAAO,KAAK,CAAC,MAAM,CAAC;QACrB,CAAC;QAED,OAAO;YACN,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;QACtB,CAAC;KACD,CAAC;AACH,CAAC;AAED,SAAS,uBAAuB,CAC/B,QAAkB,EAClB,aAAyB,EACzB,YAAoB;IAEpB,MAAM,gBAAgB,GAAG;QACxB,GAAG,QAAQ;QACX,MAAM,EAAE,aAAa;KACrB,CAAC;IAEF,MAAM,GAAG,GAAG,IAAI,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IAEhD,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE;QACf,GAAG,CAAC,CAAC,EAAE,IAAI;YACV,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;YACnE,IAAI,UAAU,EAAE,CAAC;gBAChB,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;YAClB,CAAC;QACF,CAAC;KACD,CAAC,CAAC;IAEH,MAAM,gBAAgB,GAAG,GAAG,CAAC,QAAQ,CAAC,EAAE,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;IAE1D,OAAO;QACN,KAAK;QACL,gBAAgB,CAAC,OAAO,EAAE;QAC1B,KAAK;QACL,EAAE;QACF,YAAY,CAAC,IAAI,EAAE;KACnB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACd,CAAC;AAED,KAAK,UAAU,eAAe,CAC7B,GAAgB,EAChB,GAAqB;IAErB,sBAAsB;IACtB,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;IACpE,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC;IAErC,4BAA4B;IAC5B,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,mBAAmB,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IACjE,MAAM,YAAY,GAAG,MAAM,aAAa,CAAC,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;IAEvE,+BAA+B;IAC/B,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,qBAAqB,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC;IACtE,MAAM,aAAa,GAAG,MAAM,eAAe,CAC1C,YAAY,EACZ,GAAG,CAAC,MAAM,EACV,GAAG,CAAC,cAAc,EAClB,GAAG,CAAC,GAAG,CACP,CAAC;IAEF,mDAAmD;IACnD,MAAM,QAAQ,GAAG,uBAAuB,CACvC,QAAQ,EACR,aAAa,EACb,YAAY,CACZ,CAAC;IAEF,4BAA4B;IAC5B,MAAM,GAAG,CAAC,OAAO,CAAC,oBAAoB,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAEtD,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,oBAAoB,EAAE;QACxC,GAAG;QACH,MAAM,EAAE,aAAa,CAAC,MAAM;QAC5B,KAAK,EAAE,aAAa;KACpB,CAAC,CAAC;AACJ,CAAC;AAED,SAAS,KAAK,CAAC,EAAU;IACxB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC9B,MAAM,CAAC,GAAG,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAClC,kEAAkE;QACjE,CAAuC,CAAC,KAAK,EAAE,EAAE,CAAC;IACpD,CAAC,CAAC,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,IAKtC;IACA,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC;IACvD,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,WAAW,IAAI,qBAAqB,CAAC,CAAC;IAC5E,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC;IAE5B,KAAK,UAAU,UAAU,CAAC,WAAmB;QAC5C,OAAO,IAAI,EAAE,CAAC;YACb,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC;YAC/B,IAAI,CAAC,MAAM,EAAE,CAAC;gBACb,MAAM,KAAK,CAAC,WAAW,CAAC,CAAC;gBACzB,SAAS;YACV,CAAC;YAED,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,oBAAoB,EAAE;gBACzC,MAAM,EAAE,WAAW;gBACnB,GAAG,EAAE,MAAM;aACX,CAAC,CAAC;YACH,IAAI,CAAC;gBACJ,MAAM,eAAe,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YACpC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBAChB,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,mBAAmB,EAAE;oBACxC,GAAG,EAAE,KAAK;oBACV,GAAG,EAAE,MAAM;oBACX,MAAM,EAAE,WAAW;iBACnB,CAAC,CAAC;YACJ,CAAC;QACF,CAAC;IACF,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACnB,CAAC;IAED,GAAG,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,4BAA4B,EAAE,EAAE,WAAW,EAAE,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,2BAA2B,CAAC,IAIjD;IACA,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,sBAAsB,EAAE,CAAC;IAEzD,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,6BAA6B,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QAC7D,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACzB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrB,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,6BAA6B,EAAE,EAAE,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7E,CAAC;IAED,OAAO,IAAI,CAAC,MAAM,CAAC;AACpB,CAAC"}
@@ -0,0 +1,16 @@
1
+ import type { FileStorage } from "../storage/file-storage.js";
2
+ import type { MetadataSchemaItem } from "../types.js";
3
+ export interface SkillGeneratorOptions {
4
+ domain: string;
5
+ sources: string[];
6
+ baseDir: string;
7
+ metadataSchema: MetadataSchemaItem[];
8
+ overwrite: boolean;
9
+ }
10
+ /**
11
+ * Generate a Claude Code skill file for searching indexed data
12
+ */
13
+ export declare function generateSkill(options: SkillGeneratorOptions, fileStorage: FileStorage): Promise<{
14
+ skillPath: string;
15
+ }>;
16
+ //# sourceMappingURL=skill-generator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skill-generator.d.ts","sourceRoot":"","sources":["../../src/skills/skill-generator.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAC9D,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAGtD,MAAM,WAAW,qBAAqB;IACrC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,EAAE,kBAAkB,EAAE,CAAC;IACrC,SAAS,EAAE,OAAO,CAAC;CACnB;AAqND;;GAEG;AACH,wBAAsB,aAAa,CAClC,OAAO,EAAE,qBAAqB,EAC9B,WAAW,EAAE,WAAW,GACtB,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC,CAsBhC"}
@@ -0,0 +1,210 @@
1
+ import { mkdir, writeFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { fileExists } from "../utils/file.js";
4
+ /**
5
+ * Generate ripgrep example patterns for a metadata field
6
+ */
7
+ function generateRgPattern(field, sampleValue, isArray) {
8
+ if (isArray) {
9
+ // For array fields, search within YAML array syntax
10
+ return `rg "${field}:\\s*\\[.*${sampleValue}.*\\]" processed`;
11
+ }
12
+ // For single-value fields, search for exact value
13
+ return `rg "${field}:\\s*${sampleValue}" processed`;
14
+ }
15
+ function generateSkillName(sources, maxLength = 30) {
16
+ let sourcesStr = sources.join("-");
17
+ sourcesStr = sourcesStr.trim();
18
+ sourcesStr = sourcesStr.toLowerCase().replace(/[^a-z0-9]+/g, "-");
19
+ sourcesStr = sourcesStr.replace(/^-+/, "").replace(/-+$/, "");
20
+ sourcesStr = sourcesStr.replace(/-+/g, "-");
21
+ let skillName = `search-${sourcesStr}`;
22
+ if (skillName.length > maxLength) {
23
+ skillName = skillName.slice(0, maxLength);
24
+ }
25
+ return skillName;
26
+ }
27
+ function sanitizePathSegment(name, maxLength = 50) {
28
+ let sanitized = name
29
+ .trim()
30
+ .toLowerCase()
31
+ .replace(/[^a-z0-9]+/g, "-");
32
+ sanitized = sanitized.replace(/^-+/, "").replace(/-+$/, "");
33
+ sanitized = sanitized.replace(/-+/g, "-");
34
+ if (sanitized.length > maxLength) {
35
+ sanitized = sanitized.slice(0, maxLength);
36
+ }
37
+ return sanitized || "unknown";
38
+ }
39
+ /**
40
+ * Generate the skill content from a template
41
+ */
42
+ function generateSkillContent(domain, sources, metadataSchema, fileStorage) {
43
+ const skillName = generateSkillName(sources);
44
+ // Generate metadata list from schema
45
+ const metadataList = metadataSchema
46
+ .map((field) => {
47
+ const typeSuffix = field.type.startsWith("enum") && field.enumValues
48
+ ? ` (values: ${field.enumValues.join(", ")})`
49
+ : "";
50
+ return `- \`${field.name}\` - *${field.type}*${typeSuffix}`;
51
+ })
52
+ .join("\n");
53
+ // Generate ripgrep examples for first 3-4 fields, showing both array and single-value patterns
54
+ const exampleFields = metadataSchema.slice(0, 4);
55
+ const rgExamples = exampleFields
56
+ .map((field) => {
57
+ // Check if field type indicates an array (e.g., "string[]", "enum[]")
58
+ const typeStr = String(field.type);
59
+ const isArray = typeStr.includes("[]");
60
+ const sampleValue = field.enumValues?.[0] ?? "VALUE";
61
+ const example = generateRgPattern(field.name, sampleValue, isArray);
62
+ return `# By ${field.name}\n${example}`;
63
+ })
64
+ .join("\n\n");
65
+ const exampleSource = sanitizePathSegment(sources[0] ?? "source");
66
+ return `---
67
+ name: ${skillName}
68
+ description: Guide for searching and analyzing indexed content from ${sources.join(", ")} in the '${domain}' domain. This skill should be used when you need information from ${sources.join(", ")} sources to answer questions or conduct research.
69
+ ---
70
+
71
+ # Skill Overview
72
+
73
+ This skill provides guidance for efficient search over indexed content from ${sources.join(", ")} in the \`${domain}\` domain. It leverages grep-friendly metadata and chunked content storage to enable precise filtering and retrieval.
74
+
75
+ ## About the Content
76
+
77
+ Content from ${sources.join(", ")} has been fetched, chunked, enriched with searchable metadata, and stored in the '${fileStorage.baseDir}' directory.
78
+
79
+ ### Directory Structure
80
+
81
+ \`\`\`
82
+ ${fileStorage.baseDir}/
83
+ ├── processed/ # Cleaned, search-optimized content with metadata
84
+ │ └── {source}/ # ${sources.join(", ")}, etc.
85
+ │ └── {publisher?}/
86
+ │ └── YYYY-MM/
87
+ │ └── YYYY-MM-DD-label.md
88
+ └── raw/ # Original raw content as ingested (mirrors processed/)
89
+ \`\`\`
90
+
91
+ ### File Format
92
+
93
+ Each processed file contains YAML frontmatter with document metadata and a \`chunks\` array. Each chunk includes:
94
+ - \`id\`: Unique chunk identifier (e.g., c01, c02)
95
+ - \`title\`: Chunk title
96
+ - Domain-specific metadata fields
97
+
98
+ Chunked content follows the frontmatter, with chunk IDs as section headers.
99
+
100
+ \`\`\`yaml
101
+ ---
102
+ title: "Document Title"
103
+ source: "Source Name"
104
+ publisher: "Publisher Name"
105
+ <other metadata fields>
106
+ chunks:
107
+ - id: c01
108
+ title: "First Chunk Title"
109
+ # Domain-specific metadata fields related to this chunk
110
+ - id: c02
111
+ title: "Second Chunk Title"
112
+ # Domain-specific metadata fields related to this chunk
113
+ ---
114
+
115
+ CHUNK c01: "First Chunk Title"
116
+ <chunk content here>
117
+
118
+ CHUNK c02: "Second Chunk Title"
119
+ <chunk content here>
120
+ \`\`\`
121
+
122
+ The YAML frontmatter serves as an index for the entire document.
123
+
124
+ ### Key Metadata Fields
125
+
126
+ Below are the most common metadata fields with sample values. Additional metadata fields and values may exist beyond those listed here.
127
+
128
+ ${metadataList}
129
+
130
+ ## Recommended Search Strategy
131
+
132
+ 0. **ALWAYS use rg (ripgrep)**:
133
+ Ripgrep is optimized for searching large codebases and text files quickly. It supports regex, file path patterns, and context capture.
134
+ It MUST be your primary search tool for this content if installed.
135
+
136
+ 1. **Constrain by time range first**
137
+ Use file path patterns (e.g., \`YYYY/MM/YYYY-MM-DD\`) to limit the search space before inspecting content.
138
+
139
+ 2. **Apply metadata filters**
140
+ Use \`ripgrep\` to match specific YAML frontmatter fields. Note that metadata fields can be either:
141
+ - **Single values**: Match with \`field: value\` (e.g., \`date: 2025-01-15\`)
142
+ - **Arrays**: Match with \`field: [ value1, value2 ]\` or search within arrays using \`field:\\s*\\[.*value.*\\]\`
143
+
144
+ Refer to the Key Metadata Fields section below to understand which fields are arrays vs single values.
145
+
146
+ 3. **Leverage YAML frontmatter as a document index**
147
+ Treat frontmatter as a document summary. Read it first to understand:
148
+ - Which chunks exist
149
+ - What each chunk covers
150
+ This avoids unnecessary full-content reads.
151
+
152
+ 4. **Identify relevant chunks**
153
+ From search results and frontmatter, collect IDs of chunks likely to contain relevant information.
154
+
155
+ 5. **Enumerate candidate documents**
156
+ Before reading chunk content, broaden queries slightly (alternative wording, synonyms, metadata variations) to ensure all relevant documents and chunk IDs are discovered.
157
+
158
+ 6. **Refine iteratively**
159
+ Adjust path patterns, metadata filters, and query terms based on findings until no new relevant documents or chunks appear.
160
+
161
+ 7. **Read targeted content only**
162
+ Use collected chunk IDs to read only the necessary sections of each document.
163
+
164
+ ## Ripgrep Search Examples
165
+
166
+ \`\`\`bash
167
+ # Find specific chunk content with context
168
+ rg "CHUNK c01:" -A 20 processed
169
+
170
+ # Search only ${exampleSource} content
171
+ rg "search query" processed/${exampleSource}
172
+
173
+ # Search ${exampleSource} content from December 2025
174
+ rg "search query" processed/${exampleSource} --glob "**/2025-12/*.md"
175
+
176
+ # Combine multiple metadata filters
177
+ rg -l "field1:.*value1" processed | xargs rg "field2:.*value2"
178
+
179
+ # List unique values for a metadata field
180
+ rg "field_name:" processed | sort | uniq -c | sort -rn | head -20
181
+
182
+ ${rgExamples}
183
+ \`\`\`
184
+
185
+ ## Important Guidelines
186
+
187
+ - **Primary source**: Always use the \`processed/\` directory for searches; only fall back to \`raw/\` if necessary
188
+ - **Metadata first**: Start with metadata filtering before full-text content search
189
+ - **Context capture**: Use \`-B\` and \`-A\` flags to capture surrounding lines for context without reading entire chunks
190
+ - **Citation style**: When referencing content, cite by source name, publisher, and title—never expose internal structure like chunk IDs or file paths to the user
191
+ `;
192
+ }
193
+ /**
194
+ * Generate a Claude Code skill file for searching indexed data
195
+ */
196
+ export async function generateSkill(options, fileStorage) {
197
+ const skillName = generateSkillName(options.sources, 30);
198
+ const skillContent = generateSkillContent(options.domain, options.sources, options.metadataSchema, fileStorage);
199
+ // Create skill directory
200
+ const skillDir = path.join(options.baseDir, ".claude", "skills", skillName);
201
+ await mkdir(skillDir, { recursive: true });
202
+ // Write skill file
203
+ const skillPath = path.join(skillDir, "SKILL.md");
204
+ const skillExists = await fileExists(skillPath);
205
+ if (!skillExists || options.overwrite) {
206
+ await writeFile(skillPath, skillContent, "utf8");
207
+ }
208
+ return { skillPath };
209
+ }
210
+ //# sourceMappingURL=skill-generator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skill-generator.js","sourceRoot":"","sources":["../../src/skills/skill-generator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,IAAI,MAAM,WAAW,CAAC;AAG7B,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAU9C;;GAEG;AACH,SAAS,iBAAiB,CACzB,KAAa,EACb,WAAmB,EACnB,OAAgB;IAEhB,IAAI,OAAO,EAAE,CAAC;QACb,oDAAoD;QACpD,OAAO,OAAO,KAAK,aAAa,WAAW,kBAAkB,CAAC;IAC/D,CAAC;IACD,kDAAkD;IAClD,OAAO,OAAO,KAAK,QAAQ,WAAW,aAAa,CAAC;AACrD,CAAC;AAED,SAAS,iBAAiB,CAAC,OAAiB,EAAE,SAAS,GAAG,EAAE;IAC3D,IAAI,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnC,UAAU,GAAG,UAAU,CAAC,IAAI,EAAE,CAAC;IAC/B,UAAU,GAAG,UAAU,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;IAClE,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC9D,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAE5C,IAAI,SAAS,GAAG,UAAU,UAAU,EAAE,CAAC;IACvC,IAAI,SAAS,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;QAClC,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,SAAS,CAAC;AAClB,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY,EAAE,SAAS,GAAG,EAAE;IACxD,IAAI,SAAS,GAAG,IAAI;SAClB,IAAI,EAAE;SACN,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;IAC9B,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC5D,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAE1C,IAAI,SAAS,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;QAClC,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,SAAS,IAAI,SAAS,CAAC;AAC/B,CAAC;AAED;;GAEG;AACH,SAAS,oBAAoB,CAC5B,MAAc,EACd,OAAiB,EACjB,cAAoC,EACpC,WAAwB;IAExB,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAE7C,qCAAqC;IACrC,MAAM,YAAY,GAAG,cAAc;SACjC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;QACd,MAAM,UAAU,GACf,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,UAAU;YAChD,CAAC,CAAC,aAAa,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YAC7C,CAAC,CAAC,EAAE,CAAC;QACP,OAAO,OAAO,KAAK,CAAC,IAAI,SAAS,KAAK,CAAC,IAAI,IAAI,UAAU,EAAE,CAAC;IAC7D,CAAC,CAAC;SACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEb,+FAA+F;IAC/F,MAAM,aAAa,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACjD,MAAM,UAAU,GAAG,aAAa;SAC9B,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;QACd,sEAAsE;QACtE,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvC,MAAM,WAAW,GAAG,KAAK,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC;QACrD,MAAM,OAAO,GAAG,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,WAAW,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,QAAQ,KAAK,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;IACzC,CAAC,CAAC;SACD,IAAI,CAAC,MAAM,CAAC,CAAC;IAEf,MAAM,aAAa,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC;IAElE,OAAO;QACA,SAAS;sEACqD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,MAAM,sEAAsE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;8EAKpH,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,MAAM;;;;eAIpG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,qFAAqF,WAAW,CAAC,OAAO;;;;;EAKvI,WAAW,CAAC,OAAO;;yBAEI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA4CzC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA0CE,aAAa;8BACC,aAAa;;WAEhC,aAAa;8BACM,aAAa;;;;;;;;EAQzC,UAAU;;;;;;;;;CASX,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAClC,OAA8B,EAC9B,WAAwB;IAExB,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACzD,MAAM,YAAY,GAAG,oBAAoB,CACxC,OAAO,CAAC,MAAM,EACd,OAAO,CAAC,OAAO,EACf,OAAO,CAAC,cAAc,EACtB,WAAW,CACX,CAAC;IAEF,yBAAyB;IACzB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC5E,MAAM,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,mBAAmB;IACnB,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAClD,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,SAAS,CAAC,CAAC;IAEhD,IAAI,CAAC,WAAW,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;QACvC,MAAM,SAAS,CAAC,SAAS,EAAE,YAAY,EAAE,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,EAAE,SAAS,EAAE,CAAC;AACtB,CAAC"}
@@ -0,0 +1,16 @@
1
+ import type { DocumentRef, DocumentAddResult as DocumentSaveResult } from "../storage/types.js";
2
+ import type { GreptorEatInput, Metadata } from "../types.js";
3
+ export interface FileStorage {
4
+ readonly baseDir: string;
5
+ readonly rawContentPath: string;
6
+ readonly processedContentPath: string;
7
+ saveRawContent(input: GreptorEatInput): Promise<DocumentSaveResult>;
8
+ readRawContent(ref: DocumentRef): Promise<{
9
+ metadata: Metadata;
10
+ content: string;
11
+ }>;
12
+ getUnprocessedContents(): Promise<DocumentRef[]>;
13
+ saveProcessedContent(ref: DocumentRef, content: string): Promise<void>;
14
+ }
15
+ export declare function createFileStorage(baseDir: string): FileStorage;
16
+ //# sourceMappingURL=file-storage.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"file-storage.d.ts","sourceRoot":"","sources":["../../src/storage/file-storage.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EACX,WAAW,EACX,iBAAiB,IAAI,kBAAkB,EACvC,MAAM,qBAAqB,CAAC;AAC7B,OAAO,KAAK,EAAE,eAAe,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAG7D,MAAM,WAAW,WAAW;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,oBAAoB,EAAE,MAAM,CAAC;IAEtC,cAAc,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAC;IACpE,cAAc,CACb,GAAG,EAAE,WAAW,GACd,OAAO,CAAC;QAAE,QAAQ,EAAE,QAAQ,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACpD,sBAAsB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC;IACjD,oBAAoB,CAAC,GAAG,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CACvE;AAED,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,MAAM,GAAG,WAAW,CAyN9D"}