botholomew 0.8.10 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import type { McpxClient } from "@evantahler/mcpx";
2
2
  import type { BotholomewConfig } from "../config/schemas.ts";
3
3
  import type { DbConnection } from "../db/connection.ts";
4
4
  import { type ContextItem, updateContextItem } from "../db/context.ts";
5
+ import { formatDriveRef } from "./drives.ts";
5
6
  import { fetchUrl } from "./fetcher.ts";
6
7
  import {
7
8
  type PreparedIngestion,
@@ -13,9 +14,9 @@ export type RefreshItemStatus = "updated" | "unchanged" | "missing" | "error";
13
14
 
14
15
  export interface RefreshItemResult {
15
16
  id: string;
16
- context_path: string;
17
- source_path: string;
18
- source_type: "file" | "url";
17
+ drive: string;
18
+ path: string;
19
+ ref: string;
19
20
  status: RefreshItemStatus;
20
21
  error?: string;
21
22
  }
@@ -40,9 +41,16 @@ export interface RefreshOptions {
40
41
  type IngestEmbedFn = (texts: string[]) => Promise<number[][]>;
41
42
 
42
43
  /**
43
- * Refresh a batch of context items: re-read source (file or URL), diff, update
44
- * content, and re-embed only the items that changed. Side-effect free on the
45
- * outside world — the caller owns logging and spinners.
44
+ * Refresh a batch of context items: re-read from origin, diff, update
45
+ * content, and re-embed only the items that changed.
46
+ *
47
+ * Dispatches on `drive`:
48
+ * disk → read from filesystem
49
+ * agent → skip (no external origin)
50
+ * other → re-fetch as a URL (the path is either a full URL for `url` drive
51
+ * or an origin-specific identifier that fetchUrl can re-derive via
52
+ * the MCP agent; for now this only refreshes items stored under
53
+ * `url:/<full-url>`)
46
54
  */
47
55
  export async function refreshContextItems(
48
56
  conn: DbConnection,
@@ -52,36 +60,44 @@ export async function refreshContextItems(
52
60
  opts: RefreshOptions = {},
53
61
  embedFn?: IngestEmbedFn,
54
62
  ): Promise<RefreshResult> {
55
- const sourced = items.filter(
56
- (i): i is ContextItem & { source_path: string } => !!i.source_path,
57
- );
63
+ const refreshable = items.filter((i) => i.drive !== "agent");
58
64
 
59
65
  const results: RefreshItemResult[] = [];
60
66
  const toReembed: string[] = [];
61
67
 
62
- // Phase 1: read each source, diff against stored content, update when changed.
63
- for (const [idx, item] of sourced.entries()) {
64
- opts.onItemProgress?.(idx, sourced.length);
68
+ for (const [idx, item] of refreshable.entries()) {
69
+ opts.onItemProgress?.(idx, refreshable.length);
65
70
  const base = {
66
71
  id: item.id,
67
- context_path: item.context_path,
68
- source_path: item.source_path,
69
- source_type: item.source_type,
72
+ drive: item.drive,
73
+ path: item.path,
74
+ ref: formatDriveRef(item),
70
75
  };
71
76
 
72
77
  try {
73
78
  let content: string;
74
79
 
75
- if (item.source_type === "url") {
76
- const fetched = await fetchUrl(item.source_path, config, mcpxClient);
77
- content = fetched.content;
78
- } else {
79
- const bunFile = Bun.file(item.source_path);
80
+ if (item.drive === "disk") {
81
+ const bunFile = Bun.file(item.path);
80
82
  if (!(await bunFile.exists())) {
81
83
  results.push({ ...base, status: "missing" });
82
84
  continue;
83
85
  }
84
86
  content = await bunFile.text();
87
+ } else if (item.drive === "url") {
88
+ const url = item.path.startsWith("/") ? item.path.slice(1) : item.path;
89
+ const fetched = await fetchUrl(url, config, mcpxClient);
90
+ content = fetched.content;
91
+ } else {
92
+ // Service-specific drives (google-docs, github, etc.) — only
93
+ // refreshable when the original URL can be reconstructed. For now,
94
+ // we punt: mark as error so the user knows to re-add from URL.
95
+ results.push({
96
+ ...base,
97
+ status: "error",
98
+ error: `Refresh not implemented for drive '${item.drive}' — re-add from the original URL.`,
99
+ });
100
+ continue;
85
101
  }
86
102
 
87
103
  if (content === item.content) {
@@ -100,17 +116,16 @@ export async function refreshContextItems(
100
116
  });
101
117
  }
102
118
  }
103
- opts.onItemProgress?.(sourced.length, sourced.length);
119
+ opts.onItemProgress?.(refreshable.length, refreshable.length);
104
120
 
105
121
  const updated = results.filter((r) => r.status === "updated").length;
106
122
  const unchanged = results.filter((r) => r.status === "unchanged").length;
107
123
  const missing = results.filter((r) => r.status === "missing").length;
108
124
 
109
- // Phase 2: re-embed changed items. Skip cleanly if no OpenAI key.
110
125
  const hasEmbedder = !!embedFn || !!config.openai_api_key;
111
126
  if (toReembed.length === 0 || !hasEmbedder) {
112
127
  return {
113
- checked: sourced.length,
128
+ checked: refreshable.length,
114
129
  updated,
115
130
  unchanged,
116
131
  missing,
@@ -147,7 +162,7 @@ export async function refreshContextItems(
147
162
  }
148
163
 
149
164
  return {
150
- checked: sourced.length,
165
+ checked: refreshable.length,
151
166
  updated,
152
167
  unchanged,
153
168
  missing,
@@ -10,29 +10,6 @@ export function isUrl(input: string): boolean {
10
10
  }
11
11
  }
12
12
 
13
- /**
14
- * Derives a virtual context path from a URL.
15
- * Example: `https://docs.google.com/document/d/abc123/edit` → `/{prefix}/docs.google.com/document-d-abc123.md`
16
- */
17
- export function urlToContextPath(url: string, prefix: string): string {
18
- const parsed = new URL(url);
19
- const hostname = parsed.hostname;
20
- const pathname = parsed.pathname
21
- .replace(/\/+$/, "") // strip trailing slashes
22
- .replace(/^\/+/, "") // strip leading slashes
23
- .replace(/[^a-zA-Z0-9\-_.]/g, "-") // slugify
24
- .replace(/-{2,}/g, "-"); // collapse repeated dashes
25
-
26
- const slug = pathname ? `${hostname}/${pathname}` : hostname;
27
- const full = `${prefix.replace(/\/+$/, "")}/${slug}.md`;
28
-
29
- if (full.length > 120) {
30
- return `${full.slice(0, 117 - 3)}.md`;
31
- }
32
-
33
- return full;
34
- }
35
-
36
13
  /**
37
14
  * Strips HTML tags from a string, removing script/style blocks first,
38
15
  * then all remaining tags, and collapsing whitespace.