npm - @pruddiman/hem - Versions diffs - 0.0.1-beta-9f44128 → 0.0.1-beta-6f925fe - Mend

@pruddiman/hem 0.0.1-beta-9f44128 → 0.0.1-beta-6f925fe

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/agents/documentation-agent.d.ts +15 -0
package/dist/agents/documentation-agent.js +58 -5
package/dist/discovery.js +13 -0
package/dist/grouping.js +77 -7
package/dist/import-graph.d.ts +2 -1
package/dist/import-graph.js +18 -5
package/dist/index.js +6 -1
package/package.json +1 -1

package/dist/agents/documentation-agent.d.ts CHANGED Viewed

@@ -14,6 +14,21 @@
 import type { Provider } from "../providers/types.js";
 import type { FileGroup, GenerationContext, ExplorationFindings } from "../types.js";
 import { BaseAgent } from "./base-agent.js";
+/** Upper bound on the final prompt size before progressive section drops. */
+export declare const MAX_PROMPT_CHARS = 500000;
+/** Cap on per-group exploration findings text. */
+export declare const MAX_FINDINGS_CHARS = 60000;
+/** Cap on each individual existing-doc's embedded content. */
+export declare const MAX_EXISTING_DOC_CHARS = 8000;
+/** Cap on the aggregate existing-docs section (drops trailing docs beyond this). */
+export declare const MAX_EXISTING_DOCS_SECTION_CHARS = 40000;
+/** Cap on the cross-group findings summary. */
+export declare const MAX_CROSS_GROUP_CHARS = 20000;
+/**
+ * Truncate `text` to `max` chars, appending a short note explaining how to
+ * retrieve the rest. Returns the original text if already within budget.
+ */
+export declare function truncateForPrompt(text: string, max: number, tailNote: string): string;
 /**
  * An agent that uses an LLM to generate documentation for a single
  * file group. The agent writes files directly via the edit tool.

package/dist/agents/documentation-agent.js CHANGED Viewed

@@ -12,6 +12,33 @@
  *   - If existing docs are provided, the agent merges inline.
  */
 import { BaseAgent } from "./base-agent.js";
+// ── Prompt-size budgets ─────────────────────────────────────────────────
+//
+// Copilot Sonnet has a ~168k token context window. Claude 1M is larger but
+// we pick the smaller envelope for a one-size-fits-all cap. At ~4 chars per
+// token the hard ceiling is ~670k chars; we target ~500k to leave headroom
+// for the model's output tokens. Per-section caps keep any single section
+// from dominating the prompt.
+/** Upper bound on the final prompt size before progressive section drops. */
+export const MAX_PROMPT_CHARS = 500_000;
+/** Cap on per-group exploration findings text. */
+export const MAX_FINDINGS_CHARS = 60_000;
+/** Cap on each individual existing-doc's embedded content. */
+export const MAX_EXISTING_DOC_CHARS = 8_000;
+/** Cap on the aggregate existing-docs section (drops trailing docs beyond this). */
+export const MAX_EXISTING_DOCS_SECTION_CHARS = 40_000;
+/** Cap on the cross-group findings summary. */
+export const MAX_CROSS_GROUP_CHARS = 20_000;
+/**
+ * Truncate `text` to `max` chars, appending a short note explaining how to
+ * retrieve the rest. Returns the original text if already within budget.
+ */
+export function truncateForPrompt(text, max, tailNote) {
+    if (text.length <= max)
+        return text;
+    return (text.slice(0, max).trimEnd() +
+        `\n\n[… truncated ${text.length - max} chars — ${tailNote}]`);
+}
 // ── Agent ───────────────────────────────────────────────────────────────
 /**
  * An agent that uses an LLM to generate documentation for a single
@@ -37,6 +64,14 @@ export class DocumentationAgent extends BaseAgent {
         if (verbose) {
             verbose(`[${tag}] Prompt: ${prompt.length.toLocaleString()} chars`);
         }
+        if (prompt.length > MAX_PROMPT_CHARS) {
+            // Per-section caps should have prevented this; log loudly so we can
+            // investigate what's overflowing rather than silently failing at the
+            // provider's 168k-token barrier.
+            process.stderr.write(`[${tag}] WARNING: prompt ${prompt.length.toLocaleString()} chars exceeds ` +
+                `MAX_PROMPT_CHARS=${MAX_PROMPT_CHARS.toLocaleString()}. ` +
+                `Provider may reject. Investigate per-section truncation.\n`);
+        }
         // 2. Create a new session
         const sessionId = await this.createSession(`Hem: doc — ${group.label}`);
         if (verbose) {
@@ -84,7 +119,8 @@ export class DocumentationAgent extends BaseAgent {
         // 1. System-level instructions
         parts.push(`Generate documentation files that answer the questions the code asks — not`, `merely describe what the code does.`, "", `**You write files directly using the edit tool.** Do NOT return Markdown content`, `in your response text. Instead, use the edit tool to create and write files in`, `the destination directory. When you are done writing all files, stop.`, "");
         // 2. Where to write files
-        parts.push("## Destination directory", "", `Write all documentation files under: \`${context.destinationPath}\``, "", `You have full autonomy over:`, `- **File naming**: Choose descriptive kebab-case filenames (e.g., \`user-authentication.md\`)`, `- **Directory structure**: Create subdirectories that make sense for this codebase`, `- **Number of files**: Create as many files as needed to properly document the group`, `- **File structure**: Design each document's heading hierarchy and sections`, "", `Guidelines for file organization:`, `- Choose a directory layout that reflects how the codebase is actually organized`, `- You may use flat files, subdirectories, or nested structures — whatever fits best`, `- Use \`.md\` extension for all files`, "");
+        const groupSubfolder = `${context.destinationPath}/${group.id}`;
+        parts.push("## Destination directory", "", `Write ALL documentation files for this group under this exact subfolder:`, "", `  \`${groupSubfolder}/\``, "", `**Do NOT** write outside this subfolder. **Do NOT** write to the root`, `\`${context.destinationPath}\` directory or any sibling group's subfolder.`, `This keeps the docs tree organized one-subfolder-per-group.`, "", `Within \`${groupSubfolder}/\` you have full autonomy over:`, `- **File naming**: Choose descriptive kebab-case filenames (e.g., \`overview.md\`, \`api-reference.md\`).`, `- **Nested subdirectories**: Create child folders inside the group subfolder if it helps (e.g., \`${groupSubfolder}/guides/getting-started.md\`).`, `- **Number of files**: Create as many files as needed to properly document the group.`, `- **File structure**: Design each document's heading hierarchy and sections.`, "", `Use \`.md\` extension for all files.`, "");
         // 3. Quality standards
         parts.push("## Quality Standard: Answer Every Question", "", "Your primary quality standard is: **every question from the exploration findings", "MUST be answered in the generated documentation.** Do NOT leave any question", "unanswered. If you cannot find a definitive answer, state what is known and what", "requires further investigation — but NEVER silently skip a question.", "", "For each integration discovered in the exploration findings:", "", "1. **Use `webfetch` to research answers**: When an integration has an", "   `officialDocsUrl`, fetch that URL to find authoritative answers.", "2. **Address HOW, not just WHAT**: Explain HOW to access, query, monitor,", "   and troubleshoot each integration.", "3. **Answer operational questions**: Every `operationalQuestions` entry for each", "   integration MUST be answered in the documentation.", "");
         parts.push("## Documentation quality standards", "");
@@ -103,7 +139,7 @@ export class DocumentationAgent extends BaseAgent {
         parts.push("## Exploration findings for this group", "");
         if (groupFindings) {
             parts.push("The exploration phase discovered these findings for your group:", "");
-            parts.push(DocumentationAgent.formatFindings(groupFindings));
+            parts.push(truncateForPrompt(DocumentationAgent.formatFindings(groupFindings), MAX_FINDINGS_CHARS, "re-run hem exploration or inspect the source files directly"));
         }
         else {
             parts.push("No exploration findings available for this group. Use tools to read and analyze the source files directly.", "");
@@ -111,16 +147,33 @@ export class DocumentationAgent extends BaseAgent {
         // 6. Cross-group context
         parts.push("## Cross-group context", "");
         parts.push("Other groups discovered these integrations and dependencies that may relate to", "your group:", "");
-        parts.push(DocumentationAgent.summarizeCrossGroupFindings(allFindings, group.id));
+        parts.push(truncateForPrompt(DocumentationAgent.summarizeCrossGroupFindings(allFindings, group.id), MAX_CROSS_GROUP_CHARS, "inspect sibling group docs directly with `cat`"));
         parts.push("");
         // 7. Existing docs — search-before-write with skip/update/create decisions
         if (context.existingDocs.length > 0 || (context.mentionedDocPaths && context.mentionedDocPaths.length > 0)) {
             parts.push("## Existing documentation in destination", "");
             parts.push("The destination directory already contains documentation files.", "**Before writing ANY file, you MUST search for related existing docs.**", "", "### Decision criteria for each topic you plan to document:", "", "1. **SKIP** — if an existing doc already covers the topic accurately and", "   completely. Do NOT rewrite content that is already correct.", "2. **UPDATE** — if an existing doc covers the topic but is stale, incomplete,", "   or missing sections. Update it **in place** using the edit tool. Preserve", "   accurate content; fix or expand what is stale or missing.", "3. **CREATE** — if no existing doc covers the topic. Write a new file.", "", "**Content-only changes**: Do NOT rename, move, or delete existing files.", "Only modify file content.", "");
-            // Full content for the most relevant docs
+            // Full content for the most relevant docs — with per-file + total
+            // caps so a single group's docs can't blow out the prompt window.
+            // `context.existingDocs` is already ranked by the search index, so
+            // when we hit the total budget we drop the trailing (least relevant)
+            // entries rather than truncating across the board.
             if (context.existingDocs.length > 0) {
-                parts.push(`### Most relevant existing docs (${context.existingDocs.length} file${context.existingDocs.length === 1 ? "" : "s"}, full content)`, "");
+                const includedDocs = [];
+                let includedBytes = 0;
+                let droppedDocs = 0;
                 for (const doc of context.existingDocs) {
+                    const truncated = truncateForPrompt(doc.content, MAX_EXISTING_DOC_CHARS, `run \`cat ${doc.path}\` to read the full content`);
+                    if (includedBytes + truncated.length > MAX_EXISTING_DOCS_SECTION_CHARS &&
+                        includedDocs.length > 0) {
+                        droppedDocs = context.existingDocs.length - includedDocs.length;
+                        break;
+                    }
+                    includedDocs.push({ path: doc.path, content: truncated });
+                    includedBytes += truncated.length;
+                }
+                parts.push(`### Most relevant existing docs (${includedDocs.length} of ${context.existingDocs.length} file${context.existingDocs.length === 1 ? "" : "s"}, full content${droppedDocs > 0 ? `; ${droppedDocs} omitted — read with \`cat\` as needed` : ""})`, "");
+                for (const doc of includedDocs) {
                     parts.push(`#### \`${doc.path}\``);
                     parts.push("```markdown");
                     parts.push(doc.content);

package/dist/discovery.js CHANGED Viewed

@@ -26,6 +26,19 @@ const DEFAULT_IGNORE_PATTERNS = [
     "**/coverage/**",
     "**/.cache/**",
     "**/.tmp/**",
+    // Framework build output and on-disk caches. These produce noisy,
+    // non-source files that leaked into grouping when a project's own
+    // .gitignore wasn't picked up (e.g. monorepo inner packages).
+    "**/.next/**",
+    "**/.turbo/**",
+    "**/.vercel/**",
+    "**/.nuxt/**",
+    "**/.svelte-kit/**",
+    "**/.astro/**",
+    "**/.parcel-cache/**",
+    "**/.vite/**",
+    "**/out/**",
+    "**/storybook-static/**",
 ];
 /**
  * Known binary file extensions.

package/dist/grouping.js CHANGED Viewed

@@ -225,12 +225,25 @@ function toDisplayLabel(name) {
         .replace(/\b\w/g, (ch) => ch.toUpperCase());
 }
 // ── Main ────────────────────────────────────────────────────────────────
-/** Minimum files a top-level src directory needs before it's promoted. */
-const TOP_LEVEL_PROMOTION_THRESHOLD = 3;
+/**
+ * Minimum files a top-level src directory needs before it's promoted.
+ *
+ * Raised from 3 to 6 after a real-world run produced 71 groups on a
+ * Next.js monorepo — every tiny infrastructure directory (heartbeat/,
+ * ngrok/, caddy/) was auto-promoted. A higher bar keeps those files
+ * flowing through to layer/component passes or "Other".
+ */
+const TOP_LEVEL_PROMOTION_THRESHOLD = 6;
 /** Minimum size of an import-graph connected component to become a group. */
 const MIN_COMPONENT_SIZE = 2;
 /** Components larger than this split along directory boundaries. */
 const MAX_COMPONENT_SIZE = 6;
+/**
+ * Maximum number of vertical groups before consolidation kicks in. When
+ * exceeded, the smallest non-pinned groups are merged into "Other" until
+ * the count drops back under this cap.
+ */
+const MAX_VERTICAL_GROUPS = 20;
 /**
  * Groups discovered files. See module docstring for the priority order.
  *
@@ -269,7 +282,7 @@ export function groupFiles(files, options = {}) {
         addFeature(match.key, toDisplayLabel(match.name), file);
         pinnedKeys.add(match.key);
     }
-    // ── Pass 2: src top-level promotion (≥3 files) ──
+    // ── Pass 2: src top-level promotion (≥TOP_LEVEL_PROMOTION_THRESHOLD files) ──
     const topLevelCounts = countTopLevelDirs(textFiles);
     for (const file of textFiles) {
         if (assigned.has(file.path))
@@ -279,6 +292,8 @@ export function groupFiles(files, options = {}) {
             continue;
         if (LAYER_DIRECTORIES.has(top.toLowerCase()))
             continue;
+        if (!isValidLabelCandidate(top))
+            continue;
         const count = topLevelCounts.get(top) ?? 0;
         if (count < TOP_LEVEL_PROMOTION_THRESHOLD)
             continue;
@@ -291,6 +306,8 @@ export function groupFiles(files, options = {}) {
         const feature = extractFeatureName(file.path);
         if (!feature)
             continue;
+        if (!isValidLabelCandidate(feature))
+            continue;
         addFeature(feature.toLowerCase(), toDisplayLabel(feature), file);
     }
     // Demote single-file feature buckets (unless pinned by a prior).
@@ -345,6 +362,26 @@ export function groupFiles(files, options = {}) {
     }
     // ── Pass 6: catch-all "Other" ──
     const ungrouped = textFiles.filter((f) => !assigned.has(f.path));
+    // ── Pass 7: consolidate if too many vertical groups ──
+    // Monorepos with many top-level feature dirs produce too many verticals
+    // to be useful. Fold the smallest non-pinned ones into "Other" until
+    // we're back under the cap. Priors stay pinned regardless.
+    // If consolidation runs, "Other" will exist afterward, so we target
+    // one fewer bucket to keep the total (buckets + Other) at the cap.
+    if (featureBuckets.size > MAX_VERTICAL_GROUPS - 1) {
+        const candidates = [...featureBuckets.entries()]
+            .filter(([key]) => !pinnedKeys.has(key))
+            .sort(([, a], [, b]) => a.length - b.length);
+        let excess = featureBuckets.size - (MAX_VERTICAL_GROUPS - 1);
+        for (const [key, bucket] of candidates) {
+            if (excess <= 0)
+                break;
+            ungrouped.push(...bucket);
+            featureBuckets.delete(key);
+            featureLabels.delete(key);
+            excess--;
+        }
+    }
     // ── Build FileGroup objects ──
     const groups = [];
     for (const [key, bucket] of featureBuckets) {
@@ -424,25 +461,58 @@ function buildComponentGroups(components, byPath) {
         if (files.length < MIN_COMPONENT_SIZE)
             continue;
         if (files.length <= MAX_COMPONENT_SIZE) {
-            out.push(componentToGroup(files));
+            const group = componentToGroup(files);
+            if (group)
+                out.push(group);
             continue;
         }
         for (const sub of bisectByDirectory(files)) {
             if (sub.length < MIN_COMPONENT_SIZE)
                 continue;
-            out.push(componentToGroup(sub));
+            const group = componentToGroup(sub);
+            if (group)
+                out.push(group);
         }
     }
     return out;
 }
+/**
+ * Build a vertical group from an import-graph component.
+ *
+ * Returns `null` when the component has no meaningful shared directory
+ * (root-level files) or when the derived basename would produce a
+ * degenerate label (leading dot, hash-only, empty). Rejected components
+ * fall through to the "Other" bucket rather than inventing a group name
+ * from a filename.
+ */
 function componentToGroup(files) {
     const commonDir = commonDirectory(files);
-    const basename = commonDir.split("/").filter((s) => s.length > 0).pop() ?? "cluster";
+    if (commonDir === "." || commonDir === "")
+        return null;
+    const basename = commonDir.split("/").filter((s) => s.length > 0).pop() ?? "";
+    if (!isValidLabelCandidate(basename))
+        return null;
     const label = toDisplayLabel(basename);
-    // Append a stable short hash of paths to avoid collisions with other buckets.
     const key = basename.toLowerCase() + "-cluster";
     return { key, label, files };
 }
+/**
+ * Reject directory/label candidates that would produce junk group IDs:
+ *   - leading dot (`.next`, `.turbo`, `.vite`) — build-output leaks.
+ *   - empty string.
+ *   - hash-like strings (≥12 consecutive lowercase-alnum chars with no
+ *     vowels or separators) — Next.js dev-cache hash dirs.
+ */
+function isValidLabelCandidate(name) {
+    if (!name)
+        return false;
+    if (name.startsWith("."))
+        return false;
+    const lower = name.toLowerCase();
+    if (/^[a-z0-9]{12,}$/.test(lower) && !/[aeiou]/.test(lower))
+        return false;
+    return true;
+}
 /**
  * Split a large component into sub-groups by directory prefix. Files are
  * bucketed by their first directory segment; singletons collapse back into

package/dist/import-graph.d.ts CHANGED Viewed

@@ -35,7 +35,8 @@ export interface ImportAnalysis {
 }
 /**
  * Build the import graph for a set of files. Files that fail to read are
- * silently skipped (they contribute no edges).
+ * silently skipped (they contribute no edges). Reads run in parallel with
+ * a bounded concurrency to keep wall-clock time low on large projects.
  */
 export declare function buildImportGraph(files: FileInfo[]): Promise<ImportAnalysis>;
 /**

package/dist/import-graph.js CHANGED Viewed

@@ -11,6 +11,15 @@
  *     integration catalog with file:line citations.
  */
 import { readFile } from "node:fs/promises";
+import pLimit from "p-limit";
+/**
+ * Files larger than this byte count are skipped when building the import
+ * graph. Huge generated files (lockfiles, bundled output) rarely contain
+ * useful import edges and reading them can stall the pipeline for minutes.
+ */
+const MAX_FILE_BYTES = 2 * 1024 * 1024; // 2 MB
+/** Parallel file reads when building the graph. */
+const READ_CONCURRENCY = 32;
 // ── Regexes ─────────────────────────────────────────────────────────────
 // Static: `import ... from "x"` or `export ... from "x"`
 const STATIC_RE = /(?:import|export)\s+[^;'"`]*?\s+from\s+["']([^"']+)["']/g;
@@ -19,21 +28,25 @@ const DYNAMIC_RE = /(?:import|require)\s*\(\s*["']([^"']+)["']\s*\)/g;
 // ── Public API ──────────────────────────────────────────────────────────
 /**
  * Build the import graph for a set of files. Files that fail to read are
- * silently skipped (they contribute no edges).
+ * silently skipped (they contribute no edges). Reads run in parallel with
+ * a bounded concurrency to keep wall-clock time low on large projects.
  */
 export async function buildImportGraph(files) {
     const known = new Set(files.map((f) => f.path));
     const localEdges = new Map();
     const externalImports = new Map();
-    for (const file of files) {
+    const limit = pLimit(READ_CONCURRENCY);
+    await Promise.all(files.map((file) => limit(async () => {
         if (file.isBinary)
-            continue;
+            return;
+        if (file.size > MAX_FILE_BYTES)
+            return;
         let content;
         try {
             content = await readFile(file.absolutePath, "utf-8");
         }
         catch {
-            continue;
+            return;
         }
         const local = [];
         const external = [];
@@ -55,7 +68,7 @@ export async function buildImportGraph(files) {
         if (external.length > 0) {
             externalImports.set(file.path, external);
         }
-    }
+    })));
     return { localEdges, externalImports };
 }
 /**

package/dist/index.js CHANGED Viewed

@@ -519,9 +519,14 @@ export async function handleGenerate(opts, deps = defaultDeps) {
                 .map((p) => p.name)
                 .join(", ")}`);
         }
+        if (cliOptions.verbose) {
+            verboseLog(`[grouping] building import graph from ${textFiles.length} files...`);
+        }
+        const importGraphStart = Date.now();
         const importAnalysis = await buildImportGraph(textFiles);
         if (cliOptions.verbose) {
-            verboseLog(`[grouping] import graph: ${importAnalysis.localEdges.size} files with local edges, ` +
+            const elapsed = ((Date.now() - importGraphStart) / 1000).toFixed(1);
+            verboseLog(`[grouping] import graph built in ${elapsed}s: ${importAnalysis.localEdges.size} files with local edges, ` +
                 `${importAnalysis.externalImports.size} with external imports`);
         }
         const groups = deps.groupFiles(textFiles, {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pruddiman/hem",
-  "version": "0.0.1-beta-9f44128",
+  "version": "0.0.1-beta-6f925fe",
   "type": "module",
   "bin": {
     "hem": "./dist/index.js"