amalfa 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +133 -1
  3. package/package.json +4 -1
  4. package/src/README.md +4 -0
  5. package/src/cli/commands/dashboard.ts +110 -0
  6. package/src/cli/commands/doctor.ts +11 -1
  7. package/src/cli/commands/find-gaps.ts +85 -108
  8. package/src/cli/commands/harvest-lexicon.ts +28 -0
  9. package/src/cli/commands/harvest.ts +312 -0
  10. package/src/cli/commands/search.ts +74 -22
  11. package/src/cli/commands/server.ts +13 -0
  12. package/src/cli/commands/services.ts +100 -29
  13. package/src/cli/commands/squash.ts +37 -0
  14. package/src/cli.ts +24 -0
  15. package/src/config/defaults.ts +81 -280
  16. package/src/config/schema.ts +297 -0
  17. package/src/core/EdgeWeaver.ts +14 -20
  18. package/src/core/GraphEngine.ts.test-output.json +159 -0
  19. package/src/core/GraphGardener.ts +26 -7
  20. package/src/core/GrepEngine.ts +191 -0
  21. package/src/core/HarvesterCache.ts +68 -0
  22. package/src/core/LexiconHarvester.ts +171 -0
  23. package/src/core/README.md +4 -0
  24. package/src/core/SidecarSquasher.ts +188 -0
  25. package/src/daemon/index.ts +28 -0
  26. package/src/ember/analyzer.ts +12 -2
  27. package/src/ember/squasher.ts +25 -3
  28. package/src/ember/types.ts +3 -0
  29. package/src/mcp/index.ts +483 -390
  30. package/src/pipeline/AmalfaIngestor.ts +7 -10
  31. package/src/pipeline/cross-domain/01-generate-edges.ts +196 -0
  32. package/src/pipeline/cross-domain/02-ingest.ts +83 -0
  33. package/src/pipeline/cross-domain/03-fafcas-fix.ts +125 -0
  34. package/src/pipeline/lexicon/01-harvest.ts +48 -0
  35. package/src/pipeline/lexicon/02-refine.ts +152 -0
  36. package/src/pipeline/lexicon/03-enrich.ts +131 -0
  37. package/src/pipeline/lexicon/04-embed.ts +67 -0
  38. package/src/pipeline/lexicon/05-survey-edges.ts +102 -0
  39. package/src/pipeline/lexicon/06-ingest.ts +141 -0
  40. package/src/pipeline/lexicon/07-classify-relevance.ts +252 -0
  41. package/src/pipeline/lexicon/README.md +51 -0
  42. package/src/pipeline/lexicon/dashboard.ts +315 -0
  43. package/src/pipeline/lexicon/lib/client.ts +43 -0
  44. package/src/pipeline/lexicon/pipeline.dot +63 -0
  45. package/src/pipeline/lexicon/tests/harness.ts +135 -0
  46. package/src/resonance/DatabaseFactory.ts +2 -2
  47. package/src/resonance/db.ts +49 -10
  48. package/src/resonance/drizzle/migrations/0002_curly_fat_cobra.sql +1 -0
  49. package/src/resonance/drizzle/migrations/meta/0002_snapshot.json +266 -0
  50. package/src/resonance/drizzle/migrations/meta/_journal.json +7 -0
  51. package/src/resonance/drizzle/schema.ts +1 -0
  52. package/src/resonance/services/reranker-daemon.ts +3 -3
  53. package/src/resonance/services/vector-daemon.ts +3 -3
  54. package/src/services/LangExtractClient.ts +232 -15
  55. package/src/services/README.md +8 -0
  56. package/src/services/dashboard-daemon.ts +301 -0
  57. package/src/sidecars/README.md +6 -0
  58. package/src/sidecars/lang-extract/README.md +164 -1
  59. package/src/sidecars/lang-extract/__pycache__/server.cpython-313.pyc +0 -0
  60. package/src/sidecars/lang-extract/pyproject.toml +1 -0
  61. package/src/sidecars/lang-extract/server.py +181 -28
  62. package/src/sidecars/lang-extract/uv.lock +2 -0
  63. package/src/tools/EmberExtractTool.ts +139 -0
  64. package/src/tools/README.md +6 -0
  65. package/src/tools/index.ts +6 -0
  66. package/src/types/sidecar.ts +29 -0
  67. package/src/types/tools.ts +20 -0
  68. package/src/utils/DaemonManager.ts +48 -2
  69. package/src/utils/Historian.ts +135 -0
  70. package/src/utils/JsonlUtils.ts +83 -0
  71. package/src/utils/Scratchpad.ts +27 -28
  72. package/src/utils/ServiceLifecycle.ts +7 -4
  73. package/src/utils/StatsLogger.ts +83 -0
  74. package/src/utils/ToolRegistry.ts +29 -0
  75. package/src/utils/ghost.ts +69 -0
  76. package/src/utils/ollama-discovery.ts +1 -14
  77. package/src/utils/reranker-client.ts +78 -0
  78. package/src/utils/sonar-client.ts +3 -4
  79. package/tsconfig.json +2 -8
  80. package/src/services/reranker.ts +0 -109
package/CHANGELOG.md CHANGED
@@ -5,6 +5,22 @@ All notable changes to AMALFA will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+ - **Dashboard Service Integration**: Complete monitoring for 6 system services (Vector, Reranker, Sonar, Dashboard, Ingest, Enrich).
12
+ - **Architecture Visualization**: New `/architecture` page with interactive state machine diagrams (Viz.js).
13
+ - **Resilience**: Added PID tracking for Ingest and Enrich services. E2E tested lifecycle.
14
+ - **PolyVis Integration Prep**: Paved the way for migrating Dashboard frontend to PolyVis assets.
15
+ - **Lexicon Harvester:** New `amalfa harvest-lexicon` command to generate "Golden Lexicon" candidates from cached sidecars.
16
+ - **JSONL Utilities:** `JsonlUtils.ts` for efficient streaming file I/O.
17
+ - **Package Manager Cleanup**: Consolidated global packages to Bun, reducing npm globals to just `npm`. Removed duplicate tools and standardized on Bun-first workflow.
18
+
19
+ ### Fixed
20
+ - **PID Path Resolution**: Fixed dashboard looking in legacy `.amalfa/pids` instead of `.amalfa/runtime`.
21
+ - **Harvest Command**: Added fallback for missing API keys (OpenRouter env var support).
22
+ - **CLI:** Fixed duplicate `main` function implementation in `src/cli.ts`.
23
+
8
24
  ## [1.5.0] - 2026-01-26
9
25
 
10
26
  ### Added
@@ -12,6 +28,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
12
28
  - New robust Node.js client (`src/services/LangExtractClient.ts`) with Zod validation and Pino logging.
13
29
  - Automatically enriches documents >200 chars during `amalfa ember scan`.
14
30
  - Handles API rate limits (429) gracefully.
31
+ - **Ollama Integration**: Full support for local and cloud Ollama providers in LangExtract.
32
+ - Automatic Ollama discovery and health checking via `ollama-discovery.ts` utility.
33
+ - Intelligent provider selection with fallback chain (local → cloud → Gemini → OpenRouter).
34
+ - Model priority selection based on availability (qwen2.5:1.5b → phi3:mini → tinyllama, etc.).
35
+ - Comprehensive Ollama setup documentation in `src/sidecars/lang-extract/README.md`.
36
+ - Privacy-preserving local LLM inference with automatic cloud fallback.
37
+ - **Service Management Commands**: Added documentation for service lifecycle commands.
38
+ - `amalfa watcher <action>` - Manage file watcher daemon
39
+ - `amalfa setup-python` - Initialize Python sidecar environment
40
+ - `amalfa kill` - Stop all running AMALFA services
41
+ - `amalfa squash` - Ingest sidecar JSON files into the graph
15
42
  - **Reranker Integration**: Finalized BGE-M3 cross-encoder support.
16
43
  - Added `--rerank` flag to `amalfa search` CLI command.
17
44
  - Integrated `ContentHydrator` for retrieving document content for reranking.
@@ -22,6 +49,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
22
49
  - **Service Naming**: Renamed `amalfa daemon` to `amalfa watcher` to align with internal naming and reduce confusion.
23
50
  - `amalfa daemon` is now deprecated but still works (with warning).
24
51
  - Updated `package.json` scripts to use `watcher`.
52
+ - Updated all documentation references (README.md, WARP.md) to use new command name.
25
53
  - **Ember Hardening**: Fixed critical bug in tag parsing logic that caused garbage tags (single characters).
26
54
  - Implemented strict array checking for tags.
27
55
  - Added hygiene filters to remove numeric-only and short tags.
@@ -30,6 +58,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
30
58
  ### Fixed
31
59
  - **Tag Corruption**: Identified and fixed corrupted metadata in documentation files (`newbie-onboarding.md`, etc.) caused by previous buggy runs.
32
60
  - **Git Hygiene**: Added `*.ember.json` to `.gitignore` to treat sidecars as ephemeral artifacts.
61
+ - **Import Paths**: Fixed incorrect relative import paths in `scripts/verify/e2e-historian.ts` (../src/ → ../../src/).
62
+ - **Code Hygiene**: Removed YAML frontmatter tags from `src/services/reranker.ts`.
63
+ - **Documentation Precision**: Updated WARP.md to use precise database reset command (`rm .amalfa/resonance.db*` instead of `rm -rf .amalfa/`).
64
+ - **Consistency**: Achieved 100% consistency score (101/101 checks passing) across all documentation and code alignment checks.
33
65
 
34
66
  ## [1.4.4] - 2026-01-17
35
67
  ### Added
package/README.md CHANGED
@@ -1,5 +1,115 @@
1
1
  # AMALFA
2
2
 
3
+ ## Environment Configuration
4
+
5
+ AMALFA uses environment variables for configuration. Copy `.env.example` to `.env` and fill in your API keys:
6
+
7
+ ```bash
8
+ cp .env.example .env
9
+ ```
10
+
11
+ ### API Keys
12
+
13
+ **Important:** `.env` is the single source of truth for all API key secrets. Never commit `.env` to version control.
14
+
15
+ #### Required API Keys
16
+
17
+ - **GEMINI_API_KEY** - Google Gemini API key for LangExtract
18
+ - Get from: https://makersuite.google.com/app/apikey
19
+
20
+ - **OPENROUTER_API_KEY** - OpenRouter API key for alternative LLM access
21
+ - Get from: https://openrouter.ai/keys
22
+
23
+ - **MISTRAL_API_KEY** - Mistral AI API key
24
+ - Get from: https://console.mistral.ai/
25
+
26
+ **Note:** Ollama uses Device Keys for authentication, not API keys. Device keys are SSH keys automatically managed by the Ollama CLI/daemon. Sign in to Ollama once with `ollama signin` to enable remote model access.
27
+
28
+ #### API Key Types
29
+
30
+ **SSH Keys (NOT for LLM APIs):**
31
+ - Format: `ssh-ed25519 AAAAC3NzaC1lZDI1NTE5...`
32
+ - Used for: Git authentication, SSH access
33
+ - āŒ DO NOT use for LLM API calls
34
+
35
+ **API Keys (for LLM APIs):**
36
+ - Format: `sk-or-v1-...` or alphanumeric string
37
+ - Used for: Gemini, OpenRouter, Mistral
38
+ - āœ… MUST use for LLM API calls
39
+
40
+ **Device Keys (for Ollama):**
41
+ - Format: `ssh-ed25519 AAAAC3NzaC1lZDI1NTE5...`
42
+ - Used for: Ollama CLI/daemon authentication
43
+ - āœ… Automatically managed by Ollama, not stored in `.env`
44
+ - āœ… Enable remote model access via `localhost:11434`
45
+
46
+ **Example of WRONG usage:**
47
+ ```bash
48
+ # āŒ WRONG - Don't use Ollama device keys for LLM APIs
49
+ GEMINI_API_KEY=ssh-ed25519 AAAAC3NzaC1lZDI1NTE5...
50
+ ```
51
+
52
+ **Example of CORRECT usage:**
53
+ ```bash
54
+ # āœ… CORRECT - Use proper API keys for LLM providers
55
+ GEMINI_API_KEY=AIzaSyXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
56
+ OPENROUTER_API_KEY=sk-or-v1-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
57
+
58
+ # āœ… CORRECT - Ollama device keys are managed by Ollama CLI
59
+ # Sign in once: ollama signin
60
+ # Device keys are automatically added to your Ollama account
61
+ ```
62
+
63
+ ### Security Best Practices
64
+
65
+ 1. Never commit `.env` to version control
66
+ 2. Use strong, unique API keys for each service
67
+ 3. Rotate API keys regularly
68
+ 4. Use different keys for dev/staging/production
69
+ 5. Monitor API usage and costs
70
+
71
+ ### Ollama Configuration
72
+
73
+ AMALFA uses Ollama for local and remote model access via `localhost:11434`. No API key is required - Ollama uses device keys automatically.
74
+
75
+ **Ollama Device Keys:**
76
+ - Device keys are SSH keys that allow Ollama CLI/daemon to access cloud models
77
+ - Automatically added when you sign in to Ollama
78
+ - Managed by Ollama, not stored in `.env`
79
+ - Enable remote model access without API configuration
80
+
81
+ **Setup:**
82
+ ```bash
83
+ # Sign in to Ollama (adds device key automatically)
84
+ ollama signin
85
+
86
+ # View your device keys in Ollama account settings
87
+ # https://ollama.com/account
88
+ ```
89
+
90
+ **Local Models:** Run entirely on your machine (private, slow)
91
+ - Example: `mistral-nemo:latest` (7.1 GB)
92
+ - Pull with: `ollama pull mistral-nemo:latest`
93
+
94
+ **Remote Models:** Proxied to ollama.com (fast, requires internet)
95
+ - Example: `nemotron-3-nano:30b-cloud` (30B parameters)
96
+ - Pull with: `ollama pull nemotron-3-nano:30b-cloud`
97
+ - Uses device keys for automatic authentication
98
+
99
+ Configure in `amalfa.config.json`:
100
+ ```json
101
+ {
102
+ "langExtract": {
103
+ "provider": "ollama",
104
+ "ollama": {
105
+ "host": "http://localhost:11434",
106
+ "model": "nemotron-3-nano:30b-cloud" // or "mistral-nemo:latest"
107
+ }
108
+ }
109
+ }
110
+ ```
111
+
112
+
3
113
  **A Memory Layer For Agents**
4
114
 
5
115
  [![npm](https://img.shields.io/npm/v/amalfa?logo=npm)](https://www.npmjs.com/package/amalfa)
@@ -79,6 +189,28 @@ amalfa find-gaps --limit 5 --threshold 0.7
79
189
  amalfa inject-tags docs/auth.md "authentication" "security"
80
190
  ```
81
191
 
192
+ ### Service Management Commands
193
+
194
+ ```bash
195
+ # Manage file watcher daemon (start|stop|status|restart)
196
+ amalfa watcher start
197
+ amalfa watcher stop
198
+ amalfa watcher status
199
+
200
+ # Stop all running AMALFA services
201
+ amalfa kill
202
+
203
+ # Ingest sidecar JSON files into the graph
204
+ amalfa squash
205
+ ```
206
+
207
+ ### Setup Commands
208
+
209
+ ```bash
210
+ # Initialize Python sidecar environment (for LangExtract)
211
+ amalfa setup-python
212
+ ```
213
+
82
214
  ### JSON Output for Scripting
83
215
 
84
216
  All commands support `--json` for programmatic use:
@@ -470,7 +602,7 @@ amalfa servers --dot # Generate DOT diagram
470
602
  amalfa stop-all # Stop all running services (alias: kill)
471
603
 
472
604
  # Individual services (start|stop|status|restart)
473
- amalfa daemon <action> # File watcher daemon
605
+ amalfa watcher <action> # File watcher daemon
474
606
  amalfa vector <action> # Vector embedding daemon
475
607
  amalfa reranker <action> # Reranking daemon
476
608
  amalfa sonar <action> # Sonar AI agent
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "amalfa",
3
- "version": "1.5.0",
3
+ "version": "1.5.1",
4
4
  "description": "Local-first knowledge graph engine for AI agents. Transforms markdown into searchable memory with MCP protocol.",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/pjsvis/amalfa#readme",
@@ -56,6 +56,8 @@
56
56
  "check": "biome check .",
57
57
  "format": "biome format --write .",
58
58
  "validate-config": "bun run scripts/validate-config.ts",
59
+ "build": "tsc --noEmit",
60
+ "dev": "bun run --watch src/cli.ts",
59
61
  "amalfa": "bun run src/cli.ts",
60
62
  "servers": "bun run src/cli.ts servers",
61
63
  "servers:dot": "bun run src/cli.ts servers --dot",
@@ -73,6 +75,7 @@
73
75
  "drizzle-kit": "0.31.8",
74
76
  "drizzle-orm": "0.45.1",
75
77
  "fastembed": "^1.0.0",
78
+ "glob": "^13.0.0",
76
79
  "graphology": "0.26.0",
77
80
  "graphology-library": "0.8.0",
78
81
  "gray-matter": "^4.0.3",
package/src/README.md CHANGED
@@ -14,9 +14,13 @@ This directory contains the core source code for the Amalfa project. Amalfa is a
14
14
  - `config/` - Configuration management and loading
15
15
  - `core/` - Core application logic and services
16
16
  - `daemon/` - Background services (Vector Daemon, Sonar Agent)
17
+ - `ember/` - Ember enrichment service
17
18
  - `mcp/` - Model Context Protocol server implementation
18
19
  - `pipeline/` - Data processing pipelines
19
20
  - `resonance/` - Knowledge graph and semantic services
21
+ - `services/` - Service clients and standalone daemons
22
+ - `sidecars/` - External language runtimes (Python)
23
+ - `tools/` - MCP Tool definitions
20
24
  - `types/` - TypeScript type definitions
21
25
  - `utils/` - Utility functions and helpers
22
26
 
@@ -0,0 +1,110 @@
1
+ import { spawn } from "node:child_process";
2
+ import { existsSync, readFileSync } from "node:fs";
3
+ import { join } from "node:path";
4
+ import { AMALFA_DIRS } from "@src/config/defaults";
5
+ import { getLogger } from "@src/utils/Logger";
6
+
7
+ const _log = getLogger("CLI:Dashboard");
8
+ const PID_FILE = join(AMALFA_DIRS.runtime, "dashboard.pid");
9
+ const PORT = 3013;
10
+
11
+ export async function cmdDashboard(args: string[]) {
12
+ const subcommand = args[0] || "status";
13
+
14
+ switch (subcommand) {
15
+ case "start":
16
+ await startDashboard();
17
+ break;
18
+ case "stop":
19
+ await stopDashboard();
20
+ break;
21
+ case "restart":
22
+ await stopDashboard();
23
+ await new Promise((resolve) => setTimeout(resolve, 1000));
24
+ await startDashboard();
25
+ break;
26
+ case "status":
27
+ await showStatus();
28
+ break;
29
+ case "open":
30
+ await openDashboard();
31
+ break;
32
+ default:
33
+ console.log("Usage: amalfa dashboard [start|stop|restart|status|open]");
34
+ process.exit(1);
35
+ }
36
+ }
37
+
38
+ async function startDashboard() {
39
+ if (existsSync(PID_FILE)) {
40
+ const pid = readFileSync(PID_FILE, "utf-8").trim();
41
+ console.log(`āš ļø Dashboard may already be running (PID: ${pid})`);
42
+ console.log(" Run 'amalfa dashboard stop' first if needed.");
43
+ return;
44
+ }
45
+
46
+ console.log("šŸš€ Starting dashboard...");
47
+
48
+ const child = spawn("bun", ["run", "src/services/dashboard-daemon.ts"], {
49
+ detached: true,
50
+ stdio: "ignore",
51
+ cwd: process.cwd(),
52
+ });
53
+
54
+ child.unref();
55
+
56
+ // Wait for server to start
57
+ await new Promise((resolve) => setTimeout(resolve, 1000));
58
+
59
+ if (existsSync(PID_FILE)) {
60
+ const pid = readFileSync(PID_FILE, "utf-8").trim();
61
+ console.log(`āœ… Dashboard started (PID: ${pid})`);
62
+ console.log(` View at: http://localhost:${PORT}`);
63
+ } else {
64
+ console.log("āŒ Failed to start dashboard");
65
+ }
66
+ }
67
+
68
+ async function stopDashboard() {
69
+ if (!existsSync(PID_FILE)) {
70
+ console.log("āš ļø Dashboard is not running");
71
+ return;
72
+ }
73
+
74
+ const pid = readFileSync(PID_FILE, "utf-8").trim();
75
+ console.log(`šŸ›‘ Stopping dashboard (PID: ${pid})...`);
76
+
77
+ try {
78
+ process.kill(Number(pid), "SIGTERM");
79
+ await new Promise((resolve) => setTimeout(resolve, 500));
80
+ console.log("āœ… Dashboard stopped");
81
+ } catch (err) {
82
+ console.log("āŒ Failed to stop dashboard:", err);
83
+ }
84
+ }
85
+
86
+ async function showStatus() {
87
+ if (existsSync(PID_FILE)) {
88
+ const pid = readFileSync(PID_FILE, "utf-8").trim();
89
+ console.log(`āœ… Dashboard is running`);
90
+ console.log(` PID: ${pid}`);
91
+ console.log(` URL: http://localhost:${PORT}`);
92
+ } else {
93
+ console.log("āš ļø Dashboard is not running");
94
+ console.log(" Run 'amalfa dashboard start' to start it");
95
+ }
96
+ }
97
+
98
+ async function openDashboard() {
99
+ if (!existsSync(PID_FILE)) {
100
+ console.log("āš ļø Dashboard is not running. Starting it now...");
101
+ await startDashboard();
102
+ await new Promise((resolve) => setTimeout(resolve, 1500));
103
+ }
104
+
105
+ const url = `http://localhost:${PORT}`;
106
+ console.log(`🌐 Opening dashboard: ${url}`);
107
+
108
+ // macOS
109
+ Bun.spawnSync(["open", url]);
110
+ }
@@ -1,6 +1,6 @@
1
1
  import { existsSync, statSync } from "node:fs";
2
2
  import { join } from "node:path";
3
- import { loadConfig } from "@src/config/defaults";
3
+ import { loadConfig, loadSettings } from "@src/config/defaults";
4
4
  import { getDbPath } from "../utils";
5
5
 
6
6
  export async function cmdDoctor(_args: string[]) {
@@ -31,6 +31,16 @@ export async function cmdDoctor(_args: string[]) {
31
31
  issues++;
32
32
  }
33
33
 
34
+ // Check Settings (SSOT) Compliance
35
+ try {
36
+ loadSettings(false);
37
+ console.log("āœ“ Settings (SSOT): OK");
38
+ } catch (e: any) {
39
+ console.log(`āœ— Settings (SSOT) Invalid/Missing`);
40
+ console.log(` Error: ${e.message || e}`);
41
+ issues++;
42
+ }
43
+
34
44
  // Check source directories from config
35
45
  const config = await loadConfig();
36
46
  const sources = config.sources || ["./docs"];
@@ -1,122 +1,99 @@
1
- import { createSonarClient } from "@src/utils/sonar-client";
2
- import { checkDatabase } from "../utils";
1
+ /**
2
+ * Find Gaps Tool
3
+ *
4
+ * Identifies potential missing connections in the knowledge graph
5
+ * using similarity thresholds and graph traversal.
6
+ */
3
7
 
4
- export async function cmdFindGaps(args: string[]) {
5
- // Parse arguments
6
- let limit = 10;
7
- const limitEqIdx = args.findIndex((arg) => arg.startsWith("--limit="));
8
- const limitSpaceIdx = args.indexOf("--limit");
9
-
10
- if (limitEqIdx !== -1) {
11
- limit = Number.parseInt(args[limitEqIdx]?.split("=")[1] || "10", 10);
12
- } else if (limitSpaceIdx !== -1 && args[limitSpaceIdx + 1]) {
13
- limit = Number.parseInt(args[limitSpaceIdx + 1] || "10", 10);
14
- }
8
+ import { Database } from "bun:sqlite";
9
+ import { mkdirSync, writeFileSync } from "node:fs";
10
+ import { join } from "node:path";
11
+ import { getDbPath } from "@src/cli/utils";
15
12
 
16
- let threshold = 0.3;
17
- const thresholdEqIdx = args.findIndex((arg) =>
18
- arg.startsWith("--threshold="),
19
- );
20
- const thresholdSpaceIdx = args.indexOf("--threshold");
13
+ interface GapCandidate {
14
+ source_id: string;
15
+ target_id: string;
16
+ similarity?: number;
17
+ reason?: string;
18
+ suggested_link_type?: string;
19
+ }
21
20
 
22
- if (thresholdEqIdx !== -1) {
23
- threshold = Number.parseFloat(args[thresholdEqIdx]?.split("=")[1] || "0.8");
24
- } else if (thresholdSpaceIdx !== -1 && args[thresholdSpaceIdx + 1]) {
25
- threshold = Number.parseFloat(args[thresholdSpaceIdx + 1] || "0.8");
26
- }
21
+ async function findGaps(options: { limit?: number; threshold?: number }) {
22
+ const dbPath = await getDbPath();
23
+ const db = new Database(dbPath);
27
24
 
28
- const jsonOutput = args.includes("--json");
25
+ // Find similar but unlinked nodes
26
+ const gapsQuery = db.prepare(`
27
+ WITH similar_pairs AS (
28
+ SELECT
29
+ n1.id as source_id,
30
+ n2.id as target_id,
31
+ n1.title as source_title,
32
+ n2.title as target_title,
33
+ (
34
+ SELECT COUNT(*) FROM nodes n3
35
+ WHERE n3.domain = n1.domain
36
+ AND (n3.embedding <=> n1.embedding) > ?
37
+ ) as similarity_count
38
+ FROM nodes n1
39
+ JOIN nodes n2 ON n1.id < n2.id
40
+ WHERE n1.domain = n2.domain
41
+ AND n1.id NOT IN (SELECT source FROM edges WHERE target = n2.id)
42
+ AND n2.id NOT IN (SELECT source FROM edges WHERE target = n1.id)
43
+ ORDER BY similarity_count DESC
44
+ LIMIT ?
45
+ )
46
+ SELECT * FROM similar_pairs
47
+ `);
29
48
 
30
- // Check database
31
- if (!(await checkDatabase())) {
32
- if (jsonOutput) {
33
- console.error(
34
- JSON.stringify({
35
- error: "Database not found",
36
- suggestion: "Run 'amalfa init' first",
37
- }),
38
- );
39
- } else {
40
- console.error("āŒ Database not found. Run 'amalfa init' first.");
41
- }
42
- process.exit(1);
43
- }
49
+ const threshold = options.threshold ?? 0.8;
50
+ const limit = options.limit ?? 10;
51
+ const gaps = gapsQuery.all(threshold, limit) as GapCandidate[];
44
52
 
45
- try {
46
- // Create Sonar client
47
- const sonarClient = await createSonarClient();
53
+ // Display results
54
+ console.log(`\nšŸ” Found ${gaps.length} potential gaps:\n`);
48
55
 
49
- // Check if Sonar is available
50
- const isAvailable = await sonarClient.isAvailable();
56
+ if (gaps.length === 0) {
57
+ console.log(" No gaps found above threshold.");
58
+ db.close();
59
+ return;
60
+ }
51
61
 
52
- if (!isAvailable) {
53
- if (jsonOutput) {
54
- console.error(
55
- JSON.stringify({
56
- error: "Sonar service not available",
57
- suggestion: "Start Sonar with 'amalfa sonar start'",
58
- }),
59
- );
60
- } else {
61
- console.error("āŒ Sonar service not available");
62
- console.error("\nThe find-gaps command requires the Sonar service.");
63
- console.error("Start it with: amalfa sonar start\n");
64
- }
65
- process.exit(1);
62
+ for (let i = 0; i < gaps.length; i++) {
63
+ const gap = gaps[i] as unknown as GapCandidate;
64
+ console.log(`${i + 1}. ${gap.source_id} ↔ ${gap.target_id}`);
65
+ console.log(` Similarity: ${gap.similarity?.toFixed(3) || "N/A"}`);
66
+ if (gap.reason) {
67
+ console.log(` Reason: ${gap.reason}`);
66
68
  }
69
+ if (gap.suggested_link_type) {
70
+ console.log(` Suggested: ${gap.suggested_link_type}`);
71
+ }
72
+ console.log();
73
+ }
67
74
 
68
- // Get gaps
69
- const gaps = await sonarClient.getGaps(limit);
75
+ // Export to JSON for further analysis
76
+ const exportPath = join(dbPath, "..", "gaps.json");
77
+ const exportDir = join(dbPath, "..");
78
+ mkdirSync(exportDir, { recursive: true });
79
+ writeFileSync(exportPath, JSON.stringify(gaps, null, 2));
80
+ console.log(`šŸ“ Gaps exported to: ${exportPath}`);
70
81
 
71
- // Output
72
- if (jsonOutput) {
73
- console.log(JSON.stringify(gaps, null, 2));
74
- } else {
75
- // Human-readable output
76
- if (!gaps || gaps.length === 0) {
77
- console.log("\nšŸ” No significant gaps found in knowledge graph\n");
78
- console.log("This means:");
79
- console.log(" - Similar documents are already linked");
80
- console.log(
81
- ` - No document pairs exceed similarity threshold (${threshold})`,
82
- );
83
- console.log("\nšŸ’” Try lowering the threshold with --threshold 0.7\n");
84
- } else {
85
- console.log(
86
- `\nšŸ” Found ${gaps.length} potential gaps (threshold: ${threshold}):\n`,
87
- );
82
+ db.close();
83
+ }
88
84
 
89
- for (let i = 0; i < gaps.length; i++) {
90
- const gap = gaps[i] as any;
91
- console.log(`${i + 1}. ${gap.source_id} ↔ ${gap.target_id}`);
92
- console.log(` Similarity: ${gap.similarity?.toFixed(3) || "N/A"}`);
93
- if (gap.reason) {
94
- console.log(` Reason: ${gap.reason}`);
95
- }
96
- if (gap.suggested_link_type) {
97
- console.log(` Suggested: ${gap.suggested_link_type}`);
98
- }
99
- console.log();
100
- }
85
+ export async function cmdFindGaps(args: string[]) {
86
+ const options = {
87
+ limit:
88
+ Number(args.find((a) => a.startsWith("--limit="))?.split("=")[1]) ?? 10,
89
+ threshold:
90
+ Number(args.find((a) => a.startsWith("--threshold="))?.split("=")[1]) ??
91
+ 0.8,
92
+ };
93
+ await findGaps(options);
94
+ }
101
95
 
102
- console.log(
103
- "šŸ’” Tip: Use 'amalfa read <id>' to review documents before linking\n",
104
- );
105
- }
106
- }
107
- } catch (error) {
108
- if (jsonOutput) {
109
- console.error(
110
- JSON.stringify({
111
- error: error instanceof Error ? error.message : String(error),
112
- }),
113
- );
114
- } else {
115
- console.error(
116
- "āŒ Gap detection failed:",
117
- error instanceof Error ? error.message : error,
118
- );
119
- }
120
- process.exit(1);
121
- }
96
+ // Run if executed directly
97
+ if (require.main === module) {
98
+ cmdFindGaps(process.argv.slice(2)).catch(console.error);
122
99
  }
@@ -0,0 +1,28 @@
1
+ import { join } from "node:path";
2
+ import { AMALFA_DIRS } from "../../config/defaults";
3
+ import { LexiconHarvester } from "../../core/LexiconHarvester";
4
+
5
+ export async function cmdHarvestLexicon(args: string[]) {
6
+ // Optional argument for output path
7
+ const outputPath =
8
+ args[0] || join(AMALFA_DIRS.base, "lexicon-candidates.jsonl");
9
+
10
+ // We assume sidecars are in the cache/lang-extract dir for now,
11
+ // or maybe the root cache if that's where they are.
12
+ // Based on user interaction, they are in .amalfa/cache/lang-extract
13
+ const cacheDir = join(AMALFA_DIRS.cache, "lang-extract");
14
+ const stopListPath = join(process.cwd(), "stop-list.json");
15
+
16
+ console.log(`šŸ”§ Configuring Harvester:`);
17
+ console.log(` Cache: ${cacheDir}`);
18
+ console.log(` StopList: ${stopListPath}`);
19
+ console.log(` Output: ${outputPath}\n`);
20
+
21
+ const harvester = new LexiconHarvester({
22
+ cacheDir,
23
+ stopListPath,
24
+ outputPath,
25
+ });
26
+
27
+ await harvester.harvest();
28
+ }