bluera-knowledge 0.9.43 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/CHANGELOG.md +75 -0
  3. package/README.md +114 -42
  4. package/commands/sync.md +96 -0
  5. package/dist/{chunk-MQE32YY6.js → chunk-6U45VP5Z.js} +42 -6
  6. package/dist/chunk-6U45VP5Z.js.map +1 -0
  7. package/dist/{chunk-CUHYSPRV.js → chunk-DP5XBPQV.js} +372 -2
  8. package/dist/chunk-DP5XBPQV.js.map +1 -0
  9. package/dist/{chunk-DWAIT2OD.js → chunk-UE4ZIJYA.js} +74 -5
  10. package/dist/{chunk-DWAIT2OD.js.map → chunk-UE4ZIJYA.js.map} +1 -1
  11. package/dist/index.js +216 -7
  12. package/dist/index.js.map +1 -1
  13. package/dist/mcp/server.js +2 -2
  14. package/dist/workers/background-worker-cli.js +4 -3
  15. package/dist/workers/background-worker-cli.js.map +1 -1
  16. package/hooks/check-dependencies.sh +29 -0
  17. package/package.json +1 -1
  18. package/python/crawl_worker.py +6 -1
  19. package/src/cli/commands/crawl.test.ts +43 -3
  20. package/src/cli/commands/crawl.ts +3 -3
  21. package/src/cli/commands/sync.test.ts +54 -0
  22. package/src/cli/commands/sync.ts +264 -0
  23. package/src/cli/index.ts +1 -0
  24. package/src/crawl/claude-client.test.ts +195 -24
  25. package/src/crawl/claude-client.ts +38 -3
  26. package/src/crawl/intelligent-crawler.test.ts +65 -0
  27. package/src/crawl/intelligent-crawler.ts +14 -2
  28. package/src/index.ts +2 -0
  29. package/src/mcp/commands/index.ts +2 -0
  30. package/src/mcp/commands/sync.commands.test.ts +283 -0
  31. package/src/mcp/commands/sync.commands.ts +233 -0
  32. package/src/services/gitignore.service.test.ts +157 -0
  33. package/src/services/gitignore.service.ts +132 -0
  34. package/src/services/store-definition.service.test.ts +440 -0
  35. package/src/services/store-definition.service.ts +198 -0
  36. package/src/services/store.service.test.ts +279 -1
  37. package/src/services/store.service.ts +101 -4
  38. package/src/types/index.ts +18 -0
  39. package/src/types/store-definition.test.ts +492 -0
  40. package/src/types/store-definition.ts +129 -0
  41. package/src/workers/background-worker.ts +1 -1
  42. package/dist/chunk-CUHYSPRV.js.map +0 -1
  43. package/dist/chunk-MQE32YY6.js.map +0 -1
@@ -1,5 +1,5 @@
1
1
  {
2
2
  "name": "bluera-knowledge",
3
- "version": "0.9.43",
3
+ "version": "0.10.1",
4
4
  "description": "Clone repos, crawl docs, search locally. Fast, authoritative answers for AI coding agents."
5
5
  }
package/CHANGELOG.md CHANGED
@@ -2,6 +2,81 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file. See [commit-and-tag-version](https://github.com/absolute-version/commit-and-tag-version) for commit guidelines.
4
4
 
5
+ ## [0.10.1](https://github.com/blueraai/bluera-knowledge/compare/v0.10.0...v0.10.1) (2026-01-09)
6
+
7
+
8
+ ### Features
9
+
10
+ * **sync:** add git-committable store definitions with sync command ([5cfa925](https://github.com/blueraai/bluera-knowledge/commit/5cfa92580397f193fda75ea61197fb4c9d9d4b0a))
11
+
12
+
13
+ ### Bug Fixes
14
+
15
+ * **crawl:** handle Claude CLI structured_output wrapper in intelligent crawl ([54ea74b](https://github.com/blueraai/bluera-knowledge/commit/54ea74bca6d4b7263ef11a8290416e0d66b8d37f))
16
+
17
+ ## [0.10.0](https://github.com/blueraai/bluera-knowledge/compare/v0.9.32...v0.10.0) (2026-01-09)
18
+
19
+
20
+ ### Features
21
+
22
+ * **search:** add contextual/full detail display and use process.exitCode ([3205859](https://github.com/blueraai/bluera-knowledge/commit/32058590f6375b8564a255901333536183aa1bd2))
23
+ * **search:** add raw score exposure, confidence levels, and minRelevance filtering ([dc45e4d](https://github.com/blueraai/bluera-knowledge/commit/dc45e4d760c526ae5f0ad7912adea0528a61ff05))
24
+
25
+
26
+ ### Bug Fixes
27
+
28
+ * **bridge:** kill Python process before nullifying to prevent zombie ([393dab3](https://github.com/blueraai/bluera-knowledge/commit/393dab3e45c75fd87c9ecfc1ca92e67b14526e79))
29
+ * **bridge:** mock kill() emits exit event & attach rejection handlers before stop ([d73c6ca](https://github.com/blueraai/bluera-knowledge/commit/d73c6ca6d640c3d15bd82756cabcda832f9ae245))
30
+ * **bridge:** stop() now waits for process to actually exit ([a92de41](https://github.com/blueraai/bluera-knowledge/commit/a92de41c89318fc106f996568ed88505352d5159))
31
+ * **cli:** ensure destroyServices runs before process.exit ([22e4267](https://github.com/blueraai/bluera-knowledge/commit/22e4267b7b9f698de3985a89b9c2b10759cfd49c))
32
+ * **code-unit:** brace counting now handles strings and comments ([1e857bb](https://github.com/blueraai/bluera-knowledge/commit/1e857bb297f357b97a6c067950e62495b3c8fc99))
33
+ * **code-unit:** support complex return types in signature extraction ([3bd2467](https://github.com/blueraai/bluera-knowledge/commit/3bd24675a67e73cc74a0c718f4b5a9e86cd826fb))
34
+ * **crawl:** improve link discovery for modern documentation sites ([78e1c22](https://github.com/blueraai/bluera-knowledge/commit/78e1c22f9de59131b0ec880f1b5e50b13129d6c0))
35
+ * increase native cleanup delays to prevent mutex crashes ([43566ed](https://github.com/blueraai/bluera-knowledge/commit/43566edc301a5093b9bc2000293c7dc0c538b0f0))
36
+ * **job:** validate PID before process.kill to prevent process group kill ([67c540f](https://github.com/blueraai/bluera-knowledge/commit/67c540fef6f2c55c5dca2c824104a91fe19aeff1))
37
+ * **plugin:** remove redundant hooks reference ([58ee578](https://github.com/blueraai/bluera-knowledge/commit/58ee578a54ae246db68187c4dc06e0a6d2b6c843))
38
+ * **plugin:** use .mcp.json instead of inline mcpServers ([ae2e844](https://github.com/blueraai/bluera-knowledge/commit/ae2e844371e1387bc124f1d0f9aa295f70f23440))
39
+ * **scripts:** preserve test exit codes in piped commands ([865f491](https://github.com/blueraai/bluera-knowledge/commit/865f491858ef518fb74f3d7dfed269109cd62c72))
40
+ * **search:** apply threshold filtering after score normalization ([1ebc78e](https://github.com/blueraai/bluera-knowledge/commit/1ebc78e0e688ffde0fdbaf049f17a35d129ef055))
41
+ * **search:** enable FTS-only search mode ([4a0f371](https://github.com/blueraai/bluera-knowledge/commit/4a0f371f0c42f80bf87e28ae0e609ac95986964d))
42
+ * **services:** fail fast on corrupted config/registry files ([030f63c](https://github.com/blueraai/bluera-knowledge/commit/030f63c10b0a30bddcd8e9b27b291ab0f53263f1))
43
+ * **tests:** increase timeout for exit code test in CI ([a362dcd](https://github.com/blueraai/bluera-knowledge/commit/a362dcdae32b0c19e757270e5009b0c1c5ead4e4))
44
+ * **tests:** increase timeout for flaky store delete test ([738fb49](https://github.com/blueraai/bluera-knowledge/commit/738fb4975653703d800dee802730dedfdf9e85ba))
45
+ * **watch:** clear pending timeouts on unwatch to prevent timer leak ([4dcafc1](https://github.com/blueraai/bluera-knowledge/commit/4dcafc14417442f6eeed0257cf185e04ae9de12b))
46
+ * **worker:** fail fast on PID file write error ([d92ce42](https://github.com/blueraai/bluera-knowledge/commit/d92ce42eff63cee3c97056ef019f5a52ef699edd))
47
+ * **worker:** prevent division by zero and improve cancellation handling ([b7b40ab](https://github.com/blueraai/bluera-knowledge/commit/b7b40ab950b7ad0fbbe84af243be3138b1072a72))
48
+
49
+ ## [0.9.44](https://github.com/blueraai/bluera-knowledge/compare/v0.9.32...v0.9.44) (2026-01-09)
50
+
51
+
52
+ ### Features
53
+
54
+ * **search:** add contextual/full detail display and use process.exitCode ([3205859](https://github.com/blueraai/bluera-knowledge/commit/32058590f6375b8564a255901333536183aa1bd2))
55
+ * **search:** add raw score exposure, confidence levels, and minRelevance filtering ([dc45e4d](https://github.com/blueraai/bluera-knowledge/commit/dc45e4d760c526ae5f0ad7912adea0528a61ff05))
56
+
57
+
58
+ ### Bug Fixes
59
+
60
+ * **bridge:** kill Python process before nullifying to prevent zombie ([393dab3](https://github.com/blueraai/bluera-knowledge/commit/393dab3e45c75fd87c9ecfc1ca92e67b14526e79))
61
+ * **bridge:** mock kill() emits exit event & attach rejection handlers before stop ([d73c6ca](https://github.com/blueraai/bluera-knowledge/commit/d73c6ca6d640c3d15bd82756cabcda832f9ae245))
62
+ * **bridge:** stop() now waits for process to actually exit ([a92de41](https://github.com/blueraai/bluera-knowledge/commit/a92de41c89318fc106f996568ed88505352d5159))
63
+ * **cli:** ensure destroyServices runs before process.exit ([22e4267](https://github.com/blueraai/bluera-knowledge/commit/22e4267b7b9f698de3985a89b9c2b10759cfd49c))
64
+ * **code-unit:** brace counting now handles strings and comments ([1e857bb](https://github.com/blueraai/bluera-knowledge/commit/1e857bb297f357b97a6c067950e62495b3c8fc99))
65
+ * **code-unit:** support complex return types in signature extraction ([3bd2467](https://github.com/blueraai/bluera-knowledge/commit/3bd24675a67e73cc74a0c718f4b5a9e86cd826fb))
66
+ * increase native cleanup delays to prevent mutex crashes ([43566ed](https://github.com/blueraai/bluera-knowledge/commit/43566edc301a5093b9bc2000293c7dc0c538b0f0))
67
+ * **job:** validate PID before process.kill to prevent process group kill ([67c540f](https://github.com/blueraai/bluera-knowledge/commit/67c540fef6f2c55c5dca2c824104a91fe19aeff1))
68
+ * **plugin:** remove redundant hooks reference ([58ee578](https://github.com/blueraai/bluera-knowledge/commit/58ee578a54ae246db68187c4dc06e0a6d2b6c843))
69
+ * **plugin:** use .mcp.json instead of inline mcpServers ([ae2e844](https://github.com/blueraai/bluera-knowledge/commit/ae2e844371e1387bc124f1d0f9aa295f70f23440))
70
+ * **scripts:** preserve test exit codes in piped commands ([865f491](https://github.com/blueraai/bluera-knowledge/commit/865f491858ef518fb74f3d7dfed269109cd62c72))
71
+ * **search:** apply threshold filtering after score normalization ([1ebc78e](https://github.com/blueraai/bluera-knowledge/commit/1ebc78e0e688ffde0fdbaf049f17a35d129ef055))
72
+ * **search:** enable FTS-only search mode ([4a0f371](https://github.com/blueraai/bluera-knowledge/commit/4a0f371f0c42f80bf87e28ae0e609ac95986964d))
73
+ * **services:** fail fast on corrupted config/registry files ([030f63c](https://github.com/blueraai/bluera-knowledge/commit/030f63c10b0a30bddcd8e9b27b291ab0f53263f1))
74
+ * **tests:** increase timeout for exit code test in CI ([a362dcd](https://github.com/blueraai/bluera-knowledge/commit/a362dcdae32b0c19e757270e5009b0c1c5ead4e4))
75
+ * **tests:** increase timeout for flaky store delete test ([738fb49](https://github.com/blueraai/bluera-knowledge/commit/738fb4975653703d800dee802730dedfdf9e85ba))
76
+ * **watch:** clear pending timeouts on unwatch to prevent timer leak ([4dcafc1](https://github.com/blueraai/bluera-knowledge/commit/4dcafc14417442f6eeed0257cf185e04ae9de12b))
77
+ * **worker:** fail fast on PID file write error ([d92ce42](https://github.com/blueraai/bluera-knowledge/commit/d92ce42eff63cee3c97056ef019f5a52ef699edd))
78
+ * **worker:** prevent division by zero and improve cancellation handling ([b7b40ab](https://github.com/blueraai/bluera-knowledge/commit/b7b40ab950b7ad0fbbe84af243be3138b1072a72))
79
+
5
80
  ## [0.9.43](https://github.com/blueraai/bluera-knowledge/compare/v0.9.32...v0.9.43) (2026-01-09)
6
81
 
7
82
 
package/README.md CHANGED
@@ -429,6 +429,7 @@ Background jobs include significant performance optimizations:
429
429
  | 🔄 `/bluera-knowledge:index` | Re-index a store | `<store-name-or-id>` |
430
430
  | 🗑️ `/bluera-knowledge:remove-store` | Delete a store and all data | `<store-name-or-id>` |
431
431
  | 🌐 `/bluera-knowledge:crawl` | Crawl web pages | `<url> <store-name> [--crawl "<instruction>"]` |
432
+ | 🔁 `/bluera-knowledge:sync` | Sync stores from definitions config | `[--dry-run] [--prune]` |
432
433
 
433
434
  ---
434
435
 
@@ -734,7 +735,7 @@ Removed:
734
735
  - `--extract "<instruction>"` - Natural language instruction for what content to extract
735
736
  - `--simple` - Use simple BFS mode instead of intelligent crawling
736
737
  - `--max-pages <n>` - Maximum pages to crawl (default: 50)
737
- - `--headless` - Use headless browser for JavaScript-rendered sites (Next.js, React, Vue)
738
+ - `--fast` - Use fast axios-only mode (may fail on JavaScript-heavy sites)
738
739
 
739
740
  **⚙️ Requirements:**
740
741
  - 🐍 Python 3 with `crawl4ai` package installed
@@ -756,8 +757,11 @@ Removed:
756
757
  --crawl "standard library modules" \
757
758
  --extract "function signatures and examples"
758
759
 
759
- # JavaScript-rendered sites (Next.js, React, etc.)
760
- /bluera-knowledge:crawl https://nextjs.org/docs nextjs-docs --headless --max-pages 30
760
+ # JavaScript-rendered sites work by default (uses headless browser)
761
+ /bluera-knowledge:crawl https://nextjs.org/docs nextjs-docs --max-pages 30
762
+
763
+ # Fast mode for static HTML sites (axios-only, faster but may miss JS content)
764
+ /bluera-knowledge:crawl https://example.com/static static-docs --fast --max-pages 100
761
765
 
762
766
  # Simple BFS mode (no AI guidance)
763
767
  /bluera-knowledge:crawl https://example.com/docs docs --simple --max-pages 100
@@ -767,13 +771,51 @@ The crawler converts pages to markdown and indexes them for semantic search.
767
771
 
768
772
  ---
769
773
 
774
+ ### 🔁 `/bluera-knowledge:sync`
775
+
776
+ **Sync stores from definitions config (bootstrap on fresh clone)**
777
+
778
+ ```bash
779
+ /bluera-knowledge:sync [options]
780
+ ```
781
+
782
+ **Options:**
783
+ - `--dry-run` - Show what would happen without making changes
784
+ - `--prune` - Remove stores not in definitions
785
+ - `--reindex` - Re-index existing stores after sync
786
+
787
+ **Use cases:**
788
+ - **Fresh clone**: Recreate all stores defined by the team
789
+ - **Check status**: See which stores exist vs. defined
790
+ - **Clean up**: Remove orphan stores not in config
791
+
792
+ **Examples:**
793
+ ```bash
794
+ # Preview what would be synced
795
+ /bluera-knowledge:sync --dry-run
796
+
797
+ # Sync all stores from definitions
798
+ /bluera-knowledge:sync
799
+
800
+ # Sync and remove orphan stores
801
+ /bluera-knowledge:sync --prune
802
+ ```
803
+
804
+ **How it works:**
805
+ 1. Reads store definitions from `.bluera/bluera-knowledge/stores.config.json`
806
+ 2. Creates any stores that don't exist locally
807
+ 3. Reports orphan stores (local stores not in definitions)
808
+ 4. Optionally prunes orphans with `--prune`
809
+
810
+ ---
811
+
770
812
  ## 🕷️ Crawler Architecture
771
813
 
772
- The crawler supports two modes: **standard mode** for static sites (fast) and **headless mode** for JavaScript-rendered sites (powerful).
814
+ The crawler defaults to **headless mode** (Playwright) for maximum compatibility with modern JavaScript-rendered sites. Use `--fast` for static HTML sites when speed is critical.
773
815
 
774
- ### Standard Mode (Static Sites)
816
+ ### 🎭 Default Mode (Headless - JavaScript-Rendered Sites)
775
817
 
776
- For static HTML sites, the crawler uses axios for fast HTTP requests:
818
+ By default, the crawler uses Playwright via crawl4ai to render JavaScript content:
777
819
 
778
820
  ```mermaid
779
821
  sequenceDiagram
@@ -784,57 +826,55 @@ sequenceDiagram
784
826
  participant Claude
785
827
 
786
828
  User->>CLI: crawl URL --crawl "instruction"
787
- CLI->>IntelligentCrawler: crawl(url, options)
788
- IntelligentCrawler->>Axios: fetchHtml(url)
789
- Axios-->>IntelligentCrawler: Static HTML
829
+ CLI->>IntelligentCrawler: crawl(url, {useHeadless: true})
830
+ IntelligentCrawler->>PythonBridge: fetchHeadless(url)
831
+ PythonBridge->>crawl4ai: AsyncWebCrawler.arun(url)
832
+ crawl4ai->>Playwright: Launch browser & render JS
833
+ Playwright-->>crawl4ai: Rendered HTML
834
+ crawl4ai-->>PythonBridge: {html, markdown, links}
835
+ PythonBridge-->>IntelligentCrawler: Rendered HTML
790
836
  IntelligentCrawler->>Claude: determineCrawlUrls(html, instruction)
837
+ Note over Claude: Natural language instruction<br/>STILL FULLY ACTIVE
791
838
  Claude-->>IntelligentCrawler: [urls to crawl]
792
839
  loop For each URL
793
- IntelligentCrawler->>Axios: fetchHtml(url)
794
- Axios-->>IntelligentCrawler: HTML
840
+ IntelligentCrawler->>PythonBridge: fetchHeadless(url)
841
+ PythonBridge->>crawl4ai: Render JS
842
+ crawl4ai-->>PythonBridge: HTML
843
+ PythonBridge-->>IntelligentCrawler: HTML
795
844
  IntelligentCrawler->>IntelligentCrawler: Convert to markdown & index
796
845
  end
797
846
  ```
798
847
 
799
- ### 🎭 Headless Mode (JavaScript-Rendered Sites)
848
+ ### Fast Mode (Static Sites - `--fast`)
800
849
 
801
- For JavaScript-rendered sites (Next.js, React, Vue), use `--headless` to render content with Playwright:
850
+ For static HTML sites, use `--fast` for faster crawling with axios:
802
851
 
803
852
  ```mermaid
804
853
  sequenceDiagram
805
854
  participant User
806
855
  participant CLI
807
856
  participant IntelligentCrawler
808
- participant PythonBridge
809
- participant crawl4ai
810
- participant Playwright
857
+ participant Axios
811
858
  participant Claude
812
859
 
813
- User->>CLI: crawl URL --crawl "instruction" --headless
814
- CLI->>IntelligentCrawler: crawl(url, {useHeadless: true})
815
- IntelligentCrawler->>PythonBridge: fetchHeadless(url)
816
- PythonBridge->>crawl4ai: AsyncWebCrawler.arun(url)
817
- crawl4ai->>Playwright: Launch browser & render JS
818
- Playwright-->>crawl4ai: Rendered HTML
819
- crawl4ai-->>PythonBridge: {html, markdown, links}
820
- PythonBridge-->>IntelligentCrawler: Rendered HTML
860
+ User->>CLI: crawl URL --crawl "instruction" --fast
861
+ CLI->>IntelligentCrawler: crawl(url, {useHeadless: false})
862
+ IntelligentCrawler->>Axios: fetchHtml(url)
863
+ Axios-->>IntelligentCrawler: Static HTML
821
864
  IntelligentCrawler->>Claude: determineCrawlUrls(html, instruction)
822
- Note over Claude: Natural language instruction<br/>STILL FULLY ACTIVE
823
865
  Claude-->>IntelligentCrawler: [urls to crawl]
824
866
  loop For each URL
825
- IntelligentCrawler->>PythonBridge: fetchHeadless(url)
826
- PythonBridge->>crawl4ai: Render JS
827
- crawl4ai-->>PythonBridge: HTML
828
- PythonBridge-->>IntelligentCrawler: HTML
867
+ IntelligentCrawler->>Axios: fetchHtml(url)
868
+ Axios-->>IntelligentCrawler: HTML
829
869
  IntelligentCrawler->>IntelligentCrawler: Convert to markdown & index
830
870
  end
831
871
  ```
832
872
 
833
873
  ### 🔑 Key Points
834
874
 
835
- - **🧠 Intelligent crawling preserved** - Claude Code CLI analyzes pages and selects URLs based on natural language instructions in both modes
836
- - **🎭 crawl4ai role** - ONLY renders JavaScript to get HTML - doesn't replace Claude's intelligent URL selection
837
- - **⚡ Hybrid approach** - Fast axios for static sites, Playwright for JS-rendered sites
875
+ - **🎭 Default to headless** - Maximum compatibility with modern JavaScript-rendered sites (React, Vue, Next.js)
876
+ - **⚡ Fast mode available** - Use `--fast` for static HTML sites when speed is critical
877
+ - **🧠 Intelligent crawling preserved** - Claude Code CLI analyzes pages and selects URLs in both modes
838
878
  - **🔄 Automatic fallback** - If headless fetch fails, automatically falls back to axios
839
879
 
840
880
  ### 🤖 Intelligent Mode vs Simple Mode
@@ -1017,7 +1057,7 @@ Combine canonical library code with project-specific patterns:
1017
1057
  >
1018
1058
  > **The `--crawl` instruction isn't marketing**—it actually uses Claude Code CLI to analyze each page and intelligently select which links to follow. I can tell it "crawl all API reference pages but skip blog posts" and it understands the intent.
1019
1059
  >
1020
- > For JavaScript-rendered sites (Next.js, React docs), the `--headless` mode renders pages with Playwright while I still control the crawl strategy with natural language.
1060
+ > For JavaScript-rendered sites (Next.js, React docs), the default headless mode renders pages with Playwright while I still control the crawl strategy with natural language. Use `--fast` when you need speed on static sites.
1021
1061
  >
1022
1062
  > ---
1023
1063
  >
@@ -1056,22 +1096,22 @@ The plugin automatically checks for and attempts to install Python dependencies
1056
1096
 
1057
1097
  **Required:**
1058
1098
  - **🐍 Python 3.8+** - Required for web crawling functionality
1059
- - **🕷️ crawl4ai** - Required for web crawling (auto-installed via SessionStart hook, includes playwright)
1060
- - **🎭 Playwright browser binaries** - Required for `--headless` mode on JavaScript-rendered sites (**manual install required**)
1099
+ - **🕷️ crawl4ai** - Required for web crawling (auto-installed via SessionStart hook)
1100
+ - **🎭 Playwright browser binaries** - Required for default headless mode (auto-installed via SessionStart hook)
1061
1101
 
1062
1102
  **What the SessionStart hook installs:**
1063
1103
  - ✅ crawl4ai Python package (includes playwright as dependency)
1064
- - Playwright browser binaries (you must run `playwright install` manually)
1104
+ - Playwright Chromium browser binaries (auto-installed after crawl4ai)
1065
1105
 
1066
- If auto-installation of crawl4ai fails, install manually:
1106
+ If auto-installation fails, install manually:
1067
1107
 
1068
1108
  ```bash
1069
1109
  pip install crawl4ai
1070
- playwright install # Required for --headless mode (Next.js, React, Vue sites)
1110
+ playwright install chromium
1071
1111
  ```
1072
1112
 
1073
- > [!WARNING]
1074
- > The plugin will work without crawl4ai/playwright, but web crawling features (`/bluera-knowledge:crawl`) will be unavailable. For JavaScript-rendered sites (Next.js, React, Vue), use the `--headless` flag which requires playwright browser binaries.
1113
+ > [!NOTE]
1114
+ > The plugin will work without crawl4ai/playwright, but web crawling features (`/bluera-knowledge:crawl`) will be unavailable. The default mode uses headless browser for maximum compatibility with JavaScript-rendered sites. Use `--fast` for static sites when speed is critical.
1075
1115
 
1076
1116
  **Update Plugin:**
1077
1117
  ```bash
@@ -1163,6 +1203,7 @@ The plugin exposes 3 MCP tools optimized for minimal context overhead:
1163
1203
  | `store:create` | `name`, `type`, `source`, `branch?`, `description?` | Create a new store |
1164
1204
  | `store:index` | `store` | Re-index an existing store |
1165
1205
  | `store:delete` | `store` | Delete a store and all data |
1206
+ | `stores:sync` | `dryRun?`, `prune?`, `reindex?` | Sync stores from definitions config |
1166
1207
  | `jobs` | `activeOnly?`, `status?` | List background jobs |
1167
1208
  | `job:status` | `jobId` | Check specific job status |
1168
1209
  | `job:cancel` | `jobId` | Cancel a running job |
@@ -1350,11 +1391,42 @@ Knowledge stores are stored in your project root:
1350
1391
  │ ├── repos/<store-id>/ # Cloned Git repositories
1351
1392
  │ ├── documents_*.lance/ # Vector indices (Lance DB)
1352
1393
  │ └── stores.json # Store registry
1394
+ ├── stores.config.json # Store definitions (git-committable!)
1353
1395
  └── config.json # Configuration
1354
1396
  ```
1355
1397
 
1356
- > [!CAUTION]
1357
- > **Important**: Add `.bluera/` to your `.gitignore` to avoid committing large repositories and vector indices to version control.
1398
+ ### 📋 Store Definitions (Team Sharing)
1399
+
1400
+ Store definitions are automatically saved to `.bluera/bluera-knowledge/stores.config.json`. This file is designed to be **committed to git**, allowing teams to share store configurations.
1401
+
1402
+ **Example `stores.config.json`:**
1403
+ ```json
1404
+ {
1405
+ "version": 1,
1406
+ "stores": [
1407
+ { "type": "file", "name": "my-docs", "path": "./docs" },
1408
+ { "type": "repo", "name": "react", "url": "https://github.com/facebook/react" },
1409
+ { "type": "web", "name": "api-docs", "url": "https://api.example.com/docs", "depth": 2 }
1410
+ ]
1411
+ }
1412
+ ```
1413
+
1414
+ When a teammate clones the repo, they can run `/bluera-knowledge:sync` to recreate all stores locally.
1415
+
1416
+ ### 🚫 Recommended `.gitignore` Patterns
1417
+
1418
+ When you first create a store, the plugin automatically updates your `.gitignore` with:
1419
+
1420
+ ```gitignore
1421
+ # Bluera Knowledge - data directory (not committed)
1422
+ .bluera/
1423
+ !.bluera/bluera-knowledge/
1424
+ !.bluera/bluera-knowledge/stores.config.json
1425
+ ```
1426
+
1427
+ This ensures:
1428
+ - Vector indices and cloned repos are **NOT committed** (they're large and can be recreated)
1429
+ - Store definitions **ARE committed** (small JSON file for team sharing)
1358
1430
 
1359
1431
  ---
1360
1432
 
@@ -0,0 +1,96 @@
1
+ ---
2
+ description: Sync stores from definitions config (bootstrap on fresh clone)
3
+ allowed-tools: ["mcp__bluera-knowledge__execute"]
4
+ ---
5
+
6
+ # Sync Stores from Definitions
7
+
8
+ Sync stores from the git-committable definitions config. This is useful when:
9
+ - You've cloned a repo that has `.bluera/bluera-knowledge/stores.config.json`
10
+ - You want to recreate all stores defined by the team
11
+ - You want to check for orphan stores not in the config
12
+
13
+ ## Steps
14
+
15
+ 1. Use the mcp__bluera-knowledge__execute tool with command "stores:sync" to sync stores from definitions
16
+
17
+ Optional arguments:
18
+ - `dryRun: true` - Show what would happen without making changes
19
+ - `prune: true` - Remove stores not in definitions
20
+ - `reindex: true` - Re-index existing stores after sync
21
+
22
+ 2. Present results in a structured format:
23
+
24
+ ```
25
+ ## Sync Results
26
+
27
+ **Created**: 3 stores
28
+ - my-docs (file)
29
+ - react-source (repo)
30
+ - api-docs (web)
31
+
32
+ **Skipped** (already exist): 2 stores
33
+ - lodash
34
+ - typescript-docs
35
+
36
+ **Orphans** (not in definitions): 1 store
37
+ - old-unused-store
38
+
39
+ No errors occurred.
40
+ ```
41
+
42
+ ## Dry Run Mode
43
+
44
+ When using dry run, show what WOULD happen:
45
+
46
+ ```
47
+ ## Sync Preview (Dry Run)
48
+
49
+ **Would create**: 3 stores
50
+ - my-docs (file)
51
+ - react-source (repo)
52
+ - api-docs (web)
53
+
54
+ **Would skip** (already exist): 2 stores
55
+ - lodash
56
+ - typescript-docs
57
+
58
+ **Orphans** (not in definitions): 1 store
59
+ - old-unused-store
60
+
61
+ To apply these changes, run without --dry-run
62
+ ```
63
+
64
+ ## If No Definitions Found
65
+
66
+ If no store definitions config exists:
67
+
68
+ ```
69
+ ## No Store Definitions Found
70
+
71
+ The config file `.bluera/bluera-knowledge/stores.config.json` doesn't exist yet.
72
+
73
+ Store definitions are automatically created when you:
74
+ - Add a repo: `/bluera-knowledge:add-repo <url>`
75
+ - Add a folder: `/bluera-knowledge:add-folder <path>`
76
+ - Crawl a website: `/bluera-knowledge:crawl <url>`
77
+
78
+ The config file will be created automatically and can be committed to git for team sharing.
79
+ ```
80
+
81
+ ## Error Handling
82
+
83
+ If some stores fail to sync, report them individually:
84
+
85
+ ```
86
+ ## Sync Results
87
+
88
+ **Created**: 2 stores
89
+ - my-docs
90
+ - api-docs
91
+
92
+ **Failed**: 1 store
93
+ - react-source: Directory does not exist: /path/to/repo
94
+
95
+ Continue to resolve the errors manually.
96
+ ```
@@ -3,7 +3,7 @@ import {
3
3
  createLogger,
4
4
  summarizePayload,
5
5
  truncateForLog
6
- } from "./chunk-DWAIT2OD.js";
6
+ } from "./chunk-UE4ZIJYA.js";
7
7
 
8
8
  // src/crawl/intelligent-crawler.ts
9
9
  import { EventEmitter } from "events";
@@ -270,24 +270,30 @@ var ClaudeClient = class _ClaudeClient {
270
270
  /**
271
271
  * Determine which URLs to crawl based on natural language instruction
272
272
  *
273
+ * @param seedUrl - The URL of the seed page (for resolving relative URLs)
273
274
  * @param seedHtml - HTML content of the seed page
274
275
  * @param instruction - Natural language crawl instruction (e.g., "scrape all Getting Started pages")
275
276
  * @returns List of URLs to crawl with reasoning
276
277
  */
277
- async determineCrawlUrls(seedHtml, instruction) {
278
+ async determineCrawlUrls(seedUrl, seedHtml, instruction) {
278
279
  const prompt = `You are analyzing a webpage to determine which pages to crawl based on the user's instruction.
279
280
 
281
+ Base URL: ${seedUrl}
282
+
280
283
  Instruction: ${instruction}
281
284
 
282
285
  Webpage HTML (analyze the navigation structure, links, and content):
283
286
  ${this.truncateHtml(seedHtml, 5e4)}
284
287
 
285
- Based on the instruction, extract and return a list of absolute URLs that should be crawled. Look for navigation menus, sidebars, headers, and link structures that match the instruction.
288
+ Based on the instruction, extract and return a list of absolute URLs that should be crawled. When you encounter relative URLs (starting with "/" or without a protocol), resolve them against the Base URL. For example, if Base URL is "https://example.com/docs" and you see href="/docs/hooks", return "https://example.com/docs/hooks".
289
+
290
+ Look for navigation menus, sidebars, headers, and link structures that match the instruction.
286
291
 
287
292
  Return only URLs that are relevant to the instruction. If the instruction mentions specific sections (e.g., "Getting Started"), find links in those sections.`;
288
293
  try {
289
294
  const result = await this.callClaude(prompt, CRAWL_STRATEGY_SCHEMA);
290
- const parsed = JSON.parse(result);
295
+ const rawParsed = JSON.parse(result);
296
+ const parsed = this.extractStructuredOutput(rawParsed);
291
297
  if (typeof parsed !== "object" || parsed === null || !("urls" in parsed) || !("reasoning" in parsed) || !Array.isArray(parsed.urls) || parsed.urls.length === 0 || typeof parsed.reasoning !== "string" || !parsed.urls.every((url) => typeof url === "string")) {
292
298
  throw new Error("Claude returned invalid crawl strategy");
293
299
  }
@@ -393,6 +399,26 @@ ${this.truncateMarkdown(markdown, 1e5)}`;
393
399
 
394
400
  [... content truncated ...]`;
395
401
  }
402
+ /**
403
+ * Type guard to check if value is a record (plain object)
404
+ */
405
+ isRecord(value) {
406
+ return typeof value === "object" && value !== null && !Array.isArray(value);
407
+ }
408
+ /**
409
+ * Extract structured_output from Claude CLI wrapper format if present.
410
+ * Claude CLI with --json-schema returns: {type, result, structured_output: {...}}
411
+ * This method extracts the inner structured_output, or returns the raw value if not wrapped.
412
+ */
413
+ extractStructuredOutput(rawParsed) {
414
+ if (this.isRecord(rawParsed) && "structured_output" in rawParsed) {
415
+ const structuredOutput = rawParsed["structured_output"];
416
+ if (typeof structuredOutput === "object") {
417
+ return structuredOutput;
418
+ }
419
+ }
420
+ return rawParsed;
421
+ }
396
422
  };
397
423
 
398
424
  // src/crawl/intelligent-crawler.ts
@@ -450,6 +476,16 @@ var IntelligentCrawler = class extends EventEmitter {
450
476
  },
451
477
  "Crawl complete"
452
478
  );
479
+ if (this.visited.size === 1 && maxPages > 1) {
480
+ const warningProgress = {
481
+ type: "error",
482
+ pagesVisited: this.visited.size,
483
+ totalPages: maxPages,
484
+ message: `Warning: Only crawled 1 page despite maxPages=${String(maxPages)}. Link discovery may have failed. If using --fast mode, try without it for JavaScript-heavy sites.`,
485
+ error: new Error("Low page discovery")
486
+ };
487
+ this.emit("progress", warningProgress);
488
+ }
453
489
  const completeProgress = {
454
490
  type: "complete",
455
491
  pagesVisited: this.visited.size,
@@ -484,7 +520,7 @@ var IntelligentCrawler = class extends EventEmitter {
484
520
  };
485
521
  this.emit("progress", strategyStartProgress);
486
522
  const seedHtml = await this.fetchHtml(seedUrl, useHeadless);
487
- strategy = await this.claudeClient.determineCrawlUrls(seedHtml, crawlInstruction);
523
+ strategy = await this.claudeClient.determineCrawlUrls(seedUrl, seedHtml, crawlInstruction);
488
524
  const strategyCompleteProgress = {
489
525
  type: "strategy",
490
526
  pagesVisited: 0,
@@ -765,4 +801,4 @@ var IntelligentCrawler = class extends EventEmitter {
765
801
  export {
766
802
  IntelligentCrawler
767
803
  };
768
- //# sourceMappingURL=chunk-MQE32YY6.js.map
804
+ //# sourceMappingURL=chunk-6U45VP5Z.js.map