bluera-knowledge 0.9.43 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +75 -0
- package/README.md +114 -42
- package/commands/sync.md +96 -0
- package/dist/{chunk-MQE32YY6.js → chunk-6U45VP5Z.js} +42 -6
- package/dist/chunk-6U45VP5Z.js.map +1 -0
- package/dist/{chunk-CUHYSPRV.js → chunk-DP5XBPQV.js} +372 -2
- package/dist/chunk-DP5XBPQV.js.map +1 -0
- package/dist/{chunk-DWAIT2OD.js → chunk-UE4ZIJYA.js} +74 -5
- package/dist/{chunk-DWAIT2OD.js.map → chunk-UE4ZIJYA.js.map} +1 -1
- package/dist/index.js +216 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +4 -3
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/hooks/check-dependencies.sh +29 -0
- package/package.json +1 -1
- package/python/crawl_worker.py +6 -1
- package/src/cli/commands/crawl.test.ts +43 -3
- package/src/cli/commands/crawl.ts +3 -3
- package/src/cli/commands/sync.test.ts +54 -0
- package/src/cli/commands/sync.ts +264 -0
- package/src/cli/index.ts +1 -0
- package/src/crawl/claude-client.test.ts +195 -24
- package/src/crawl/claude-client.ts +38 -3
- package/src/crawl/intelligent-crawler.test.ts +65 -0
- package/src/crawl/intelligent-crawler.ts +14 -2
- package/src/index.ts +2 -0
- package/src/mcp/commands/index.ts +2 -0
- package/src/mcp/commands/sync.commands.test.ts +283 -0
- package/src/mcp/commands/sync.commands.ts +233 -0
- package/src/services/gitignore.service.test.ts +157 -0
- package/src/services/gitignore.service.ts +132 -0
- package/src/services/store-definition.service.test.ts +440 -0
- package/src/services/store-definition.service.ts +198 -0
- package/src/services/store.service.test.ts +279 -1
- package/src/services/store.service.ts +101 -4
- package/src/types/index.ts +18 -0
- package/src/types/store-definition.test.ts +492 -0
- package/src/types/store-definition.ts +129 -0
- package/src/workers/background-worker.ts +1 -1
- package/dist/chunk-CUHYSPRV.js.map +0 -1
- package/dist/chunk-MQE32YY6.js.map +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,81 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [commit-and-tag-version](https://github.com/absolute-version/commit-and-tag-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
## [0.10.1](https://github.com/blueraai/bluera-knowledge/compare/v0.10.0...v0.10.1) (2026-01-09)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Features
|
|
9
|
+
|
|
10
|
+
* **sync:** add git-committable store definitions with sync command ([5cfa925](https://github.com/blueraai/bluera-knowledge/commit/5cfa92580397f193fda75ea61197fb4c9d9d4b0a))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
### Bug Fixes
|
|
14
|
+
|
|
15
|
+
* **crawl:** handle Claude CLI structured_output wrapper in intelligent crawl ([54ea74b](https://github.com/blueraai/bluera-knowledge/commit/54ea74bca6d4b7263ef11a8290416e0d66b8d37f))
|
|
16
|
+
|
|
17
|
+
## [0.10.0](https://github.com/blueraai/bluera-knowledge/compare/v0.9.32...v0.10.0) (2026-01-09)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
### Features
|
|
21
|
+
|
|
22
|
+
* **search:** add contextual/full detail display and use process.exitCode ([3205859](https://github.com/blueraai/bluera-knowledge/commit/32058590f6375b8564a255901333536183aa1bd2))
|
|
23
|
+
* **search:** add raw score exposure, confidence levels, and minRelevance filtering ([dc45e4d](https://github.com/blueraai/bluera-knowledge/commit/dc45e4d760c526ae5f0ad7912adea0528a61ff05))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
### Bug Fixes
|
|
27
|
+
|
|
28
|
+
* **bridge:** kill Python process before nullifying to prevent zombie ([393dab3](https://github.com/blueraai/bluera-knowledge/commit/393dab3e45c75fd87c9ecfc1ca92e67b14526e79))
|
|
29
|
+
* **bridge:** mock kill() emits exit event & attach rejection handlers before stop ([d73c6ca](https://github.com/blueraai/bluera-knowledge/commit/d73c6ca6d640c3d15bd82756cabcda832f9ae245))
|
|
30
|
+
* **bridge:** stop() now waits for process to actually exit ([a92de41](https://github.com/blueraai/bluera-knowledge/commit/a92de41c89318fc106f996568ed88505352d5159))
|
|
31
|
+
* **cli:** ensure destroyServices runs before process.exit ([22e4267](https://github.com/blueraai/bluera-knowledge/commit/22e4267b7b9f698de3985a89b9c2b10759cfd49c))
|
|
32
|
+
* **code-unit:** brace counting now handles strings and comments ([1e857bb](https://github.com/blueraai/bluera-knowledge/commit/1e857bb297f357b97a6c067950e62495b3c8fc99))
|
|
33
|
+
* **code-unit:** support complex return types in signature extraction ([3bd2467](https://github.com/blueraai/bluera-knowledge/commit/3bd24675a67e73cc74a0c718f4b5a9e86cd826fb))
|
|
34
|
+
* **crawl:** improve link discovery for modern documentation sites ([78e1c22](https://github.com/blueraai/bluera-knowledge/commit/78e1c22f9de59131b0ec880f1b5e50b13129d6c0))
|
|
35
|
+
* increase native cleanup delays to prevent mutex crashes ([43566ed](https://github.com/blueraai/bluera-knowledge/commit/43566edc301a5093b9bc2000293c7dc0c538b0f0))
|
|
36
|
+
* **job:** validate PID before process.kill to prevent process group kill ([67c540f](https://github.com/blueraai/bluera-knowledge/commit/67c540fef6f2c55c5dca2c824104a91fe19aeff1))
|
|
37
|
+
* **plugin:** remove redundant hooks reference ([58ee578](https://github.com/blueraai/bluera-knowledge/commit/58ee578a54ae246db68187c4dc06e0a6d2b6c843))
|
|
38
|
+
* **plugin:** use .mcp.json instead of inline mcpServers ([ae2e844](https://github.com/blueraai/bluera-knowledge/commit/ae2e844371e1387bc124f1d0f9aa295f70f23440))
|
|
39
|
+
* **scripts:** preserve test exit codes in piped commands ([865f491](https://github.com/blueraai/bluera-knowledge/commit/865f491858ef518fb74f3d7dfed269109cd62c72))
|
|
40
|
+
* **search:** apply threshold filtering after score normalization ([1ebc78e](https://github.com/blueraai/bluera-knowledge/commit/1ebc78e0e688ffde0fdbaf049f17a35d129ef055))
|
|
41
|
+
* **search:** enable FTS-only search mode ([4a0f371](https://github.com/blueraai/bluera-knowledge/commit/4a0f371f0c42f80bf87e28ae0e609ac95986964d))
|
|
42
|
+
* **services:** fail fast on corrupted config/registry files ([030f63c](https://github.com/blueraai/bluera-knowledge/commit/030f63c10b0a30bddcd8e9b27b291ab0f53263f1))
|
|
43
|
+
* **tests:** increase timeout for exit code test in CI ([a362dcd](https://github.com/blueraai/bluera-knowledge/commit/a362dcdae32b0c19e757270e5009b0c1c5ead4e4))
|
|
44
|
+
* **tests:** increase timeout for flaky store delete test ([738fb49](https://github.com/blueraai/bluera-knowledge/commit/738fb4975653703d800dee802730dedfdf9e85ba))
|
|
45
|
+
* **watch:** clear pending timeouts on unwatch to prevent timer leak ([4dcafc1](https://github.com/blueraai/bluera-knowledge/commit/4dcafc14417442f6eeed0257cf185e04ae9de12b))
|
|
46
|
+
* **worker:** fail fast on PID file write error ([d92ce42](https://github.com/blueraai/bluera-knowledge/commit/d92ce42eff63cee3c97056ef019f5a52ef699edd))
|
|
47
|
+
* **worker:** prevent division by zero and improve cancellation handling ([b7b40ab](https://github.com/blueraai/bluera-knowledge/commit/b7b40ab950b7ad0fbbe84af243be3138b1072a72))
|
|
48
|
+
|
|
49
|
+
## [0.9.44](https://github.com/blueraai/bluera-knowledge/compare/v0.9.32...v0.9.44) (2026-01-09)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
### Features
|
|
53
|
+
|
|
54
|
+
* **search:** add contextual/full detail display and use process.exitCode ([3205859](https://github.com/blueraai/bluera-knowledge/commit/32058590f6375b8564a255901333536183aa1bd2))
|
|
55
|
+
* **search:** add raw score exposure, confidence levels, and minRelevance filtering ([dc45e4d](https://github.com/blueraai/bluera-knowledge/commit/dc45e4d760c526ae5f0ad7912adea0528a61ff05))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
### Bug Fixes
|
|
59
|
+
|
|
60
|
+
* **bridge:** kill Python process before nullifying to prevent zombie ([393dab3](https://github.com/blueraai/bluera-knowledge/commit/393dab3e45c75fd87c9ecfc1ca92e67b14526e79))
|
|
61
|
+
* **bridge:** mock kill() emits exit event & attach rejection handlers before stop ([d73c6ca](https://github.com/blueraai/bluera-knowledge/commit/d73c6ca6d640c3d15bd82756cabcda832f9ae245))
|
|
62
|
+
* **bridge:** stop() now waits for process to actually exit ([a92de41](https://github.com/blueraai/bluera-knowledge/commit/a92de41c89318fc106f996568ed88505352d5159))
|
|
63
|
+
* **cli:** ensure destroyServices runs before process.exit ([22e4267](https://github.com/blueraai/bluera-knowledge/commit/22e4267b7b9f698de3985a89b9c2b10759cfd49c))
|
|
64
|
+
* **code-unit:** brace counting now handles strings and comments ([1e857bb](https://github.com/blueraai/bluera-knowledge/commit/1e857bb297f357b97a6c067950e62495b3c8fc99))
|
|
65
|
+
* **code-unit:** support complex return types in signature extraction ([3bd2467](https://github.com/blueraai/bluera-knowledge/commit/3bd24675a67e73cc74a0c718f4b5a9e86cd826fb))
|
|
66
|
+
* increase native cleanup delays to prevent mutex crashes ([43566ed](https://github.com/blueraai/bluera-knowledge/commit/43566edc301a5093b9bc2000293c7dc0c538b0f0))
|
|
67
|
+
* **job:** validate PID before process.kill to prevent process group kill ([67c540f](https://github.com/blueraai/bluera-knowledge/commit/67c540fef6f2c55c5dca2c824104a91fe19aeff1))
|
|
68
|
+
* **plugin:** remove redundant hooks reference ([58ee578](https://github.com/blueraai/bluera-knowledge/commit/58ee578a54ae246db68187c4dc06e0a6d2b6c843))
|
|
69
|
+
* **plugin:** use .mcp.json instead of inline mcpServers ([ae2e844](https://github.com/blueraai/bluera-knowledge/commit/ae2e844371e1387bc124f1d0f9aa295f70f23440))
|
|
70
|
+
* **scripts:** preserve test exit codes in piped commands ([865f491](https://github.com/blueraai/bluera-knowledge/commit/865f491858ef518fb74f3d7dfed269109cd62c72))
|
|
71
|
+
* **search:** apply threshold filtering after score normalization ([1ebc78e](https://github.com/blueraai/bluera-knowledge/commit/1ebc78e0e688ffde0fdbaf049f17a35d129ef055))
|
|
72
|
+
* **search:** enable FTS-only search mode ([4a0f371](https://github.com/blueraai/bluera-knowledge/commit/4a0f371f0c42f80bf87e28ae0e609ac95986964d))
|
|
73
|
+
* **services:** fail fast on corrupted config/registry files ([030f63c](https://github.com/blueraai/bluera-knowledge/commit/030f63c10b0a30bddcd8e9b27b291ab0f53263f1))
|
|
74
|
+
* **tests:** increase timeout for exit code test in CI ([a362dcd](https://github.com/blueraai/bluera-knowledge/commit/a362dcdae32b0c19e757270e5009b0c1c5ead4e4))
|
|
75
|
+
* **tests:** increase timeout for flaky store delete test ([738fb49](https://github.com/blueraai/bluera-knowledge/commit/738fb4975653703d800dee802730dedfdf9e85ba))
|
|
76
|
+
* **watch:** clear pending timeouts on unwatch to prevent timer leak ([4dcafc1](https://github.com/blueraai/bluera-knowledge/commit/4dcafc14417442f6eeed0257cf185e04ae9de12b))
|
|
77
|
+
* **worker:** fail fast on PID file write error ([d92ce42](https://github.com/blueraai/bluera-knowledge/commit/d92ce42eff63cee3c97056ef019f5a52ef699edd))
|
|
78
|
+
* **worker:** prevent division by zero and improve cancellation handling ([b7b40ab](https://github.com/blueraai/bluera-knowledge/commit/b7b40ab950b7ad0fbbe84af243be3138b1072a72))
|
|
79
|
+
|
|
5
80
|
## [0.9.43](https://github.com/blueraai/bluera-knowledge/compare/v0.9.32...v0.9.43) (2026-01-09)
|
|
6
81
|
|
|
7
82
|
|
package/README.md
CHANGED
|
@@ -429,6 +429,7 @@ Background jobs include significant performance optimizations:
|
|
|
429
429
|
| 🔄 `/bluera-knowledge:index` | Re-index a store | `<store-name-or-id>` |
|
|
430
430
|
| 🗑️ `/bluera-knowledge:remove-store` | Delete a store and all data | `<store-name-or-id>` |
|
|
431
431
|
| 🌐 `/bluera-knowledge:crawl` | Crawl web pages | `<url> <store-name> [--crawl "<instruction>"]` |
|
|
432
|
+
| 🔁 `/bluera-knowledge:sync` | Sync stores from definitions config | `[--dry-run] [--prune]` |
|
|
432
433
|
|
|
433
434
|
---
|
|
434
435
|
|
|
@@ -734,7 +735,7 @@ Removed:
|
|
|
734
735
|
- `--extract "<instruction>"` - Natural language instruction for what content to extract
|
|
735
736
|
- `--simple` - Use simple BFS mode instead of intelligent crawling
|
|
736
737
|
- `--max-pages <n>` - Maximum pages to crawl (default: 50)
|
|
737
|
-
- `--
|
|
738
|
+
- `--fast` - Use fast axios-only mode (may fail on JavaScript-heavy sites)
|
|
738
739
|
|
|
739
740
|
**⚙️ Requirements:**
|
|
740
741
|
- 🐍 Python 3 with `crawl4ai` package installed
|
|
@@ -756,8 +757,11 @@ Removed:
|
|
|
756
757
|
--crawl "standard library modules" \
|
|
757
758
|
--extract "function signatures and examples"
|
|
758
759
|
|
|
759
|
-
# JavaScript-rendered sites (
|
|
760
|
-
/bluera-knowledge:crawl https://nextjs.org/docs nextjs-docs --
|
|
760
|
+
# JavaScript-rendered sites work by default (uses headless browser)
|
|
761
|
+
/bluera-knowledge:crawl https://nextjs.org/docs nextjs-docs --max-pages 30
|
|
762
|
+
|
|
763
|
+
# Fast mode for static HTML sites (axios-only, faster but may miss JS content)
|
|
764
|
+
/bluera-knowledge:crawl https://example.com/static static-docs --fast --max-pages 100
|
|
761
765
|
|
|
762
766
|
# Simple BFS mode (no AI guidance)
|
|
763
767
|
/bluera-knowledge:crawl https://example.com/docs docs --simple --max-pages 100
|
|
@@ -767,13 +771,51 @@ The crawler converts pages to markdown and indexes them for semantic search.
|
|
|
767
771
|
|
|
768
772
|
---
|
|
769
773
|
|
|
774
|
+
### 🔁 `/bluera-knowledge:sync`
|
|
775
|
+
|
|
776
|
+
**Sync stores from definitions config (bootstrap on fresh clone)**
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
/bluera-knowledge:sync [options]
|
|
780
|
+
```
|
|
781
|
+
|
|
782
|
+
**Options:**
|
|
783
|
+
- `--dry-run` - Show what would happen without making changes
|
|
784
|
+
- `--prune` - Remove stores not in definitions
|
|
785
|
+
- `--reindex` - Re-index existing stores after sync
|
|
786
|
+
|
|
787
|
+
**Use cases:**
|
|
788
|
+
- **Fresh clone**: Recreate all stores defined by the team
|
|
789
|
+
- **Check status**: See which stores exist vs. defined
|
|
790
|
+
- **Clean up**: Remove orphan stores not in config
|
|
791
|
+
|
|
792
|
+
**Examples:**
|
|
793
|
+
```bash
|
|
794
|
+
# Preview what would be synced
|
|
795
|
+
/bluera-knowledge:sync --dry-run
|
|
796
|
+
|
|
797
|
+
# Sync all stores from definitions
|
|
798
|
+
/bluera-knowledge:sync
|
|
799
|
+
|
|
800
|
+
# Sync and remove orphan stores
|
|
801
|
+
/bluera-knowledge:sync --prune
|
|
802
|
+
```
|
|
803
|
+
|
|
804
|
+
**How it works:**
|
|
805
|
+
1. Reads store definitions from `.bluera/bluera-knowledge/stores.config.json`
|
|
806
|
+
2. Creates any stores that don't exist locally
|
|
807
|
+
3. Reports orphan stores (local stores not in definitions)
|
|
808
|
+
4. Optionally prunes orphans with `--prune`
|
|
809
|
+
|
|
810
|
+
---
|
|
811
|
+
|
|
770
812
|
## 🕷️ Crawler Architecture
|
|
771
813
|
|
|
772
|
-
The crawler
|
|
814
|
+
The crawler defaults to **headless mode** (Playwright) for maximum compatibility with modern JavaScript-rendered sites. Use `--fast` for static HTML sites when speed is critical.
|
|
773
815
|
|
|
774
|
-
###
|
|
816
|
+
### 🎭 Default Mode (Headless - JavaScript-Rendered Sites)
|
|
775
817
|
|
|
776
|
-
|
|
818
|
+
By default, the crawler uses Playwright via crawl4ai to render JavaScript content:
|
|
777
819
|
|
|
778
820
|
```mermaid
|
|
779
821
|
sequenceDiagram
|
|
@@ -784,57 +826,55 @@ sequenceDiagram
|
|
|
784
826
|
participant Claude
|
|
785
827
|
|
|
786
828
|
User->>CLI: crawl URL --crawl "instruction"
|
|
787
|
-
CLI->>IntelligentCrawler: crawl(url,
|
|
788
|
-
IntelligentCrawler->>
|
|
789
|
-
|
|
829
|
+
CLI->>IntelligentCrawler: crawl(url, {useHeadless: true})
|
|
830
|
+
IntelligentCrawler->>PythonBridge: fetchHeadless(url)
|
|
831
|
+
PythonBridge->>crawl4ai: AsyncWebCrawler.arun(url)
|
|
832
|
+
crawl4ai->>Playwright: Launch browser & render JS
|
|
833
|
+
Playwright-->>crawl4ai: Rendered HTML
|
|
834
|
+
crawl4ai-->>PythonBridge: {html, markdown, links}
|
|
835
|
+
PythonBridge-->>IntelligentCrawler: Rendered HTML
|
|
790
836
|
IntelligentCrawler->>Claude: determineCrawlUrls(html, instruction)
|
|
837
|
+
Note over Claude: Natural language instruction<br/>STILL FULLY ACTIVE
|
|
791
838
|
Claude-->>IntelligentCrawler: [urls to crawl]
|
|
792
839
|
loop For each URL
|
|
793
|
-
IntelligentCrawler->>
|
|
794
|
-
|
|
840
|
+
IntelligentCrawler->>PythonBridge: fetchHeadless(url)
|
|
841
|
+
PythonBridge->>crawl4ai: Render JS
|
|
842
|
+
crawl4ai-->>PythonBridge: HTML
|
|
843
|
+
PythonBridge-->>IntelligentCrawler: HTML
|
|
795
844
|
IntelligentCrawler->>IntelligentCrawler: Convert to markdown & index
|
|
796
845
|
end
|
|
797
846
|
```
|
|
798
847
|
|
|
799
|
-
###
|
|
848
|
+
### ⚡ Fast Mode (Static Sites - `--fast`)
|
|
800
849
|
|
|
801
|
-
For
|
|
850
|
+
For static HTML sites, use `--fast` for faster crawling with axios:
|
|
802
851
|
|
|
803
852
|
```mermaid
|
|
804
853
|
sequenceDiagram
|
|
805
854
|
participant User
|
|
806
855
|
participant CLI
|
|
807
856
|
participant IntelligentCrawler
|
|
808
|
-
participant
|
|
809
|
-
participant crawl4ai
|
|
810
|
-
participant Playwright
|
|
857
|
+
participant Axios
|
|
811
858
|
participant Claude
|
|
812
859
|
|
|
813
|
-
User->>CLI: crawl URL --crawl "instruction" --
|
|
814
|
-
CLI->>IntelligentCrawler: crawl(url, {useHeadless:
|
|
815
|
-
IntelligentCrawler->>
|
|
816
|
-
|
|
817
|
-
crawl4ai->>Playwright: Launch browser & render JS
|
|
818
|
-
Playwright-->>crawl4ai: Rendered HTML
|
|
819
|
-
crawl4ai-->>PythonBridge: {html, markdown, links}
|
|
820
|
-
PythonBridge-->>IntelligentCrawler: Rendered HTML
|
|
860
|
+
User->>CLI: crawl URL --crawl "instruction" --fast
|
|
861
|
+
CLI->>IntelligentCrawler: crawl(url, {useHeadless: false})
|
|
862
|
+
IntelligentCrawler->>Axios: fetchHtml(url)
|
|
863
|
+
Axios-->>IntelligentCrawler: Static HTML
|
|
821
864
|
IntelligentCrawler->>Claude: determineCrawlUrls(html, instruction)
|
|
822
|
-
Note over Claude: Natural language instruction<br/>STILL FULLY ACTIVE
|
|
823
865
|
Claude-->>IntelligentCrawler: [urls to crawl]
|
|
824
866
|
loop For each URL
|
|
825
|
-
IntelligentCrawler->>
|
|
826
|
-
|
|
827
|
-
crawl4ai-->>PythonBridge: HTML
|
|
828
|
-
PythonBridge-->>IntelligentCrawler: HTML
|
|
867
|
+
IntelligentCrawler->>Axios: fetchHtml(url)
|
|
868
|
+
Axios-->>IntelligentCrawler: HTML
|
|
829
869
|
IntelligentCrawler->>IntelligentCrawler: Convert to markdown & index
|
|
830
870
|
end
|
|
831
871
|
```
|
|
832
872
|
|
|
833
873
|
### 🔑 Key Points
|
|
834
874
|
|
|
835
|
-
-
|
|
836
|
-
-
|
|
837
|
-
-
|
|
875
|
+
- **🎭 Default to headless** - Maximum compatibility with modern JavaScript-rendered sites (React, Vue, Next.js)
|
|
876
|
+
- **⚡ Fast mode available** - Use `--fast` for static HTML sites when speed is critical
|
|
877
|
+
- **🧠 Intelligent crawling preserved** - Claude Code CLI analyzes pages and selects URLs in both modes
|
|
838
878
|
- **🔄 Automatic fallback** - If headless fetch fails, automatically falls back to axios
|
|
839
879
|
|
|
840
880
|
### 🤖 Intelligent Mode vs Simple Mode
|
|
@@ -1017,7 +1057,7 @@ Combine canonical library code with project-specific patterns:
|
|
|
1017
1057
|
>
|
|
1018
1058
|
> **The `--crawl` instruction isn't marketing**—it actually uses Claude Code CLI to analyze each page and intelligently select which links to follow. I can tell it "crawl all API reference pages but skip blog posts" and it understands the intent.
|
|
1019
1059
|
>
|
|
1020
|
-
> For JavaScript-rendered sites (Next.js, React docs), the
|
|
1060
|
+
> For JavaScript-rendered sites (Next.js, React docs), the default headless mode renders pages with Playwright while I still control the crawl strategy with natural language. Use `--fast` when you need speed on static sites.
|
|
1021
1061
|
>
|
|
1022
1062
|
> ---
|
|
1023
1063
|
>
|
|
@@ -1056,22 +1096,22 @@ The plugin automatically checks for and attempts to install Python dependencies
|
|
|
1056
1096
|
|
|
1057
1097
|
**Required:**
|
|
1058
1098
|
- **🐍 Python 3.8+** - Required for web crawling functionality
|
|
1059
|
-
- **🕷️ crawl4ai** - Required for web crawling (auto-installed via SessionStart hook
|
|
1060
|
-
- **🎭 Playwright browser binaries** - Required for
|
|
1099
|
+
- **🕷️ crawl4ai** - Required for web crawling (auto-installed via SessionStart hook)
|
|
1100
|
+
- **🎭 Playwright browser binaries** - Required for default headless mode (auto-installed via SessionStart hook)
|
|
1061
1101
|
|
|
1062
1102
|
**What the SessionStart hook installs:**
|
|
1063
1103
|
- ✅ crawl4ai Python package (includes playwright as dependency)
|
|
1064
|
-
-
|
|
1104
|
+
- ✅ Playwright Chromium browser binaries (auto-installed after crawl4ai)
|
|
1065
1105
|
|
|
1066
|
-
If auto-installation
|
|
1106
|
+
If auto-installation fails, install manually:
|
|
1067
1107
|
|
|
1068
1108
|
```bash
|
|
1069
1109
|
pip install crawl4ai
|
|
1070
|
-
playwright install
|
|
1110
|
+
playwright install chromium
|
|
1071
1111
|
```
|
|
1072
1112
|
|
|
1073
|
-
> [!
|
|
1074
|
-
> The plugin will work without crawl4ai/playwright, but web crawling features (`/bluera-knowledge:crawl`) will be unavailable.
|
|
1113
|
+
> [!NOTE]
|
|
1114
|
+
> The plugin will work without crawl4ai/playwright, but web crawling features (`/bluera-knowledge:crawl`) will be unavailable. The default mode uses headless browser for maximum compatibility with JavaScript-rendered sites. Use `--fast` for static sites when speed is critical.
|
|
1075
1115
|
|
|
1076
1116
|
**Update Plugin:**
|
|
1077
1117
|
```bash
|
|
@@ -1163,6 +1203,7 @@ The plugin exposes 3 MCP tools optimized for minimal context overhead:
|
|
|
1163
1203
|
| `store:create` | `name`, `type`, `source`, `branch?`, `description?` | Create a new store |
|
|
1164
1204
|
| `store:index` | `store` | Re-index an existing store |
|
|
1165
1205
|
| `store:delete` | `store` | Delete a store and all data |
|
|
1206
|
+
| `stores:sync` | `dryRun?`, `prune?`, `reindex?` | Sync stores from definitions config |
|
|
1166
1207
|
| `jobs` | `activeOnly?`, `status?` | List background jobs |
|
|
1167
1208
|
| `job:status` | `jobId` | Check specific job status |
|
|
1168
1209
|
| `job:cancel` | `jobId` | Cancel a running job |
|
|
@@ -1350,11 +1391,42 @@ Knowledge stores are stored in your project root:
|
|
|
1350
1391
|
│ ├── repos/<store-id>/ # Cloned Git repositories
|
|
1351
1392
|
│ ├── documents_*.lance/ # Vector indices (Lance DB)
|
|
1352
1393
|
│ └── stores.json # Store registry
|
|
1394
|
+
├── stores.config.json # Store definitions (git-committable!)
|
|
1353
1395
|
└── config.json # Configuration
|
|
1354
1396
|
```
|
|
1355
1397
|
|
|
1356
|
-
|
|
1357
|
-
|
|
1398
|
+
### 📋 Store Definitions (Team Sharing)
|
|
1399
|
+
|
|
1400
|
+
Store definitions are automatically saved to `.bluera/bluera-knowledge/stores.config.json`. This file is designed to be **committed to git**, allowing teams to share store configurations.
|
|
1401
|
+
|
|
1402
|
+
**Example `stores.config.json`:**
|
|
1403
|
+
```json
|
|
1404
|
+
{
|
|
1405
|
+
"version": 1,
|
|
1406
|
+
"stores": [
|
|
1407
|
+
{ "type": "file", "name": "my-docs", "path": "./docs" },
|
|
1408
|
+
{ "type": "repo", "name": "react", "url": "https://github.com/facebook/react" },
|
|
1409
|
+
{ "type": "web", "name": "api-docs", "url": "https://api.example.com/docs", "depth": 2 }
|
|
1410
|
+
]
|
|
1411
|
+
}
|
|
1412
|
+
```
|
|
1413
|
+
|
|
1414
|
+
When a teammate clones the repo, they can run `/bluera-knowledge:sync` to recreate all stores locally.
|
|
1415
|
+
|
|
1416
|
+
### 🚫 Recommended `.gitignore` Patterns
|
|
1417
|
+
|
|
1418
|
+
When you first create a store, the plugin automatically updates your `.gitignore` with:
|
|
1419
|
+
|
|
1420
|
+
```gitignore
|
|
1421
|
+
# Bluera Knowledge - data directory (not committed)
|
|
1422
|
+
.bluera/
|
|
1423
|
+
!.bluera/bluera-knowledge/
|
|
1424
|
+
!.bluera/bluera-knowledge/stores.config.json
|
|
1425
|
+
```
|
|
1426
|
+
|
|
1427
|
+
This ensures:
|
|
1428
|
+
- Vector indices and cloned repos are **NOT committed** (they're large and can be recreated)
|
|
1429
|
+
- Store definitions **ARE committed** (small JSON file for team sharing)
|
|
1358
1430
|
|
|
1359
1431
|
---
|
|
1360
1432
|
|
package/commands/sync.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Sync stores from definitions config (bootstrap on fresh clone)
|
|
3
|
+
allowed-tools: ["mcp__bluera-knowledge__execute"]
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Sync Stores from Definitions
|
|
7
|
+
|
|
8
|
+
Sync stores from the git-committable definitions config. This is useful when:
|
|
9
|
+
- You've cloned a repo that has `.bluera/bluera-knowledge/stores.config.json`
|
|
10
|
+
- You want to recreate all stores defined by the team
|
|
11
|
+
- You want to check for orphan stores not in the config
|
|
12
|
+
|
|
13
|
+
## Steps
|
|
14
|
+
|
|
15
|
+
1. Use the mcp__bluera-knowledge__execute tool with command "stores:sync" to sync stores from definitions
|
|
16
|
+
|
|
17
|
+
Optional arguments:
|
|
18
|
+
- `dryRun: true` - Show what would happen without making changes
|
|
19
|
+
- `prune: true` - Remove stores not in definitions
|
|
20
|
+
- `reindex: true` - Re-index existing stores after sync
|
|
21
|
+
|
|
22
|
+
2. Present results in a structured format:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
## Sync Results
|
|
26
|
+
|
|
27
|
+
**Created**: 3 stores
|
|
28
|
+
- my-docs (file)
|
|
29
|
+
- react-source (repo)
|
|
30
|
+
- api-docs (web)
|
|
31
|
+
|
|
32
|
+
**Skipped** (already exist): 2 stores
|
|
33
|
+
- lodash
|
|
34
|
+
- typescript-docs
|
|
35
|
+
|
|
36
|
+
**Orphans** (not in definitions): 1 store
|
|
37
|
+
- old-unused-store
|
|
38
|
+
|
|
39
|
+
No errors occurred.
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Dry Run Mode
|
|
43
|
+
|
|
44
|
+
When using dry run, show what WOULD happen:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
## Sync Preview (Dry Run)
|
|
48
|
+
|
|
49
|
+
**Would create**: 3 stores
|
|
50
|
+
- my-docs (file)
|
|
51
|
+
- react-source (repo)
|
|
52
|
+
- api-docs (web)
|
|
53
|
+
|
|
54
|
+
**Would skip** (already exist): 2 stores
|
|
55
|
+
- lodash
|
|
56
|
+
- typescript-docs
|
|
57
|
+
|
|
58
|
+
**Orphans** (not in definitions): 1 store
|
|
59
|
+
- old-unused-store
|
|
60
|
+
|
|
61
|
+
To apply these changes, run without --dry-run
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## If No Definitions Found
|
|
65
|
+
|
|
66
|
+
If no store definitions config exists:
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
## No Store Definitions Found
|
|
70
|
+
|
|
71
|
+
The config file `.bluera/bluera-knowledge/stores.config.json` doesn't exist yet.
|
|
72
|
+
|
|
73
|
+
Store definitions are automatically created when you:
|
|
74
|
+
- Add a repo: `/bluera-knowledge:add-repo <url>`
|
|
75
|
+
- Add a folder: `/bluera-knowledge:add-folder <path>`
|
|
76
|
+
- Crawl a website: `/bluera-knowledge:crawl <url>`
|
|
77
|
+
|
|
78
|
+
The config file will be created automatically and can be committed to git for team sharing.
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Error Handling
|
|
82
|
+
|
|
83
|
+
If some stores fail to sync, report them individually:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
## Sync Results
|
|
87
|
+
|
|
88
|
+
**Created**: 2 stores
|
|
89
|
+
- my-docs
|
|
90
|
+
- api-docs
|
|
91
|
+
|
|
92
|
+
**Failed**: 1 store
|
|
93
|
+
- react-source: Directory does not exist: /path/to/repo
|
|
94
|
+
|
|
95
|
+
Continue to resolve the errors manually.
|
|
96
|
+
```
|
|
@@ -3,7 +3,7 @@ import {
|
|
|
3
3
|
createLogger,
|
|
4
4
|
summarizePayload,
|
|
5
5
|
truncateForLog
|
|
6
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-UE4ZIJYA.js";
|
|
7
7
|
|
|
8
8
|
// src/crawl/intelligent-crawler.ts
|
|
9
9
|
import { EventEmitter } from "events";
|
|
@@ -270,24 +270,30 @@ var ClaudeClient = class _ClaudeClient {
|
|
|
270
270
|
/**
|
|
271
271
|
* Determine which URLs to crawl based on natural language instruction
|
|
272
272
|
*
|
|
273
|
+
* @param seedUrl - The URL of the seed page (for resolving relative URLs)
|
|
273
274
|
* @param seedHtml - HTML content of the seed page
|
|
274
275
|
* @param instruction - Natural language crawl instruction (e.g., "scrape all Getting Started pages")
|
|
275
276
|
* @returns List of URLs to crawl with reasoning
|
|
276
277
|
*/
|
|
277
|
-
async determineCrawlUrls(seedHtml, instruction) {
|
|
278
|
+
async determineCrawlUrls(seedUrl, seedHtml, instruction) {
|
|
278
279
|
const prompt = `You are analyzing a webpage to determine which pages to crawl based on the user's instruction.
|
|
279
280
|
|
|
281
|
+
Base URL: ${seedUrl}
|
|
282
|
+
|
|
280
283
|
Instruction: ${instruction}
|
|
281
284
|
|
|
282
285
|
Webpage HTML (analyze the navigation structure, links, and content):
|
|
283
286
|
${this.truncateHtml(seedHtml, 5e4)}
|
|
284
287
|
|
|
285
|
-
Based on the instruction, extract and return a list of absolute URLs that should be crawled.
|
|
288
|
+
Based on the instruction, extract and return a list of absolute URLs that should be crawled. When you encounter relative URLs (starting with "/" or without a protocol), resolve them against the Base URL. For example, if Base URL is "https://example.com/docs" and you see href="/docs/hooks", return "https://example.com/docs/hooks".
|
|
289
|
+
|
|
290
|
+
Look for navigation menus, sidebars, headers, and link structures that match the instruction.
|
|
286
291
|
|
|
287
292
|
Return only URLs that are relevant to the instruction. If the instruction mentions specific sections (e.g., "Getting Started"), find links in those sections.`;
|
|
288
293
|
try {
|
|
289
294
|
const result = await this.callClaude(prompt, CRAWL_STRATEGY_SCHEMA);
|
|
290
|
-
const
|
|
295
|
+
const rawParsed = JSON.parse(result);
|
|
296
|
+
const parsed = this.extractStructuredOutput(rawParsed);
|
|
291
297
|
if (typeof parsed !== "object" || parsed === null || !("urls" in parsed) || !("reasoning" in parsed) || !Array.isArray(parsed.urls) || parsed.urls.length === 0 || typeof parsed.reasoning !== "string" || !parsed.urls.every((url) => typeof url === "string")) {
|
|
292
298
|
throw new Error("Claude returned invalid crawl strategy");
|
|
293
299
|
}
|
|
@@ -393,6 +399,26 @@ ${this.truncateMarkdown(markdown, 1e5)}`;
|
|
|
393
399
|
|
|
394
400
|
[... content truncated ...]`;
|
|
395
401
|
}
|
|
402
|
+
/**
|
|
403
|
+
* Type guard to check if value is a record (plain object)
|
|
404
|
+
*/
|
|
405
|
+
isRecord(value) {
|
|
406
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
407
|
+
}
|
|
408
|
+
/**
|
|
409
|
+
* Extract structured_output from Claude CLI wrapper format if present.
|
|
410
|
+
* Claude CLI with --json-schema returns: {type, result, structured_output: {...}}
|
|
411
|
+
* This method extracts the inner structured_output, or returns the raw value if not wrapped.
|
|
412
|
+
*/
|
|
413
|
+
extractStructuredOutput(rawParsed) {
|
|
414
|
+
if (this.isRecord(rawParsed) && "structured_output" in rawParsed) {
|
|
415
|
+
const structuredOutput = rawParsed["structured_output"];
|
|
416
|
+
if (typeof structuredOutput === "object") {
|
|
417
|
+
return structuredOutput;
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
return rawParsed;
|
|
421
|
+
}
|
|
396
422
|
};
|
|
397
423
|
|
|
398
424
|
// src/crawl/intelligent-crawler.ts
|
|
@@ -450,6 +476,16 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
450
476
|
},
|
|
451
477
|
"Crawl complete"
|
|
452
478
|
);
|
|
479
|
+
if (this.visited.size === 1 && maxPages > 1) {
|
|
480
|
+
const warningProgress = {
|
|
481
|
+
type: "error",
|
|
482
|
+
pagesVisited: this.visited.size,
|
|
483
|
+
totalPages: maxPages,
|
|
484
|
+
message: `Warning: Only crawled 1 page despite maxPages=${String(maxPages)}. Link discovery may have failed. If using --fast mode, try without it for JavaScript-heavy sites.`,
|
|
485
|
+
error: new Error("Low page discovery")
|
|
486
|
+
};
|
|
487
|
+
this.emit("progress", warningProgress);
|
|
488
|
+
}
|
|
453
489
|
const completeProgress = {
|
|
454
490
|
type: "complete",
|
|
455
491
|
pagesVisited: this.visited.size,
|
|
@@ -484,7 +520,7 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
484
520
|
};
|
|
485
521
|
this.emit("progress", strategyStartProgress);
|
|
486
522
|
const seedHtml = await this.fetchHtml(seedUrl, useHeadless);
|
|
487
|
-
strategy = await this.claudeClient.determineCrawlUrls(seedHtml, crawlInstruction);
|
|
523
|
+
strategy = await this.claudeClient.determineCrawlUrls(seedUrl, seedHtml, crawlInstruction);
|
|
488
524
|
const strategyCompleteProgress = {
|
|
489
525
|
type: "strategy",
|
|
490
526
|
pagesVisited: 0,
|
|
@@ -765,4 +801,4 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
765
801
|
export {
|
|
766
802
|
IntelligentCrawler
|
|
767
803
|
};
|
|
768
|
-
//# sourceMappingURL=chunk-
|
|
804
|
+
//# sourceMappingURL=chunk-6U45VP5Z.js.map
|