rag-forge-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rag_forge_core-0.1.0/.gitignore +62 -0
- rag_forge_core-0.1.0/PKG-INFO +72 -0
- rag_forge_core-0.1.0/README.md +33 -0
- rag_forge_core-0.1.0/pyproject.toml +49 -0
- rag_forge_core-0.1.0/src/rag_forge_core/__init__.py +3 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/__init__.py +7 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/base.py +53 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/config.py +50 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/factory.py +71 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/fixed.py +87 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/llm_driven.py +163 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/recursive.py +128 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/semantic.py +143 -0
- rag_forge_core-0.1.0/src/rag_forge_core/chunking/structural.py +128 -0
- rag_forge_core-0.1.0/src/rag_forge_core/cli.py +664 -0
- rag_forge_core-0.1.0/src/rag_forge_core/context/__init__.py +23 -0
- rag_forge_core-0.1.0/src/rag_forge_core/context/cache_store.py +99 -0
- rag_forge_core-0.1.0/src/rag_forge_core/context/enricher.py +94 -0
- rag_forge_core-0.1.0/src/rag_forge_core/context/manager.py +45 -0
- rag_forge_core-0.1.0/src/rag_forge_core/context/semantic_cache.py +129 -0
- rag_forge_core-0.1.0/src/rag_forge_core/embedding/__init__.py +6 -0
- rag_forge_core-0.1.0/src/rag_forge_core/embedding/base.py +20 -0
- rag_forge_core-0.1.0/src/rag_forge_core/embedding/local_embedder.py +38 -0
- rag_forge_core-0.1.0/src/rag_forge_core/embedding/mock_embedder.py +33 -0
- rag_forge_core-0.1.0/src/rag_forge_core/embedding/openai_embedder.py +60 -0
- rag_forge_core-0.1.0/src/rag_forge_core/generation/__init__.py +6 -0
- rag_forge_core-0.1.0/src/rag_forge_core/generation/base.py +9 -0
- rag_forge_core-0.1.0/src/rag_forge_core/generation/claude_generator.py +37 -0
- rag_forge_core-0.1.0/src/rag_forge_core/generation/mock_generator.py +18 -0
- rag_forge_core-0.1.0/src/rag_forge_core/generation/openai_generator.py +38 -0
- rag_forge_core-0.1.0/src/rag_forge_core/ingestion/__init__.py +1 -0
- rag_forge_core-0.1.0/src/rag_forge_core/ingestion/pipeline.py +182 -0
- rag_forge_core-0.1.0/src/rag_forge_core/n8n_export.py +88 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/__init__.py +28 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/base.py +23 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/directory.py +67 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/html.py +56 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/markdown.py +74 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/pdf.py +41 -0
- rag_forge_core-0.1.0/src/rag_forge_core/parsing/plaintext.py +31 -0
- rag_forge_core-0.1.0/src/rag_forge_core/plugins/__init__.py +5 -0
- rag_forge_core-0.1.0/src/rag_forge_core/plugins/registry.py +100 -0
- rag_forge_core-0.1.0/src/rag_forge_core/query/__init__.py +5 -0
- rag_forge_core-0.1.0/src/rag_forge_core/query/agentic.py +162 -0
- rag_forge_core-0.1.0/src/rag_forge_core/query/engine.py +166 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/__init__.py +28 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/base.py +28 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/config.py +43 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/dense.py +46 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/hybrid.py +86 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/reranker.py +131 -0
- rag_forge_core-0.1.0/src/rag_forge_core/retrieval/sparse.py +143 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/__init__.py +50 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/adversarial.py +112 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/adversarial_corpus.json +47 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/citations.py +43 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/faithfulness.py +71 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/injection.py +90 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/input_guard.py +85 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/output_guard.py +102 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/pii.py +88 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/pii_scanner.py +46 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/rate_limiter.py +94 -0
- rag_forge_core-0.1.0/src/rag_forge_core/security/staleness.py +62 -0
- rag_forge_core-0.1.0/src/rag_forge_core/storage/__init__.py +6 -0
- rag_forge_core-0.1.0/src/rag_forge_core/storage/base.py +43 -0
- rag_forge_core-0.1.0/src/rag_forge_core/storage/qdrant.py +109 -0
- rag_forge_core-0.1.0/tests/conftest.py +1 -0
- rag_forge_core-0.1.0/tests/test_adversarial.py +61 -0
- rag_forge_core-0.1.0/tests/test_agentic_query.py +101 -0
- rag_forge_core-0.1.0/tests/test_cache_store.py +74 -0
- rag_forge_core-0.1.0/tests/test_cached_query.py +68 -0
- rag_forge_core-0.1.0/tests/test_chunker_factory.py +56 -0
- rag_forge_core-0.1.0/tests/test_chunking.py +80 -0
- rag_forge_core-0.1.0/tests/test_citations.py +42 -0
- rag_forge_core-0.1.0/tests/test_dense_retriever.py +74 -0
- rag_forge_core-0.1.0/tests/test_embedding.py +33 -0
- rag_forge_core-0.1.0/tests/test_enricher.py +112 -0
- rag_forge_core-0.1.0/tests/test_faithfulness.py +55 -0
- rag_forge_core-0.1.0/tests/test_fixed_chunker.py +62 -0
- rag_forge_core-0.1.0/tests/test_get_by_id.py +42 -0
- rag_forge_core-0.1.0/tests/test_hybrid_pipeline_integration.py +168 -0
- rag_forge_core-0.1.0/tests/test_hybrid_retriever.py +125 -0
- rag_forge_core-0.1.0/tests/test_injection.py +93 -0
- rag_forge_core-0.1.0/tests/test_input_guard.py +77 -0
- rag_forge_core-0.1.0/tests/test_instrumented_pipeline.py +96 -0
- rag_forge_core-0.1.0/tests/test_instrumented_query.py +79 -0
- rag_forge_core-0.1.0/tests/test_llm_driven_chunker.py +85 -0
- rag_forge_core-0.1.0/tests/test_n8n_export.py +37 -0
- rag_forge_core-0.1.0/tests/test_output_guard.py +75 -0
- rag_forge_core-0.1.0/tests/test_parse_chunk_cli.py +41 -0
- rag_forge_core-0.1.0/tests/test_parsing.py +135 -0
- rag_forge_core-0.1.0/tests/test_pii.py +76 -0
- rag_forge_core-0.1.0/tests/test_pii_scanner.py +50 -0
- rag_forge_core-0.1.0/tests/test_pipeline_integration.py +66 -0
- rag_forge_core-0.1.0/tests/test_plugin_registry.py +76 -0
- rag_forge_core-0.1.0/tests/test_query.py +83 -0
- rag_forge_core-0.1.0/tests/test_rate_limiter.py +74 -0
- rag_forge_core-0.1.0/tests/test_reranker.py +46 -0
- rag_forge_core-0.1.0/tests/test_retrieval_config.py +93 -0
- rag_forge_core-0.1.0/tests/test_security_integration.py +101 -0
- rag_forge_core-0.1.0/tests/test_semantic_cache.py +116 -0
- rag_forge_core-0.1.0/tests/test_semantic_chunker.py +93 -0
- rag_forge_core-0.1.0/tests/test_sparse_retriever.py +106 -0
- rag_forge_core-0.1.0/tests/test_staleness.py +66 -0
- rag_forge_core-0.1.0/tests/test_storage.py +48 -0
- rag_forge_core-0.1.0/tests/test_structural_chunker.py +68 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Dependencies
|
|
2
|
+
node_modules/
|
|
3
|
+
.pnpm-store/
|
|
4
|
+
|
|
5
|
+
# Build outputs
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.tsbuildinfo
|
|
9
|
+
|
|
10
|
+
# Turborepo
|
|
11
|
+
.turbo/
|
|
12
|
+
|
|
13
|
+
# Python
|
|
14
|
+
__pycache__/
|
|
15
|
+
*.py[cod]
|
|
16
|
+
*$py.class
|
|
17
|
+
*.egg-info/
|
|
18
|
+
*.egg
|
|
19
|
+
.venv/
|
|
20
|
+
.python-version-local
|
|
21
|
+
|
|
22
|
+
# Python tools
|
|
23
|
+
.mypy_cache/
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
.pytest_cache/
|
|
26
|
+
htmlcov/
|
|
27
|
+
.coverage
|
|
28
|
+
.coverage.*
|
|
29
|
+
|
|
30
|
+
# Environment variables
|
|
31
|
+
.env
|
|
32
|
+
.env.local
|
|
33
|
+
.env.*.local
|
|
34
|
+
|
|
35
|
+
# IDE
|
|
36
|
+
.vscode/
|
|
37
|
+
.idea/
|
|
38
|
+
*.swp
|
|
39
|
+
*.swo
|
|
40
|
+
*~
|
|
41
|
+
|
|
42
|
+
# OS
|
|
43
|
+
.DS_Store
|
|
44
|
+
Thumbs.db
|
|
45
|
+
desktop.ini
|
|
46
|
+
|
|
47
|
+
# Test & coverage
|
|
48
|
+
coverage/
|
|
49
|
+
*.lcov
|
|
50
|
+
|
|
51
|
+
# Logs
|
|
52
|
+
*.log
|
|
53
|
+
npm-debug.log*
|
|
54
|
+
pnpm-debug.log*
|
|
55
|
+
|
|
56
|
+
.claude/
|
|
57
|
+
|
|
58
|
+
# Next.js
|
|
59
|
+
apps/*/.next
|
|
60
|
+
apps/*/out
|
|
61
|
+
apps/*/next-env.d.ts
|
|
62
|
+
.vercel
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rag-forge-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: RAG pipeline primitives: ingestion, retrieval, context management, and security
|
|
5
|
+
Project-URL: Homepage, https://github.com/hallengray/rag-forge
|
|
6
|
+
Project-URL: Repository, https://github.com/hallengray/rag-forge
|
|
7
|
+
Project-URL: Issues, https://github.com/hallengray/rag-forge/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/hallengray/rag-forge#readme
|
|
9
|
+
Author: Femi Adedayo
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
Keywords: chunking,embedding,llm,pipeline,rag,retrieval-augmented-generation
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
21
|
+
Requires-Dist: bm25s>=0.2
|
|
22
|
+
Requires-Dist: lxml>=5.0
|
|
23
|
+
Requires-Dist: openai>=1.30
|
|
24
|
+
Requires-Dist: opentelemetry-api>=1.20
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Requires-Dist: pymupdf>=1.24
|
|
27
|
+
Requires-Dist: qdrant-client>=1.9
|
|
28
|
+
Requires-Dist: rich>=13.0
|
|
29
|
+
Requires-Dist: tiktoken>=0.7
|
|
30
|
+
Provides-Extra: cohere
|
|
31
|
+
Requires-Dist: cohere>=5.0; extra == 'cohere'
|
|
32
|
+
Provides-Extra: local
|
|
33
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'local'
|
|
34
|
+
Provides-Extra: presidio
|
|
35
|
+
Requires-Dist: presidio-analyzer>=2.2; extra == 'presidio'
|
|
36
|
+
Provides-Extra: redis
|
|
37
|
+
Requires-Dist: redis>=5.0; extra == 'redis'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# rag-forge-core
|
|
41
|
+
|
|
42
|
+
RAG pipeline primitives for the RAG-Forge toolkit: ingestion, chunking, retrieval, context management, and security.
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install rag-forge-core
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
This package provides the building blocks used by the `rag-forge` CLI. For end-user usage, see the [main RAG-Forge documentation](https://github.com/hallengray/rag-forge#readme).
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from rag_forge_core.chunking.factory import create_chunker
|
|
56
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
57
|
+
|
|
58
|
+
chunker = create_chunker(ChunkConfig(strategy="recursive", chunk_size=512))
|
|
59
|
+
chunks = chunker.chunk("Some long document text...", source="doc.md")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Modules
|
|
63
|
+
|
|
64
|
+
- `rag_forge_core.chunking` — Five chunking strategies (recursive, fixed, semantic, structural, llm-driven)
|
|
65
|
+
- `rag_forge_core.retrieval` — Dense, sparse, and hybrid retrieval with reranking
|
|
66
|
+
- `rag_forge_core.security` — InputGuard, OutputGuard, PII scanning, prompt injection detection
|
|
67
|
+
- `rag_forge_core.context` — Contextual enrichment and semantic caching
|
|
68
|
+
- `rag_forge_core.plugins` — Plugin registry for custom extensions
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
MIT
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# rag-forge-core
|
|
2
|
+
|
|
3
|
+
RAG pipeline primitives for the RAG-Forge toolkit: ingestion, chunking, retrieval, context management, and security.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install rag-forge-core
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
This package provides the building blocks used by the `rag-forge` CLI. For end-user usage, see the [main RAG-Forge documentation](https://github.com/hallengray/rag-forge#readme).
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from rag_forge_core.chunking.factory import create_chunker
|
|
17
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
18
|
+
|
|
19
|
+
chunker = create_chunker(ChunkConfig(strategy="recursive", chunk_size=512))
|
|
20
|
+
chunks = chunker.chunk("Some long document text...", source="doc.md")
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Modules
|
|
24
|
+
|
|
25
|
+
- `rag_forge_core.chunking` — Five chunking strategies (recursive, fixed, semantic, structural, llm-driven)
|
|
26
|
+
- `rag_forge_core.retrieval` — Dense, sparse, and hybrid retrieval with reranking
|
|
27
|
+
- `rag_forge_core.security` — InputGuard, OutputGuard, PII scanning, prompt injection detection
|
|
28
|
+
- `rag_forge_core.context` — Contextual enrichment and semantic caching
|
|
29
|
+
- `rag_forge_core.plugins` — Plugin registry for custom extensions
|
|
30
|
+
|
|
31
|
+
## License
|
|
32
|
+
|
|
33
|
+
MIT
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rag-forge-core"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "RAG pipeline primitives: ingestion, retrieval, context management, and security"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
authors = [{ name = "Femi Adedayo" }]
|
|
8
|
+
keywords = ["rag", "retrieval-augmented-generation", "llm", "pipeline", "chunking", "embedding"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 3 - Alpha",
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"License :: OSI Approved :: MIT License",
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
]
|
|
18
|
+
readme = "README.md"
|
|
19
|
+
dependencies = [
|
|
20
|
+
"pydantic>=2.0",
|
|
21
|
+
"rich>=13.0",
|
|
22
|
+
"tiktoken>=0.7",
|
|
23
|
+
"pymupdf>=1.24",
|
|
24
|
+
"beautifulsoup4>=4.12",
|
|
25
|
+
"lxml>=5.0",
|
|
26
|
+
"openai>=1.30",
|
|
27
|
+
"qdrant-client>=1.9",
|
|
28
|
+
"bm25s>=0.2",
|
|
29
|
+
"opentelemetry-api>=1.20",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/hallengray/rag-forge"
|
|
34
|
+
Repository = "https://github.com/hallengray/rag-forge"
|
|
35
|
+
Issues = "https://github.com/hallengray/rag-forge/issues"
|
|
36
|
+
Documentation = "https://github.com/hallengray/rag-forge#readme"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
local = ["sentence-transformers>=3.0"]
|
|
40
|
+
cohere = ["cohere>=5.0"]
|
|
41
|
+
presidio = ["presidio-analyzer>=2.2"]
|
|
42
|
+
redis = ["redis>=5.0"]
|
|
43
|
+
|
|
44
|
+
[build-system]
|
|
45
|
+
requires = ["hatchling"]
|
|
46
|
+
build-backend = "hatchling.build"
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["src/rag_forge_core"]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Chunking strategies for document splitting."""
|
|
2
|
+
|
|
3
|
+
from rag_forge_core.chunking.base import ChunkStrategy
|
|
4
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
5
|
+
from rag_forge_core.chunking.factory import UnsupportedStrategyError, create_chunker
|
|
6
|
+
|
|
7
|
+
__all__ = ["ChunkConfig", "ChunkStrategy", "UnsupportedStrategyError", "create_chunker"]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Abstract base class for all chunking strategies."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Chunk:
|
|
11
|
+
"""A single chunk of text with metadata."""
|
|
12
|
+
|
|
13
|
+
text: str
|
|
14
|
+
chunk_index: int
|
|
15
|
+
source_document: str
|
|
16
|
+
strategy_used: str
|
|
17
|
+
parent_section: str | None = None
|
|
18
|
+
overlap_tokens: int = 0
|
|
19
|
+
metadata: dict[str, str | int | float] | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ChunkStats:
|
|
24
|
+
"""Statistics about a chunking operation."""
|
|
25
|
+
|
|
26
|
+
total_chunks: int
|
|
27
|
+
avg_chunk_size: int
|
|
28
|
+
min_chunk_size: int
|
|
29
|
+
max_chunk_size: int
|
|
30
|
+
total_tokens: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ChunkStrategy(ABC):
|
|
34
|
+
"""Abstract base class that all chunking strategies must implement.
|
|
35
|
+
|
|
36
|
+
Ensures strategies are interchangeable and the evaluation engine
|
|
37
|
+
can compare performance across strategies on the same dataset.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, config: ChunkConfig) -> None:
|
|
41
|
+
self.config = config
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def chunk(self, text: str, source: str) -> list[Chunk]:
|
|
45
|
+
"""Split text into chunks according to the strategy."""
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def preview(self, text: str, source: str) -> list[Chunk]:
|
|
49
|
+
"""Dry-run: show chunk boundaries without committing to storage."""
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def stats(self, chunks: list[Chunk]) -> ChunkStats:
|
|
53
|
+
"""Compute statistics about the chunking result."""
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Chunk configuration with fail-fast validation."""
|
|
2
|
+
|
|
3
|
+
from typing import Self
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ChunkConfig(BaseModel):
|
|
9
|
+
"""Configuration for chunking strategies. Validated at init time (fail-fast)."""
|
|
10
|
+
|
|
11
|
+
strategy: str = Field(
|
|
12
|
+
default="recursive",
|
|
13
|
+
pattern=r"^(fixed|recursive|semantic|structural|llm-driven)$",
|
|
14
|
+
description="Chunking strategy: fixed, recursive, semantic, structural, llm-driven",
|
|
15
|
+
)
|
|
16
|
+
chunk_size: int = Field(
|
|
17
|
+
default=512,
|
|
18
|
+
ge=64,
|
|
19
|
+
le=8192,
|
|
20
|
+
description="Target chunk size in tokens",
|
|
21
|
+
)
|
|
22
|
+
overlap_ratio: float = Field(
|
|
23
|
+
default=0.1,
|
|
24
|
+
ge=0.0,
|
|
25
|
+
le=0.5,
|
|
26
|
+
description="Overlap ratio between consecutive chunks (0.0 to 0.5)",
|
|
27
|
+
)
|
|
28
|
+
separators: list[str] = Field(
|
|
29
|
+
default_factory=lambda: ["\n\n", "\n", ". ", " "],
|
|
30
|
+
description="Separator hierarchy for recursive splitting",
|
|
31
|
+
)
|
|
32
|
+
cosine_threshold: float = Field(
|
|
33
|
+
default=0.75,
|
|
34
|
+
ge=0.0,
|
|
35
|
+
le=1.0,
|
|
36
|
+
description="Cosine similarity threshold for semantic chunking",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@model_validator(mode="after")
|
|
40
|
+
def validate_overlap(self) -> Self:
|
|
41
|
+
overlap_tokens = int(self.chunk_size * self.overlap_ratio)
|
|
42
|
+
if overlap_tokens >= self.chunk_size:
|
|
43
|
+
msg = f"Overlap ({overlap_tokens} tokens) must be less than chunk_size ({self.chunk_size})"
|
|
44
|
+
raise ValueError(msg)
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def overlap_tokens(self) -> int:
|
|
49
|
+
"""Calculate the overlap in tokens."""
|
|
50
|
+
return int(self.chunk_size * self.overlap_ratio)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Factory function for creating chunker instances by strategy name."""
|
|
2
|
+
|
|
3
|
+
from rag_forge_core.chunking.base import ChunkStrategy
|
|
4
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
5
|
+
from rag_forge_core.embedding.base import EmbeddingProvider
|
|
6
|
+
from rag_forge_core.generation.base import GenerationProvider
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UnsupportedStrategyError(ValueError):
|
|
10
|
+
"""Raised when an unknown chunking strategy is requested."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_chunker(
|
|
14
|
+
config: ChunkConfig,
|
|
15
|
+
embedder: EmbeddingProvider | None = None,
|
|
16
|
+
generator: GenerationProvider | None = None,
|
|
17
|
+
) -> ChunkStrategy:
|
|
18
|
+
"""Create a chunker instance for the given strategy.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
config: Chunk configuration with strategy name.
|
|
22
|
+
embedder: Required for "semantic" strategy.
|
|
23
|
+
generator: Required for "llm-driven" strategy.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
A ChunkStrategy instance ready to use.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If a required dependency is missing.
|
|
30
|
+
UnsupportedStrategyError: If the strategy name is unknown.
|
|
31
|
+
"""
|
|
32
|
+
strategy = config.strategy
|
|
33
|
+
|
|
34
|
+
if strategy == "recursive":
|
|
35
|
+
from rag_forge_core.chunking.recursive import RecursiveChunker
|
|
36
|
+
|
|
37
|
+
return RecursiveChunker(config)
|
|
38
|
+
|
|
39
|
+
if strategy == "fixed":
|
|
40
|
+
from rag_forge_core.chunking.fixed import FixedSizeChunker
|
|
41
|
+
|
|
42
|
+
return FixedSizeChunker(config)
|
|
43
|
+
|
|
44
|
+
if strategy == "structural":
|
|
45
|
+
from rag_forge_core.chunking.structural import StructuralChunker
|
|
46
|
+
|
|
47
|
+
return StructuralChunker(config)
|
|
48
|
+
|
|
49
|
+
if strategy == "semantic":
|
|
50
|
+
if embedder is None:
|
|
51
|
+
msg = "Semantic chunking requires an embedder. Pass embedder= to create_chunker()."
|
|
52
|
+
raise ValueError(msg)
|
|
53
|
+
from rag_forge_core.chunking.semantic import SemanticChunker
|
|
54
|
+
|
|
55
|
+
return SemanticChunker(config=config, embedder=embedder)
|
|
56
|
+
|
|
57
|
+
if strategy == "llm-driven":
|
|
58
|
+
if generator is None:
|
|
59
|
+
msg = (
|
|
60
|
+
"LLM-driven chunking requires a generator. "
|
|
61
|
+
"Pass generator= to create_chunker()."
|
|
62
|
+
)
|
|
63
|
+
raise ValueError(msg)
|
|
64
|
+
from rag_forge_core.chunking.llm_driven import LLMDrivenChunker
|
|
65
|
+
|
|
66
|
+
return LLMDrivenChunker(config=config, generator=generator)
|
|
67
|
+
|
|
68
|
+
raise UnsupportedStrategyError(
|
|
69
|
+
f"Unknown chunking strategy: {strategy!r}. "
|
|
70
|
+
"Supported: 'recursive', 'fixed', 'structural', 'semantic', 'llm-driven'."
|
|
71
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Fixed-size chunking strategy.
|
|
2
|
+
|
|
3
|
+
Splits text by token count with configurable overlap.
|
|
4
|
+
Best for structured data and baseline comparisons.
|
|
5
|
+
PRD default: 512 tokens, 10-20% overlap.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import tiktoken
|
|
9
|
+
|
|
10
|
+
from rag_forge_core.chunking.base import Chunk, ChunkStats, ChunkStrategy
|
|
11
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
12
|
+
|
|
13
|
+
_ENCODING = tiktoken.get_encoding("cl100k_base")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _token_count(text: str) -> int:
|
|
17
|
+
"""Count tokens using tiktoken cl100k_base encoding."""
|
|
18
|
+
return len(_ENCODING.encode(text))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FixedSizeChunker(ChunkStrategy):
|
|
22
|
+
"""Split text into fixed-size token windows with overlap.
|
|
23
|
+
|
|
24
|
+
Each window is exactly `chunk_size` tokens (or fewer for the last window).
|
|
25
|
+
Consecutive windows share `overlap_tokens` tokens from the end of the
|
|
26
|
+
previous window, giving the model context across chunk boundaries.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: ChunkConfig | None = None) -> None:
|
|
30
|
+
super().__init__(config or ChunkConfig(strategy="fixed"))
|
|
31
|
+
|
|
32
|
+
def chunk(self, text: str, source: str) -> list[Chunk]:
|
|
33
|
+
"""Split text into fixed-size token windows with overlap."""
|
|
34
|
+
if not text.strip():
|
|
35
|
+
return []
|
|
36
|
+
|
|
37
|
+
tokens = _ENCODING.encode(text)
|
|
38
|
+
chunk_size = self.config.chunk_size
|
|
39
|
+
overlap = self.config.overlap_tokens
|
|
40
|
+
step = max(1, chunk_size - overlap)
|
|
41
|
+
|
|
42
|
+
chunks: list[Chunk] = []
|
|
43
|
+
idx = 0
|
|
44
|
+
start = 0
|
|
45
|
+
|
|
46
|
+
while start < len(tokens):
|
|
47
|
+
end = min(start + chunk_size, len(tokens))
|
|
48
|
+
chunk_text = _ENCODING.decode(tokens[start:end])
|
|
49
|
+
chunks.append(
|
|
50
|
+
Chunk(
|
|
51
|
+
text=chunk_text,
|
|
52
|
+
chunk_index=idx,
|
|
53
|
+
source_document=source,
|
|
54
|
+
strategy_used="fixed",
|
|
55
|
+
overlap_tokens=overlap if idx > 0 else 0,
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
idx += 1
|
|
59
|
+
start += step
|
|
60
|
+
if end == len(tokens):
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
return chunks
|
|
64
|
+
|
|
65
|
+
def preview(self, text: str, source: str) -> list[Chunk]:
|
|
66
|
+
"""Dry-run: show chunk boundaries without committing to storage."""
|
|
67
|
+
return self.chunk(text, source)
|
|
68
|
+
|
|
69
|
+
def stats(self, chunks: list[Chunk]) -> ChunkStats:
|
|
70
|
+
"""Compute statistics using tiktoken token counts."""
|
|
71
|
+
if not chunks:
|
|
72
|
+
return ChunkStats(
|
|
73
|
+
total_chunks=0,
|
|
74
|
+
avg_chunk_size=0,
|
|
75
|
+
min_chunk_size=0,
|
|
76
|
+
max_chunk_size=0,
|
|
77
|
+
total_tokens=0,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
sizes = [_token_count(c.text) for c in chunks]
|
|
81
|
+
return ChunkStats(
|
|
82
|
+
total_chunks=len(chunks),
|
|
83
|
+
avg_chunk_size=sum(sizes) // len(sizes),
|
|
84
|
+
min_chunk_size=min(sizes),
|
|
85
|
+
max_chunk_size=max(sizes),
|
|
86
|
+
total_tokens=sum(sizes),
|
|
87
|
+
)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""LLM-driven chunking strategy.
|
|
2
|
+
|
|
3
|
+
Uses a small LLM to identify meaningful boundary points in text.
|
|
4
|
+
The LLM receives numbered sentences and returns boundary indices as JSON.
|
|
5
|
+
Falls back to size-based splitting when the LLM response is unparseable.
|
|
6
|
+
PRD recommendation: Claude Haiku / GPT-4o-mini for cost efficiency.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
import tiktoken
|
|
13
|
+
|
|
14
|
+
from rag_forge_core.chunking.base import Chunk, ChunkStats, ChunkStrategy
|
|
15
|
+
from rag_forge_core.chunking.config import ChunkConfig
|
|
16
|
+
from rag_forge_core.generation.base import GenerationProvider
|
|
17
|
+
|
|
18
|
+
_ENCODING = tiktoken.get_encoding("cl100k_base")
|
|
19
|
+
_LOG = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_BOUNDARY_PROMPT = """You are a document chunking assistant. Given the following numbered sentences, identify the indices where topic boundaries occur. A boundary means the content shifts to a different topic or subtopic.
|
|
22
|
+
|
|
23
|
+
Return a JSON array of sentence indices (0-based) where splits should happen. For example: [3, 7, 12] means split BEFORE sentences 3, 7, and 12.
|
|
24
|
+
|
|
25
|
+
If there are no clear boundaries, return an empty array: []
|
|
26
|
+
|
|
27
|
+
Sentences:
|
|
28
|
+
{sentences}"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _token_count(text: str) -> int:
|
|
32
|
+
return len(_ENCODING.encode(text))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _split_into_sentences(text: str) -> list[str]:
|
|
36
|
+
"""Split text into sentences for LLM analysis."""
|
|
37
|
+
# Normalise Windows line endings
|
|
38
|
+
text = text.replace("\r\n", "\n")
|
|
39
|
+
paragraphs = text.split("\n\n")
|
|
40
|
+
sentences: list[str] = []
|
|
41
|
+
for para in paragraphs:
|
|
42
|
+
para = para.strip()
|
|
43
|
+
if not para:
|
|
44
|
+
continue
|
|
45
|
+
parts = para.replace(". ", ".\n").replace("? ", "?\n").replace("! ", "!\n").split("\n")
|
|
46
|
+
for part in parts:
|
|
47
|
+
part = part.strip()
|
|
48
|
+
if part:
|
|
49
|
+
sentences.append(part)
|
|
50
|
+
return sentences
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class LLMDrivenChunker(ChunkStrategy):
|
|
54
|
+
"""Use an LLM to identify semantic boundaries in text.
|
|
55
|
+
|
|
56
|
+
Sends numbered sentences to the LLM and asks for boundary indices.
|
|
57
|
+
Falls back to size-based splitting on LLM failure or invalid response.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self, config: ChunkConfig, generator: GenerationProvider) -> None:
|
|
61
|
+
super().__init__(config)
|
|
62
|
+
self._generator = generator
|
|
63
|
+
|
|
64
|
+
def chunk(self, text: str, source: str) -> list[Chunk]:
|
|
65
|
+
if not text.strip():
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
sentences = _split_into_sentences(text)
|
|
69
|
+
if not sentences:
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
if len(sentences) == 1:
|
|
73
|
+
return [
|
|
74
|
+
Chunk(
|
|
75
|
+
text=sentences[0],
|
|
76
|
+
chunk_index=0,
|
|
77
|
+
source_document=source,
|
|
78
|
+
strategy_used="llm-driven",
|
|
79
|
+
)
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
boundaries = self._get_boundaries(sentences)
|
|
83
|
+
groups = self._apply_boundaries(sentences, boundaries)
|
|
84
|
+
|
|
85
|
+
return [
|
|
86
|
+
Chunk(
|
|
87
|
+
text=" ".join(group),
|
|
88
|
+
chunk_index=idx,
|
|
89
|
+
source_document=source,
|
|
90
|
+
strategy_used="llm-driven",
|
|
91
|
+
)
|
|
92
|
+
for idx, group in enumerate(groups)
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
def preview(self, text: str, source: str) -> list[Chunk]:
|
|
96
|
+
return self.chunk(text, source)
|
|
97
|
+
|
|
98
|
+
def stats(self, chunks: list[Chunk]) -> ChunkStats:
|
|
99
|
+
if not chunks:
|
|
100
|
+
return ChunkStats(
|
|
101
|
+
total_chunks=0,
|
|
102
|
+
avg_chunk_size=0,
|
|
103
|
+
min_chunk_size=0,
|
|
104
|
+
max_chunk_size=0,
|
|
105
|
+
total_tokens=0,
|
|
106
|
+
)
|
|
107
|
+
sizes = [_token_count(c.text) for c in chunks]
|
|
108
|
+
return ChunkStats(
|
|
109
|
+
total_chunks=len(chunks),
|
|
110
|
+
avg_chunk_size=sum(sizes) // len(sizes),
|
|
111
|
+
min_chunk_size=min(sizes),
|
|
112
|
+
max_chunk_size=max(sizes),
|
|
113
|
+
total_tokens=sum(sizes),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _get_boundaries(self, sentences: list[str]) -> list[int]:
|
|
117
|
+
"""Ask the LLM for boundary indices. Returns sorted list of split points."""
|
|
118
|
+
numbered = "\n".join(f"[{i}] {s}" for i, s in enumerate(sentences))
|
|
119
|
+
prompt = _BOUNDARY_PROMPT.format(sentences=numbered)
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
response = self._generator.generate(
|
|
123
|
+
system_prompt="You are a document analysis assistant. Respond only with valid JSON.",
|
|
124
|
+
user_prompt=prompt,
|
|
125
|
+
)
|
|
126
|
+
boundaries = json.loads(response)
|
|
127
|
+
if not isinstance(boundaries, list):
|
|
128
|
+
_LOG.warning(
|
|
129
|
+
"LLM returned non-list: %s, falling back to size-based splitting",
|
|
130
|
+
type(boundaries),
|
|
131
|
+
)
|
|
132
|
+
return self._fallback_boundaries(sentences)
|
|
133
|
+
# Empty list from LLM means "no boundaries" — return as-is (single group)
|
|
134
|
+
if len(boundaries) == 0:
|
|
135
|
+
return []
|
|
136
|
+
valid = sorted({int(b) for b in boundaries if 0 < int(b) < len(sentences)})
|
|
137
|
+
return valid if valid else self._fallback_boundaries(sentences)
|
|
138
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
|
139
|
+
_LOG.warning("LLM boundary parsing failed: %s, falling back", e)
|
|
140
|
+
return self._fallback_boundaries(sentences)
|
|
141
|
+
|
|
142
|
+
def _fallback_boundaries(self, sentences: list[str]) -> list[int]:
|
|
143
|
+
"""Size-based fallback: split every chunk_size tokens."""
|
|
144
|
+
boundaries: list[int] = []
|
|
145
|
+
current_tokens = 0
|
|
146
|
+
for i, sentence in enumerate(sentences):
|
|
147
|
+
current_tokens += _token_count(sentence)
|
|
148
|
+
if current_tokens >= self.config.chunk_size and i > 0:
|
|
149
|
+
boundaries.append(i)
|
|
150
|
+
current_tokens = _token_count(sentence)
|
|
151
|
+
return boundaries
|
|
152
|
+
|
|
153
|
+
def _apply_boundaries(self, sentences: list[str], boundaries: list[int]) -> list[list[str]]:
|
|
154
|
+
"""Split sentences into groups at the given boundary indices."""
|
|
155
|
+
if not boundaries:
|
|
156
|
+
return [sentences]
|
|
157
|
+
groups: list[list[str]] = []
|
|
158
|
+
prev = 0
|
|
159
|
+
for boundary in boundaries:
|
|
160
|
+
groups.append(sentences[prev:boundary])
|
|
161
|
+
prev = boundary
|
|
162
|
+
groups.append(sentences[prev:])
|
|
163
|
+
return [g for g in groups if g]
|