rag-forge-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. rag_forge_core-0.1.0/.gitignore +62 -0
  2. rag_forge_core-0.1.0/PKG-INFO +72 -0
  3. rag_forge_core-0.1.0/README.md +33 -0
  4. rag_forge_core-0.1.0/pyproject.toml +49 -0
  5. rag_forge_core-0.1.0/src/rag_forge_core/__init__.py +3 -0
  6. rag_forge_core-0.1.0/src/rag_forge_core/chunking/__init__.py +7 -0
  7. rag_forge_core-0.1.0/src/rag_forge_core/chunking/base.py +53 -0
  8. rag_forge_core-0.1.0/src/rag_forge_core/chunking/config.py +50 -0
  9. rag_forge_core-0.1.0/src/rag_forge_core/chunking/factory.py +71 -0
  10. rag_forge_core-0.1.0/src/rag_forge_core/chunking/fixed.py +87 -0
  11. rag_forge_core-0.1.0/src/rag_forge_core/chunking/llm_driven.py +163 -0
  12. rag_forge_core-0.1.0/src/rag_forge_core/chunking/recursive.py +128 -0
  13. rag_forge_core-0.1.0/src/rag_forge_core/chunking/semantic.py +143 -0
  14. rag_forge_core-0.1.0/src/rag_forge_core/chunking/structural.py +128 -0
  15. rag_forge_core-0.1.0/src/rag_forge_core/cli.py +664 -0
  16. rag_forge_core-0.1.0/src/rag_forge_core/context/__init__.py +23 -0
  17. rag_forge_core-0.1.0/src/rag_forge_core/context/cache_store.py +99 -0
  18. rag_forge_core-0.1.0/src/rag_forge_core/context/enricher.py +94 -0
  19. rag_forge_core-0.1.0/src/rag_forge_core/context/manager.py +45 -0
  20. rag_forge_core-0.1.0/src/rag_forge_core/context/semantic_cache.py +129 -0
  21. rag_forge_core-0.1.0/src/rag_forge_core/embedding/__init__.py +6 -0
  22. rag_forge_core-0.1.0/src/rag_forge_core/embedding/base.py +20 -0
  23. rag_forge_core-0.1.0/src/rag_forge_core/embedding/local_embedder.py +38 -0
  24. rag_forge_core-0.1.0/src/rag_forge_core/embedding/mock_embedder.py +33 -0
  25. rag_forge_core-0.1.0/src/rag_forge_core/embedding/openai_embedder.py +60 -0
  26. rag_forge_core-0.1.0/src/rag_forge_core/generation/__init__.py +6 -0
  27. rag_forge_core-0.1.0/src/rag_forge_core/generation/base.py +9 -0
  28. rag_forge_core-0.1.0/src/rag_forge_core/generation/claude_generator.py +37 -0
  29. rag_forge_core-0.1.0/src/rag_forge_core/generation/mock_generator.py +18 -0
  30. rag_forge_core-0.1.0/src/rag_forge_core/generation/openai_generator.py +38 -0
  31. rag_forge_core-0.1.0/src/rag_forge_core/ingestion/__init__.py +1 -0
  32. rag_forge_core-0.1.0/src/rag_forge_core/ingestion/pipeline.py +182 -0
  33. rag_forge_core-0.1.0/src/rag_forge_core/n8n_export.py +88 -0
  34. rag_forge_core-0.1.0/src/rag_forge_core/parsing/__init__.py +28 -0
  35. rag_forge_core-0.1.0/src/rag_forge_core/parsing/base.py +23 -0
  36. rag_forge_core-0.1.0/src/rag_forge_core/parsing/directory.py +67 -0
  37. rag_forge_core-0.1.0/src/rag_forge_core/parsing/html.py +56 -0
  38. rag_forge_core-0.1.0/src/rag_forge_core/parsing/markdown.py +74 -0
  39. rag_forge_core-0.1.0/src/rag_forge_core/parsing/pdf.py +41 -0
  40. rag_forge_core-0.1.0/src/rag_forge_core/parsing/plaintext.py +31 -0
  41. rag_forge_core-0.1.0/src/rag_forge_core/plugins/__init__.py +5 -0
  42. rag_forge_core-0.1.0/src/rag_forge_core/plugins/registry.py +100 -0
  43. rag_forge_core-0.1.0/src/rag_forge_core/query/__init__.py +5 -0
  44. rag_forge_core-0.1.0/src/rag_forge_core/query/agentic.py +162 -0
  45. rag_forge_core-0.1.0/src/rag_forge_core/query/engine.py +166 -0
  46. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/__init__.py +28 -0
  47. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/base.py +28 -0
  48. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/config.py +43 -0
  49. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/dense.py +46 -0
  50. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/hybrid.py +86 -0
  51. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/reranker.py +131 -0
  52. rag_forge_core-0.1.0/src/rag_forge_core/retrieval/sparse.py +143 -0
  53. rag_forge_core-0.1.0/src/rag_forge_core/security/__init__.py +50 -0
  54. rag_forge_core-0.1.0/src/rag_forge_core/security/adversarial.py +112 -0
  55. rag_forge_core-0.1.0/src/rag_forge_core/security/adversarial_corpus.json +47 -0
  56. rag_forge_core-0.1.0/src/rag_forge_core/security/citations.py +43 -0
  57. rag_forge_core-0.1.0/src/rag_forge_core/security/faithfulness.py +71 -0
  58. rag_forge_core-0.1.0/src/rag_forge_core/security/injection.py +90 -0
  59. rag_forge_core-0.1.0/src/rag_forge_core/security/input_guard.py +85 -0
  60. rag_forge_core-0.1.0/src/rag_forge_core/security/output_guard.py +102 -0
  61. rag_forge_core-0.1.0/src/rag_forge_core/security/pii.py +88 -0
  62. rag_forge_core-0.1.0/src/rag_forge_core/security/pii_scanner.py +46 -0
  63. rag_forge_core-0.1.0/src/rag_forge_core/security/rate_limiter.py +94 -0
  64. rag_forge_core-0.1.0/src/rag_forge_core/security/staleness.py +62 -0
  65. rag_forge_core-0.1.0/src/rag_forge_core/storage/__init__.py +6 -0
  66. rag_forge_core-0.1.0/src/rag_forge_core/storage/base.py +43 -0
  67. rag_forge_core-0.1.0/src/rag_forge_core/storage/qdrant.py +109 -0
  68. rag_forge_core-0.1.0/tests/conftest.py +1 -0
  69. rag_forge_core-0.1.0/tests/test_adversarial.py +61 -0
  70. rag_forge_core-0.1.0/tests/test_agentic_query.py +101 -0
  71. rag_forge_core-0.1.0/tests/test_cache_store.py +74 -0
  72. rag_forge_core-0.1.0/tests/test_cached_query.py +68 -0
  73. rag_forge_core-0.1.0/tests/test_chunker_factory.py +56 -0
  74. rag_forge_core-0.1.0/tests/test_chunking.py +80 -0
  75. rag_forge_core-0.1.0/tests/test_citations.py +42 -0
  76. rag_forge_core-0.1.0/tests/test_dense_retriever.py +74 -0
  77. rag_forge_core-0.1.0/tests/test_embedding.py +33 -0
  78. rag_forge_core-0.1.0/tests/test_enricher.py +112 -0
  79. rag_forge_core-0.1.0/tests/test_faithfulness.py +55 -0
  80. rag_forge_core-0.1.0/tests/test_fixed_chunker.py +62 -0
  81. rag_forge_core-0.1.0/tests/test_get_by_id.py +42 -0
  82. rag_forge_core-0.1.0/tests/test_hybrid_pipeline_integration.py +168 -0
  83. rag_forge_core-0.1.0/tests/test_hybrid_retriever.py +125 -0
  84. rag_forge_core-0.1.0/tests/test_injection.py +93 -0
  85. rag_forge_core-0.1.0/tests/test_input_guard.py +77 -0
  86. rag_forge_core-0.1.0/tests/test_instrumented_pipeline.py +96 -0
  87. rag_forge_core-0.1.0/tests/test_instrumented_query.py +79 -0
  88. rag_forge_core-0.1.0/tests/test_llm_driven_chunker.py +85 -0
  89. rag_forge_core-0.1.0/tests/test_n8n_export.py +37 -0
  90. rag_forge_core-0.1.0/tests/test_output_guard.py +75 -0
  91. rag_forge_core-0.1.0/tests/test_parse_chunk_cli.py +41 -0
  92. rag_forge_core-0.1.0/tests/test_parsing.py +135 -0
  93. rag_forge_core-0.1.0/tests/test_pii.py +76 -0
  94. rag_forge_core-0.1.0/tests/test_pii_scanner.py +50 -0
  95. rag_forge_core-0.1.0/tests/test_pipeline_integration.py +66 -0
  96. rag_forge_core-0.1.0/tests/test_plugin_registry.py +76 -0
  97. rag_forge_core-0.1.0/tests/test_query.py +83 -0
  98. rag_forge_core-0.1.0/tests/test_rate_limiter.py +74 -0
  99. rag_forge_core-0.1.0/tests/test_reranker.py +46 -0
  100. rag_forge_core-0.1.0/tests/test_retrieval_config.py +93 -0
  101. rag_forge_core-0.1.0/tests/test_security_integration.py +101 -0
  102. rag_forge_core-0.1.0/tests/test_semantic_cache.py +116 -0
  103. rag_forge_core-0.1.0/tests/test_semantic_chunker.py +93 -0
  104. rag_forge_core-0.1.0/tests/test_sparse_retriever.py +106 -0
  105. rag_forge_core-0.1.0/tests/test_staleness.py +66 -0
  106. rag_forge_core-0.1.0/tests/test_storage.py +48 -0
  107. rag_forge_core-0.1.0/tests/test_structural_chunker.py +68 -0
@@ -0,0 +1,62 @@
1
+ # Dependencies
2
+ node_modules/
3
+ .pnpm-store/
4
+
5
+ # Build outputs
6
+ dist/
7
+ build/
8
+ *.tsbuildinfo
9
+
10
+ # Turborepo
11
+ .turbo/
12
+
13
+ # Python
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.egg-info/
18
+ *.egg
19
+ .venv/
20
+ .python-version-local
21
+
22
+ # Python tools
23
+ .mypy_cache/
24
+ .ruff_cache/
25
+ .pytest_cache/
26
+ htmlcov/
27
+ .coverage
28
+ .coverage.*
29
+
30
+ # Environment variables
31
+ .env
32
+ .env.local
33
+ .env.*.local
34
+
35
+ # IDE
36
+ .vscode/
37
+ .idea/
38
+ *.swp
39
+ *.swo
40
+ *~
41
+
42
+ # OS
43
+ .DS_Store
44
+ Thumbs.db
45
+ desktop.ini
46
+
47
+ # Test & coverage
48
+ coverage/
49
+ *.lcov
50
+
51
+ # Logs
52
+ *.log
53
+ npm-debug.log*
54
+ pnpm-debug.log*
55
+
56
+ .claude/
57
+
58
+ # Next.js
59
+ apps/*/.next
60
+ apps/*/out
61
+ apps/*/next-env.d.ts
62
+ .vercel
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag-forge-core
3
+ Version: 0.1.0
4
+ Summary: RAG pipeline primitives: ingestion, retrieval, context management, and security
5
+ Project-URL: Homepage, https://github.com/hallengray/rag-forge
6
+ Project-URL: Repository, https://github.com/hallengray/rag-forge
7
+ Project-URL: Issues, https://github.com/hallengray/rag-forge/issues
8
+ Project-URL: Documentation, https://github.com/hallengray/rag-forge#readme
9
+ Author: Femi Adedayo
10
+ License-Expression: MIT
11
+ Keywords: chunking,embedding,llm,pipeline,rag,retrieval-augmented-generation
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: beautifulsoup4>=4.12
21
+ Requires-Dist: bm25s>=0.2
22
+ Requires-Dist: lxml>=5.0
23
+ Requires-Dist: openai>=1.30
24
+ Requires-Dist: opentelemetry-api>=1.20
25
+ Requires-Dist: pydantic>=2.0
26
+ Requires-Dist: pymupdf>=1.24
27
+ Requires-Dist: qdrant-client>=1.9
28
+ Requires-Dist: rich>=13.0
29
+ Requires-Dist: tiktoken>=0.7
30
+ Provides-Extra: cohere
31
+ Requires-Dist: cohere>=5.0; extra == 'cohere'
32
+ Provides-Extra: local
33
+ Requires-Dist: sentence-transformers>=3.0; extra == 'local'
34
+ Provides-Extra: presidio
35
+ Requires-Dist: presidio-analyzer>=2.2; extra == 'presidio'
36
+ Provides-Extra: redis
37
+ Requires-Dist: redis>=5.0; extra == 'redis'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # rag-forge-core
41
+
42
+ RAG pipeline primitives for the RAG-Forge toolkit: ingestion, chunking, retrieval, context management, and security.
43
+
44
+ ## Installation
45
+
46
+ ```bash
47
+ pip install rag-forge-core
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ This package provides the building blocks used by the `rag-forge` CLI. For end-user usage, see the [main RAG-Forge documentation](https://github.com/hallengray/rag-forge#readme).
53
+
54
+ ```python
55
+ from rag_forge_core.chunking.factory import create_chunker
56
+ from rag_forge_core.chunking.config import ChunkConfig
57
+
58
+ chunker = create_chunker(ChunkConfig(strategy="recursive", chunk_size=512))
59
+ chunks = chunker.chunk("Some long document text...", source="doc.md")
60
+ ```
61
+
62
+ ## Modules
63
+
64
+ - `rag_forge_core.chunking` — Five chunking strategies (recursive, fixed, semantic, structural, llm-driven)
65
+ - `rag_forge_core.retrieval` — Dense, sparse, and hybrid retrieval with reranking
66
+ - `rag_forge_core.security` — InputGuard, OutputGuard, PII scanning, prompt injection detection
67
+ - `rag_forge_core.context` — Contextual enrichment and semantic caching
68
+ - `rag_forge_core.plugins` — Plugin registry for custom extensions
69
+
70
+ ## License
71
+
72
+ MIT
@@ -0,0 +1,33 @@
1
+ # rag-forge-core
2
+
3
+ RAG pipeline primitives for the RAG-Forge toolkit: ingestion, chunking, retrieval, context management, and security.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install rag-forge-core
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ This package provides the building blocks used by the `rag-forge` CLI. For end-user usage, see the [main RAG-Forge documentation](https://github.com/hallengray/rag-forge#readme).
14
+
15
+ ```python
16
+ from rag_forge_core.chunking.factory import create_chunker
17
+ from rag_forge_core.chunking.config import ChunkConfig
18
+
19
+ chunker = create_chunker(ChunkConfig(strategy="recursive", chunk_size=512))
20
+ chunks = chunker.chunk("Some long document text...", source="doc.md")
21
+ ```
22
+
23
+ ## Modules
24
+
25
+ - `rag_forge_core.chunking` — Five chunking strategies (recursive, fixed, semantic, structural, llm-driven)
26
+ - `rag_forge_core.retrieval` — Dense, sparse, and hybrid retrieval with reranking
27
+ - `rag_forge_core.security` — InputGuard, OutputGuard, PII scanning, prompt injection detection
28
+ - `rag_forge_core.context` — Contextual enrichment and semantic caching
29
+ - `rag_forge_core.plugins` — Plugin registry for custom extensions
30
+
31
+ ## License
32
+
33
+ MIT
@@ -0,0 +1,49 @@
1
+ [project]
2
+ name = "rag-forge-core"
3
+ version = "0.1.0"
4
+ description = "RAG pipeline primitives: ingestion, retrieval, context management, and security"
5
+ requires-python = ">=3.11"
6
+ license = "MIT"
7
+ authors = [{ name = "Femi Adedayo" }]
8
+ keywords = ["rag", "retrieval-augmented-generation", "llm", "pipeline", "chunking", "embedding"]
9
+ classifiers = [
10
+ "Development Status :: 3 - Alpha",
11
+ "Intended Audience :: Developers",
12
+ "License :: OSI Approved :: MIT License",
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ ]
18
+ readme = "README.md"
19
+ dependencies = [
20
+ "pydantic>=2.0",
21
+ "rich>=13.0",
22
+ "tiktoken>=0.7",
23
+ "pymupdf>=1.24",
24
+ "beautifulsoup4>=4.12",
25
+ "lxml>=5.0",
26
+ "openai>=1.30",
27
+ "qdrant-client>=1.9",
28
+ "bm25s>=0.2",
29
+ "opentelemetry-api>=1.20",
30
+ ]
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/hallengray/rag-forge"
34
+ Repository = "https://github.com/hallengray/rag-forge"
35
+ Issues = "https://github.com/hallengray/rag-forge/issues"
36
+ Documentation = "https://github.com/hallengray/rag-forge#readme"
37
+
38
+ [project.optional-dependencies]
39
+ local = ["sentence-transformers>=3.0"]
40
+ cohere = ["cohere>=5.0"]
41
+ presidio = ["presidio-analyzer>=2.2"]
42
+ redis = ["redis>=5.0"]
43
+
44
+ [build-system]
45
+ requires = ["hatchling"]
46
+ build-backend = "hatchling.build"
47
+
48
+ [tool.hatch.build.targets.wheel]
49
+ packages = ["src/rag_forge_core"]
@@ -0,0 +1,3 @@
1
+ """RAG-Forge Core: RAG pipeline primitives for ingestion, retrieval, context, and security."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,7 @@
1
+ """Chunking strategies for document splitting."""
2
+
3
+ from rag_forge_core.chunking.base import ChunkStrategy
4
+ from rag_forge_core.chunking.config import ChunkConfig
5
+ from rag_forge_core.chunking.factory import UnsupportedStrategyError, create_chunker
6
+
7
+ __all__ = ["ChunkConfig", "ChunkStrategy", "UnsupportedStrategyError", "create_chunker"]
@@ -0,0 +1,53 @@
1
+ """Abstract base class for all chunking strategies."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+
6
+ from rag_forge_core.chunking.config import ChunkConfig
7
+
8
+
9
+ @dataclass
10
+ class Chunk:
11
+ """A single chunk of text with metadata."""
12
+
13
+ text: str
14
+ chunk_index: int
15
+ source_document: str
16
+ strategy_used: str
17
+ parent_section: str | None = None
18
+ overlap_tokens: int = 0
19
+ metadata: dict[str, str | int | float] | None = None
20
+
21
+
22
+ @dataclass
23
+ class ChunkStats:
24
+ """Statistics about a chunking operation."""
25
+
26
+ total_chunks: int
27
+ avg_chunk_size: int
28
+ min_chunk_size: int
29
+ max_chunk_size: int
30
+ total_tokens: int
31
+
32
+
33
+ class ChunkStrategy(ABC):
34
+ """Abstract base class that all chunking strategies must implement.
35
+
36
+ Ensures strategies are interchangeable and the evaluation engine
37
+ can compare performance across strategies on the same dataset.
38
+ """
39
+
40
+ def __init__(self, config: ChunkConfig) -> None:
41
+ self.config = config
42
+
43
+ @abstractmethod
44
+ def chunk(self, text: str, source: str) -> list[Chunk]:
45
+ """Split text into chunks according to the strategy."""
46
+
47
+ @abstractmethod
48
+ def preview(self, text: str, source: str) -> list[Chunk]:
49
+ """Dry-run: show chunk boundaries without committing to storage."""
50
+
51
+ @abstractmethod
52
+ def stats(self, chunks: list[Chunk]) -> ChunkStats:
53
+ """Compute statistics about the chunking result."""
@@ -0,0 +1,50 @@
1
+ """Chunk configuration with fail-fast validation."""
2
+
3
+ from typing import Self
4
+
5
+ from pydantic import BaseModel, Field, model_validator
6
+
7
+
8
+ class ChunkConfig(BaseModel):
9
+ """Configuration for chunking strategies. Validated at init time (fail-fast)."""
10
+
11
+ strategy: str = Field(
12
+ default="recursive",
13
+ pattern=r"^(fixed|recursive|semantic|structural|llm-driven)$",
14
+ description="Chunking strategy: fixed, recursive, semantic, structural, llm-driven",
15
+ )
16
+ chunk_size: int = Field(
17
+ default=512,
18
+ ge=64,
19
+ le=8192,
20
+ description="Target chunk size in tokens",
21
+ )
22
+ overlap_ratio: float = Field(
23
+ default=0.1,
24
+ ge=0.0,
25
+ le=0.5,
26
+ description="Overlap ratio between consecutive chunks (0.0 to 0.5)",
27
+ )
28
+ separators: list[str] = Field(
29
+ default_factory=lambda: ["\n\n", "\n", ". ", " "],
30
+ description="Separator hierarchy for recursive splitting",
31
+ )
32
+ cosine_threshold: float = Field(
33
+ default=0.75,
34
+ ge=0.0,
35
+ le=1.0,
36
+ description="Cosine similarity threshold for semantic chunking",
37
+ )
38
+
39
+ @model_validator(mode="after")
40
+ def validate_overlap(self) -> Self:
41
+ overlap_tokens = int(self.chunk_size * self.overlap_ratio)
42
+ if overlap_tokens >= self.chunk_size:
43
+ msg = f"Overlap ({overlap_tokens} tokens) must be less than chunk_size ({self.chunk_size})"
44
+ raise ValueError(msg)
45
+ return self
46
+
47
+ @property
48
+ def overlap_tokens(self) -> int:
49
+ """Calculate the overlap in tokens."""
50
+ return int(self.chunk_size * self.overlap_ratio)
@@ -0,0 +1,71 @@
1
+ """Factory function for creating chunker instances by strategy name."""
2
+
3
+ from rag_forge_core.chunking.base import ChunkStrategy
4
+ from rag_forge_core.chunking.config import ChunkConfig
5
+ from rag_forge_core.embedding.base import EmbeddingProvider
6
+ from rag_forge_core.generation.base import GenerationProvider
7
+
8
+
9
+ class UnsupportedStrategyError(ValueError):
10
+ """Raised when an unknown chunking strategy is requested."""
11
+
12
+
13
+ def create_chunker(
14
+ config: ChunkConfig,
15
+ embedder: EmbeddingProvider | None = None,
16
+ generator: GenerationProvider | None = None,
17
+ ) -> ChunkStrategy:
18
+ """Create a chunker instance for the given strategy.
19
+
20
+ Args:
21
+ config: Chunk configuration with strategy name.
22
+ embedder: Required for "semantic" strategy.
23
+ generator: Required for "llm-driven" strategy.
24
+
25
+ Returns:
26
+ A ChunkStrategy instance ready to use.
27
+
28
+ Raises:
29
+ ValueError: If a required dependency is missing.
30
+ UnsupportedStrategyError: If the strategy name is unknown.
31
+ """
32
+ strategy = config.strategy
33
+
34
+ if strategy == "recursive":
35
+ from rag_forge_core.chunking.recursive import RecursiveChunker
36
+
37
+ return RecursiveChunker(config)
38
+
39
+ if strategy == "fixed":
40
+ from rag_forge_core.chunking.fixed import FixedSizeChunker
41
+
42
+ return FixedSizeChunker(config)
43
+
44
+ if strategy == "structural":
45
+ from rag_forge_core.chunking.structural import StructuralChunker
46
+
47
+ return StructuralChunker(config)
48
+
49
+ if strategy == "semantic":
50
+ if embedder is None:
51
+ msg = "Semantic chunking requires an embedder. Pass embedder= to create_chunker()."
52
+ raise ValueError(msg)
53
+ from rag_forge_core.chunking.semantic import SemanticChunker
54
+
55
+ return SemanticChunker(config=config, embedder=embedder)
56
+
57
+ if strategy == "llm-driven":
58
+ if generator is None:
59
+ msg = (
60
+ "LLM-driven chunking requires a generator. "
61
+ "Pass generator= to create_chunker()."
62
+ )
63
+ raise ValueError(msg)
64
+ from rag_forge_core.chunking.llm_driven import LLMDrivenChunker
65
+
66
+ return LLMDrivenChunker(config=config, generator=generator)
67
+
68
+ raise UnsupportedStrategyError(
69
+ f"Unknown chunking strategy: {strategy!r}. "
70
+ "Supported: 'recursive', 'fixed', 'structural', 'semantic', 'llm-driven'."
71
+ )
@@ -0,0 +1,87 @@
1
+ """Fixed-size chunking strategy.
2
+
3
+ Splits text by token count with configurable overlap.
4
+ Best for structured data and baseline comparisons.
5
+ PRD default: 512 tokens, 10-20% overlap.
6
+ """
7
+
8
+ import tiktoken
9
+
10
+ from rag_forge_core.chunking.base import Chunk, ChunkStats, ChunkStrategy
11
+ from rag_forge_core.chunking.config import ChunkConfig
12
+
13
+ _ENCODING = tiktoken.get_encoding("cl100k_base")
14
+
15
+
16
+ def _token_count(text: str) -> int:
17
+ """Count tokens using tiktoken cl100k_base encoding."""
18
+ return len(_ENCODING.encode(text))
19
+
20
+
21
+ class FixedSizeChunker(ChunkStrategy):
22
+ """Split text into fixed-size token windows with overlap.
23
+
24
+ Each window is exactly `chunk_size` tokens (or fewer for the last window).
25
+ Consecutive windows share `overlap_tokens` tokens from the end of the
26
+ previous window, giving the model context across chunk boundaries.
27
+ """
28
+
29
+ def __init__(self, config: ChunkConfig | None = None) -> None:
30
+ super().__init__(config or ChunkConfig(strategy="fixed"))
31
+
32
+ def chunk(self, text: str, source: str) -> list[Chunk]:
33
+ """Split text into fixed-size token windows with overlap."""
34
+ if not text.strip():
35
+ return []
36
+
37
+ tokens = _ENCODING.encode(text)
38
+ chunk_size = self.config.chunk_size
39
+ overlap = self.config.overlap_tokens
40
+ step = max(1, chunk_size - overlap)
41
+
42
+ chunks: list[Chunk] = []
43
+ idx = 0
44
+ start = 0
45
+
46
+ while start < len(tokens):
47
+ end = min(start + chunk_size, len(tokens))
48
+ chunk_text = _ENCODING.decode(tokens[start:end])
49
+ chunks.append(
50
+ Chunk(
51
+ text=chunk_text,
52
+ chunk_index=idx,
53
+ source_document=source,
54
+ strategy_used="fixed",
55
+ overlap_tokens=overlap if idx > 0 else 0,
56
+ )
57
+ )
58
+ idx += 1
59
+ start += step
60
+ if end == len(tokens):
61
+ break
62
+
63
+ return chunks
64
+
65
+ def preview(self, text: str, source: str) -> list[Chunk]:
66
+ """Dry-run: show chunk boundaries without committing to storage."""
67
+ return self.chunk(text, source)
68
+
69
+ def stats(self, chunks: list[Chunk]) -> ChunkStats:
70
+ """Compute statistics using tiktoken token counts."""
71
+ if not chunks:
72
+ return ChunkStats(
73
+ total_chunks=0,
74
+ avg_chunk_size=0,
75
+ min_chunk_size=0,
76
+ max_chunk_size=0,
77
+ total_tokens=0,
78
+ )
79
+
80
+ sizes = [_token_count(c.text) for c in chunks]
81
+ return ChunkStats(
82
+ total_chunks=len(chunks),
83
+ avg_chunk_size=sum(sizes) // len(sizes),
84
+ min_chunk_size=min(sizes),
85
+ max_chunk_size=max(sizes),
86
+ total_tokens=sum(sizes),
87
+ )
@@ -0,0 +1,163 @@
1
+ """LLM-driven chunking strategy.
2
+
3
+ Uses a small LLM to identify meaningful boundary points in text.
4
+ The LLM receives numbered sentences and returns boundary indices as JSON.
5
+ Falls back to size-based splitting when the LLM response is unparseable.
6
+ PRD recommendation: Claude Haiku / GPT-4o-mini for cost efficiency.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+
12
+ import tiktoken
13
+
14
+ from rag_forge_core.chunking.base import Chunk, ChunkStats, ChunkStrategy
15
+ from rag_forge_core.chunking.config import ChunkConfig
16
+ from rag_forge_core.generation.base import GenerationProvider
17
+
18
+ _ENCODING = tiktoken.get_encoding("cl100k_base")
19
+ _LOG = logging.getLogger(__name__)
20
+
21
+ _BOUNDARY_PROMPT = """You are a document chunking assistant. Given the following numbered sentences, identify the indices where topic boundaries occur. A boundary means the content shifts to a different topic or subtopic.
22
+
23
+ Return a JSON array of sentence indices (0-based) where splits should happen. For example: [3, 7, 12] means split BEFORE sentences 3, 7, and 12.
24
+
25
+ If there are no clear boundaries, return an empty array: []
26
+
27
+ Sentences:
28
+ {sentences}"""
29
+
30
+
31
+ def _token_count(text: str) -> int:
32
+ return len(_ENCODING.encode(text))
33
+
34
+
35
+ def _split_into_sentences(text: str) -> list[str]:
36
+ """Split text into sentences for LLM analysis."""
37
+ # Normalise Windows line endings
38
+ text = text.replace("\r\n", "\n")
39
+ paragraphs = text.split("\n\n")
40
+ sentences: list[str] = []
41
+ for para in paragraphs:
42
+ para = para.strip()
43
+ if not para:
44
+ continue
45
+ parts = para.replace(". ", ".\n").replace("? ", "?\n").replace("! ", "!\n").split("\n")
46
+ for part in parts:
47
+ part = part.strip()
48
+ if part:
49
+ sentences.append(part)
50
+ return sentences
51
+
52
+
53
+ class LLMDrivenChunker(ChunkStrategy):
54
+ """Use an LLM to identify semantic boundaries in text.
55
+
56
+ Sends numbered sentences to the LLM and asks for boundary indices.
57
+ Falls back to size-based splitting on LLM failure or invalid response.
58
+ """
59
+
60
+ def __init__(self, config: ChunkConfig, generator: GenerationProvider) -> None:
61
+ super().__init__(config)
62
+ self._generator = generator
63
+
64
+ def chunk(self, text: str, source: str) -> list[Chunk]:
65
+ if not text.strip():
66
+ return []
67
+
68
+ sentences = _split_into_sentences(text)
69
+ if not sentences:
70
+ return []
71
+
72
+ if len(sentences) == 1:
73
+ return [
74
+ Chunk(
75
+ text=sentences[0],
76
+ chunk_index=0,
77
+ source_document=source,
78
+ strategy_used="llm-driven",
79
+ )
80
+ ]
81
+
82
+ boundaries = self._get_boundaries(sentences)
83
+ groups = self._apply_boundaries(sentences, boundaries)
84
+
85
+ return [
86
+ Chunk(
87
+ text=" ".join(group),
88
+ chunk_index=idx,
89
+ source_document=source,
90
+ strategy_used="llm-driven",
91
+ )
92
+ for idx, group in enumerate(groups)
93
+ ]
94
+
95
+ def preview(self, text: str, source: str) -> list[Chunk]:
96
+ return self.chunk(text, source)
97
+
98
+ def stats(self, chunks: list[Chunk]) -> ChunkStats:
99
+ if not chunks:
100
+ return ChunkStats(
101
+ total_chunks=0,
102
+ avg_chunk_size=0,
103
+ min_chunk_size=0,
104
+ max_chunk_size=0,
105
+ total_tokens=0,
106
+ )
107
+ sizes = [_token_count(c.text) for c in chunks]
108
+ return ChunkStats(
109
+ total_chunks=len(chunks),
110
+ avg_chunk_size=sum(sizes) // len(sizes),
111
+ min_chunk_size=min(sizes),
112
+ max_chunk_size=max(sizes),
113
+ total_tokens=sum(sizes),
114
+ )
115
+
116
+ def _get_boundaries(self, sentences: list[str]) -> list[int]:
117
+ """Ask the LLM for boundary indices. Returns sorted list of split points."""
118
+ numbered = "\n".join(f"[{i}] {s}" for i, s in enumerate(sentences))
119
+ prompt = _BOUNDARY_PROMPT.format(sentences=numbered)
120
+
121
+ try:
122
+ response = self._generator.generate(
123
+ system_prompt="You are a document analysis assistant. Respond only with valid JSON.",
124
+ user_prompt=prompt,
125
+ )
126
+ boundaries = json.loads(response)
127
+ if not isinstance(boundaries, list):
128
+ _LOG.warning(
129
+ "LLM returned non-list: %s, falling back to size-based splitting",
130
+ type(boundaries),
131
+ )
132
+ return self._fallback_boundaries(sentences)
133
+ # Empty list from LLM means "no boundaries" — return as-is (single group)
134
+ if len(boundaries) == 0:
135
+ return []
136
+ valid = sorted({int(b) for b in boundaries if 0 < int(b) < len(sentences)})
137
+ return valid if valid else self._fallback_boundaries(sentences)
138
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
139
+ _LOG.warning("LLM boundary parsing failed: %s, falling back", e)
140
+ return self._fallback_boundaries(sentences)
141
+
142
+ def _fallback_boundaries(self, sentences: list[str]) -> list[int]:
143
+ """Size-based fallback: split every chunk_size tokens."""
144
+ boundaries: list[int] = []
145
+ current_tokens = 0
146
+ for i, sentence in enumerate(sentences):
147
+ current_tokens += _token_count(sentence)
148
+ if current_tokens >= self.config.chunk_size and i > 0:
149
+ boundaries.append(i)
150
+ current_tokens = _token_count(sentence)
151
+ return boundaries
152
+
153
+ def _apply_boundaries(self, sentences: list[str], boundaries: list[int]) -> list[list[str]]:
154
+ """Split sentences into groups at the given boundary indices."""
155
+ if not boundaries:
156
+ return [sentences]
157
+ groups: list[list[str]] = []
158
+ prev = 0
159
+ for boundary in boundaries:
160
+ groups.append(sentences[prev:boundary])
161
+ prev = boundary
162
+ groups.append(sentences[prev:])
163
+ return [g for g in groups if g]