chunkrank 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkrank-0.2.0/LICENCE +21 -0
- chunkrank-0.2.0/PKG-INFO +104 -0
- chunkrank-0.2.0/README.md +82 -0
- chunkrank-0.2.0/chunkrank/__init__.py +8 -0
- chunkrank-0.2.0/chunkrank/answerers.py +37 -0
- chunkrank-0.2.0/chunkrank/chunker.py +93 -0
- chunkrank-0.2.0/chunkrank/models.py +31 -0
- chunkrank-0.2.0/chunkrank/pipeline.py +14 -0
- chunkrank-0.2.0/chunkrank/ranker.py +56 -0
- chunkrank-0.2.0/chunkrank/registry/__init__.py +0 -0
- chunkrank-0.2.0/chunkrank/registry/model_registry.json +149 -0
- chunkrank-0.2.0/chunkrank/tokenizers.py +40 -0
- chunkrank-0.2.0/chunkrank/utils/__init__.py +0 -0
- chunkrank-0.2.0/pyproject.toml +25 -0
chunkrank-0.2.0/LICENCE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Amit Nautiyal
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
chunkrank-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunkrank
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Model-Aware Chunking + Answer Ranking
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENCE
|
|
7
|
+
Author: Your Name
|
|
8
|
+
Author-email: you@example.com
|
|
9
|
+
Requires-Python: >=3.14,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
|
+
Requires-Dist: deptry (>=0.24.0,<0.25.0)
|
|
14
|
+
Requires-Dist: mypy (>=1.19.1,<2.0.0)
|
|
15
|
+
Requires-Dist: numpy (>=1.26)
|
|
16
|
+
Requires-Dist: rank-bm25 (>=0.2.2)
|
|
17
|
+
Requires-Dist: regex (>=2023.10.3)
|
|
18
|
+
Requires-Dist: ruff (>=0.14.10,<0.15.0)
|
|
19
|
+
Requires-Dist: scikit-learn (>=1.5)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# ChunkRank: Model-Aware Chunking + Answer Ranking
|
|
23
|
+
|
|
24
|
+
## Problem
|
|
25
|
+
When using LLMs, text often exceeds the model’s context window.
|
|
26
|
+
To handle this, text must be **chunked** into pieces that fit the model’s maximum token length.
|
|
27
|
+
|
|
28
|
+
Two challenges arise:
|
|
29
|
+
1. **Model-aware chunking**
|
|
30
|
+
Each model (OpenAI, Anthropic, Llama, Gemini, t5, Bert, BigBert, LangBert etc.) has a different context length and tokenizer.
|
|
31
|
+
Current libraries require users to manually configure chunk sizes; no unified library automatically adapts to the chosen model.
|
|
32
|
+
|
|
33
|
+
2. **Answer consolidation & ranking**
|
|
34
|
+
Once text is chunked, a query may return multiple answers from different chunks.
|
|
35
|
+
A **ranking step** is needed to decide the best, most relevant answer.
|
|
36
|
+
Existing solutions (e.g., RAG frameworks) combine retrieval + generation, but there’s no standalone library that couples **chunking** and **answer re-ranking**.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Existing Libraries & Gaps
|
|
41
|
+
|
|
42
|
+
### Chunking
|
|
43
|
+
- **LangChain Text Splitters** → Token-based, works with `tiktoken`, but requires manual chunk size config.
|
|
44
|
+
- **LlamaIndex `TokenTextSplitter`** → Similar functionality, manual sizing.
|
|
45
|
+
- **Haystack `PreProcessor`** → Can split by tokens, overlap supported, but not model-aware by default.
|
|
46
|
+
- **semantic-text-splitter / semchunk** → Standalone, supports tiktoken/HF tokenizers, still needs user-specified chunk length.
|
|
47
|
+
|
|
48
|
+
**Gap:** None of these libraries automatically map a model → tokenizer → context window → chunk size.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
### Ranking
|
|
53
|
+
- **pygaggle** (Waterloo CAST) → neural re-ranker.
|
|
54
|
+
- **Tevatron** → dense retrieval + re-ranking toolkit.
|
|
55
|
+
- **Pyserini** (with pygaggle) → BM25 + neural re-rankers.
|
|
56
|
+
- **Haystack, LlamaIndex** → include ranking in RAG pipelines.
|
|
57
|
+
|
|
58
|
+
**Gap:** Ranking exists, but **not combined with chunking** in a single, simple package.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## What We Want to Build
|
|
63
|
+
A standalone Python library that:
|
|
64
|
+
|
|
65
|
+
1. **Model-Aware Chunking**
|
|
66
|
+
- User specifies a model name (e.g., `gpt-4o-mini`, `claude-3.5-sonnet`, `Llama-3.1-8B`).
|
|
67
|
+
- Library looks up the model’s max context window and tokenizer.
|
|
68
|
+
- Automatically chunks text into model-compatible pieces with optional overlap and reserve space.
|
|
69
|
+
|
|
70
|
+
2. **Answer Consolidation & Ranking**
|
|
71
|
+
- Given multiple answers from chunks, apply a re-ranking step to select the best one.
|
|
72
|
+
- Should integrate with existing ranking models (cross-encoder, bi-encoder, BM25 + re-ranker).
|
|
73
|
+
- Should work standalone, without needing a full RAG pipeline.
|
|
74
|
+
|
|
75
|
+
3. **Unified Workflow**
|
|
76
|
+
- `chunks = chunkrank.split(text, model="gpt-4o-mini")`
|
|
77
|
+
- `answers = chunkrank.answer(question, chunks)`
|
|
78
|
+
- `best = chunkrank.rank(answers)`
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Vision
|
|
83
|
+
- Lightweight, model-agnostic utility library.
|
|
84
|
+
- Bridges the gap between **text preparation** (chunking) and **answer quality** (ranking).
|
|
85
|
+
- Complements existing RAG frameworks but can also work independently.
|
|
86
|
+
- Easy to drop into pipelines: preprocessing for QA, summarization, or information extraction.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Next Steps
|
|
91
|
+
1. Build the **model registry** (model → context window + tokenizer).
|
|
92
|
+
2. Implement **chunking strategies** (tokens, sentences, paragraphs).
|
|
93
|
+
3. Integrate a **re-ranking engine** (start with Hugging Face cross-encoder).
|
|
94
|
+
4. Package and release to PyPI with a simple API.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Community
|
|
99
|
+
|
|
100
|
+
- [Contributors](CONTRIBUTORS.md)
|
|
101
|
+
- [Maintainers](MAINTAINERS.md)
|
|
102
|
+
- [Contributing Guidelines](CONTRIBUTING.md)
|
|
103
|
+
|
|
104
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# ChunkRank: Model-Aware Chunking + Answer Ranking
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
When using LLMs, text often exceeds the model’s context window.
|
|
5
|
+
To handle this, text must be **chunked** into pieces that fit the model’s maximum token length.
|
|
6
|
+
|
|
7
|
+
Two challenges arise:
|
|
8
|
+
1. **Model-aware chunking**
|
|
9
|
+
Each model (OpenAI, Anthropic, Llama, Gemini, t5, Bert, BigBert, LangBert etc.) has a different context length and tokenizer.
|
|
10
|
+
Current libraries require users to manually configure chunk sizes; no unified library automatically adapts to the chosen model.
|
|
11
|
+
|
|
12
|
+
2. **Answer consolidation & ranking**
|
|
13
|
+
Once text is chunked, a query may return multiple answers from different chunks.
|
|
14
|
+
A **ranking step** is needed to decide the best, most relevant answer.
|
|
15
|
+
Existing solutions (e.g., RAG frameworks) combine retrieval + generation, but there’s no standalone library that couples **chunking** and **answer re-ranking**.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Existing Libraries & Gaps
|
|
20
|
+
|
|
21
|
+
### Chunking
|
|
22
|
+
- **LangChain Text Splitters** → Token-based, works with `tiktoken`, but requires manual chunk size config.
|
|
23
|
+
- **LlamaIndex `TokenTextSplitter`** → Similar functionality, manual sizing.
|
|
24
|
+
- **Haystack `PreProcessor`** → Can split by tokens, overlap supported, but not model-aware by default.
|
|
25
|
+
- **semantic-text-splitter / semchunk** → Standalone, supports tiktoken/HF tokenizers, still needs user-specified chunk length.
|
|
26
|
+
|
|
27
|
+
**Gap:** None of these libraries automatically map a model → tokenizer → context window → chunk size.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
### Ranking
|
|
32
|
+
- **pygaggle** (Waterloo CAST) → neural re-ranker.
|
|
33
|
+
- **Tevatron** → dense retrieval + re-ranking toolkit.
|
|
34
|
+
- **Pyserini** (with pygaggle) → BM25 + neural re-rankers.
|
|
35
|
+
- **Haystack, LlamaIndex** → include ranking in RAG pipelines.
|
|
36
|
+
|
|
37
|
+
**Gap:** Ranking exists, but **not combined with chunking** in a single, simple package.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## What We Want to Build
|
|
42
|
+
A standalone Python library that:
|
|
43
|
+
|
|
44
|
+
1. **Model-Aware Chunking**
|
|
45
|
+
- User specifies a model name (e.g., `gpt-4o-mini`, `claude-3.5-sonnet`, `Llama-3.1-8B`).
|
|
46
|
+
- Library looks up the model’s max context window and tokenizer.
|
|
47
|
+
- Automatically chunks text into model-compatible pieces with optional overlap and reserve space.
|
|
48
|
+
|
|
49
|
+
2. **Answer Consolidation & Ranking**
|
|
50
|
+
- Given multiple answers from chunks, apply a re-ranking step to select the best one.
|
|
51
|
+
- Should integrate with existing ranking models (cross-encoder, bi-encoder, BM25 + re-ranker).
|
|
52
|
+
- Should work standalone, without needing a full RAG pipeline.
|
|
53
|
+
|
|
54
|
+
3. **Unified Workflow**
|
|
55
|
+
- `chunks = chunkrank.split(text, model="gpt-4o-mini")`
|
|
56
|
+
- `answers = chunkrank.answer(question, chunks)`
|
|
57
|
+
- `best = chunkrank.rank(answers)`
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Vision
|
|
62
|
+
- Lightweight, model-agnostic utility library.
|
|
63
|
+
- Bridges the gap between **text preparation** (chunking) and **answer quality** (ranking).
|
|
64
|
+
- Complements existing RAG frameworks but can also work independently.
|
|
65
|
+
- Easy to drop into pipelines: preprocessing for QA, summarization, or information extraction.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Next Steps
|
|
70
|
+
1. Build the **model registry** (model → context window + tokenizer).
|
|
71
|
+
2. Implement **chunking strategies** (tokens, sentences, paragraphs).
|
|
72
|
+
3. Integrate a **re-ranking engine** (start with Hugging Face cross-encoder).
|
|
73
|
+
4. Package and release to PyPI with a simple API.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Community
|
|
78
|
+
|
|
79
|
+
- [Contributors](CONTRIBUTORS.md)
|
|
80
|
+
- [Maintainers](MAINTAINERS.md)
|
|
81
|
+
- [Contributing Guidelines](CONTRIBUTING.md)
|
|
82
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Optional, Tuple
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class LocalExtractiveAnswerer:
|
|
9
|
+
min_overlap: int = 2
|
|
10
|
+
|
|
11
|
+
def answer(self, question: str, context: str) -> str:
|
|
12
|
+
sentences = _split_sentences(context)
|
|
13
|
+
if not sentences:
|
|
14
|
+
return ""
|
|
15
|
+
|
|
16
|
+
q_words = _norm_words(question)
|
|
17
|
+
best: Tuple[str, int] = ("", 0)
|
|
18
|
+
|
|
19
|
+
for s in sentences:
|
|
20
|
+
s_words = _norm_words(s)
|
|
21
|
+
overlap = len(q_words.intersection(s_words))
|
|
22
|
+
if overlap > best[1]:
|
|
23
|
+
best = (s, overlap)
|
|
24
|
+
|
|
25
|
+
if best[1] < self.min_overlap:
|
|
26
|
+
return ""
|
|
27
|
+
return best[0].strip()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _split_sentences(text: str) -> List[str]:
|
|
31
|
+
parts = re.split(r"(?<=[.!?])\s+", text.strip())
|
|
32
|
+
return [p.strip() for p in parts if p.strip()]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _norm_words(text: str) -> set[str]:
|
|
36
|
+
words = re.findall(r"[A-Za-z0-9']+", text.lower())
|
|
37
|
+
return set(words)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List, Literal, Optional
|
|
5
|
+
|
|
6
|
+
from .models import get_model_info
|
|
7
|
+
from .tokenizers import build_tokenizer
|
|
8
|
+
|
|
9
|
+
Strategy = Literal["tokens"] # keep it simple for now
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ChunkerConfig:
|
|
14
|
+
model: str
|
|
15
|
+
strategy: Strategy = "tokens"
|
|
16
|
+
overlap_tokens: int = 0
|
|
17
|
+
reserve_tokens: Optional[int] = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Chunker:
|
|
21
|
+
def __init__(self, config: ChunkerConfig):
|
|
22
|
+
info = get_model_info(config.model)
|
|
23
|
+
|
|
24
|
+
reserve = config.reserve_tokens if config.reserve_tokens is not None else info.default_reserve
|
|
25
|
+
self.window = max(1, info.max_context - max(0, reserve))
|
|
26
|
+
|
|
27
|
+
self.overlap = max(0, config.overlap_tokens)
|
|
28
|
+
if self.overlap >= self.window:
|
|
29
|
+
raise ValueError("overlap_tokens must be < usable window size")
|
|
30
|
+
|
|
31
|
+
self.strategy = config.strategy
|
|
32
|
+
self.tok = build_tokenizer(info.tokenizer, info.tokenizer_id)
|
|
33
|
+
|
|
34
|
+
def split(self, text: str) -> List[str]:
|
|
35
|
+
if not isinstance(text, str) or not text:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
if self.strategy != "tokens":
|
|
39
|
+
raise NotImplementedError("Only 'tokens' strategy is implemented in this version.")
|
|
40
|
+
|
|
41
|
+
return list(self._chunk_by_token_budget(text))
|
|
42
|
+
|
|
43
|
+
def _chunk_by_token_budget(self, text: str):
|
|
44
|
+
"""
|
|
45
|
+
Robust approach: grow a slice until token budget reached, then emit slice.
|
|
46
|
+
Avoids needing tokenizer.decode() (so no None chunks).
|
|
47
|
+
"""
|
|
48
|
+
start = 0
|
|
49
|
+
n = len(text)
|
|
50
|
+
|
|
51
|
+
# Fast path: already fits
|
|
52
|
+
if self.tok.count(text) <= self.window:
|
|
53
|
+
yield text
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
# Character-based upper bound for initial probe (roughly 4 chars/token)
|
|
57
|
+
approx_chars = max(64, self.window * 4)
|
|
58
|
+
|
|
59
|
+
while start < n:
|
|
60
|
+
end = min(n, start + approx_chars)
|
|
61
|
+
chunk = text[start:end]
|
|
62
|
+
|
|
63
|
+
# If too big, shrink
|
|
64
|
+
while end > start and self.tok.count(chunk) > self.window:
|
|
65
|
+
end = start + max(1, (end - start) * 9 // 10)
|
|
66
|
+
chunk = text[start:end]
|
|
67
|
+
|
|
68
|
+
# If somehow cannot shrink (pathological), force a minimal progress
|
|
69
|
+
if end <= start:
|
|
70
|
+
end = min(n, start + 200)
|
|
71
|
+
chunk = text[start:end]
|
|
72
|
+
|
|
73
|
+
yield chunk
|
|
74
|
+
|
|
75
|
+
if end >= n:
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
# overlap handling (approx char backoff)
|
|
79
|
+
if self.overlap > 0:
|
|
80
|
+
backoff_chars = self.overlap * 4
|
|
81
|
+
start = max(0, end - backoff_chars)
|
|
82
|
+
else:
|
|
83
|
+
start = end
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def chunk_text(
|
|
87
|
+
text: str,
|
|
88
|
+
model: str,
|
|
89
|
+
overlap_tokens: int = 0,
|
|
90
|
+
reserve_tokens: Optional[int] = None,
|
|
91
|
+
) -> List[str]:
|
|
92
|
+
cfg = ChunkerConfig(model=model, overlap_tokens=overlap_tokens, reserve_tokens=reserve_tokens)
|
|
93
|
+
return Chunker(cfg).split(text)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import importlib.resources
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ModelInfo:
|
|
9
|
+
name: str
|
|
10
|
+
max_context: int
|
|
11
|
+
tokenizer: Optional[str]
|
|
12
|
+
tokenizer_id: Optional[str]
|
|
13
|
+
default_reserve: int = 256
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_registry() -> Dict[str, ModelInfo]:
|
|
17
|
+
"Loads the model registry from the json file"
|
|
18
|
+
with importlib.resources.open_text("chunkrank.registry", "model_registry.json") as file:
|
|
19
|
+
data = json.load(file)
|
|
20
|
+
return {k: ModelInfo(**v) for k, v in data.items()}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_model_info(model: str) -> ModelInfo:
|
|
24
|
+
registry = load_registry()
|
|
25
|
+
if model in registry:
|
|
26
|
+
return registry[model]
|
|
27
|
+
return ModelInfo(model,
|
|
28
|
+
128_000,
|
|
29
|
+
"tiktoken",
|
|
30
|
+
"o200k_base",
|
|
31
|
+
512)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .chunker import Chunker, ChunkerConfig
|
|
2
|
+
from .ranker import Ranker
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ChunkRankPipeline:
|
|
6
|
+
def __init__(self, model: str):
|
|
7
|
+
self.chunker = Chunker(ChunkerConfig(model=model))
|
|
8
|
+
self.ranker = Ranker()
|
|
9
|
+
|
|
10
|
+
def process(self, question: str, text: str) -> str:
|
|
11
|
+
chunks = self.chunker.split(text)
|
|
12
|
+
answers = [f"Answer from chunk {i}" for i, _ in enumerate(chunks, 1)] # placeholder
|
|
13
|
+
best = self.ranker.rank(question, answers)[0][0]
|
|
14
|
+
return best
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
4
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
5
|
+
from rank_bm25 import BM25Okapi
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Ranker:
|
|
9
|
+
|
|
10
|
+
def __init__(self, method: str = "bm25"):
|
|
11
|
+
self.method = method
|
|
12
|
+
|
|
13
|
+
def rank(self, question: str, answers: List[str]) -> List[Tuple[str, float]]:
|
|
14
|
+
clean = [a for a in answers if isinstance(a, str) and a.strip()]
|
|
15
|
+
if not clean:
|
|
16
|
+
return []
|
|
17
|
+
|
|
18
|
+
if self.method == "tfidf":
|
|
19
|
+
return self._rank_tfidf(question, clean)
|
|
20
|
+
elif self.method == "bm25":
|
|
21
|
+
return self._rank_bm25(question, clean)
|
|
22
|
+
else:
|
|
23
|
+
raise ValueError(f"Unknown ranking method: {self.method}")
|
|
24
|
+
|
|
25
|
+
def _rank_tfidf(self, question: str, answers: List[str]) -> List[Tuple[str, float]]:
|
|
26
|
+
vectorizer = TfidfVectorizer(stop_words="english")
|
|
27
|
+
corpus = [question] + answers
|
|
28
|
+
vectors = vectorizer.fit_transform(corpus)
|
|
29
|
+
|
|
30
|
+
q_vec = vectors[0]
|
|
31
|
+
a_vecs = vectors[1:]
|
|
32
|
+
|
|
33
|
+
scores = cosine_similarity(q_vec, a_vecs)[0]
|
|
34
|
+
return sorted(zip(answers, scores), key=lambda x: x[1], reverse=True)
|
|
35
|
+
|
|
36
|
+
def _rank_bm25(self, question: str, answers: List[str]) -> List[Tuple[str, float]]:
|
|
37
|
+
tokenized_answers = [a.split() for a in answers if a and a.strip()]
|
|
38
|
+
if not tokenized_answers:
|
|
39
|
+
return []
|
|
40
|
+
bm25 = BM25Okapi(tokenized_answers)
|
|
41
|
+
|
|
42
|
+
q_tokens = question.split()
|
|
43
|
+
scores = bm25.get_scores(q_tokens)
|
|
44
|
+
return sorted(zip(answers, scores), key=lambda x: x[1], reverse=True)
|
|
45
|
+
|
|
46
|
+
def rank_texts(self, query: str, texts: List[str]) -> List[Tuple[str, float]]:
|
|
47
|
+
"""
|
|
48
|
+
Rank raw texts (chunks) against a query.
|
|
49
|
+
"""
|
|
50
|
+
return self.rank(query, texts)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def rank_answers(question: str, answers: List[str], method: str = "bm25") -> str:
|
|
54
|
+
ranker = Ranker(method=method)
|
|
55
|
+
ranked = ranker.rank(question, answers)
|
|
56
|
+
return ranked[0][0]
|
|
File without changes
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
{
|
|
2
|
+
"gpt-4o-mini": {
|
|
3
|
+
"name": "gpt-4o-mini",
|
|
4
|
+
"max_context": 128000,
|
|
5
|
+
"tokenizer": "tiktoken",
|
|
6
|
+
"tokenizer_id": "o200k_base",
|
|
7
|
+
"default_reserve": 512
|
|
8
|
+
},
|
|
9
|
+
"claude-3-5-sonnet": {
|
|
10
|
+
"name": "claude-3-5-sonnet",
|
|
11
|
+
"max_context": 200000,
|
|
12
|
+
"tokenizer": "tiktoken",
|
|
13
|
+
"tokenizer_id": "o200k_base",
|
|
14
|
+
"default_reserve": 512
|
|
15
|
+
},
|
|
16
|
+
"Llama-3.1-8B": {
|
|
17
|
+
"name": "Llama-3.1-8B",
|
|
18
|
+
"max_context": 128000,
|
|
19
|
+
"tokenizer": "hf",
|
|
20
|
+
"tokenizer_id": "meta-llama/Llama-3.1-8B",
|
|
21
|
+
"default_reserve": 512
|
|
22
|
+
},
|
|
23
|
+
"Llama-3.1-70B": {
|
|
24
|
+
"name": "Llama-3.1-70B",
|
|
25
|
+
"max_context": 128000,
|
|
26
|
+
"tokenizer": "hf",
|
|
27
|
+
"tokenizer_id": "meta-llama/Llama-3.1-70B",
|
|
28
|
+
"default_reserve": 1024
|
|
29
|
+
},
|
|
30
|
+
"mistral-7b": {
|
|
31
|
+
"name": "Mistral-7B",
|
|
32
|
+
"max_context": 32768,
|
|
33
|
+
"tokenizer": "hf",
|
|
34
|
+
"tokenizer_id": "mistralai/Mistral-7B-v0.2",
|
|
35
|
+
"default_reserve": 512
|
|
36
|
+
},
|
|
37
|
+
"mixtral-8x7b": {
|
|
38
|
+
"name": "Mixtral-8x7B",
|
|
39
|
+
"max_context": 65536,
|
|
40
|
+
"tokenizer": "hf",
|
|
41
|
+
"tokenizer_id": "mistralai/Mixtral-8x7B-Instruct",
|
|
42
|
+
"default_reserve": 1024
|
|
43
|
+
},
|
|
44
|
+
"gpt-neo-2.7B": {
|
|
45
|
+
"name": "GPT-Neo-2.7B",
|
|
46
|
+
"max_context": 2048,
|
|
47
|
+
"tokenizer": "hf",
|
|
48
|
+
"tokenizer_id": "EleutherAI/gpt-neo-2.7B",
|
|
49
|
+
"default_reserve": 128
|
|
50
|
+
},
|
|
51
|
+
"gpt-j-6B": {
|
|
52
|
+
"name": "GPT-J-6B",
|
|
53
|
+
"max_context": 4096,
|
|
54
|
+
"tokenizer": "hf",
|
|
55
|
+
"tokenizer_id": "EleutherAI/gpt-j-6B",
|
|
56
|
+
"default_reserve": 256
|
|
57
|
+
},
|
|
58
|
+
"bert-base-uncased": {
|
|
59
|
+
"name": "BERT Base Uncased",
|
|
60
|
+
"max_context": 512,
|
|
61
|
+
"tokenizer": "hf",
|
|
62
|
+
"tokenizer_id": "bert-base-uncased",
|
|
63
|
+
"default_reserve": 64
|
|
64
|
+
},
|
|
65
|
+
"bert-large-uncased": {
|
|
66
|
+
"name": "BERT Large Uncased",
|
|
67
|
+
"max_context": 512,
|
|
68
|
+
"tokenizer": "hf",
|
|
69
|
+
"tokenizer_id": "bert-large-uncased",
|
|
70
|
+
"default_reserve": 64
|
|
71
|
+
},
|
|
72
|
+
"distilbert-base-uncased": {
|
|
73
|
+
"name": "DistilBERT Base Uncased",
|
|
74
|
+
"max_context": 512,
|
|
75
|
+
"tokenizer": "hf",
|
|
76
|
+
"tokenizer_id": "distilbert-base-uncased",
|
|
77
|
+
"default_reserve": 64
|
|
78
|
+
},
|
|
79
|
+
"bigbird-roberta-base": {
|
|
80
|
+
"name": "BigBird RoBERTa Base",
|
|
81
|
+
"max_context": 4096,
|
|
82
|
+
"tokenizer": "hf",
|
|
83
|
+
"tokenizer_id": "google/bigbird-roberta-base",
|
|
84
|
+
"default_reserve": 256
|
|
85
|
+
},
|
|
86
|
+
"bigbird-roberta-large": {
|
|
87
|
+
"name": "BigBird RoBERTa Large",
|
|
88
|
+
"max_context": 4096,
|
|
89
|
+
"tokenizer": "hf",
|
|
90
|
+
"tokenizer_id": "google/bigbird-roberta-large",
|
|
91
|
+
"default_reserve": 256
|
|
92
|
+
},
|
|
93
|
+
"longformer-base-4096": {
|
|
94
|
+
"name": "Longformer Base 4096",
|
|
95
|
+
"max_context": 4096,
|
|
96
|
+
"tokenizer": "hf",
|
|
97
|
+
"tokenizer_id": "allenai/longformer-base-4096",
|
|
98
|
+
"default_reserve": 256
|
|
99
|
+
},
|
|
100
|
+
"longformer-large-4096": {
|
|
101
|
+
"name": "Longformer Large 4096",
|
|
102
|
+
"max_context": 4096,
|
|
103
|
+
"tokenizer": "hf",
|
|
104
|
+
"tokenizer_id": "allenai/longformer-large-4096",
|
|
105
|
+
"default_reserve": 256
|
|
106
|
+
},
|
|
107
|
+
"deberta-v3-base": {
|
|
108
|
+
"name": "DeBERTa v3 Base",
|
|
109
|
+
"max_context": 512,
|
|
110
|
+
"tokenizer": "hf",
|
|
111
|
+
"tokenizer_id": "microsoft/deberta-v3-base",
|
|
112
|
+
"default_reserve": 64
|
|
113
|
+
},
|
|
114
|
+
"deberta-v3-large": {
|
|
115
|
+
"name": "DeBERTa v3 Large",
|
|
116
|
+
"max_context": 512,
|
|
117
|
+
"tokenizer": "hf",
|
|
118
|
+
"tokenizer_id": "microsoft/deberta-v3-large",
|
|
119
|
+
"default_reserve": 64
|
|
120
|
+
},
|
|
121
|
+
"t5-base": {
|
|
122
|
+
"name": "T5 Base",
|
|
123
|
+
"max_context": 512,
|
|
124
|
+
"tokenizer": "hf",
|
|
125
|
+
"tokenizer_id": "t5-base",
|
|
126
|
+
"default_reserve": 64
|
|
127
|
+
},
|
|
128
|
+
"t5-large": {
|
|
129
|
+
"name": "T5 Large",
|
|
130
|
+
"max_context": 512,
|
|
131
|
+
"tokenizer": "hf",
|
|
132
|
+
"tokenizer_id": "t5-large",
|
|
133
|
+
"default_reserve": 64
|
|
134
|
+
},
|
|
135
|
+
"flan-t5-xl": {
|
|
136
|
+
"name": "FLAN-T5 XL",
|
|
137
|
+
"max_context": 2048,
|
|
138
|
+
"tokenizer": "hf",
|
|
139
|
+
"tokenizer_id": "google/flan-t5-xl",
|
|
140
|
+
"default_reserve": 128
|
|
141
|
+
},
|
|
142
|
+
"gemini-pro-placeholder": {
|
|
143
|
+
"name": "Gemini Pro",
|
|
144
|
+
"max_context": 128000,
|
|
145
|
+
"tokenizer": "sentencepiece",
|
|
146
|
+
"tokenizer_id": "google/gemini",
|
|
147
|
+
"default_reserve": 512
|
|
148
|
+
}
|
|
149
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
|
|
2
|
+
def _try_importing_tiktoken():
|
|
3
|
+
try:
|
|
4
|
+
import tiktoken
|
|
5
|
+
return tiktoken
|
|
6
|
+
except ImportError:
|
|
7
|
+
return None
|
|
8
|
+
|
|
9
|
+
def _try_importing_transformers():
|
|
10
|
+
try:
|
|
11
|
+
import transformers
|
|
12
|
+
return transformers
|
|
13
|
+
except ImportError:
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TokenizerAdapter:
|
|
18
|
+
def __init__(self, encode_fn: Callable[[str], List[int]]):
|
|
19
|
+
self._encode = encode_fn
|
|
20
|
+
|
|
21
|
+
def encode(self, text: str) -> List[int]:
|
|
22
|
+
return self._encode(text)
|
|
23
|
+
|
|
24
|
+
def count(self, text: str) -> int:
|
|
25
|
+
return len(self._encode(text))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_tokenizer(backend: Optional[str], tokenizer_id: Optional[str]) -> TokenizerAdapter:
|
|
29
|
+
if backend == "tiktoken":
|
|
30
|
+
tiktoken = _try_importing_tiktoken()
|
|
31
|
+
if tiktoken:
|
|
32
|
+
enc = tiktoken.get_encoding(tokenizer_id or "o200k_base")
|
|
33
|
+
return TokenizerAdapter(lambda s: enc.encode(s, disallowed_special=()))
|
|
34
|
+
elif backend == "hf":
|
|
35
|
+
transformers = _try_importing_transformers()
|
|
36
|
+
if transformers:
|
|
37
|
+
tok = transformers.AutoTokenizer.from_pretrained(tokenizer_id, use_fast=True)
|
|
38
|
+
return TokenizerAdapter(lambda s: tok.encode(s, add_special_tokens=False))
|
|
39
|
+
|
|
40
|
+
return TokenizerAdapter(lambda s: list(range(len(s) // 4)))
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "chunkrank"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Model-Aware Chunking + Answer Ranking"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Your Name", email = "you@example.com" }
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = { text = "MIT" }
|
|
10
|
+
requires-python = ">=3.14,<4.0"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"mypy (>=1.19.1,<2.0.0)",
|
|
13
|
+
"deptry (>=0.24.0,<0.25.0)",
|
|
14
|
+
"ruff (>=0.14.10,<0.15.0)",
|
|
15
|
+
"regex>=2023.10.3",
|
|
16
|
+
"numpy>=1.26",
|
|
17
|
+
"scikit-learn>=1.5",
|
|
18
|
+
"rank-bm25>=0.2.2"
|
|
19
|
+
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
25
|
+
build-backend = "poetry.core.masonry.api"
|